From: cmzmasek@gmail.com Date: Wed, 9 Feb 2011 01:23:45 +0000 (+0000) Subject: initial commit X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;ds=sidebyside;h=7f39f89a230adf39809c80c9540c2f5fe33f3b48;p=jalview.git initial commit --- diff --git a/forester/archive/RIO/C/Makefile b/forester/archive/RIO/C/Makefile new file mode 100644 index 0000000..552adfe --- /dev/null +++ b/forester/archive/RIO/C/Makefile @@ -0,0 +1,16 @@ +# Makefile for bootstrap_cz +# Last modified 06/07/01 + +# For GNU gcc +CFLAGS = -O2 -Wall -pedantic + +# This one specifies the "cc" C compiler +#CC = cc $(CFLAGS) +# +# To use "gcc" instead +CC = gcc $(CFLAGS) + + +bootstrap_cz: bootstrap_cz.c + $(CC) -o bootstrap_cz bootstrap_cz.c + diff --git a/forester/archive/RIO/C/bootstrap_cz.c b/forester/archive/RIO/C/bootstrap_cz.c new file mode 100644 index 0000000..6f59a30 --- /dev/null +++ b/forester/archive/RIO/C/bootstrap_cz.c @@ -0,0 +1,484 @@ +/* +# bootstrap_cz +# ------------ +# Copyright (C) 1999-2002 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Created: 06/06/01 +# +# Last modified: 01/27/02 +# +# Purpose: +# Bootstrap resamples an alignment in PHYLIP sequential format times. +# Bootstrapping is not done randomly but according to a BSP (bootstrap +# positions) file. +# The BSP file can be created with the Perl program "bootstrap_cz.pl" +# in mode 0. +# This prgram has the same functionality as "bootstrap_cz.pl" in mode 1. +# Sequence names are normalized to LENGTH_OF_NAME characters. +# The output alignment is in PHYLIP's sequential or interleaved format. +# (These two are the same in this case, since all the seqs will be one +# line in length (no returns in seq).) +# +# Usage: bootstrap_cz +# [number of processors] +*/ + + + +#include +#include +#include +#include + + +#define LENGTH_OF_NAME 26 + + +static char **names, /* This stores the sequence names */ + **sequences; /* This stores the sequences */ +static int number_of_seqs, + number_of_colm; + + +void readInAlignmnet( const char * ); +void bootstrapAccordingToBSPfile( int, const char *, const char * ); +void checkForMemAllocFailure( void * ); +int fileExists( const char *); +void errorInCommandLine(); + + + + + +/* Reads the seqs and seq-names from inalignment */ +/* into **sequences and **sequences. */ +/* Inalignment must be in PHYLIP sequential format. */ +/* Last modified: 06/25/01 */ +void readInAlignment( const char *inalignment ) { + + FILE *inalignment_fp = NULL; + char *str = NULL; + int max_length = 0; + register char c = ' '; + register int i = 0, + ii = 0, + z = 0, + seq = 0; + + number_of_seqs = 0; + number_of_colm = 0; + + inalignment_fp = fopen( inalignment, "r" ); + if ( inalignment_fp == NULL ) { + printf( "\nbootstrap_cz: Error: Could not open alignment file for reading.\n" ); + exit( -1 ); + } + + if ( fscanf( inalignment_fp, "%d", &number_of_seqs ) != 1 ) { + printf( "\nbootstrap_cz: Error: Could not read in number of seqs.\n" ); + exit( -1 ); + } + if ( fscanf( inalignment_fp, "%d", &number_of_colm ) != 1 ) { + printf( "\nbootstrap_cz: Error: Could not read in number of columns.\n" ); + exit( -1 ); + } + + names = malloc( number_of_seqs * sizeof( char *) ); + checkForMemAllocFailure( names ); + for ( i = 0; i < number_of_seqs; ++i ) { + names[ i ] = malloc( LENGTH_OF_NAME * sizeof( char ) ); + checkForMemAllocFailure( names[ i ] ); + } + + sequences = malloc( number_of_seqs * sizeof( char * ) ); + checkForMemAllocFailure( sequences ); + for ( i = 0; i < number_of_seqs; ++i ) { + sequences[ i ] = malloc( number_of_colm * sizeof( char ) ); + checkForMemAllocFailure( sequences[ i ] ); + } + + max_length = ( 30 * LENGTH_OF_NAME ) + number_of_colm; + + str = malloc( max_length * sizeof( char * ) ); + checkForMemAllocFailure( str ); + + while ( fgets( str, max_length, inalignment_fp ) != NULL ) { + + if ( !isspace( str[ 0 ] ) != 0 ) { + + i = 0; + while ( str[ i ] != ' ' ) { + names[ seq ][ i ] = str[ i ]; + i++; + } + + ii = i; + while ( ii < LENGTH_OF_NAME ) { + names[ seq ][ ii ] = ' '; + ii++; + } + + z = 0; + + while ( str[ i ] != '\n' && str[ i ] != '\r' && str[ i ] != '\0' ) { + c = str[ i ]; + if ( c != ' ' ) { + if ( isupper( c ) != 0 || c == '-' ) { + sequences[ seq ][ z++ ] = c; + } + else { + printf( "\nbootstrap_cz: Error: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n" ); + exit( -1 ); + } + } + i++; + if ( z > number_of_colm ) { + printf( "\nbootstrap_cz: Error: line in \"%s\" contains more than %d columns.\n", + inalignment, number_of_colm ); + exit( -1 ); + } + } + if ( z != number_of_colm ) { + printf( "\nbootstrap_cz: Error: line in \"%s\" contains a incorrect number of columns.\n", + inalignment ); + exit( -1 ); + } + + seq++; + + if ( seq > number_of_seqs ) { + printf( "\nbootstrap_cz: Error: \"%s\" contains more than %d seqs.\n", + inalignment, number_of_seqs ); + exit( -1 ); + } + } + + + } /* while ( fgets ) */ + + if ( seq != number_of_seqs ) { + printf( "\nbootstrap_cz: Error: \"%s\" contains a incorrect number of seqs.\n", + inalignment ); + exit( -1 ); + } + + fclose( inalignment_fp ); + + return; + +} /* readInAlignment */ + + + +/* Rearrenges the aa in sequences according to */ +/* the bsp (bootstrap positions) file bsp_file. */ +/* Writes the results to outfile */ +/* Last modified: 06/07/01 */ +void bootstrapAccordingToBSPfile( int bootstraps, + const char *bsp_file, + const char *outfile ) { + + FILE *bsp_file_fp = NULL, + *outfile_fp = NULL; + int *positions = NULL, + p = 0; + register int boot = 0, + seq = 0, + i = 0; + + positions = malloc( number_of_colm * sizeof( int ) ); + checkForMemAllocFailure( positions ); + + + bsp_file_fp = fopen( bsp_file, "r" ); + if ( bsp_file_fp == NULL ) { + printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n", + bsp_file ); + exit( -1 ); + } + + outfile_fp = fopen( outfile, "w" ); + if ( outfile_fp == NULL ) { + printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n", + outfile ); + exit( -1 ); + } + + for ( boot = 0; boot < bootstraps; ++boot ) { + + for ( i = 0; i < number_of_colm; ++i ) { + if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) { + printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n", + bsp_file ); + exit( -1 ); + } + positions[ i ] = p; + } + + fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm ); + for ( seq = 0; seq < number_of_seqs; ++seq ) { + for ( i = 0; i < LENGTH_OF_NAME; ++i ) { + fprintf( outfile_fp, "%c", names[ seq ][ i ] ); + } + for ( i = 0; i < number_of_colm; ++i ) { + fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] ); + } + fprintf( outfile_fp, "\n" ); + } + } + + /* Now, the bsp file must not contain any more numbers */ + if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) { + printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n", + bsp_file ); + printf( ">%d<\n", p ); + printf( "number of seqs=%d\n", number_of_seqs ); + exit( -1 ); + } + + fclose( bsp_file_fp ); + fclose( outfile_fp ); + free( positions ); + return; + +} /* bootstrapAccordingToBSPfile */ + + + +/* Rearrenges the aa in sequences according to */ +/* the bsp (bootstrap positions) file bsp_file. */ +/* Writes the results to outfile */ +/* Last modified: 01/25/02 */ +void bootstrapAccordingToBSPfileP( int bootstraps, + int processors, + const char *bsp_file, + const char *outfile ) { + + FILE *bsp_file_fp = NULL, + *outfile_fp = NULL; + int *positions = NULL, + p = 0; + char *outfile_ = NULL; + register int boot = 0, + seq = 0, + i = 0, + j = 0, + z = 0, + flag = 0; + int block_size = 0, + larger_blocks = 0; + + block_size = ( int ) bootstraps / processors; + larger_blocks = bootstraps - ( block_size * processors ); /* number of blocks which have a size of + block_size + 1 */ + + positions = malloc( number_of_colm * sizeof( int ) ); + checkForMemAllocFailure( positions ); + + outfile_ = malloc( ( strlen( outfile ) + 20 ) * sizeof( char ) ); + checkForMemAllocFailure( outfile_ ); + + bsp_file_fp = fopen( bsp_file, "r" ); + if ( bsp_file_fp == NULL ) { + printf( "\nbootstrap_cz: Error: could not open file \"%s\" for reading.\n", + bsp_file ); + exit( -1 ); + } + + j = -1; + flag = 1; + z = 0; + + for ( boot = 0; boot < bootstraps; ++boot ) { + + for ( i = 0; i < number_of_colm; ++i ) { + if ( fscanf( bsp_file_fp, "%d", &p ) != 1 ) { + printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment.\n", + bsp_file ); + exit( -1 ); + } + positions[ i ] = p; + } + + j++; + + if ( larger_blocks > 0 ) { + if ( j >= block_size + 1 ) { + flag = 1; + j = 0; + larger_blocks--; + } + } + else if ( j >= block_size ) { + flag = 1; + j = 0; + } + + if ( flag == 1 ) { + if ( boot > 0 ) { + fclose( outfile_fp ); + } + sprintf( outfile_, "%s%d", outfile, z++ ); + if ( fileExists( outfile_ ) == 1 ) { + printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n", + outfile_ ); + exit( -1 ); + } + outfile_fp = fopen( outfile_, "w" ); + if ( outfile_fp == NULL ) { + printf( "\nbootstrap_cz: Error: could not open file \"%s\" for writing.\n", + outfile_ ); + exit( -1 ); + } + flag = 0; + } + + fprintf( outfile_fp, " %d %d\n", number_of_seqs, number_of_colm ); + for ( seq = 0; seq < number_of_seqs; ++seq ) { + for ( i = 0; i < LENGTH_OF_NAME; ++i ) { + fprintf( outfile_fp, "%c", names[ seq ][ i ] ); + } + for ( i = 0; i < number_of_colm; ++i ) { + fprintf( outfile_fp, "%c", sequences[ seq ][ positions[ i ] ] ); + } + fprintf( outfile_fp, "\n" ); + } + } + + /* Now, the bsp file must not contain any more numbers */ + if ( fscanf( bsp_file_fp, "%d", &p ) == 1 ) { + printf( "\nbootstrap_cz: Error: file \"%s\" does not correspond to alignment (too long).\n", + bsp_file ); + printf( ">%d<\n", p ); + printf( "number of seqs=%d\n", number_of_seqs ); + exit( -1 ); + } + + fclose( bsp_file_fp ); + fclose( outfile_fp ); + + free( positions ); + free( outfile_ ); + + return; + +} /* bootstrapAccordingToBSPfileP */ + + + + +/* Exits if *p is NULL. */ +/* Last modified: 06/06/01 */ +void checkForMemAllocFailure( void *p ) { + if ( p == NULL ) { + printf( "\nbootstrap_cz: Memory allocation failed.\n" ); + exit( -1 ); + } + else { + return; + } +} /* checkForMemAllocFailure */ + + + +/* Returns 1 if filename can be opened. */ +/* Returns 0 otherwise. */ +/* Last modified: 06/07/01 */ +int fileExists( const char *filename ) { + FILE *fp = NULL; + if ( ( fp = fopen( filename, "r" ) ) != NULL ) { + fclose( fp ); + return 1; + } + else { + return 0; + } +} /* fileExists */ + + + +void errorInCommandLine() { + printf( "\n" ); + printf( " bootstrap_cz version 3.000\n" ); + printf( " ---------------------------\n\n" ); + printf( " Purpose:\n" ); + printf( " Bootstrap resamples an alignment in PHYLIP sequential format times.\n" ); + printf( " Bootstrapping is not done randomly but according to a BSP (bootstrap\n" ); + printf( " positions) file.\n" ); + printf( " The BSP file can be created with the Perl program \"bootstrap_cz.pl\"\n" ); + printf( " in mode 0.\n" ); + printf( " This prgram has the same functionality as \"bootstrap_cz.pl\" in mode 1.\n" ); + printf( " Sequence names are normalized to LENGTH_OF_NAME characters.\n" ); + printf( " The output alignment is in PHYLIP's sequential or interleaved format.\n" ); + printf( " (These two are the same in this case, since all the seqs will be one\n" ); + printf( " line in length (no returns in seq).)\n\n" ); + printf( " Usage: bootstrap_cz \n" ); + printf( " [number of processors]\n\n" ); +} /* errorInCommandLine */ + + + +int main( int argc, char *argv[] ) { + + char *inalign = NULL, + *bsp_file = NULL, + *outfile = NULL; + int bootstraps = 0, + processors = 0; + + + if ( argc != 5 && argc != 6 ) { + errorInCommandLine(); + exit( -1 ); + } + + bootstraps = atoi( argv[ 1 ] ); + inalign = argv[ 2 ]; + bsp_file = argv[ 3 ]; + outfile = argv[ 4 ]; + + if ( bootstraps < 1 ) { + errorInCommandLine(); + exit( -1 ); + } + + if ( argc == 6 ) { + processors = atoi( argv[ 5 ] ); + if ( processors < 1 ) { + errorInCommandLine(); + exit( -1 ); + } + if ( processors > bootstraps ) { + processors = bootstraps; + } + } + + if ( argc == 5 && fileExists( outfile ) == 1 ) { + printf( "\nbootstrap_cz: Error: outfile \"%s\" already exists.\n", + outfile ); + exit( -1 ); + } + + readInAlignment( inalign ); + + if ( argc == 5 ) { + bootstrapAccordingToBSPfile( bootstraps, + bsp_file, + outfile ); + } + else { + bootstrapAccordingToBSPfileP( bootstraps, + processors, + bsp_file, + outfile ); + } + + return 0; + +} /* main */ diff --git a/forester/archive/RIO/COPYRIGHT b/forester/archive/RIO/COPYRIGHT new file mode 100644 index 0000000..eae6883 --- /dev/null +++ b/forester/archive/RIO/COPYRIGHT @@ -0,0 +1,32 @@ +RIO - Phylogenomic Protein Function Analysis +Copyright (C) 2002 Washington University School of Medicine +and Howard Hughes Medical Institute +All rights reserved +---------------------------------------------------------------- + +This pipeline of programs is free software. You can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +In other words, you are free to modify, copy, or redistribute this +source code and its documentation in any way you like, but you must +distribute all derivative versions as free software under the same +terms that I've provided my code to you (i.e. the GNU General Public +License). This precludes any use of the code in proprietary or +commercial software unless your source code is made freely available. + +In contrast to RIO as a whole, the JAVA programs in directory "java" +are under the BSD license. + +This software is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this RIO release, in the file LICENSE; if not, write to +the Free Software Foundation, Inc., 675 Mass. Ave, Cambridge, MA 02139 +USA. + + diff --git a/forester/archive/RIO/IMPORTANT_NOTICE b/forester/archive/RIO/IMPORTANT_NOTICE new file mode 100644 index 0000000..9040c6f --- /dev/null +++ b/forester/archive/RIO/IMPORTANT_NOTICE @@ -0,0 +1,48 @@ +RIO - Phylogenomic Protein Function Analysis +---------------------------------------------------------------- + + +RIO contains modified versions of programs written by others: + +1. TREE-PUZZLE + (Strimmer, K., and A. von Haeseler. 1996. Quartet puzzling: A quartet maximum + likelihood method for reconstructing tree topologies. Mol. Biol. Evol. 13: 964-969.) + + +2. PHYLIP + (Felsenstein, J. 1993. PHYLIP (Phylogeny Inference Package) version 3.5c. + Distributed by the author. + Department of Genetics, University of Washington, Seattle.) + + +Please note: +------------ + +1. RIO uses modifications of these programs, the original versions were + written by others: + + TREE-PUZZLE: Heiko A. Schmidt, Korbinian Strimmer, Martin Vingron, Arndt von Haeseler + + PHYLIP: Joseph Felsenstein, see also http://evolution.genetics.washington.edu/phylip/credits.html + + +2. The programs in the RIO distribution have been modified specifically + to work within RIO and cannot be used for any other purpose. + + +3. I am responsible for any accidentally introduced errors. + + +4. The original can be downloaded from the following sites: + TREE-PUZZLE: http://www.tree-puzzle.de/ + PHYLIP: http://evolution.genetics.washington.edu/phylip.html + + +RIO also contains hmmer (version 2.2g). +hmmer can be downloaded at: http://hmmer.wustl.edu/ + + + +Christian Zmasek, 03/09/02 + + diff --git a/forester/archive/RIO/LICENSE b/forester/archive/RIO/LICENSE new file mode 100644 index 0000000..a43ea21 --- /dev/null +++ b/forester/archive/RIO/LICENSE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/forester/archive/RIO/RIO_INSTALL b/forester/archive/RIO/RIO_INSTALL new file mode 100644 index 0000000..7d4f2ec --- /dev/null +++ b/forester/archive/RIO/RIO_INSTALL @@ -0,0 +1,459 @@ + +RIO - Phylogenomic Protein Function Analysis + +____________________________________________ + + + + +RIO/FORESTER : http://www.genetics.wustl.edu/eddy/forester/ +RIO webserver: http://www.rio.wustl.edu/ + +Reference: Zmasek C.M. and Eddy S.R. (2002) + RIO: Analyzing proteomes by automated phylogenomics using + resampled inference of orthologs. + BMC Bioinformatics 3:14 + http://www.biomedcentral.com/1471-2105/3/14/ + + It is highly recommended that you read this paper before + installing and/or using RIO. (Included in the RIO + distribution as PDF: "RIO.pdf".) + + +Preconditions: A Unix system, Java 1.2 or higher, Perl, gcc or cc, + ... and some experience with Perl and Unix. + + + +1. Compilation +______________ + + +This describes how to compile the various components of RIO. + + + "gunzip RIO1.x.tar.gz" + + "tar -xvf RIO1.x.tar" + + + + +in directory "RIO1.x/C": + + "make" + + + +in directory "RIO1.x/hmmer" (version of HMMER is "2.2g"): + +(if you already have a local copy of HMMER 2.2g installed, this step +is not necessary, but in this case you need to change variables "$HMMALIGN", +"$HMMSEARCH", "$HMMBUILD", "$HMMFETCH", and "$SFE" to point to the +corresponding HMMER programs) + + "./configure" + + "make" + + + +in directory "RIO1.x/java" (requires JDK 1.2 or greater): + + "javac forester/tools/*java" + + "javac ATVapp.java" + + + + +in directory "RIO1.x/puzzle_dqo": + + "./configure" + + "make" + + + +in directory "RIO1.x/puzzle_mod": + + "./configure" + + "make" + + + +in directory "RIO1.x/phylip_mod/src": + + "make install" + + + + +2. Setting the variables in "RIO1.x/perl/rio_module.pm" +_______________________________________________________ + + +Most global variables used in "RIO1.x/perl/rio.pl" are set in +the perl module "RIO1.x/perl/rio_module.pm". +This module pretty much "controls everything". + +It is necessary to set the variables which point to: + +-- the rio directory itself: $PATH_TO_FORESTER + + (example: $PATH_TO_FORESTER = "/home/czmasek/linux/RIO1.1/";) + + +-- your Java virtual machine: $JAVA + + (example: $JAVA = "/home/czmasek/linux/j2sdk1.4.0/bin/java";) + + +-- a directory where temporary files can be created: $TEMP_DIR_DEFAULT + + + + Example: + Now that $PATH_TO_FORESTER, $JAVA, $TEMP_DIR_DEFAULT are set, + it is posssible to run rio.pl based on the example precalculated distances + in "/example_data/": + + % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out0 p I + + To use RIO to analyze your protein sequences, please continue setting + variables and preparing data...... + + + +-- your local copy of the Pfam database (see http://pfam.wustl.edu/) + (if only precalculated distances are being used, these variables do not + matter): + + $PFAM_FULL_DIRECTORY -- the directory containing the "full" alignments + (Pfam-A.full) see below (3.) + + $PFAM_SEED_DIRECTORY -- the directory containing the "seed" alignments + (Pfam-A.seed) see below (3.) + + $PFAM_HMM_DB -- the Pfam HMM library file (Pfam_ls) + see below (3.) + + +-- $TREMBL_ACDEOS_FILE and $SWISSPROT_ACDEOS_FILE: see below (4. and 5.). + + +-- list of species (SWISS-PROT codes) which can be analyzed: $SPECIES_NAMES_FILE + (for most purposes $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-4_species_list" + should be sufficient, hence this variable does not necessarly need to be changed) + + +-- a default species tree in NHX format: $SPECIES_TREE_FILE_DEFAULT + (for most purposes $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-4.nhx" + should be sufficient, hence this variable does not necessarly need to be changed) + + +-- Only if precalculated distances are being used: + $MATRIX_FOR_PWD, $RIO_PWD_DIRECTORY, $RIO_BSP_DIRECTORY, + $RIO_NBD_DIRECTORY, $RIO_ALN_DIRECTORY, and $RIO_HMM_DIRECTORY: + please see below (6.) + + + + + + +IMPORTANT: Need to redo steps 3., 4., 5., and 6. if species + in the master species tree and/or the species list + are added and/or changed or if a new version of Pfam is used!! + + + + + +3. Downloading and processing of Pfam +_____________________________________ + + + +Please note: Even if you already have a local copy of the +Pfam database, you still need to perform steps c. through k. + +a. download + - "Pfam_ls" (PFAM HMM library, glocal alignment models) + - "Pfam-A.full" (full alignments of the curated families) + - "Pfam-A.seed" (seed alignments of the curated families) + [and ideally "prior.tar.gz"] + from http://pfam.wustl.edu/ or ftp.genetics.wustl.edu/pub/eddy/pfam-x/ + +b. "gunzip" and "tar -xvf" these downloaded files, if necessary + +c. create a new directory named "Full" and move "Pfam-A.full" into it + +d. in directory "Full" execute "RIO1.x/perl/pfam2slx.pl Pfam-A.full" + +e. set variable $PFAM_FULL_DIRECTORY in "RIO1.x/perl/rio_module.pm" + to point to this "Full" directory + +f. create a new directory named "Seed" and move "Pfam-A.seed" into it + +g. in directory "Seed" execute "RIO1.x/perl/pfam2slx.pl Pfam-A.seed" + +h. set variable $PFAM_SEED_DIRECTORY in "RIO1.x/perl/rio_module.pm" + to point to this "Seed" directory + +i. execute "RIO1.x/hmmer/binaries/hmmindex Pfam_ls" (in same + directory as "Pfam_ls") resulting in "Pfam_ls.ssi" + +j. set environment variable HMMERDB to point to the directory where + "Pfam_ls" and "Pfam_ls.ssi" reside + (for example "setenv HMMERDB /home/czmasek/PFAM7.3/") + +k. set variable $PFAM_HMM_DB in "RIO1.x/perl/rio_module.pm" + to point to the "Pfam_ls" file + (for example $PFAM_HMM_DB = "/home/czmasek/PFAM7.3/Pfam_ls";) + + + + +4. Extraction of ID, DE, and species from a SWISS-PROT sprot.dat file +_____________________________________________________________________ + + +This creates the file from which RIO will get the sequence descriptions for +sequences from SWISS-PROT. +(RIO1.x/data/ does not contain an example for this, since SWISS-PROT is +copyrighted.) + + +a. download SWISS-PROT "sprotXX.dat" from + "ftp://ca.expasy.org/databases/swiss-prot/release/" + +b. "extractSWISS-PROT.pl [species list]" + + ("extractSWISS-PROT.pl" is in "RIO1.x/perl") + + example: + "extractSWISS-PROT.pl sprot40.dat sp40_ACDEOS RIO1.x/data/species/tree_of_life_bin_1-4_species_list" + +c. the output file should be placed in "RIO1.x/data" and the + variable $SWISSPROT_ACDEOS_FILE in "RIO1.x/perl/rio_module.pm" should point + to this output. + + + + +5. Extraction of AC, DE, and species from a TrEMBL trembl.dat file +__________________________________________________________________ + + +This creates the file from which RIO will get the sequence descriptions for +sequences from TrEMBL. +(RIO1.x/data/ already contains an example: "trembl20_ACDEOS_1-4") + +a. download TrEMBL "trembl.dat.gz" from + "ftp://ca.expasy.org/databases/sp_tr_nrdb/" + +b. "gunzip trembl.dat.gz" + +c. "extractTrembl.pl [species list]" + + ("extractTrembl.pl" is in "RIO1.x/perl") + + example: + "extractTrembl.pl trembl.dat trembl17.7_ACDEOS_1-4 RIO1.x/data/species/tree_of_life_bin_1-4_species_list" + +d. the output file should be placed in "RIO1.x/data/" and the + variable $TREMBL_ACDEOS_FILE in "RIO1.x/perl/rio_module.pm" should point + to this output. + + + +Now, you could go to directly to 7. to run the examples...... + + + +6. Precalculation of pairwise distances (optional): pfam2pwd.pl +_______________________________________________________________ + + +This step is of course only necessary if you want to use RIO on +precalculated pairwise distances. The precalculation is time consuming +(range of one or two weeks on ten processors). +It is best to run it on a few machines, dividing up the input data. + +The program to do this, is "RIO1.x/perl/pfam2pwd.pl". + +Please note: "pfam2pwd.pl" creates a logfile in the same directory + where is places the pairwise distance output ($MY_RIO_PWD_DIRECTORY). + + + +The following variables in "RIO1.x/perl/pfam2pwd.pl" need to be set +("pfam2pwd.pl" gets most of its information from "rio_module.pm"): + + +"$MY_PFAM_FULL_DIRECTORY": + This is the directory where the Pfam full alignments reside, processed + as described in 3.a to 3.d. + + + +"$ALGNS_TO_USE_LIST_FILE": + If left empty, all alignments in $MY_PFAM_FULL_DIRECTORY are being + used the calculate pairwise distances from. + If this points to a file listing names of Pfam alignments, + only those listed are being used. + The file can either be a simple new-line deliminated list, or can have + the same format as the "Summary of changes" list + ("FI PF03214 RGP NEW SEED HMM_ls HMM_fs FULL DESC") + which is part of the Pfam distribution. + One purpose of this is to use the list of "too large" alignments + in the logfile produced by "pfam2pwd.pl" to run "pfam2pwd.pl" with + a smaller species list (as can be set with "$MY_SPECIES_NAMES_FILE") + on large alignments. + + + +"$MY_SPECIES_NAMES_FILE" -- Dealing with too large alignments: + + This is most important. It determines the species whose sequences + are being used (sequences from species not listed in $MY_SPECIES_NAMES_FILE + are ignored). Normally, one would use the same list as RIO uses + ($SPECIES_NAMES_FILE in "rio_module.pm"): + + my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE; + + For certain large families (such as protein kinases, one must use + a species file which contains less species in order to be able to finish + the calculations in reasonable time: + + my $MY_SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/tree_of_life_bin_1-4_species_list_NO_RAT_RABBIT_MONKEYS_APES_SHEEP_GOAT_HAMSTER + + An additional way to reduce the number of sequences in an alignment is + to only use sequences originating from SWISS-PROT. This is done by + placing the following line of code into pfam2pwd.pl: + + $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/NO_TREMBL"; + + + +"$MY_RIO_PWD_DIRECTORY", +"$MY_RIO_BSP_DIRECTORY", +"$MY_RIO_NBD_DIRECTORY", +"$MY_RIO_ALN_DIRECTORY", +"$MY_RIO_HMM_DIRECTORY": + These determine where to place the output. + After all the data has been calculated, the corresponding variables + in RIO1.x/perl/rio_module.pm ("$RIO_PWD_DIRECTORY", etc.) need to be set + so that they point to the appropriate values. Having different variables + allows to precalculate distances and at the same time use RIO on + previously precalculated distances. + + + +"$MY_TEMP_DIR": + A directory to create temporary files in. + + + +"$MIN_SEQS": + Alignments in which the number of sequences after pruning (determined + by "$MY_SPECIES_NAMES_FILE") is lower than $MIN_SEQS, are ignored + (no calculation of pwds). + + + +"$MAX_SEQS": + Alignments in which the number of sequences after pruning (determined + by "$MY_SPECIES_NAMES_FILE") is greater than $MAX_SEQS, are ignored + (no calculation of pwds). + + + +"$MY_SEED": + Seed for the random number generator for bootstrapping (must be 4n+1). + + + +"$MY_MATRIX": + This is used to choose the model to be used for the (ML) + distance calculation: + 0 = JTT + 2 = BLOSUM 62 + 3 = mtREV24 + 5 = VT + 6 = WAG + PAM otherwise + After all the data has been calculated, variable "$MATRIX_FOR_PWD" + in RIO1.x/perl/rio_module.pm needs to be set to the same value. + + + +Once pairwise distances are calculated, the following variables in +"rio_module.pm" need to be set accordingly: +$MATRIX_FOR_PWD : corresponds to $MY_MATRIX in pfam2pwd.pl +$RIO_PWD_DIRECTORY : corresponds to $MY_RIO_PWD_DIRECTORY in pfam2pwd.pl +$RIO_BSP_DIRECTORY : corresponds to $MY_RIO_BSP_DIRECTORY in pfam2pwd.pl +$RIO_NBD_DIRECTORY : corresponds to $MY_RIO_NBD_DIRECTORY in pfam2pwd.pl +$RIO_ALN_DIRECTORY : corresponds to $MY_RIO_ALN_DIRECTORY in pfam2pwd.pl +$RIO_HMM_DIRECTORY : corresponds to $MY_RIO_HMM_DIRECTORY in pfam2pwd.pl +...of course, if Pfam has been updated, the corresponding variables in rio_module.pm +($PFAM_FULL_DIRECTORY, etc.) need to be updated, too. + + + + + + +IMPORTANT: Need to redo steps 3., 4., 5., and 6. if species + in the master species tree and/or the species list + are added and/or changed or if a new version of Pfam is used! + + + + +7. Example of a phylogenomic analysis using "rio.pl" +____________________________________________________ + + +Without using precalculated distances (for this, all the variables above +need to point to the correct loctions, in particular to your local and processed +Pfam database): + + % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase H=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out3 p I C E + + + +Without using precalculated distances (for this, all the variables above +need to point to the correct loctions, in particular to your local and processed +Pfam database) using a query sequence which is already in the alignment: + + % RIO1.1/perl/rio.pl 4 A=/path/to/my/pfam/Full/aconitase N=LEU2_LACLA/5-449 O=out4 p I C E + + + +Using the example precalculated distances in "/example_data/" +($RIO_PWD_DIRECTORY, etc. need to point to $PATH_TO_FORESTER."example_data/"): + + % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out1 p I C E + + + +Using a query sequence which is already in the precalculated distances in "/example_data/" +($RIO_PWD_DIRECTORY, etc. need to point to $PATH_TO_FORESTER."example_data/"): + + % RIO1.1/perl/rio.pl 2 A=aconitase N=LEU2_LACLA/5-449 O=out2 p I C E + + + +for detailed instructions on how to use rio.pl see the source code, +or type "rio.pl" without any arguments + + + + +Christian Zmasek +zmasek@genetics.wustl.edu +05/26/02 + diff --git a/forester/archive/RIO/data.tar.bz2 b/forester/archive/RIO/data.tar.bz2 new file mode 100644 index 0000000..d5a86af Binary files /dev/null and b/forester/archive/RIO/data.tar.bz2 differ diff --git a/forester/archive/RIO/docs/RIO.pdf b/forester/archive/RIO/docs/RIO.pdf new file mode 100644 index 0000000..546b02d Binary files /dev/null and b/forester/archive/RIO/docs/RIO.pdf differ diff --git a/forester/archive/RIO/others/hmmer/00README b/forester/archive/RIO/others/hmmer/00README new file mode 100644 index 0000000..71bff7e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/00README @@ -0,0 +1,57 @@ +HMMER - profile hidden Markov models for biological sequence analysis +Version 2.2 (August 2001) +Copyright (C) 1992-2001 Washington University School of Medicine +------------------------------------------------------------------ + +o About this software... + HMMER is an implementation of profile HMM methods for + sensitive database searches using multiple sequence alignments as queries. + + Basically, you give HMMER a multiple sequence alignment as input; + it builds a statistical model called a "hidden Markov model" + which you can then use as a query into a sequence database + to find (and/or align) additional homologues of the sequence family. + +o Getting HMMER + WWW home: http://hmmer.wustl.edu/ + Distribution: ftp://ftp.genetics.wustl.edu/pub/eddy/hmmer/ + +o Installing HMMER + See the file INSTALL for brief instructions. + See the chapter Installation in the HMMER User's Guide for more + detailed instructions. + + You should also read the following files: + COPYING -- copyright notice, and information on the free software license + LICENSE -- Full text of the GNU Public License, version 2 (see COPYING) + + If you have obtained HMMER from Washington University under + a non-GPL license as part of a special licensing agreement, + COPYING and LICENSE will refer to the terms of that agreement. + +o Getting started with HMMER + The User's Guide is in Userguide/Userguide.pdf [Adobe PDF format]. + It is also available online as hypertext: + http://hmmer.wustl.edu/hmmer-html/ + + A quick tutorial intro is given as the first chapter of + the User's Guide. + +o Registering HMMER + Email eddy@genetics.wustl.edu to register and get on my + infrequent mailing list of HMMER news, patches, and updates. + +o Reporting bugs + These programs are under active development. Though this + release has been tested and appears to be stable, bugs may crop up. If + you use these programs, please help me out and e-mail me with + suggestions, comments, and bug reports. (eddy@genetics.wustl.edu) + + +Sean Eddy +Howard Hughes Medical Institute and Dept. of Genetics +Washington University School of Medicine, St. Louis, Missouri, USA +------------------------------------------------------------------- + + + diff --git a/forester/archive/RIO/others/hmmer/COPYRIGHT b/forester/archive/RIO/others/hmmer/COPYRIGHT new file mode 100644 index 0000000..f50acd7 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/COPYRIGHT @@ -0,0 +1,36 @@ +HMMER - Biological sequence analysis with profile hidden Markov models +Copyright (C) 1992-2001 Washington University School of Medicine + +This suite of programs is free software. You can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +In other words, you are free to modify, copy, or redistribute this +source code and its documentation in any way you like, but you must +distribute all derivative versions as free software under the same +terms that I've provided my code to you (i.e. the GNU General Public +License). This precludes any use of the code in proprietary or +commercial software unless your source code is made freely available. + +If you wish to use HMMER code under a different Open Source license +that's not compatible with the GPL (like the Artistic License, BSD +license, or the Netscape Public License), please contact me +(eddy@genetics.wustl.edu) for permission. + +Incorporation into commercial software under non-GPL terms is possible +by obtaining a specially licensed version from Washington University +School of Medicine. Contact Jack Pincus (jhpincus@cris.com) to arrange +licensing terms. + +This software is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this HMMER release, in the file LICENSE; if not, write to +the Free Software Foundation, Inc., 675 Mass. Ave, Cambridge, MA 02139 +USA. + + diff --git a/forester/archive/RIO/others/hmmer/INSTALL b/forester/archive/RIO/others/hmmer/INSTALL new file mode 100644 index 0000000..268dd24 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/INSTALL @@ -0,0 +1,45 @@ +Brief installation instructions for HMMER 2.2 +SRE, Sun Aug 5 16:22:52 2001 +------------------------------------------ + +For a source distribution (hmmer-2.2.tar.gz): + + uncompress hmmer-2.2.tar.gz Uncompresses the archive. + tar xf hmmer-2.2.tar Unpacks the archive. (makes a new directory, hmmer-2.2) + cd hmmer-2.2 Moves into the distribution toplevel directory. + ./configure Configures the software for your system. + make Builds the binaries. + make check Runs the test suite to test the new binaries. + make install Installs the software. (You may need to be root.) + + It should build cleanly on just about any UNIX machine. + +For a binary distribution (hmmer-2.2.bin.-.tar.gz), +for example, the sun-solaris distro: + + uncompress hmmer-2.2.bin.sun-solaris.tar.Z + tar xf hmmer-2.2.bin.sun-solaris.tar + cd hmmer-2.2 + ./configure + make install + + Note that "make" and "make check" aren't necessary, since you + have a precompiled distribution. + + If your machine doesn't have a C compiler, the ./configure will + fail, and you won't be able to do a "make install". This is + an artifact of the way I have the configure script built. + If this happens, just copy the man pages and binaries whereever + you want them, for instance: + + cp Man/* /usr/local/man/man1/ + cp binaries/* /usr/local/bin/ + + HMMER doesn't depend on any external data files, so the installation is + simple - the binaries are free standing, and you can install + them anywhere you like. + +For more detail, see the Installation chapter in the HMMER User's +Guide. + + diff --git a/forester/archive/RIO/others/hmmer/LICENSE b/forester/archive/RIO/others/hmmer/LICENSE new file mode 100644 index 0000000..a43ea21 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/LICENSE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/forester/archive/RIO/others/hmmer/Makefile.in b/forester/archive/RIO/others/hmmer/Makefile.in new file mode 100644 index 0000000..a8e4cb4 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/Makefile.in @@ -0,0 +1,284 @@ +################################################################# +# Makefile for HMMER: Main project directory: installation, documentation +# CVS $Id: Makefile.in,v 1.1.1.1 2005/03/22 08:33:51 cmzmasek Exp $ +########### +# HMMER - Biological sequence analysis with profile HMMs +# Copyright (C) 1992-1999 Washington University School of Medicine +# All Rights Reserved +# +# This source code is distributed under the terms of the +# GNU General Public License. See the files COPYING and LICENSE +# for details. +########### + +# On most Unices, you can build the package without modifying anything +# by just typing "./configure; make". +# +# You may want to modify the following make variables: +# BINDIR - where the executables will be installed by a 'make install' +# MANDIR - where the man pages will be installed by a 'make install' +# CC - which compiler to use +# CFLAGS - compiler flags to use + +# where you want things installed +# Sort of uses GNU coding standards. ${prefix} might be /usr/local. +# ${exec_prefix} gives you some flexibility for installing architecture +# dependent files (e.g. the programs): an example ${exec_prefix} might be +# /nfs/share/irix64/ +# +prefix = @prefix@ +exec_prefix = @exec_prefix@ +BINDIR = @bindir@ +MANDIR = @mandir@ + +# how to install the man pages; +# cp is generally fine, unless you preformat your pages. +# +INSTMAN = cp +MANSUFFIX = 1 + +# your compiler and compiler flags +# +CC = @CC@ +CFLAGS = @CFLAGS@ + +####### +## You should not need to modify below this line. +## Much of it is concerned with maintenance of the development version +## and building the release (indeed, several commands will only work in +## St. Louis) +####### +SHELL = /bin/sh +BASENAME = "hmmer" +PACKAGE = "HMMER" +RELEASE = "2.2g" +RELCODE = "hmmer2_2g" +RELEASEDATE = "August 2001" +COPYRIGHT = "Copyright \(C\) 1992-2001 HHMI/Washington University School of Medicine" +LICENSE = "Freely distributed under the GNU General Public License \(GPL\)" +LICENSETAG = gnu +COMPRESS = gzip + +# The program lists below for HMMER and SQUID are not necessarily +# a complete manifest. They are the list of stable programs that the +# package will install. There must be a man page for each one of them +# in the appropriate places (documentation/man for HMMER, squid/Man +# for the SQUID library). +# +PROGS = hmmalign\ + hmmbuild\ + hmmcalibrate\ + hmmconvert\ + hmmemit\ + hmmfetch\ + hmmindex\ + hmmpfam\ + hmmsearch + +PVMPROGS = @PVMPROGS@ + +SQUIDPROGS = afetch\ + alistat\ + seqstat\ + sfetch\ + sindex\ + shuffle\ + sreformat + +# all: Compile everything (except the testsuite), +# and stick the supported programs in binaries/ +# +all: version.h + @if test -d binaries; then\ + echo "You appear to already have a compiled HMMER distribution.";\ + echo "You don't need to make.";\ + echo "";\ + echo "If I'm wrong, it's because I'm only looking to see if you have";\ + echo "a 'binaries' directory; do 'make distclean' to revert HMMER";\ + echo "to a pristine source distribution.";\ + else\ + (cd squid; make CC="$(CC)" CFLAGS="$(CFLAGS)"; make module);\ + (cd src; make CC="$(CC)" CFLAGS="$(CFLAGS)"; make module);\ + mkdir binaries;\ + for prog in $(PROGS) $(PVMPROGS); do\ + mv src/$$prog binaries/;\ + done;\ + for prog in $(SQUIDPROGS); do\ + mv squid/$$prog binaries/;\ + done;\ + fi + +# version.h: +# create the version.h file that will define stamps used by +# squidcore.c's Banner(), which is called by all executables to +# print a standard package/copyright/license banner; +# then puts copies of version.h in all directories that are +# going to need it -- HMMER src/ and any modules of mine that +# also produce installed executables (squid/) +# +version.h: + echo "/* version.h -- automatically generated by a Makefile. DO NOT EDIT. */" > version.h + echo "#define PACKAGE \"$(PACKAGE)\"" >> version.h + echo "#define RELEASE \"$(RELEASE)\"" >> version.h + echo "#define RELEASEDATE \"$(RELEASEDATE)\"" >> version.h + echo "#define COPYRIGHT \"$(COPYRIGHT)\"" >> version.h + echo "#define LICENSE \"$(LICENSE)\"" >> version.h + cp version.h squid/ + cp version.h src/ + +# check: compiles and runs test suite in testsuite/ +# These are public tests, distributed with the package. +# +check: squid/libsquid.a src/libhmmer.a + (cd testsuite; make CC="$(CC)" CFLAGS="$(CFLAGS)") + (cd testsuite; make check) + +squid/libsquid.a: version.h + (cd squid; make CC="$(CC)" CFLAGS="$(CFLAGS)"; make module) + +src/libhmmer.a: version.h + (cd src; make CC="$(CC)" CFLAGS="$(CFLAGS)"; make module) + + +# install: installs the binaries in BINDIR/ +# installs man pages in MANDIR/man1/ (e.g. if MANSUFFIX is 1) +install: + for file in $(PROGS) $(SQUIDPROGS) $(PVMPROGS); do\ + cp binaries/$$file $(BINDIR)/;\ + done + for file in hmmer $(PROGS) $(SQUIDPROGS); do\ + $(INSTMAN) documentation/man/$$file.man $(MANDIR)/man$(MANSUFFIX)/$$file.$(MANSUFFIX);\ + done + for file in $(SQUIDPROGS); do\ + $(INSTMAN) squid/Man/$$file.man $(MANDIR)/man$(MANSUFFIX)/$$file.$(MANSUFFIX);\ + done + +# "make clean" removes almost everything except configuration files +# and binaries. +clean: + -rm -f *.o *~ Makefile.bak core TAGS gmon.out + (cd src; make clean) + (cd squid; make clean) + (cd testsuite; make clean) + +# "make distclean" leaves a pristine source distribution. +# +distclean: + -rm -rf binaries + -rm config.cache config.log config.status + -rm version.h + make clean + (cd src; make distclean) + (cd squid; make distclean) + (cd testsuite; make distclean) + -rm Makefile + +# verify: consistency checks on the package +# These are private tests, not distributed with HMMER +verify: + @echo Checking options for consistency and documentation... + @for prog in $(PROGS) $(SQUIDPROGS); do\ + ./checkoptions.pl $$prog;\ + done + +# doc: build the Userguide and on-line manual +# +doc: + (cd Userguide; make) + +# dist: build a new distribution directory in hmmer-$RELEASE +# Exports from the CVS repository. +# tags RCS files with $(RELCODE) for later reconstruction +# squid RCS files are tagged with hmmer$(RELCODE). +# Adds a license statement to each file that has a @ LICENSE @ line. +# Virtually identical to squid's make dist -- keep them in sync! +dist: + # Delete old versions of the same release + # + @if test -d ${BASENAME}-$(RELEASE); then rm -rf ${BASENAME}-$(RELEASE); fi + @if test -e ${BASENAME}-$(RELEASE).tar; then rm -f ${BASENAME}-$(RELEASE).tar; fi + @if test -e ${BASENAME}-$(RELEASE).tar.Z; then rm -f ${BASENAME}-$(RELEASE).tar.Z; fi + @if test -e ${BASENAME}-$(RELEASE).tar.gz; then rm -f ${BASENAME}-$(RELEASE).tar.gz; fi + # + # CVS tag and extract. -c: make sure we committed; + # -F: allow more than one "make dist" per rel + # prep: must have done "cvs commit", and CVSROOT must be set + # We also need the squid library, so tag and export it too. + # + cvs tag -F $(RELCODE) + cvs export -r $(RELCODE) -d ${BASENAME}-${RELEASE} ${BASENAME} + cvs rtag -F ${RELCODE} squid + (cd ${BASENAME}-${RELEASE}; cvs export -r ${RELCODE} -d squid squid) + # + # Make the configure scripts from configure.in + # + (cd ${BASENAME}-${RELEASE}; autoconf) + (cd ${BASENAME}-${RELEASE}/squid; autoconf) + # + # Include the appropriate license files and release notes + # + cp Licenses/LICENSE.$(LICENSETAG) ${BASENAME}-$(RELEASE)/LICENSE + cp Licenses/COPYING.$(LICENSETAG) ${BASENAME}-$(RELEASE)/COPYRIGHT + -cp Release-Notes/RELEASE-$(RELEASE) ${BASENAME}-$(RELEASE)/NOTES + # + # Attach license stamps on files that need 'em (replace LICENSE keyword) + # licenseadd.pl is basically harmless, so we can overannotate here by + # trying to licenseadd everything. + # + find ${BASENAME}-${RELEASE} -type f -exec licenseadd.pl Licenses/$(LICENSETAG) {} \; + # + # Compilation of the documentation. + # Documentation is not provided in source form. + # + (cd documentation/userguide; make clean) + (cd documentation/userguide; make pdf) + cp -f documentation/userguide/Userguide.pdf build/hmmer-$(RELEASE)/ + # + # Remove CVS-controlled files/directories that don't belong in + # the distro + # + -rm -rf ${BASENAME}-${RELEASE}/Bugs + -rm -rf ${BASENAME}-${RELEASE}/CHECKLIST + -rm -rf ${BASENAME}-${RELEASE}/Internal-Notes + -rm -rf ${BASENAME}-${RELEASE}/Licenses + -rm -rf ${BASENAME}-${RELEASE}/MAILING_LIST + -rm -rf ${BASENAME}-${RELEASE}/Release-Notes + -rm -rf ${BASENAME}-${RELEASE}/configure.in + -rm -rf ${BASENAME}-${RELEASE}/checkoptions.pl + -rm -rf ${BASENAME}-${RELEASE}/licenseadd.pl + -rm -rf ${BASENAME}-${RELEASE}/nodebuild + -rm -rf ${BASENAME}-${RELEASE}/rootbuild + -rm -rf ${BASENAME}-${RELEASE}/buildall + -rm -rf ${BASENAME}-${RELEASE}/documentation/userguide + -rm -rf ${BASENAME}-${RELEASE}/squid/Licenses + -rm -rf ${BASENAME}-${RELEASE}/squid/LOG + -rm -rf ${BASENAME}-${RELEASE}/squid/configure.in + # + # Packaging commands + # + tar cvf ${BASENAME}-${RELEASE}.tar ${BASENAME}-${RELEASE} + $(COMPRESS) ${BASENAME}-$(RELEASE).tar + + +# make ftpdist: install FTP distribution. Assumes a "make dist" +# has already succeeded. Doesn't do the symlink; +# doesn't install the on-line manual for the Web site. +# +ftpdist: + cp -f $(READMES) $(FTPDIR) + cp -f build/hmmer-$(RELEASE)/COPYING $(FTPDIR) + cp -f build/hmmer-$(RELEASE)/LICENSETAG $(FTPDIR) + cp -f build/hmmer-$(RELEASE)/NOTES $(FTPDIR) + cp -f build/hmmer-$(RELEASE).tar.Z $(FTPDIR) + cp -f build/hmmer-$(RELEASE)-*.tar.Z $(FTPDIR) + cp -f Userguide/Userguide.ps $(FTPDIR) + cp -f Userguide/Userguide.pdf $(FTPDIR) + cp -f Userguide/hmmer-html.tar.Z $(FTPDIR) + + +# make stable: Set up the FTP site symlink to the current stable HMMER release. +# +stable: + ln -sf $(FTPDIR)/hmmer-$(RELEASE).tar.Z $(FTPDIR)/hmmer.tar.Z + + diff --git a/forester/archive/RIO/others/hmmer/NOTES b/forester/archive/RIO/others/hmmer/NOTES new file mode 100644 index 0000000..12c069b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/NOTES @@ -0,0 +1,197 @@ +HMMER 2.2 release notes +http://hmmer.wustl.edu/ +SRE, Fri May 4 13:00:33 2001 +--------------------------------------------------------------- + +As it has been more than 2 years since the last HMMER release, this is +unlikely to be a comprehensive list of changes. + +HMMER is now maintained under CVS. Anonymous read-only access to the +development code is permitted. To download the current snapshot: + > setenv CVSROOT :pserver:anonymous@skynet.wustl.edu:/repository/sre + > cvs login + [password is "anonymous"] + > cvs checkout hmmer + > cd hmmer + > cvs checkout squid + > cvs logout + +The following programs were added to the distribution: + + - The program "afetch" can fetch an alignment from + a Stockholm format multiple alignment database (e.g. Pfam). + "afetch --index" creates the index files for such + a database. + + - The program "shuffle" makes "randomized" sequences. + It supports a variety of sequence randomization methods, + including an implementation of Altschul/Erickson's + shuffling-while-preserving-digram-composition algorithm. + + - The program "sindex" creates SSI indices from sequence + files, that "sfetch" can use to rapidly retrieve sequences + from databases. Previously, index files were constructed + with Perl scripts that were not supported as part of the + HMMER distribution. + +The following features were added: + + - hmmsearch and hmmpfam can now use Pfam GA, TC, NC cutoffs, + if these have been picked up in the HMM file (by hmmbuild). + See the --cut_ga, --cut_tc, and --cut_nc options. + + - "Stockholm format" alignments are supported, and have replaced + SELEX format as the default alignment format. Stockholm format + is the alignment format agreed upon by the Pfam Consortium, + providing extensible markup and annotation capabilities. HMMER + writes Stockholm format alignments by default. The program + sreformat can reformat alignments to other formats, including + Clustal and GCG MSF formats. + + - To improve robustness, particularly in high-throughput annotation + pipelines, all programs now accept an option --informat , + where is the name of a sequence file format (FASTA, for + example). The format autodetection code that is used by default + is almost always right, and is very helpful in interactive use + (HMMER reads almost anything without you worrying much about + format issues). --informat bypasses the autodetector, asserts + a particular format, and decreases the likelihood that HMMER + misparses a sequence file. + + - new options: + hmmpfam --acc reports HMM accession numbers instead of + HMM names in output files. [Pfam infrastructure] + + sreformat --nogap, when reformatting an alignment, + removes all columns containing any gap symbols; useful + as a prefilter for phylogenetic analysis. + + - The real software version of HMMER is logged into + the HMMER2.0 line of ASCII save files, for better + version control (e.g. bug tracking, but there are + no bugs in HMMER). + + - GCG MSF format reading/writing is now much more robust, + thanks to assistance from Steve Smith at GCG. + + - The PVM implementation of hmmcalibrate is now + parallelized in a finer grained fashion; single models + can be accelerated. (The previous version parallelized + by assigning models to processors, so could not + accelerate a single model calibration.) + + - hmmemit can now take HMM libraries as input, not just + a single HMM at a time - useful for instance for producing + "consensus sequences" for every model in Pfam with one + command. + +The following changes may affect HMMER-compatible software: + + - The name of the sequence retrieval program "getseq" was + changed to "sfetch" in this release. The name "getseq" + clashes with a Genetics Computer Group package program + of similar functionality. + + - The output format for the headers of hmmsearch and hmmpfam + were changed. The accessions and descriptions of query + HMMs or sequences, respectively, are reported on separate + lines. An option ("--compat") is provided for reverting + to the previous format, if you don't want to rewrite your + parser(s) right away. + + - hmmpfam now calculates E-values based on the actual + number of HMMs in the database that is searched, unless + overridden with the -Z option from the command line. + It used to use Z=59021 semi-arbitrarily to make results + jibe with a typical hmmsearch, but this just confused + people more than it helped. hmmpfam E-values will therefore + become more significant in this release by about 37x, + for a typical Pfam search (59021/1600 = 37). + +The following major bugs were fixed: + [none] + +The following minor bugs were fixed: + - more argument casting to silence compiler warnings + [M. Regelson, Paracel ] + + - a potential reentrancy problem with setting the + alphabet type in the threads version was + fixed, but this problem is unlikely to have ever affected + anyone. [M. Sievers, Paracel]. + + - fixed a bug where hmmbuild on Solaris machines would crash + when presented with an alignment with an #=ID line. + Same bug caused a crash when building a model from a single + sequence FASTA file [A. Bateman, Sanger] + + - The configure script was modified to deal better with + different vendor's implementations of pthreads, in response + to a DEC Digital UNIX compilation problem [W. Pearson, + U. Virginia] + + - Automatic sequence file format detection was slightly + improved, fixing a bug in detecting GCG-reformatted + Swissprot files [reported by J. Holzwarth] + + - hmmpfam-pvm and hmmindex had a bad interaction if an HMM file had + accession numbers as well as names (e.g., Pfam). The phenotype was + that hmmpfam-pvm would search each model twice: once for its name, + and once for its accession. hmmindex now uses a new + indexing scheme (SSI, replacing GSI). [multiple reports; + often manifested as a failure of the StL Pfam server to + install, because of an hmmindex --one2one option in the Makefile; this was + a local hack, never distributed in HMMER]. + + - a rare floating exception bug in ExtremeValueP() was fixed; + range-checking protections in the function were in error, and + a range error in a log() calculation appeared on + Digital Unix platforms for a *very* tiny set of scores + for any given mu, lambda. + + - The default null2 score correction was applied in + a way that was justifiable, but differed between per-seq + and per-domain scores; thus per-domain scores did not + necessarily add up to per-seq scores. In certain cases + this produced counterintuitive results. null2 is now + applied in a way that is still justifiable, and also + consistent; per-domain scores add up to the per-seq score. + [first reported by David Kerk] + + - --domE and --domT did not work correctly in hmmpfam, because + the code assumed that E-values are monotonic with score. + In some cases, this could cause HMMER to fail to report some + significant domains. [Christiane VanSchlun, GCG] + +The following obscure bugs were fixed (i.e., there were no reports of +anyone but me detecting these bugs): + + - sreformat no longer core dumps when reformatting a + single sequence to an alignment format. + + - Banner() was printing a line to stdout instead of its + file handle... but Banner is always called w/ stdout as + its filehandle in the current implementation. + [M. Regelson, Paracel] + + - .gz file reading is only supported on POSIX OS's. A compile + time define, SRE_STRICT_ANSI, may be defined to allow compiling + on ANSI compliant but non-POSIX operating systems. + + - Several problems with robustness w.r.t. unexpected + combinations of command line options were detected by + GCG quality control testing. [Christiane VanSchlun] + +(At least) the following projects remain incomplete: + + - Ian Holmes' posterior probability routines (POSTAL) are + partially assimilated; see postprob.c, display.c + + - CPU times can now be reported for serial, threaded, + and PVM executions; this is only supported by hmmcalibrate + right now. + + - Mixture Dirichlet priors now include some ongoing work + in collaboration with Michael Asman and Erik Sonnhammer + in Stockholm; also #=GC X-PRM, X-PRT, X-PRI support in + hmmbuild/Stockholm annotation. diff --git a/forester/archive/RIO/others/hmmer/Userguide.pdf b/forester/archive/RIO/others/hmmer/Userguide.pdf new file mode 100644 index 0000000..74eab16 Binary files /dev/null and b/forester/archive/RIO/others/hmmer/Userguide.pdf differ diff --git a/forester/archive/RIO/others/hmmer/config.guess b/forester/archive/RIO/others/hmmer/config.guess new file mode 100755 index 0000000..2960d6e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/config.guess @@ -0,0 +1,951 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright (C) 1992, 93, 94, 95, 96, 97, 1998 Free Software Foundation, Inc. +# +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Written by Per Bothner . +# The master version of this file is at the FSF in /home/gd/gnu/lib. +# +# This script attempts to guess a canonical system name similar to +# config.sub. If it succeeds, it prints the system name on stdout, and +# exits with 0. Otherwise, it exits with 1. +# +# The plan is that this can be called by configure scripts if you +# don't specify an explicit system type (host/target name). +# +# Only a few systems have been added to this list; please add others +# (but try to keep the structure clean). +# + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@noc.rutgers.edu 8/24/94.) +if (test -f /.attbin/uname) >/dev/null 2>&1 ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +trap 'rm -f dummy.c dummy.o dummy; exit 1' 1 2 15 + +# Note: order is significant - the case branches are not exclusive. + +case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in + alpha:OSF1:*:*) + if test $UNAME_RELEASE = "V4.0"; then + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + fi + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + cat <dummy.s + .globl main + .ent main +main: + .frame \$30,0,\$26,0 + .prologue 0 + .long 0x47e03d80 # implver $0 + lda \$2,259 + .long 0x47e20c21 # amask $2,$1 + srl \$1,8,\$2 + sll \$2,2,\$2 + sll \$0,3,\$0 + addl \$1,\$0,\$0 + addl \$2,\$0,\$0 + ret \$31,(\$26),1 + .end main +EOF + ${CC-cc} dummy.s -o dummy 2>/dev/null + if test "$?" = 0 ; then + ./dummy + case "$?" in + 7) + UNAME_MACHINE="alpha" + ;; + 15) + UNAME_MACHINE="alphaev5" + ;; + 14) + UNAME_MACHINE="alphaev56" + ;; + 10) + UNAME_MACHINE="alphapca56" + ;; + 16) + UNAME_MACHINE="alphaev6" + ;; + esac + fi + rm -f dummy.s dummy + echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr [[A-Z]] [[a-z]]` + exit 0 ;; + 21064:Windows_NT:50:3) + echo alpha-dec-winnt3.5 + exit 0 ;; + Amiga*:UNIX_System_V:4.0:*) + echo m68k-cbm-sysv4 + exit 0;; + amiga:NetBSD:*:*) + echo m68k-cbm-netbsd${UNAME_RELEASE} + exit 0 ;; + amiga:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + *:[Aa]miga[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-amigaos + exit 0 ;; + arc64:OpenBSD:*:*) + echo mips64el-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + arc:OpenBSD:*:*) + echo mipsel-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + hkmips:OpenBSD:*:*) + echo mips-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + pmax:OpenBSD:*:*) + echo mipsel-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + sgi:OpenBSD:*:*) + echo mips-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + wgrisc:OpenBSD:*:*) + echo mipsel-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + echo arm-acorn-riscix${UNAME_RELEASE} + exit 0;; + arm32:NetBSD:*:*) + echo arm-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + exit 0 ;; + SR2?01:HI-UX/MPP:*:*) + echo hppa1.1-hitachi-hiuxmpp + exit 0;; + Pyramid*:OSx*:*:*|MIS*:OSx*:*:*|MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + if test "`(/bin/universe) 2>/dev/null`" = att ; then + echo pyramid-pyramid-sysv3 + else + echo pyramid-pyramid-bsd + fi + exit 0 ;; + NILE:*:*:dcosx) + echo pyramid-pyramid-svr4 + exit 0 ;; + sun4H:SunOS:5.*:*) + echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + i86pc:SunOS:5.*:*) + echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + sun4*:SunOS:*:*) + case "`/usr/bin/arch -k`" in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like `4.1.3-JL'. + echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` + exit 0 ;; + sun3*:SunOS:*:*) + echo m68k-sun-sunos${UNAME_RELEASE} + exit 0 ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(head -1 /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 + case "`/bin/arch`" in + sun3) + echo m68k-sun-sunos${UNAME_RELEASE} + ;; + sun4) + echo sparc-sun-sunos${UNAME_RELEASE} + ;; + esac + exit 0 ;; + aushp:SunOS:*:*) + echo sparc-auspex-sunos${UNAME_RELEASE} + exit 0 ;; + atari*:NetBSD:*:*) + echo m68k-atari-netbsd${UNAME_RELEASE} + exit 0 ;; + atari*:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + sun3*:NetBSD:*:*) + echo m68k-sun-netbsd${UNAME_RELEASE} + exit 0 ;; + sun3*:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + mac68k:NetBSD:*:*) + echo m68k-apple-netbsd${UNAME_RELEASE} + exit 0 ;; + mac68k:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + mvme68k:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + mvme88k:OpenBSD:*:*) + echo m88k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + powerpc:machten:*:*) + echo powerpc-apple-machten${UNAME_RELEASE} + exit 0 ;; + macppc:NetBSD:*:*) + echo powerpc-apple-netbsd${UNAME_RELEASE} + exit 0 ;; + RISC*:Mach:*:*) + echo mips-dec-mach_bsd4.3 + exit 0 ;; + RISC*:ULTRIX:*:*) + echo mips-dec-ultrix${UNAME_RELEASE} + exit 0 ;; + VAX*:ULTRIX*:*:*) + echo vax-dec-ultrix${UNAME_RELEASE} + exit 0 ;; + 2020:CLIX:*:*) + echo clipper-intergraph-clix${UNAME_RELEASE} + exit 0 ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + sed 's/^ //' << EOF >dummy.c + int main (argc, argv) int argc; char **argv; { + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + ${CC-cc} dummy.c -o dummy \ + && ./dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \ + && rm dummy.c dummy && exit 0 + rm -f dummy.c dummy + echo mips-mips-riscos${UNAME_RELEASE} + exit 0 ;; + Night_Hawk:Power_UNIX:*:*) + echo powerpc-harris-powerunix + exit 0 ;; + m88k:CX/UX:7*:*) + echo m88k-harris-cxux7 + exit 0 ;; + m88k:*:4*:R4*) + echo m88k-motorola-sysv4 + exit 0 ;; + m88k:*:3*:R3*) + echo m88k-motorola-sysv3 + exit 0 ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if [ $UNAME_PROCESSOR = mc88100 -o $UNAME_PROCESSOR = mc88110 ] ; then + if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx \ + -o ${TARGET_BINARY_INTERFACE}x = x ] ; then + echo m88k-dg-dgux${UNAME_RELEASE} + else + echo m88k-dg-dguxbcs${UNAME_RELEASE} + fi + else echo i586-dg-dgux${UNAME_RELEASE} + fi + exit 0 ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + echo m88k-dolphin-sysv3 + exit 0 ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + echo m88k-motorola-sysv3 + exit 0 ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + echo m88k-tektronix-sysv3 + exit 0 ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + echo m68k-tektronix-bsd + exit 0 ;; + *:IRIX*:*:*) + echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` + exit 0 ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id + exit 0 ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i?86:AIX:*:*) + echo i386-ibm-aix + exit 0 ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + sed 's/^ //' << EOF >dummy.c + #include + + main() + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + ${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0 + rm -f dummy.c dummy + echo rs6000-ibm-aix3.2.5 + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + echo rs6000-ibm-aix3.2.4 + else + echo rs6000-ibm-aix3.2 + fi + exit 0 ;; + *:AIX:*:4) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | head -1 | awk '{ print $1 }'` + if /usr/sbin/lsattr -EHl ${IBM_CPU_ID} | grep POWER >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=4.${UNAME_RELEASE} + fi + echo ${IBM_ARCH}-ibm-aix${IBM_REV} + exit 0 ;; + *:AIX:*:*) + echo rs6000-ibm-aix + exit 0 ;; + ibmrt:4.4BSD:*|romp-ibm:BSD:*) + echo romp-ibm-bsd4.4 + exit 0 ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC NetBSD and + echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to + exit 0 ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + echo rs6000-bull-bosx + exit 0 ;; + DPX/2?00:B.O.S.:*:*) + echo m68k-bull-sysv3 + exit 0 ;; + 9000/[34]??:4.3bsd:1.*:*) + echo m68k-hp-bsd + exit 0 ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + echo m68k-hp-bsd4.4 + exit 0 ;; + 9000/[34678]??:HP-UX:*:*) + case "${UNAME_MACHINE}" in + 9000/31? ) HP_ARCH=m68000 ;; + 9000/[34]?? ) HP_ARCH=m68k ;; + 9000/6?? | 9000/7?? | 9000/80[24] | 9000/8?[13679] | 9000/892 ) + sed 's/^ //' << EOF >dummy.c + #include + #include + + int main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (${CC-cc} dummy.c -o dummy 2>/dev/null ) && HP_ARCH=`./dummy` + rm -f dummy.c dummy + esac + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + echo ${HP_ARCH}-hp-hpux${HPUX_REV} + exit 0 ;; + 3050*:HI-UX:*:*) + sed 's/^ //' << EOF >dummy.c + #include + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + ${CC-cc} dummy.c -o dummy && ./dummy && rm dummy.c dummy && exit 0 + rm -f dummy.c dummy + echo unknown-hitachi-hiuxwe2 + exit 0 ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + echo hppa1.1-hp-bsd + exit 0 ;; + 9000/8??:4.3bsd:*:*) + echo hppa1.0-hp-bsd + exit 0 ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + echo hppa1.1-hp-osf + exit 0 ;; + hp8??:OSF1:*:*) + echo hppa1.0-hp-osf + exit 0 ;; + i?86:OSF1:*:*) + if [ -x /usr/sbin/sysversion ] ; then + echo ${UNAME_MACHINE}-unknown-osf1mk + else + echo ${UNAME_MACHINE}-unknown-osf1 + fi + exit 0 ;; + parisc*:Lites*:*:*) + echo hppa1.1-hp-lites + exit 0 ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + echo c1-convex-bsd + exit 0 ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit 0 ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + echo c34-convex-bsd + exit 0 ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + echo c38-convex-bsd + exit 0 ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + echo c4-convex-bsd + exit 0 ;; + CRAY*X-MP:*:*:*) + echo xmp-cray-unicos + exit 0 ;; + CRAY*Y-MP:*:*:*) + echo ymp-cray-unicos${UNAME_RELEASE} + exit 0 ;; + CRAY*[A-Z]90:*:*:*) + echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ + exit 0 ;; + CRAY*TS:*:*:*) + echo t90-cray-unicos${UNAME_RELEASE} + exit 0 ;; + CRAY-2:*:*:*) + echo cray2-cray-unicos + exit 0 ;; + F300:UNIX_System_V:*:*) + FUJITSU_SYS=`uname -p | tr [A-Z] [a-z] | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "f300-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit 0 ;; + F301:UNIX_System_V:*:*) + echo f301-fujitsu-uxpv`echo $UNAME_RELEASE | sed 's/ .*//'` + exit 0 ;; + hp3[0-9][05]:NetBSD:*:*) + echo m68k-hp-netbsd${UNAME_RELEASE} + exit 0 ;; + hp300:OpenBSD:*:*) + echo m68k-unknown-openbsd${UNAME_RELEASE} + exit 0 ;; + sparc*:BSD/OS:*:*) + echo sparc-unknown-bsdi${UNAME_RELEASE} + exit 0 ;; + i?86:BSD/386:*:* | *:BSD/OS:*:*) + echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} + exit 0 ;; + *:FreeBSD:*:*) + echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` + exit 0 ;; + *:NetBSD:*:*) + echo ${UNAME_MACHINE}-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + exit 0 ;; + *:OpenBSD:*:*) + echo ${UNAME_MACHINE}-unknown-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + exit 0 ;; + i*:CYGWIN*:*) + echo ${UNAME_MACHINE}-pc-cygwin + exit 0 ;; + i*:MINGW*:*) + echo ${UNAME_MACHINE}-pc-mingw32 + exit 0 ;; + p*:CYGWIN*:*) + echo powerpcle-unknown-cygwin + exit 0 ;; + prep*:SunOS:5.*:*) + echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit 0 ;; + *:GNU:*:*) + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + exit 0 ;; + *:Linux:*:*) + # uname on the ARM produces all sorts of strangeness, and we need to + # filter it out. + case "$UNAME_MACHINE" in + arm* | sa110*) UNAME_MACHINE="arm" ;; + esac + + # The BFD linker knows what the default object file format is, so + # first see if it will tell us. + ld_help_string=`ld --help 2>&1` + ld_supported_emulations=`echo $ld_help_string \ + | sed -ne '/supported emulations:/!d + s/[ ][ ]*/ /g + s/.*supported emulations: *// + s/ .*// + p'` + case "$ld_supported_emulations" in + i?86linux) echo "${UNAME_MACHINE}-pc-linux-gnuaout" ; exit 0 ;; + i?86coff) echo "${UNAME_MACHINE}-pc-linux-gnucoff" ; exit 0 ;; + sparclinux) echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;; + armlinux) echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;; + m68klinux) echo "${UNAME_MACHINE}-unknown-linux-gnuaout" ; exit 0 ;; + elf32ppc) echo "powerpc-unknown-linux-gnu" ; exit 0 ;; + esac + + if test "${UNAME_MACHINE}" = "alpha" ; then + sed 's/^ //' <dummy.s + .globl main + .ent main + main: + .frame \$30,0,\$26,0 + .prologue 0 + .long 0x47e03d80 # implver $0 + lda \$2,259 + .long 0x47e20c21 # amask $2,$1 + srl \$1,8,\$2 + sll \$2,2,\$2 + sll \$0,3,\$0 + addl \$1,\$0,\$0 + addl \$2,\$0,\$0 + ret \$31,(\$26),1 + .end main +EOF + LIBC="" + ${CC-cc} dummy.s -o dummy 2>/dev/null + if test "$?" = 0 ; then + ./dummy + case "$?" in + 7) + UNAME_MACHINE="alpha" + ;; + 15) + UNAME_MACHINE="alphaev5" + ;; + 14) + UNAME_MACHINE="alphaev56" + ;; + 10) + UNAME_MACHINE="alphapca56" + ;; + 16) + UNAME_MACHINE="alphaev6" + ;; + esac + + objdump --private-headers dummy | \ + grep ld.so.1 > /dev/null + if test "$?" = 0 ; then + LIBC="libc1" + fi + fi + rm -f dummy.s dummy + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} ; exit 0 + elif test "${UNAME_MACHINE}" = "mips" ; then + cat >dummy.c </dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0 + rm -f dummy.c dummy + else + # Either a pre-BFD a.out linker (linux-gnuoldld) + # or one that does not give us useful --help. + # GCC wants to distinguish between linux-gnuoldld and linux-gnuaout. + # If ld does not provide *any* "supported emulations:" + # that means it is gnuoldld. + echo "$ld_help_string" | grep >/dev/null 2>&1 "supported emulations:" + test $? != 0 && echo "${UNAME_MACHINE}-pc-linux-gnuoldld" && exit 0 + + case "${UNAME_MACHINE}" in + i?86) + VENDOR=pc; + ;; + *) + VENDOR=unknown; + ;; + esac + # Determine whether the default compiler is a.out or elf + cat >dummy.c < +main(argc, argv) + int argc; + char *argv[]; +{ +#ifdef __ELF__ +# ifdef __GLIBC__ +# if __GLIBC__ >= 2 + printf ("%s-${VENDOR}-linux-gnu\n", argv[1]); +# else + printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]); +# endif +# else + printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]); +# endif +#else + printf ("%s-${VENDOR}-linux-gnuaout\n", argv[1]); +#endif + return 0; +} +EOF + ${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy "${UNAME_MACHINE}" && rm dummy.c dummy && exit 0 + rm -f dummy.c dummy + fi ;; +# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. earlier versions +# are messed up and put the nodename in both sysname and nodename. + i?86:DYNIX/ptx:4*:*) + echo i386-sequent-sysv4 + exit 0 ;; + i?86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} + exit 0 ;; + i?86:*:4.*:* | i?86:SYSTEM_V:4.*:*) + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + echo ${UNAME_MACHINE}-univel-sysv${UNAME_RELEASE} + else + echo ${UNAME_MACHINE}-pc-sysv${UNAME_RELEASE} + fi + exit 0 ;; + i?86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|egrep Release|sed -e 's/.*= //')` + (/bin/uname -X|egrep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|egrep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + echo ${UNAME_MACHINE}-pc-sco$UNAME_REL + else + echo ${UNAME_MACHINE}-pc-sysv32 + fi + exit 0 ;; + i?86:UnixWare:*:*) + if /bin/uname -X 2>/dev/null >/dev/null ; then + (/bin/uname -X|egrep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + fi + echo ${UNAME_MACHINE}-unixware-${UNAME_RELEASE}-${UNAME_VERSION} + exit 0 ;; + pc:*:*:*) + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i386. + echo i386-pc-msdosdjgpp + exit 0 ;; + Intel:Mach:3*:*) + echo i386-pc-mach3 + exit 0 ;; + paragon:*:*:*) + echo i860-intel-osf1 + exit 0 ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + fi + exit 0 ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + echo m68010-convergent-sysv + exit 0 ;; + M68*:*:R3V[567]*:*) + test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;; + 3[34]??:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 4850:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && echo i486-ncr-sysv4.3${OS_REL} && exit 0 + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && echo i486-ncr-sysv4 && exit 0 ;; + m68*:LynxOS:2.*:*) + echo m68k-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + mc68030:UNIX_System_V:4.*:*) + echo m68k-atari-sysv4 + exit 0 ;; + i?86:LynxOS:2.*:*) + echo i386-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + TSUNAMI:LynxOS:2.*:*) + echo sparc-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + rs6000:LynxOS:2.*:* | PowerPC:LynxOS:2.*:*) + echo rs6000-unknown-lynxos${UNAME_RELEASE} + exit 0 ;; + SM[BE]S:UNIX_SV:*:*) + echo mips-dde-sysv${UNAME_RELEASE} + exit 0 ;; + RM*:SINIX-*:*:*) + echo mips-sni-sysv4 + exit 0 ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + echo ${UNAME_MACHINE}-sni-sysv4 + else + echo ns32k-sni-sysv + fi + exit 0 ;; + PENTIUM:CPunix:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says + echo i586-unisys-sysv4 + exit 0 ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes . + # How about differentiating between stratus architectures? -djm + echo hppa1.1-stratus-sysv4 + exit 0 ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + echo i860-stratus-sysv4 + exit 0 ;; + mc68*:A/UX:*:*) + echo m68k-apple-aux${UNAME_RELEASE} + exit 0 ;; + news*:NEWS-OS:*:6*) + echo mips-sony-newsos6 + exit 0 ;; + R3000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R4000:UNIX_SV:*:*) + if [ -d /usr/nec ]; then + echo mips-nec-sysv${UNAME_RELEASE} + else + echo mips-unknown-sysv${UNAME_RELEASE} + fi + exit 0 ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + echo powerpc-be-beos + exit 0 ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + echo powerpc-apple-beos + exit 0 ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + echo i586-pc-beos + exit 0 ;; +esac + +#echo '(No uname command or uname output not recognized.)' 1>&2 +#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 + +cat >dummy.c < +# include +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (__arm) && defined (__acorn) && defined (__unix) + printf ("arm-acorn-riscix"); exit (0); +#endif + +#if defined (hp300) && !defined (hpux) + printf ("m68k-hp-bsd\n"); exit (0); +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); + +#endif + +#if defined (vax) +#if !defined (ultrix) + printf ("vax-dec-bsd\n"); exit (0); +#else + printf ("vax-dec-ultrix\n"); exit (0); +#endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +${CC-cc} dummy.c -o dummy 2>/dev/null && ./dummy && rm dummy.c dummy && exit 0 +rm -f dummy.c dummy + +# Apollos put the system type in the environment. + +test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; } + +# Convex versions that predate uname can use getsysinfo(1) + +if [ -x /usr/convex/getsysinfo ] +then + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd + exit 0 ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit 0 ;; + c34*) + echo c34-convex-bsd + exit 0 ;; + c38*) + echo c38-convex-bsd + exit 0 ;; + c4*) + echo c4-convex-bsd + exit 0 ;; + esac +fi + +#echo '(Unable to guess system type)' 1>&2 + +exit 1 diff --git a/forester/archive/RIO/others/hmmer/config.sub b/forester/archive/RIO/others/hmmer/config.sub new file mode 100755 index 0000000..00bea6e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/config.sub @@ -0,0 +1,955 @@ +#! /bin/sh +# Configuration validation subroutine script, version 1.1. +# Copyright (C) 1991, 92-97, 1998 Free Software Foundation, Inc. +# This file is (in principle) common to ALL GNU software. +# The presence of a machine in this file suggests that SOME GNU software +# can handle that machine. It does not imply ALL GNU software can. +# +# This file is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + +# Configuration subroutine to validate and canonicalize a configuration type. +# Supply the specified configuration type as an argument. +# If it is invalid, we print an error message on stderr and exit with code 1. +# Otherwise, we print the canonical config type on stdout and succeed. + +# This file is supposed to be the same for all GNU packages +# and recognize all the CPU types, system types and aliases +# that are meaningful with *any* GNU software. +# Each package is responsible for reporting which valid configurations +# it does not support. The user should be able to distinguish +# a failure to support a valid configuration from a meaningless +# configuration. + +# The goal of this file is to map all the various variations of a given +# machine specification into a single specification in the form: +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or in some cases, the newer four-part form: +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# It is wrong to echo any other type of specification. + +if [ x$1 = x ] +then + echo Configuration name missing. 1>&2 + echo "Usage: $0 CPU-MFR-OPSYS" 1>&2 + echo "or $0 ALIAS" 1>&2 + echo where ALIAS is a recognized configuration type. 1>&2 + exit 1 +fi + +# First pass through any local machine types. +case $1 in + *local*) + echo $1 + exit 0 + ;; + *) + ;; +esac + +# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). +# Here we must recognize all the valid KERNEL-OS combinations. +maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` +case $maybe_os in + linux-gnu*) + os=-$maybe_os + basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` + ;; + *) + basic_machine=`echo $1 | sed 's/-[^-]*$//'` + if [ $basic_machine != $1 ] + then os=`echo $1 | sed 's/.*-/-/'` + else os=; fi + ;; +esac + +### Let's recognize common machines as not being operating systems so +### that things like config.sub decstation-3100 work. We also +### recognize some manufacturers as not being operating systems, so we +### can provide default operating systems below. +case $os in + -sun*os*) + # Prevent following clause from handling this invalid input. + ;; + -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ + -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ + -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ + -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ + -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ + -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ + -apple) + os= + basic_machine=$1 + ;; + -hiux*) + os=-hiuxwe2 + ;; + -sco5) + os=sco3.2v5 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco4) + os=-sco3.2v4 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2.[4-9]*) + os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2v[4-9]*) + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco*) + os=-sco3.2v2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -isc) + os=-isc2.2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -clix*) + basic_machine=clipper-intergraph + ;; + -isc*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -lynx*) + os=-lynxos + ;; + -ptx*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` + ;; + -windowsnt*) + os=`echo $os | sed -e 's/windowsnt/winnt/'` + ;; + -psos*) + os=-psos + ;; +esac + +# Decode aliases for certain CPU-COMPANY combinations. +case $basic_machine in + # Recognize the basic CPU types without company name. + # Some are omitted here because they have special meanings below. + tahoe | i860 | m32r | m68k | m68000 | m88k | ns32k | arc | arm \ + | arme[lb] | pyramid | mn10200 | mn10300 | tron | a29k \ + | 580 | i960 | h8300 | hppa | hppa1.0 | hppa1.1 | hppa2.0 \ + | alpha | alphaev5 | alphaev56 | we32k | ns16k | clipper \ + | i370 | sh | powerpc | powerpcle | 1750a | dsp16xx | pdp11 \ + | mips64 | mipsel | mips64el | mips64orion | mips64orionel \ + | mipstx39 | mipstx39el \ + | sparc | sparclet | sparclite | sparc64 | v850) + basic_machine=$basic_machine-unknown + ;; + # We use `pc' rather than `unknown' + # because (1) that's what they normally are, and + # (2) the word "unknown" tends to confuse beginning users. + i[34567]86) + basic_machine=$basic_machine-pc + ;; + # Object if more than one company name word. + *-*-*) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; + # Recognize the basic CPU types with company name. + vax-* | tahoe-* | i[34567]86-* | i860-* | m32r-* | m68k-* | m68000-* \ + | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | arm-* | c[123]* \ + | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \ + | power-* | none-* | 580-* | cray2-* | h8300-* | i960-* \ + | xmp-* | ymp-* | hppa-* | hppa1.0-* | hppa1.1-* | hppa2.0-* \ + | alpha-* | alphaev5-* | alphaev56-* | we32k-* | cydra-* \ + | ns16k-* | pn-* | np1-* | xps100-* | clipper-* | orion-* \ + | sparclite-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \ + | sparc64-* | mips64-* | mipsel-* \ + | mips64el-* | mips64orion-* | mips64orionel-* \ + | mipstx39-* | mipstx39el-* \ + | f301-*) + ;; + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. + 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) + basic_machine=m68000-att + ;; + 3b*) + basic_machine=we32k-att + ;; + alliant | fx80) + basic_machine=fx80-alliant + ;; + altos | altos3068) + basic_machine=m68k-altos + ;; + am29k) + basic_machine=a29k-none + os=-bsd + ;; + amdahl) + basic_machine=580-amdahl + os=-sysv + ;; + amiga | amiga-*) + basic_machine=m68k-cbm + ;; + amigaos | amigados) + basic_machine=m68k-cbm + os=-amigaos + ;; + amigaunix | amix) + basic_machine=m68k-cbm + os=-sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + os=-sysv + ;; + aux) + basic_machine=m68k-apple + os=-aux + ;; + balance) + basic_machine=ns32k-sequent + os=-dynix + ;; + convex-c1) + basic_machine=c1-convex + os=-bsd + ;; + convex-c2) + basic_machine=c2-convex + os=-bsd + ;; + convex-c32) + basic_machine=c32-convex + os=-bsd + ;; + convex-c34) + basic_machine=c34-convex + os=-bsd + ;; + convex-c38) + basic_machine=c38-convex + os=-bsd + ;; + cray | ymp) + basic_machine=ymp-cray + os=-unicos + ;; + cray2) + basic_machine=cray2-cray + os=-unicos + ;; + [ctj]90-cray) + basic_machine=c90-cray + os=-unicos + ;; + crds | unos) + basic_machine=m68k-crds + ;; + da30 | da30-*) + basic_machine=m68k-da30 + ;; + decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) + basic_machine=mips-dec + ;; + delta | 3300 | motorola-3300 | motorola-delta \ + | 3300-motorola | delta-motorola) + basic_machine=m68k-motorola + ;; + delta88) + basic_machine=m88k-motorola + os=-sysv3 + ;; + dpx20 | dpx20-*) + basic_machine=rs6000-bull + os=-bosx + ;; + dpx2* | dpx2*-bull) + basic_machine=m68k-bull + os=-sysv3 + ;; + ebmon29k) + basic_machine=a29k-amd + os=-ebmon + ;; + elxsi) + basic_machine=elxsi-elxsi + os=-bsd + ;; + encore | umax | mmax) + basic_machine=ns32k-encore + ;; + fx2800) + basic_machine=i860-alliant + ;; + genix) + basic_machine=ns32k-ns + ;; + gmicro) + basic_machine=tron-gmicro + os=-sysv + ;; + h3050r* | hiux*) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + h8300hms) + basic_machine=h8300-hitachi + os=-hms + ;; + harris) + basic_machine=m88k-harris + os=-sysv3 + ;; + hp300-*) + basic_machine=m68k-hp + ;; + hp300bsd) + basic_machine=m68k-hp + os=-bsd + ;; + hp300hpux) + basic_machine=m68k-hp + os=-hpux + ;; + hp9k2[0-9][0-9] | hp9k31[0-9]) + basic_machine=m68000-hp + ;; + hp9k3[2-9][0-9]) + basic_machine=m68k-hp + ;; + hp9k7[0-9][0-9] | hp7[0-9][0-9] | hp9k8[0-9]7 | hp8[0-9]7) + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][0-9] | hp8[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hppa-next) + os=-nextstep3 + ;; + i370-ibm* | ibm*) + basic_machine=i370-ibm + os=-mvs + ;; +# I'm not sure what "Sysv32" means. Should this be sysv3.2? + i[34567]86v32) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv32 + ;; + i[34567]86v4*) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv4 + ;; + i[34567]86v) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv + ;; + i[34567]86sol2) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-solaris2 + ;; + iris | iris4d) + basic_machine=mips-sgi + case $os in + -irix*) + ;; + *) + os=-irix4 + ;; + esac + ;; + isi68 | isi) + basic_machine=m68k-isi + os=-sysv + ;; + m88k-omron*) + basic_machine=m88k-omron + ;; + magnum | m3230) + basic_machine=mips-mips + os=-sysv + ;; + merlin) + basic_machine=ns32k-utek + os=-sysv + ;; + miniframe) + basic_machine=m68000-convergent + ;; + mipsel*-linux*) + basic_machine=mipsel-unknown + os=-linux-gnu + ;; + mips*-linux*) + basic_machine=mips-unknown + os=-linux-gnu + ;; + mips3*-*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` + ;; + mips3*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown + ;; + ncr3000) + basic_machine=i486-ncr + os=-sysv4 + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + os=-newsos + ;; + news1000) + basic_machine=m68030-sony + os=-newsos + ;; + news-3600 | risc-news) + basic_machine=mips-sony + os=-newsos + ;; + next | m*-next ) + basic_machine=m68k-next + case $os in + -nextstep* ) + ;; + -ns2*) + os=-nextstep2 + ;; + *) + os=-nextstep3 + ;; + esac + ;; + nh3000) + basic_machine=m68k-harris + os=-cxux + ;; + nh[45]000) + basic_machine=m88k-harris + os=-cxux + ;; + nindy960) + basic_machine=i960-intel + os=-nindy + ;; + np1) + basic_machine=np1-gould + ;; + pa-hitachi) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + paragon) + basic_machine=i860-intel + os=-osf + ;; + pbd) + basic_machine=sparc-tti + ;; + pbb) + basic_machine=m68k-tti + ;; + pc532 | pc532-*) + basic_machine=ns32k-pc532 + ;; + pentium | p5 | k5 | nexen) + basic_machine=i586-pc + ;; + pentiumpro | p6 | k6 | 6x86) + basic_machine=i686-pc + ;; + pentiumii | pentium2) + basic_machine=i786-pc + ;; + pentium-* | p5-* | k5-* | nexen-*) + basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumpro-* | p6-* | k6-* | 6x86-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumii-* | pentium2-*) + basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pn) + basic_machine=pn-gould + ;; + power) basic_machine=rs6000-ibm + ;; + ppc) basic_machine=powerpc-unknown + ;; + ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppcle | powerpclittle | ppc-le | powerpc-little) + basic_machine=powerpcle-unknown + ;; + ppcle-* | powerpclittle-*) + basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ps2) + basic_machine=i386-ibm + ;; + rm[46]00) + basic_machine=mips-siemens + ;; + rtpc | rtpc-*) + basic_machine=romp-ibm + ;; + sequent) + basic_machine=i386-sequent + ;; + sh) + basic_machine=sh-hitachi + os=-hms + ;; + sps7) + basic_machine=m68k-bull + os=-sysv2 + ;; + spur) + basic_machine=spur-unknown + ;; + sun2) + basic_machine=m68000-sun + ;; + sun2os3) + basic_machine=m68000-sun + os=-sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + os=-sunos4 + ;; + sun3os3) + basic_machine=m68k-sun + os=-sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + os=-sunos4 + ;; + sun4os3) + basic_machine=sparc-sun + os=-sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + os=-sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + os=-solaris2 + ;; + sun3 | sun3-*) + basic_machine=m68k-sun + ;; + sun4) + basic_machine=sparc-sun + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + ;; + symmetry) + basic_machine=i386-sequent + os=-dynix + ;; + tx39) + basic_machine=mipstx39-unknown + ;; + tx39el) + basic_machine=mipstx39el-unknown + ;; + tower | tower-32) + basic_machine=m68k-ncr + ;; + udi29k) + basic_machine=a29k-amd + os=-udi + ;; + ultra3) + basic_machine=a29k-nyu + os=-sym1 + ;; + vaxv) + basic_machine=vax-dec + os=-sysv + ;; + vms) + basic_machine=vax-dec + os=-vms + ;; + vpp*|vx|vx-*) + basic_machine=f301-fujitsu + ;; + vxworks960) + basic_machine=i960-wrs + os=-vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + os=-vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + os=-vxworks + ;; + xmp) + basic_machine=xmp-cray + os=-unicos + ;; + xps | xps100) + basic_machine=xps100-honeywell + ;; + none) + basic_machine=none-none + os=-none + ;; + +# Here we handle the default manufacturer of certain CPU types. It is in +# some cases the only manufacturer, in others, it is the most popular. + mips) + if [ x$os = x-linux-gnu ]; then + basic_machine=mips-unknown + else + basic_machine=mips-mips + fi + ;; + romp) + basic_machine=romp-ibm + ;; + rs6000) + basic_machine=rs6000-ibm + ;; + vax) + basic_machine=vax-dec + ;; + pdp11) + basic_machine=pdp11-dec + ;; + we32k) + basic_machine=we32k-att + ;; + sparc) + basic_machine=sparc-sun + ;; + cydra) + basic_machine=cydra-cydrome + ;; + orion) + basic_machine=orion-highlevel + ;; + orion105) + basic_machine=clipper-highlevel + ;; + *) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; +esac + +# Here we canonicalize certain aliases for manufacturers. +case $basic_machine in + *-digital*) + basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` + ;; + *-commodore*) + basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` + ;; + *) + ;; +esac + +# Decode manufacturer-specific aliases for certain operating systems. + +if [ x"$os" != x"" ] +then +case $os in + # First match some system type aliases + # that might get confused with valid system types. + # -solaris* is a basic system type, with this one exception. + -solaris1 | -solaris1.*) + os=`echo $os | sed -e 's|solaris1|sunos4|'` + ;; + -solaris) + os=-solaris2 + ;; + -svr4*) + os=-sysv4 + ;; + -unixware*) + os=-sysv4.2uw + ;; + -gnu/linux*) + os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` + ;; + # First accept the basic system types. + # The portable systems comes first. + # Each alternative MUST END IN A *, to match a version number. + # -sysv* is not here because it comes later, after sysvr4. + -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ + | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \ + | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ + | -aos* \ + | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ + | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ + | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \ + | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* \ + | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ + | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ + | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ + | -mingw32* | -linux-gnu* | -uxpv* | -beos*) + # Remember, each alternative MUST END IN *, to match a version number. + ;; + -linux*) + os=`echo $os | sed -e 's|linux|linux-gnu|'` + ;; + -sunos5*) + os=`echo $os | sed -e 's|sunos5|solaris2|'` + ;; + -sunos6*) + os=`echo $os | sed -e 's|sunos6|solaris3|'` + ;; + -osfrose*) + os=-osfrose + ;; + -osf*) + os=-osf + ;; + -utek*) + os=-bsd + ;; + -dynix*) + os=-bsd + ;; + -acis*) + os=-aos + ;; + -ctix* | -uts*) + os=-sysv + ;; + -ns2 ) + os=-nextstep2 + ;; + # Preserve the version number of sinix5. + -sinix5.*) + os=`echo $os | sed -e 's|sinix|sysv|'` + ;; + -sinix*) + os=-sysv4 + ;; + -triton*) + os=-sysv3 + ;; + -oss*) + os=-sysv3 + ;; + -svr4) + os=-sysv4 + ;; + -svr3) + os=-sysv3 + ;; + -sysvr4) + os=-sysv4 + ;; + # This must come after -sysvr4. + -sysv*) + ;; + -xenix) + os=-xenix + ;; + -none) + ;; + *) + # Get rid of the `-' at the beginning of $os. + os=`echo $os | sed 's/[^-]*-//'` + echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 + exit 1 + ;; +esac +else + +# Here we handle the default operating systems that come with various machines. +# The value should be what the vendor currently ships out the door with their +# machine or put another way, the most popular os provided with the machine. + +# Note that if you're going to try to match "-MANUFACTURER" here (say, +# "-sun"), then you have to tell the case statement up towards the top +# that MANUFACTURER isn't an operating system. Otherwise, code above +# will signal an error saying that MANUFACTURER isn't an operating +# system, and we'll never get to this point. + +case $basic_machine in + *-acorn) + os=-riscix1.2 + ;; + arm*-semi) + os=-aout + ;; + pdp11-*) + os=-none + ;; + *-dec | vax-*) + os=-ultrix4.2 + ;; + m68*-apollo) + os=-domain + ;; + i386-sun) + os=-sunos4.0.2 + ;; + m68000-sun) + os=-sunos3 + # This also exists in the configure program, but was not the + # default. + # os=-sunos4 + ;; + *-tti) # must be before sparc entry or we get the wrong os. + os=-sysv3 + ;; + sparc-* | *-sun) + os=-sunos4.1.1 + ;; + *-be) + os=-beos + ;; + *-ibm) + os=-aix + ;; + *-hp) + os=-hpux + ;; + *-hitachi) + os=-hiux + ;; + i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) + os=-sysv + ;; + *-cbm) + os=-amigaos + ;; + *-dg) + os=-dgux + ;; + *-dolphin) + os=-sysv3 + ;; + m68k-ccur) + os=-rtu + ;; + m88k-omron*) + os=-luna + ;; + *-next ) + os=-nextstep + ;; + *-sequent) + os=-ptx + ;; + *-crds) + os=-unos + ;; + *-ns) + os=-genix + ;; + i370-*) + os=-mvs + ;; + *-next) + os=-nextstep3 + ;; + *-gould) + os=-sysv + ;; + *-highlevel) + os=-bsd + ;; + *-encore) + os=-bsd + ;; + *-sgi) + os=-irix + ;; + *-siemens) + os=-sysv4 + ;; + *-masscomp) + os=-rtu + ;; + f301-fujitsu) + os=-uxpv + ;; + *) + os=-none + ;; +esac +fi + +# Here we handle the case where we know the os, and the CPU type, but not the +# manufacturer. We pick the logical manufacturer. +vendor=unknown +case $basic_machine in + *-unknown) + case $os in + -riscix*) + vendor=acorn + ;; + -sunos*) + vendor=sun + ;; + -aix*) + vendor=ibm + ;; + -hpux*) + vendor=hp + ;; + -hiux*) + vendor=hitachi + ;; + -unos*) + vendor=crds + ;; + -dgux*) + vendor=dg + ;; + -luna*) + vendor=omron + ;; + -genix*) + vendor=ns + ;; + -mvs*) + vendor=ibm + ;; + -ptx*) + vendor=sequent + ;; + -vxsim* | -vxworks*) + vendor=wrs + ;; + -aux*) + vendor=apple + ;; + esac + basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` + ;; +esac + +echo $basic_machine$os diff --git a/forester/archive/RIO/others/hmmer/configure b/forester/archive/RIO/others/hmmer/configure new file mode 100755 index 0000000..e5b79e4 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/configure @@ -0,0 +1,2509 @@ +#! /bin/sh + +# Guess values for system-dependent variables and create Makefiles. +# Generated automatically using autoconf version 2.13 +# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc. +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. + +# Defaults: +ac_help= +ac_default_prefix=/usr/local +# Any additions from configure.in: +ac_help="$ac_help + --with-pvm enable PVM, Parallel Virtual Machine" +ac_help="$ac_help + --disable-threads disable POSIX threads support" + +# Initialize some variables set by options. +# The variables have the same names as the options, with +# dashes changed to underlines. +build=NONE +cache_file=./config.cache +exec_prefix=NONE +host=NONE +no_create= +nonopt=NONE +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +target=NONE +verbose= +x_includes=NONE +x_libraries=NONE +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datadir='${prefix}/share' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +libdir='${exec_prefix}/lib' +includedir='${prefix}/include' +oldincludedir='/usr/include' +infodir='${prefix}/info' +mandir='${prefix}/man' + +# Initialize some other variables. +subdirs= +MFLAGS= MAKEFLAGS= +SHELL=${CONFIG_SHELL-/bin/sh} +# Maximum number of lines to put in a shell here document. +ac_max_here_lines=12 + +ac_prev= +for ac_option +do + + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval "$ac_prev=\$ac_option" + ac_prev= + continue + fi + + case "$ac_option" in + -*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;; + *) ac_optarg= ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case "$ac_option" in + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir="$ac_optarg" ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build="$ac_optarg" ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file="$ac_optarg" ;; + + -datadir | --datadir | --datadi | --datad | --data | --dat | --da) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \ + | --da=*) + datadir="$ac_optarg" ;; + + -disable-* | --disable-*) + ac_feature=`echo $ac_option|sed -e 's/-*disable-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + eval "enable_${ac_feature}=no" ;; + + -enable-* | --enable-*) + ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "enable_${ac_feature}='$ac_optarg'" ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix="$ac_optarg" ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he) + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat << EOF +Usage: configure [options] [host] +Options: [defaults in brackets after descriptions] +Configuration: + --cache-file=FILE cache test results in FILE + --help print this message + --no-create do not create output files + --quiet, --silent do not print \`checking...' messages + --version print the version of autoconf that created configure +Directory and file names: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [same as prefix] + --bindir=DIR user executables in DIR [EPREFIX/bin] + --sbindir=DIR system admin executables in DIR [EPREFIX/sbin] + --libexecdir=DIR program executables in DIR [EPREFIX/libexec] + --datadir=DIR read-only architecture-independent data in DIR + [PREFIX/share] + --sysconfdir=DIR read-only single-machine data in DIR [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data in DIR + [PREFIX/com] + --localstatedir=DIR modifiable single-machine data in DIR [PREFIX/var] + --libdir=DIR object code libraries in DIR [EPREFIX/lib] + --includedir=DIR C header files in DIR [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc in DIR [/usr/include] + --infodir=DIR info documentation in DIR [PREFIX/info] + --mandir=DIR man documentation in DIR [PREFIX/man] + --srcdir=DIR find the sources in DIR [configure dir or ..] + --program-prefix=PREFIX prepend PREFIX to installed program names + --program-suffix=SUFFIX append SUFFIX to installed program names + --program-transform-name=PROGRAM + run sed PROGRAM on installed program names +EOF + cat << EOF +Host type: + --build=BUILD configure for building on BUILD [BUILD=HOST] + --host=HOST configure for HOST [guessed] + --target=TARGET configure for TARGET [TARGET=HOST] +Features and packages: + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) + --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --x-includes=DIR X include files are in DIR + --x-libraries=DIR X library files are in DIR +EOF + if test -n "$ac_help"; then + echo "--enable and --with options recognized:$ac_help" + fi + exit 0 ;; + + -host | --host | --hos | --ho) + ac_prev=host ;; + -host=* | --host=* | --hos=* | --ho=*) + host="$ac_optarg" ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir="$ac_optarg" ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir="$ac_optarg" ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir="$ac_optarg" ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir="$ac_optarg" ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst \ + | --locals | --local | --loca | --loc | --lo) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* \ + | --locals=* | --local=* | --loca=* | --loc=* | --lo=*) + localstatedir="$ac_optarg" ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir="$ac_optarg" ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir="$ac_optarg" ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix="$ac_optarg" ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix="$ac_optarg" ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix="$ac_optarg" ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name="$ac_optarg" ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir="$ac_optarg" ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir="$ac_optarg" ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site="$ac_optarg" ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir="$ac_optarg" ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir="$ac_optarg" ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target="$ac_optarg" ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers) + echo "configure generated by autoconf version 2.13" + exit 0 ;; + + -with-* | --with-*) + ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "with_${ac_package}='$ac_optarg'" ;; + + -without-* | --without-*) + ac_package=`echo $ac_option|sed -e 's/-*without-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + eval "with_${ac_package}=no" ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes="$ac_optarg" ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries="$ac_optarg" ;; + + -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; } + ;; + + *) + if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then + echo "configure: warning: $ac_option: invalid host type" 1>&2 + fi + if test "x$nonopt" != xNONE; then + { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; } + fi + nonopt="$ac_option" + ;; + + esac +done + +if test -n "$ac_prev"; then + { echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; } +fi + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +# File descriptor usage: +# 0 standard input +# 1 file creation +# 2 errors and warnings +# 3 some systems may open it to /dev/tty +# 4 used on the Kubota Titan +# 6 checking for... messages and results +# 5 compiler messages saved in config.log +if test "$silent" = yes; then + exec 6>/dev/null +else + exec 6>&1 +fi +exec 5>./config.log + +echo "\ +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. +" 1>&5 + +# Strip out --no-create and --no-recursion so they do not pile up. +# Also quote any args containing shell metacharacters. +ac_configure_args= +for ac_arg +do + case "$ac_arg" in + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) ;; + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;; + *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*) + ac_configure_args="$ac_configure_args '$ac_arg'" ;; + *) ac_configure_args="$ac_configure_args $ac_arg" ;; + esac +done + +# NLS nuisances. +# Only set these to C if already set. These must not be set unconditionally +# because not all systems understand e.g. LANG=C (notably SCO). +# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'! +# Non-C LC_CTYPE values break the ctype check. +if test "${LANG+set}" = set; then LANG=C; export LANG; fi +if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi +if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi +if test "${LC_CTYPE+set}" = set; then LC_CTYPE=C; export LC_CTYPE; fi + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -rf conftest* confdefs.h +# AIX cpp loses on an empty file, so make sure it contains at least a newline. +echo > confdefs.h + +# A filename unique to this package, relative to the directory that +# configure is in, which we can look for to find out if srcdir is correct. +ac_unique_file=src/hmmpfam.c + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then its parent. + ac_prog=$0 + ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'` + test "x$ac_confdir" = "x$ac_prog" && ac_confdir=. + srcdir=$ac_confdir + if test ! -r $srcdir/$ac_unique_file; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r $srcdir/$ac_unique_file; then + if test "$ac_srcdir_defaulted" = yes; then + { echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; } + else + { echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; } + fi +fi +srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'` + +# Prefer explicitly selected file to automatically selected ones. +if test -z "$CONFIG_SITE"; then + if test "x$prefix" != xNONE; then + CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site" + else + CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" + fi +fi +for ac_site_file in $CONFIG_SITE; do + if test -r "$ac_site_file"; then + echo "loading site script $ac_site_file" + . "$ac_site_file" + fi +done + +if test -r "$cache_file"; then + echo "loading cache $cache_file" + . $cache_file +else + echo "creating cache $cache_file" + > $cache_file +fi + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +ac_exeext= +ac_objext=o +if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then + # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu. + if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then + ac_n= ac_c=' +' ac_t=' ' + else + ac_n=-n ac_c= ac_t= + fi +else + ac_n= ac_c='\c' ac_t= +fi + + + +echo " Welcome to HMMER... configuring for your system." + + + + + + + + +# Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:542: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="gcc" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:572: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_prog_rejected=no + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + break + fi + done + IFS="$ac_save_ifs" +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# -gt 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + set dummy "$ac_dir/$ac_word" "$@" + shift + ac_cv_prog_CC="$@" + fi +fi +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + if test -z "$CC"; then + case "`uname -s`" in + *win32* | *WIN32*) + # Extract the first word of "cl", so it can be a program name with args. +set dummy cl; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:623: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="cl" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + ;; + esac + fi + test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; } +fi + +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6 +echo "configure:655: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5 + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +cat > conftest.$ac_ext << EOF + +#line 666 "configure" +#include "confdefs.h" + +main(){return(0);} +EOF +if { (eval echo configure:671: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + ac_cv_prog_cc_works=yes + # If we can't run a trivial program, we are probably using a cross compiler. + if (./conftest; exit) 2>/dev/null; then + ac_cv_prog_cc_cross=no + else + ac_cv_prog_cc_cross=yes + fi +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + ac_cv_prog_cc_works=no +fi +rm -fr conftest* +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +echo "$ac_t""$ac_cv_prog_cc_works" 1>&6 +if test $ac_cv_prog_cc_works = no; then + { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; } +fi +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6 +echo "configure:697: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5 +echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6 +cross_compiling=$ac_cv_prog_cc_cross + +echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6 +echo "configure:702: checking whether we are using GNU C" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.c <&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then + ac_cv_prog_gcc=yes +else + ac_cv_prog_gcc=no +fi +fi + +echo "$ac_t""$ac_cv_prog_gcc" 1>&6 + +if test $ac_cv_prog_gcc = yes; then + GCC=yes +else + GCC= +fi + +ac_test_CFLAGS="${CFLAGS+set}" +ac_save_CFLAGS="$CFLAGS" +CFLAGS= +echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6 +echo "configure:730: checking whether ${CC-cc} accepts -g" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'void f(){}' > conftest.c +if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then + ac_cv_prog_cc_g=yes +else + ac_cv_prog_cc_g=no +fi +rm -f conftest* + +fi + +echo "$ac_t""$ac_cv_prog_cc_g" 1>&6 +if test "$ac_test_CFLAGS" = set; then + CFLAGS="$ac_save_CFLAGS" +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi + +echo $ac_n "checking whether ln -s works""... $ac_c" 1>&6 +echo "configure:762: checking whether ln -s works" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_LN_S'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + rm -f conftestdata +if ln -s X conftestdata 2>/dev/null +then + rm -f conftestdata + ac_cv_prog_LN_S="ln -s" +else + ac_cv_prog_LN_S=ln +fi +fi +LN_S="$ac_cv_prog_LN_S" +if test "$ac_cv_prog_LN_S" = "ln -s"; then + echo "$ac_t""yes" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "ranlib", so it can be a program name with args. +set dummy ranlib; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:785: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$RANLIB"; then + ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_RANLIB="ranlib" + break + fi + done + IFS="$ac_save_ifs" + test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":" +fi +fi +RANLIB="$ac_cv_prog_RANLIB" +if test -n "$RANLIB"; then + echo "$ac_t""$RANLIB" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + + + + echo $ac_n "checking whether your make is GNU make""... $ac_c" 1>&6 +echo "configure:816: checking whether your make is GNU make" >&5 + foundGNUmake='nope, assuming sysv make.' ; + EXEC_DEPENDENCY=\$\$\@.o ; + if ( make --version nothing 2> /dev/null | grep GNU > /dev/null ) ; then + foundGNUmake='yes, it is.' ; + EXEC_DEPENDENCY='%: %.o' ; + fi + echo "$ac_t""$foundGNUmake" 1>&6 + + + +# Check whether --with-pvm or --without-pvm was given. +if test "${with_pvm+set}" = set; then + withval="$with_pvm" + case $with_pvm in + yes) echo 'Configuring for PVM' + PVMLIBDIR="-L${PVM_ROOT}/lib/${PVM_ARCH}" + PVMINCDIR="-I${PVM_ROOT}/include" + PVMFLAG="-DHMMER_PVM" + PVMPROGS="hmmcalibrate-pvm hmmpfam-pvm hmmsearch-pvm" + PVMLIBS="-lpvm3" + ;; + no) ;; + *) echo "Ignoring unknown argument to --with-pvm: $with_pvm" + ;; +esac +fi + + +ac_aux_dir= +for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do + if test -f $ac_dir/install-sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install-sh -c" + break + elif test -f $ac_dir/install.sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install.sh -c" + break + fi +done +if test -z "$ac_aux_dir"; then + { echo "configure: error: can not find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; } +fi +ac_config_guess=$ac_aux_dir/config.guess +ac_config_sub=$ac_aux_dir/config.sub +ac_configure=$ac_aux_dir/configure # This should be Cygnus configure. + + +# Make sure we can run config.sub. +if ${CONFIG_SHELL-/bin/sh} $ac_config_sub sun4 >/dev/null 2>&1; then : +else { echo "configure: error: can not run $ac_config_sub" 1>&2; exit 1; } +fi + +echo $ac_n "checking host system type""... $ac_c" 1>&6 +echo "configure:871: checking host system type" >&5 + +host_alias=$host +case "$host_alias" in +NONE) + case $nonopt in + NONE) + if host_alias=`${CONFIG_SHELL-/bin/sh} $ac_config_guess`; then : + else { echo "configure: error: can not guess host type; you must specify one" 1>&2; exit 1; } + fi ;; + *) host_alias=$nonopt ;; + esac ;; +esac + +host=`${CONFIG_SHELL-/bin/sh} $ac_config_sub $host_alias` +host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'` +host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'` +host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'` +echo "$ac_t""$host" 1>&6 + +echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6 +echo "configure:892: checking how to run the C preprocessor" >&5 +# On Suns, sometimes $CPP names a directory. +if test -n "$CPP" && test -d "$CPP"; then + CPP= +fi +if test -z "$CPP"; then +if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + # This must be in double quotes, not single quotes, because CPP may get + # substituted into the Makefile and "${CC-cc}" will confuse make. + CPP="${CC-cc} -E" + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:913: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -E -traditional-cpp" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:930: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -nologo -E" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:947: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP=/lib/cpp +fi +rm -f conftest* +fi +rm -f conftest* +fi +rm -f conftest* + ac_cv_prog_CPP="$CPP" +fi + CPP="$ac_cv_prog_CPP" +else + ac_cv_prog_CPP="$CPP" +fi +echo "$ac_t""$CPP" 1>&6 + +# Check whether --enable-threads or --disable-threads was given. +if test "${enable_threads+set}" = set; then + enableval="$enable_threads" + case $enable_threads in + yes) echo "Enabling POSIX threads support" + + +acx_pthread_ok=no + +# First, check if the POSIX threads header, pthread.h, is available. +# If it isn't, don't bother looking for the threads libraries. +ac_safe=`echo "pthread.h" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for pthread.h""... $ac_c" 1>&6 +echo "configure:984: checking for pthread.h" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:994: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + : +else + echo "$ac_t""no" 1>&6 +acx_pthread_ok=noheader +fi + + +# We must check for the threads library under a number of different +# names; the ordering is very important because some systems +# (e.g. DEC) have both -lpthread and -lpthreads, where one of the +# libraries is broken (non-POSIX). + +# First of all, check if the user has set any of the PTHREAD_LIBS, +# etcetera environment variables, and if threads linking works using +# them: +if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + echo $ac_n "checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS""... $ac_c" 1>&6 +echo "configure:1031: checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS" >&5 + cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + acx_pthread_ok=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 +fi +rm -f conftest* + echo "$ac_t""$acx_pthread_ok" 1>&6 + if test x"$acx_pthread_ok" = xno; then + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" + fi + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" +fi + +# Create a list of thread flags to try. Items starting with a "-" are +# C compiler flags, and other items are library names, except for "none" +# which indicates that we try without any flags at all. + +acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt" + +# The ordering *is* (sometimes) important. Some notes on the +# individual items follow: + +# pthreads: AIX (must check this before -lpthread) +# none: in case threads are in libc; should be tried before -Kthread and +# other compiler flags to prevent continual compiler warnings +# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) +# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) +# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) +# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) +# -pthreads: Solaris/gcc +# -mthreads: Mingw32/gcc, Lynx/gcc +# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it +# doesn't hurt to check since this sometimes defines pthreads too; +# also defines -D_REENTRANT) +# pthread: Linux, etcetera +# --thread-safe: KAI C++ + +case "${host_cpu}-${host_os}" in + *solaris*) + + # On Solaris (at least, for some versions), libc contains stubbed + # (non-functional) versions of the pthreads routines, so link-based + # tests will erroneously succeed. (We need to link with -pthread or + # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather + # a function called by this macro, so we could check for that, but + # who knows whether they'll stub that too in a future libc.) So, + # we'll just look for -pthreads and -lpthread first: + + acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags" + ;; +esac + +if test x"$acx_pthread_ok" = xno; then +for flag in $acx_pthread_flags; do + + case $flag in + none) + echo $ac_n "checking whether pthreads work without any flags""... $ac_c" 1>&6 +echo "configure:1106: checking whether pthreads work without any flags" >&5 + ;; + + -*) + echo $ac_n "checking whether pthreads work with $flag""... $ac_c" 1>&6 +echo "configure:1111: checking whether pthreads work with $flag" >&5 + PTHREAD_CFLAGS="$flag" + ;; + + *) + echo $ac_n "checking for the pthreads library -l$flag""... $ac_c" 1>&6 +echo "configure:1117: checking for the pthreads library -l$flag" >&5 + PTHREAD_LIBS="-l$flag" + ;; + esac + + save_LIBS="$LIBS" + save_CFLAGS="$CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Check for various functions. We must include pthread.h, + # since some functions may be macros. (On the Sequent, we + # need a special flag -Kthread to make this header compile.) + # We check for pthread_join because it is in -lpthread on IRIX + # while pthread_create is in libc. We check for pthread_attr_init + # due to DEC craziness with -lpthreads. We check for + # pthread_cleanup_push because it is one of the few pthread + # functions on Solaris that doesn't have a non-functional libc stub. + # We try pthread_create on general principles. + cat > conftest.$ac_ext < +int main() { +pthread_t th; pthread_join(th, 0); + pthread_attr_init(0); pthread_cleanup_push(0, 0); + pthread_create(0,0,0,0); pthread_cleanup_pop(0); +; return 0; } +EOF +if { (eval echo configure:1146: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + acx_pthread_ok=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 +fi +rm -f conftest* + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + + echo "$ac_t""$acx_pthread_ok" 1>&6 + if test "x$acx_pthread_ok" = xyes; then + break; + fi + + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" +done +fi + +# Various other checks: +if test "x$acx_pthread_ok" = xyes; then + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Detect AIX lossage: threads are created detached by default + # and the JOINABLE attribute has a nonstandard name (UNDETACHED). + echo $ac_n "checking for joinable pthread attribute""... $ac_c" 1>&6 +echo "configure:1178: checking for joinable pthread attribute" >&5 + cat > conftest.$ac_ext < +int main() { +int attr=PTHREAD_CREATE_JOINABLE; +; return 0; } +EOF +if { (eval echo configure:1187: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ok=PTHREAD_CREATE_JOINABLE +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ok=unknown +fi +rm -f conftest* + if test x"$ok" = xunknown; then + cat > conftest.$ac_ext < +int main() { +int attr=PTHREAD_CREATE_UNDETACHED; +; return 0; } +EOF +if { (eval echo configure:1206: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ok=PTHREAD_CREATE_UNDETACHED +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ok=unknown +fi +rm -f conftest* + fi + if test x"$ok" != xPTHREAD_CREATE_JOINABLE; then + cat >> confdefs.h <<\EOF +#define PTHREAD_CREATE_JOINABLE $ok +EOF + + fi + echo "$ac_t""${ok}" 1>&6 + if test x"$ok" = xunknown; then + echo "configure: warning: we do not know how to create joinable pthreads" 1>&2 + fi + + echo $ac_n "checking if more special flags are required for pthreads""... $ac_c" 1>&6 +echo "configure:1229: checking if more special flags are required for pthreads" >&5 + flag=no + case "${host_cpu}-${host_os}" in + *-aix* | *-freebsd*) flag="-D_THREAD_SAFE";; + *solaris* | alpha*-osf*) flag="-D_REENTRANT";; + esac + echo "$ac_t""${flag}" 1>&6 + if test "x$flag" != xno; then + PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" + fi + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + + # More AIX lossage: must compile with cc_r + # Extract the first word of "cc_r", so it can be a program name with args. +set dummy cc_r; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1247: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_PTHREAD_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$PTHREAD_CC"; then + ac_cv_prog_PTHREAD_CC="$PTHREAD_CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_PTHREAD_CC="cc_r" + break + fi + done + IFS="$ac_save_ifs" + test -z "$ac_cv_prog_PTHREAD_CC" && ac_cv_prog_PTHREAD_CC="${CC}" +fi +fi +PTHREAD_CC="$ac_cv_prog_PTHREAD_CC" +if test -n "$PTHREAD_CC"; then + echo "$ac_t""$PTHREAD_CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +else + PTHREAD_CC="$CC" +fi + + + + + +# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: +if test x"$acx_pthread_ok" = xyes; then + MDEFS="${MDEFS} -DHMMER_THREADS" + : +else + acx_pthread_ok=no + +fi + + + ;; + no) echo "POSIX threads support disabled" + ;; + *) echo "Ignoring unknown argument to --disable-threads: $enable_threads" + ;; +esac +else + + echo " Trying to enable default POSIX threads support" + + +acx_pthread_ok=no + +# First, check if the POSIX threads header, pthread.h, is available. +# If it isn't, don't bother looking for the threads libraries. +ac_safe=`echo "pthread.h" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for pthread.h""... $ac_c" 1>&6 +echo "configure:1309: checking for pthread.h" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1319: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + : +else + echo "$ac_t""no" 1>&6 +acx_pthread_ok=noheader +fi + + +# We must check for the threads library under a number of different +# names; the ordering is very important because some systems +# (e.g. DEC) have both -lpthread and -lpthreads, where one of the +# libraries is broken (non-POSIX). + +# First of all, check if the user has set any of the PTHREAD_LIBS, +# etcetera environment variables, and if threads linking works using +# them: +if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + echo $ac_n "checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS""... $ac_c" 1>&6 +echo "configure:1356: checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS" >&5 + cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + acx_pthread_ok=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 +fi +rm -f conftest* + echo "$ac_t""$acx_pthread_ok" 1>&6 + if test x"$acx_pthread_ok" = xno; then + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" + fi + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" +fi + +# Create a list of thread flags to try. Items starting with a "-" are +# C compiler flags, and other items are library names, except for "none" +# which indicates that we try without any flags at all. + +acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt" + +# The ordering *is* (sometimes) important. Some notes on the +# individual items follow: + +# pthreads: AIX (must check this before -lpthread) +# none: in case threads are in libc; should be tried before -Kthread and +# other compiler flags to prevent continual compiler warnings +# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) +# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) +# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) +# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) +# -pthreads: Solaris/gcc +# -mthreads: Mingw32/gcc, Lynx/gcc +# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it +# doesn't hurt to check since this sometimes defines pthreads too; +# also defines -D_REENTRANT) +# pthread: Linux, etcetera +# --thread-safe: KAI C++ + +case "${host_cpu}-${host_os}" in + *solaris*) + + # On Solaris (at least, for some versions), libc contains stubbed + # (non-functional) versions of the pthreads routines, so link-based + # tests will erroneously succeed. (We need to link with -pthread or + # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather + # a function called by this macro, so we could check for that, but + # who knows whether they'll stub that too in a future libc.) So, + # we'll just look for -pthreads and -lpthread first: + + acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags" + ;; +esac + +if test x"$acx_pthread_ok" = xno; then +for flag in $acx_pthread_flags; do + + case $flag in + none) + echo $ac_n "checking whether pthreads work without any flags""... $ac_c" 1>&6 +echo "configure:1431: checking whether pthreads work without any flags" >&5 + ;; + + -*) + echo $ac_n "checking whether pthreads work with $flag""... $ac_c" 1>&6 +echo "configure:1436: checking whether pthreads work with $flag" >&5 + PTHREAD_CFLAGS="$flag" + ;; + + *) + echo $ac_n "checking for the pthreads library -l$flag""... $ac_c" 1>&6 +echo "configure:1442: checking for the pthreads library -l$flag" >&5 + PTHREAD_LIBS="-l$flag" + ;; + esac + + save_LIBS="$LIBS" + save_CFLAGS="$CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Check for various functions. We must include pthread.h, + # since some functions may be macros. (On the Sequent, we + # need a special flag -Kthread to make this header compile.) + # We check for pthread_join because it is in -lpthread on IRIX + # while pthread_create is in libc. We check for pthread_attr_init + # due to DEC craziness with -lpthreads. We check for + # pthread_cleanup_push because it is one of the few pthread + # functions on Solaris that doesn't have a non-functional libc stub. + # We try pthread_create on general principles. + cat > conftest.$ac_ext < +int main() { +pthread_t th; pthread_join(th, 0); + pthread_attr_init(0); pthread_cleanup_push(0, 0); + pthread_create(0,0,0,0); pthread_cleanup_pop(0); +; return 0; } +EOF +if { (eval echo configure:1471: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + acx_pthread_ok=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 +fi +rm -f conftest* + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + + echo "$ac_t""$acx_pthread_ok" 1>&6 + if test "x$acx_pthread_ok" = xyes; then + break; + fi + + PTHREAD_LIBS="" + PTHREAD_CFLAGS="" +done +fi + +# Various other checks: +if test "x$acx_pthread_ok" = xyes; then + save_LIBS="$LIBS" + LIBS="$PTHREAD_LIBS $LIBS" + save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + + # Detect AIX lossage: threads are created detached by default + # and the JOINABLE attribute has a nonstandard name (UNDETACHED). + echo $ac_n "checking for joinable pthread attribute""... $ac_c" 1>&6 +echo "configure:1503: checking for joinable pthread attribute" >&5 + cat > conftest.$ac_ext < +int main() { +int attr=PTHREAD_CREATE_JOINABLE; +; return 0; } +EOF +if { (eval echo configure:1512: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ok=PTHREAD_CREATE_JOINABLE +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ok=unknown +fi +rm -f conftest* + if test x"$ok" = xunknown; then + cat > conftest.$ac_ext < +int main() { +int attr=PTHREAD_CREATE_UNDETACHED; +; return 0; } +EOF +if { (eval echo configure:1531: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + ok=PTHREAD_CREATE_UNDETACHED +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ok=unknown +fi +rm -f conftest* + fi + if test x"$ok" != xPTHREAD_CREATE_JOINABLE; then + cat >> confdefs.h <<\EOF +#define PTHREAD_CREATE_JOINABLE $ok +EOF + + fi + echo "$ac_t""${ok}" 1>&6 + if test x"$ok" = xunknown; then + echo "configure: warning: we do not know how to create joinable pthreads" 1>&2 + fi + + echo $ac_n "checking if more special flags are required for pthreads""... $ac_c" 1>&6 +echo "configure:1554: checking if more special flags are required for pthreads" >&5 + flag=no + case "${host_cpu}-${host_os}" in + *-aix* | *-freebsd*) flag="-D_THREAD_SAFE";; + *solaris* | alpha*-osf*) flag="-D_REENTRANT";; + esac + echo "$ac_t""${flag}" 1>&6 + if test "x$flag" != xno; then + PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" + fi + + LIBS="$save_LIBS" + CFLAGS="$save_CFLAGS" + + # More AIX lossage: must compile with cc_r + # Extract the first word of "cc_r", so it can be a program name with args. +set dummy cc_r; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1572: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_PTHREAD_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$PTHREAD_CC"; then + ac_cv_prog_PTHREAD_CC="$PTHREAD_CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_PTHREAD_CC="cc_r" + break + fi + done + IFS="$ac_save_ifs" + test -z "$ac_cv_prog_PTHREAD_CC" && ac_cv_prog_PTHREAD_CC="${CC}" +fi +fi +PTHREAD_CC="$ac_cv_prog_PTHREAD_CC" +if test -n "$PTHREAD_CC"; then + echo "$ac_t""$PTHREAD_CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +else + PTHREAD_CC="$CC" +fi + + + + + +# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: +if test x"$acx_pthread_ok" = xyes; then + MDEFS="${MDEFS} -DHMMER_THREADS" + : +else + acx_pthread_ok=no + +fi + + + +fi + + + +for ac_func in pthread_setconcurrency +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1625: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1653: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +fi +done + +for ac_func in pthread_attr_setscope +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1680: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1708: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +fi +done + + +for ac_func in ntohs +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1736: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1764: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for ntohs in -lsocket""... $ac_c" 1>&6 +echo "configure:1786: checking for ntohs in -lsocket" >&5 +ac_lib_var=`echo socket'_'ntohs | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + +for ac_func in ntohl +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1838: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1866: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for ntohl in -lsocket""... $ac_c" 1>&6 +echo "configure:1888: checking for ntohl in -lsocket" >&5 +ac_lib_var=`echo socket'_'ntohl | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + +for ac_func in htons +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1940: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1968: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for htons in -lsocket""... $ac_c" 1>&6 +echo "configure:1990: checking for htons in -lsocket" >&5 +ac_lib_var=`echo socket'_'htons | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + +for ac_func in htonl +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:2042: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:2070: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for htonl in -lsocket""... $ac_c" 1>&6 +echo "configure:2092: checking for htonl in -lsocket" >&5 +ac_lib_var=`echo socket'_'htonl | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + + +subdirs="squid" + + +echo " Configuration complete. Writing Makefiles and such..." +trap '' 1 2 15 +cat > confcache <<\EOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs. It is not useful on other systems. +# If it contains results you don't want to keep, you may remove or edit it. +# +# By default, configure uses ./config.cache as the cache file, +# creating it if it does not exist already. You can give configure +# the --cache-file=FILE option to use a different cache file; that is +# what configure does when it calls configure scripts in +# subdirectories, so they share the cache. +# Giving --cache-file=/dev/null disables caching, for debugging configure. +# config.status only pays attention to the cache file if you give it the +# --recheck option to rerun configure. +# +EOF +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, don't put newlines in cache variables' values. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +(set) 2>&1 | + case `(ac_space=' '; set | grep ac_space) 2>&1` in + *ac_space=\ *) + # `set' does not quote correctly, so add quotes (double-quote substitution + # turns \\\\ into \\, and sed turns \\ into \). + sed -n \ + -e "s/'/'\\\\''/g" \ + -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p" + ;; + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p' + ;; + esac >> confcache +if cmp -s $cache_file confcache; then + : +else + if test -w $cache_file; then + echo "updating cache $cache_file" + cat confcache > $cache_file + else + echo "not updating unwritable cache $cache_file" + fi +fi +rm -f confcache + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Any assignment to VPATH causes Sun make to only execute +# the first set of double-colon rules, so remove it if not needed. +# If there is a colon in the path, we need to keep it. +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[^:]*$/d' +fi + +trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15 + +# Transform confdefs.h into DEFS. +# Protect against shell expansion while executing Makefile rules. +# Protect against Makefile macro expansion. +cat > conftest.defs <<\EOF +s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%-D\1=\2%g +s%[ `~#$^&*(){}\\|;'"<>?]%\\&%g +s%\[%\\&%g +s%\]%\\&%g +s%\$%$$%g +EOF +DEFS=`sed -f conftest.defs confdefs.h | tr '\012' ' '` +rm -f conftest.defs + + +# Without the "./", some shells look in PATH for config.status. +: ${CONFIG_STATUS=./config.status} + +echo creating $CONFIG_STATUS +rm -f $CONFIG_STATUS +cat > $CONFIG_STATUS </dev/null | sed 1q`: +# +# $0 $ac_configure_args +# +# Compiler output produced by configure, useful for debugging +# configure, is in ./config.log if it exists. + +ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]" +for ac_option +do + case "\$ac_option" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion" + exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;; + -version | --version | --versio | --versi | --vers | --ver | --ve | --v) + echo "$CONFIG_STATUS generated by autoconf version 2.13" + exit 0 ;; + -help | --help | --hel | --he | --h) + echo "\$ac_cs_usage"; exit 0 ;; + *) echo "\$ac_cs_usage"; exit 1 ;; + esac +done + +ac_given_srcdir=$srcdir + +trap 'rm -fr `echo "Makefile src/Makefile testsuite/Makefile" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15 +EOF +cat >> $CONFIG_STATUS < conftest.subs <<\\CEOF +$ac_vpsub +$extrasub +s%@SHELL@%$SHELL%g +s%@CFLAGS@%$CFLAGS%g +s%@CPPFLAGS@%$CPPFLAGS%g +s%@CXXFLAGS@%$CXXFLAGS%g +s%@FFLAGS@%$FFLAGS%g +s%@DEFS@%$DEFS%g +s%@LDFLAGS@%$LDFLAGS%g +s%@LIBS@%$LIBS%g +s%@exec_prefix@%$exec_prefix%g +s%@prefix@%$prefix%g +s%@program_transform_name@%$program_transform_name%g +s%@bindir@%$bindir%g +s%@sbindir@%$sbindir%g +s%@libexecdir@%$libexecdir%g +s%@datadir@%$datadir%g +s%@sysconfdir@%$sysconfdir%g +s%@sharedstatedir@%$sharedstatedir%g +s%@localstatedir@%$localstatedir%g +s%@libdir@%$libdir%g +s%@includedir@%$includedir%g +s%@oldincludedir@%$oldincludedir%g +s%@infodir@%$infodir%g +s%@mandir@%$mandir%g +s%@MDEFS@%$MDEFS%g +s%@PVMLIBDIR@%$PVMLIBDIR%g +s%@PVMINCDIR@%$PVMINCDIR%g +s%@PVMFLAG@%$PVMFLAG%g +s%@PVMPROGS@%$PVMPROGS%g +s%@PVMLIBS@%$PVMLIBS%g +s%@CC@%$CC%g +s%@LN_S@%$LN_S%g +s%@RANLIB@%$RANLIB%g +s%@EXEC_DEPENDENCY@%$EXEC_DEPENDENCY%g +s%@host@%$host%g +s%@host_alias@%$host_alias%g +s%@host_cpu@%$host_cpu%g +s%@host_vendor@%$host_vendor%g +s%@host_os@%$host_os%g +s%@CPP@%$CPP%g +s%@PTHREAD_CC@%$PTHREAD_CC%g +s%@PTHREAD_LIBS@%$PTHREAD_LIBS%g +s%@PTHREAD_CFLAGS@%$PTHREAD_CFLAGS%g +s%@subdirs@%$subdirs%g + +CEOF +EOF + +cat >> $CONFIG_STATUS <<\EOF + +# Split the substitutions into bite-sized pieces for seds with +# small command number limits, like on Digital OSF/1 and HP-UX. +ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script. +ac_file=1 # Number of current file. +ac_beg=1 # First line for current file. +ac_end=$ac_max_sed_cmds # Line after last line for current file. +ac_more_lines=: +ac_sed_cmds="" +while $ac_more_lines; do + if test $ac_beg -gt 1; then + sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file + else + sed "${ac_end}q" conftest.subs > conftest.s$ac_file + fi + if test ! -s conftest.s$ac_file; then + ac_more_lines=false + rm -f conftest.s$ac_file + else + if test -z "$ac_sed_cmds"; then + ac_sed_cmds="sed -f conftest.s$ac_file" + else + ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file" + fi + ac_file=`expr $ac_file + 1` + ac_beg=$ac_end + ac_end=`expr $ac_end + $ac_max_sed_cmds` + fi +done +if test -z "$ac_sed_cmds"; then + ac_sed_cmds=cat +fi +EOF + +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF +for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories. + + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`" + # A "../" for each directory in $ac_dir_suffix. + ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'` + else + ac_dir_suffix= ac_dots= + fi + + case "$ac_given_srcdir" in + .) srcdir=. + if test -z "$ac_dots"; then top_srcdir=. + else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;; + /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;; + *) # Relative path. + srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix" + top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + + echo creating "$ac_file" + rm -f "$ac_file" + configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure." + case "$ac_file" in + *Makefile*) ac_comsub="1i\\ +# $configure_input" ;; + *) ac_comsub= ;; + esac + + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + sed -e "$ac_comsub +s%@configure_input@%$configure_input%g +s%@srcdir@%$srcdir%g +s%@top_srcdir@%$top_srcdir%g +" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file +fi; done +rm -f conftest.s* + +EOF +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF + +exit 0 +EOF +chmod +x $CONFIG_STATUS +rm -fr confdefs* $ac_clean_files +test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1 + +if test "$no_recursion" != yes; then + + # Remove --cache-file and --srcdir arguments so they do not pile up. + ac_sub_configure_args= + ac_prev= + for ac_arg in $ac_configure_args; do + if test -n "$ac_prev"; then + ac_prev= + continue + fi + case "$ac_arg" in + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + ;; + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + ;; + *) ac_sub_configure_args="$ac_sub_configure_args $ac_arg" ;; + esac + done + + for ac_config_dir in squid; do + + # Do not complain, so a configure script can configure whichever + # parts of a large source tree are present. + if test ! -d $srcdir/$ac_config_dir; then + continue + fi + + echo configuring in $ac_config_dir + + case "$srcdir" in + .) ;; + *) + if test -d ./$ac_config_dir || mkdir ./$ac_config_dir; then :; + else + { echo "configure: error: can not create `pwd`/$ac_config_dir" 1>&2; exit 1; } + fi + ;; + esac + + ac_popdir=`pwd` + cd $ac_config_dir + + # A "../" for each directory in /$ac_config_dir. + ac_dots=`echo $ac_config_dir|sed -e 's%^\./%%' -e 's%[^/]$%&/%' -e 's%[^/]*/%../%g'` + + case "$srcdir" in + .) # No --srcdir option. We are building in place. + ac_sub_srcdir=$srcdir ;; + /*) # Absolute path. + ac_sub_srcdir=$srcdir/$ac_config_dir ;; + *) # Relative path. + ac_sub_srcdir=$ac_dots$srcdir/$ac_config_dir ;; + esac + + # Check for guested configure; otherwise get Cygnus style configure. + if test -f $ac_sub_srcdir/configure; then + ac_sub_configure=$ac_sub_srcdir/configure + elif test -f $ac_sub_srcdir/configure.in; then + ac_sub_configure=$ac_configure + else + echo "configure: warning: no configuration information is in $ac_config_dir" 1>&2 + ac_sub_configure= + fi + + # The recursion is here. + if test -n "$ac_sub_configure"; then + + # Make the cache file name correct relative to the subdirectory. + case "$cache_file" in + /*) ac_sub_cache_file=$cache_file ;; + *) # Relative path. + ac_sub_cache_file="$ac_dots$cache_file" ;; + esac + + echo "running ${CONFIG_SHELL-/bin/sh} $ac_sub_configure $ac_sub_configure_args --cache-file=$ac_sub_cache_file --srcdir=$ac_sub_srcdir" + # The eval makes quoting arguments work. + if eval ${CONFIG_SHELL-/bin/sh} $ac_sub_configure $ac_sub_configure_args --cache-file=$ac_sub_cache_file --srcdir=$ac_sub_srcdir + then : + else + { echo "configure: error: $ac_sub_configure failed for $ac_config_dir" 1>&2; exit 1; } + fi + fi + + cd $ac_popdir + done +fi + + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmalign.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmalign.man new file mode 100644 index 0000000..dc08445 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmalign.man @@ -0,0 +1,154 @@ +.TH "hmmalign" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmalign - align sequences to an HMM profile + +.SH SYNOPSIS +.B hmmalign +.I [options] +.I hmmfile +.I seqfile + +.SH DESCRIPTION + +.B hmmalign +reads an HMM file from +.I hmmfile +and a set of sequences from +.I seqfile, +aligns the sequences to the profile HMM, +and outputs a multiple sequence alignment. + +.PP +.I seqfile +may be in any unaligned or aligned file format +accepted by HMMER. If it is in a multiple alignment format +(e.g. Stockholm, MSF, SELEX, ClustalW), the existing alignment +is ignored (i.e., the sequences are read as if they were +unaligned - hmmalign will align them the way it wants). + +.SH OPTIONS + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.B -m +Include in the alignment only those symbols aligned to match states. +Do not show symbols assigned to insert states. + +.TP +.BI -o " " +Save alignment to file +.I +instead of to standard output. + +.TP +.B -q +quiet; suppress all output except the alignment itself. +Useful for piping or redirecting the output. + +.SH EXPERT OPTIONS + +.TP +.BI --informat " " +Assert that the input +.I seqfile +is in format +.I ; +do not run Babelfish format autodection. This increases +the reliability of the program somewhat, because +the Babelfish can make mistakes; particularly +recommended for unattended, high-throughput runs +of HMMER. Valid format strings include FASTA, +GENBANK, EMBL, GCG, PIR, STOCKHOLM, SELEX, MSF, +CLUSTAL, and PHYLIP. See the User's Guide for a complete +list. + +.TP +.BI --mapali " " +Reads an alignment from file +.I +and aligns it as a single object to the HMM; e.g. the alignment in +.I +is held fixed. +This allows you to align sequences to a model with +.B hmmalign +and view them in the context of an existing trusted +multiple alignment. +The alignment to the alignment is defined by a "map" kept +in the HMM, and so is fast and guaranteed to be consistent +with the way the HMM was constructed from the alignment. +The alignment in the file +.I +must be exactly the alignment that the HMM was built from. +Compare the +.B --withali +option. + +.TP +.BI --withali " " +Reads an alignment from file +.I +and aligns it as a single object to the HMM; e.g. the alignment in +.I +is held fixed. +This allows you to align sequences to a model with +.B hmmalign +and view them in the context of an existing trusted +multiple alignment. The alignment to the alignment is +done with a heuristic (nonoptimal) dynamic programming procedure, +which may be somewhat slow and is not guaranteed to +be completely consistent with the way the HMM was +constructed (though it should be quite close). +However, any alignment can be used, not just the alignment that +the HMM was built from. Compare the +.B --mapali +option. + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmbuild.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmbuild.man new file mode 100644 index 0000000..f4856b8 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmbuild.man @@ -0,0 +1,476 @@ +.TH "hmmbuild" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmbuild - build a profile HMM from an alignment + +.SH SYNOPSIS +.B hmmbuild +.I [options] +.I hmmfile +.I alignfile + +.SH DESCRIPTION + +.B hmmbuild +reads a multiple sequence alignment file +.I alignfile +, builds a new profile HMM, and saves the HMM in +.I hmmfile. + +.PP +.I alignfile +may be in ClustalW, GCG MSF, SELEX, Stockholm, or aligned FASTA +alignment format. The format is automatically detected. + +.PP +By default, the model is configured to find one or more +nonoverlapping alignments to the complete model: multiple +global alignments with respect to the model, and local with +respect to the sequence. +This +is analogous to the behavior of the +.B hmmls +program of HMMER 1. +To configure the model for multiple +.I local +alignments +with respect to the model and local with respect to +the sequence, +a la the old program +.B hmmfs, +use the +.B -f +(fragment) option. More rarely, you may want to +configure the model for a single +global alignment (global with respect to both +model and sequence), using the +.B -g +option; +or to configure the model for a single local/local alignment +(a la standard Smith/Waterman, or the old +.B hmmsw +program), use the +.B -s +option. + +.SH OPTIONS + +.TP +.B -f +Configure the model for finding multiple domains per sequence, +where each domain can be a local (fragmentary) alignment. This +is analogous to the old +.B hmmfs +program of HMMER 1. + +.TP +.B -g +Configure the model for finding a single global alignment to +a target sequence, analogous to +the old +.B hmms +program of HMMER 1. + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.BI -n " " +Name this HMM +.I . +.I +can be any string of non-whitespace characters (e.g. one "word"). +There is no length limit (at least not one imposed by HMMER; +your shell will complain about command line lengths first). + +.TP +.BI -o " " +Re-save the starting alignment to +.I , +in Stockholm format. +The columns which were assigned to match states will be +marked with x's in an #=RF annotation line. +If either the +.B --hand +or +.B --fast +construction options were chosen, the alignment may have +been slightly altered to be compatible with Plan 7 transitions, +so saving the final alignment and comparing to the +starting alignment can let you view these alterations. +See the User's Guide for more information on this arcane +side effect. + +.TP +.B -s +Configure the model for finding a single local alignment per +target sequence. This is analogous to the standard Smith/Waterman +algorithm or the +.B hmmsw +program of HMMER 1. + +.TP +.B -A +Append this model to an existing +.I hmmfile +rather than creating +.I hmmfile. +Useful for building HMM libraries (like Pfam). + +.TP +.B -F +Force overwriting of an existing +.I hmmfile. +Otherwise HMMER will refuse to clobber your existing HMM files, +for safety's sake. + +.SH EXPERT OPTIONS + +.TP +.B --amino +Force the sequence alignment to be interpreted as amino acid +sequences. Normally HMMER autodetects whether the alignment is +protein or DNA, but sometimes alignments are so small that +autodetection is ambiguous. See +.B --nucleic. + +.TP +.BI --archpri " " +Set the "architecture prior" used by MAP architecture construction to +.I , +where +.I +is a probability between 0 and 1. This parameter governs a geometric +prior distribution over model lengths. As +.I +increases, longer models are favored a priori. +As +.I +decreases, it takes more residue conservation in a column to +make a column a "consensus" match column in the model architecture. +The 0.85 default has been chosen empirically as a reasonable setting. + +.TP +.B --binary +Write the HMM to +.I hmmfile +in HMMER binary format instead of readable ASCII text. + +.TP +.BI --cfile " " +Save the observed emission and transition counts to +.I +after the architecture has been determined (e.g. after residues/gaps +have been assigned to match, delete, and insert states). +This option is used in HMMER development for generating data files +useful for training new Dirichlet priors. The format of +count files is documented in the User's Guide. + +.TP +.B --fast +Quickly and heuristically determine the architecture of the model by +assigning all columns will more than a certain fraction of gap +characters to insert states. By default this fraction is 0.5, and it +can be changed using the +.B --gapmax +option. +The default construction algorithm is a maximum a posteriori (MAP) +algorithm, which is slower. + +.TP +.BI --gapmax " " +Controls the +.I --fast +model construction algorithm, but if +.I --fast +is not being used, has no effect. +If a column has more than a fraction +.I +of gap symbols in it, it gets assigned to an insert column. +.I +is a frequency from 0 to 1, and by default is set +to 0.5. Higher values of +.I +mean more columns get assigned to consensus, and models get +longer; smaller values of +.I +mean fewer columns get assigned to consensus, and models get +smaller. +.I + +.TP +.B --hand +Specify the architecture of the model by hand: the alignment file must +be in SELEX or Stockholm format, and the reference annotation +line (#=RF in SELEX, #=GC RF in Stockholm) is used to specify +the architecture. Any column marked with a non-gap symbol (such +as an 'x', for instance) is assigned as a consensus (match) column in +the model. + +.TP +.BI --idlevel " " +Controls both the determination of effective sequence number and +the behavior of the +.I --wblosum +weighting option. The sequence alignment is clustered by percent +identity, and the number of clusters at a cutoff threshold of +.I +is used to determine the effective sequence number. +Higher values of +.I +give more clusters and higher effective sequence +numbers; lower values of +.I +give fewer clusters and lower effective sequence numbers. +.I +is a fraction from 0 to 1, and +by default is set to 0.62 (corresponding to the clustering level used +in constructing the BLOSUM62 substitution matrix). + +.TP +.BI --informat " " +Assert that the input +.I seqfile +is in format +.I ; +do not run Babelfish format autodection. This increases +the reliability of the program somewhat, because +the Babelfish can make mistakes; particularly +recommended for unattended, high-throughput runs +of HMMER. Valid format strings include FASTA, +GENBANK, EMBL, GCG, PIR, STOCKHOLM, SELEX, MSF, +CLUSTAL, and PHYLIP. See the User's Guide for a complete +list. + +.TP +.B --noeff +Turn off the effective sequence number calculation, and use the +true number of sequences instead. This will usually reduce the +sensitivity of the final model (so don't do it without good reason!) + +.TP +.B --nucleic +Force the alignment to be interpreted as nucleic acid sequence, +either RNA or DNA. Normally HMMER autodetects whether the alignment is +protein or DNA, but sometimes alignments are so small that +autodetection is ambiguous. See +.B --amino. + +.TP +.BI --null " " +Read a null model from +.I . +The default for protein is to use average amino acid frequencies from +Swissprot 34 and p1 = 350/351; for nucleic acid, the default is +to use 0.25 for each base and p1 = 1000/1001. For documentation +of the format of the null model file and further explanation +of how the null model is used, see the User's Guide. + +.TP +.BI --pam " " +Apply a heuristic PAM- (substitution matrix-) based prior on match +emission probabilities instead of +the default mixture Dirichlet. The substitution matrix is read +from +.I . +See +.B --pamwgt. + +The default Dirichlet state transition prior and insert emission prior +are unaffected. Therefore in principle you could combine +.B --prior +with +.B --pam +but this isn't recommended, as it hasn't been tested. ( +.B --pam +itself hasn't been tested much!) + +.TP +.BI --pamwgt " " +Controls the weight on a PAM-based prior. Only has effect if +.B --pam +option is also in use. +.I +is a positive real number, 20.0 by default. +.I +is the number of "pseudocounts" contriubuted by the heuristic +prior. Very high values of +.I +can force a scoring system that is entirely driven by the +substitution matrix, making +HMMER somewhat approximate Gribskov profiles. + +.TP +.BI --pbswitch " " +For alignments with a very large number of sequences, +the GSC, BLOSUM, and Voronoi weighting schemes are slow; +they're O(N^2) for N sequences. Henikoff position-based +weights (PB weights) are more efficient. At or above a certain +threshold sequence number +.I +.B hmmbuild +will switch from GSC, BLOSUM, or Voronoi weights to +PB weights. To disable this switching behavior (at the cost +of compute time, set +.I +to be something larger than the number of sequences in +your alignment. +.I +is a positive integer; the default is 1000. + +.TP +.BI --prior " " +Read a Dirichlet prior from +.I , +replacing the default mixture Dirichlet. +The format of prior files is documented in the User's Guide, +and an example is given in the Demos directory of the HMMER +distribution. + +.TP +.BI --swentry " " +Controls the total probability that is distributed to local entries +into the model, versus starting at the beginning of the model +as in a global alignment. +.I +is a probability from 0 to 1, and by default is set to 0.5. +Higher values of +.I +mean that hits that are fragments on their left (N or 5'-terminal) side will be +penalized less, but complete global alignments will be penalized more. +Lower values of +.I +mean that fragments on the left will be penalized more, and +global alignments on this side will be favored. +This option only affects the configurations that allow local +alignments, +e.g. +.B -s +and +.B -f; +unless one of these options is also activated, this option has no effect. +You have independent control over local/global alignment behavior for +the N/C (5'/3') termini of your target sequences using +.B --swentry +and +.B --swexit. + +.TP +.BI --swexit " " +Controls the total probability that is distributed to local exits +from the model, versus ending an alignment at the end of the model +as in a global alignment. +.I +is a probability from 0 to 1, and by default is set to 0.5. +Higher values of +.I +mean that hits that are fragments on their right (C or 3'-terminal) side will be +penalized less, but complete global alignments will be penalized more. +Lower values of +.I +mean that fragments on the right will be penalized more, and +global alignments on this side will be favored. +This option only affects the configurations that allow local +alignments, +e.g. +.B -s +and +.B -f; +unless one of these options is also activated, this option has no effect. +You have independent control over local/global alignment behavior for +the N/C (5'/3') termini of your target sequences using +.B --swentry +and +.B --swexit. + +.TP +.B --verbose +Print more possibly useful stuff, such as the individual scores for +each sequence in the alignment. + +.TP +.B --wblosum +Use the BLOSUM filtering algorithm to weight the sequences, +instead of the default. +Cluster the sequences at a given percentage identity +(see +.B --idlevel); +assign each cluster a total weight of 1.0, distributed equally +amongst the members of that cluster. + + +.TP +.B --wgsc +Use the Gerstein/Sonnhammer/Chothia ad hoc sequence weighting +algorithm. This is already the default, so this option has no effect +(unless it follows another option in the --w family, in which case it +overrides it). + +.TP +.B --wme +Use the Krogh/Mitchison maximum entropy algorithm to "weight" +the sequences. This supercedes the Eddy/Mitchison/Durbin +maximum discrimination algorithm, which gives almost +identical weights but is less robust. ME weighting seems +to give a marginal increase in sensitivity +over the default GSC weights, but takes a fair amount of time. + +.TP +.B --wnone +Turn off all sequence weighting. + +.TP +.B --wpb +Use the Henikoff position-based weighting scheme. + +.TP +.B --wvoronoi +Use the Sibbald/Argos Voronoi sequence weighting algorithm +in place of the default GSC weighting. + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmcalibrate.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmcalibrate.man new file mode 100644 index 0000000..e472b92 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmcalibrate.man @@ -0,0 +1,172 @@ +.TH "hmmcalibrate" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmcalibrate - calibrate HMM search statistics + +.SH SYNOPSIS +.B hmmcalibrate +.I [options] +.I hmmfile + +.SH DESCRIPTION + +.B hmmcalibrate +reads an HMM file from +.I hmmfile, +scores a large number of synthesized random sequences with it, fits an +extreme value distribution (EVD) to the histogram of those scores, and +re-saves +.I hmmfile +now including the EVD parameters. + +.PP +.B hmmcalibrate +may take several minutes (or longer) to run. +While it is running, a temporary file called +.I hmmfile.xxx +is generated in your working directory. +If you abort +.B hmmcalibrate +prematurely (ctrl-C, for instance), your original +.I hmmfile +will be untouched, and you should delete the +.I hmmfile.xxx +temporary file. + +.SH OPTIONS + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.SH EXPERT OPTIONS + +.TP +.BI --cpu " " +Sets the maximum number of CPUs that the program +will run on. The default is to use all CPUs +in the machine. Overrides the HMMER_NCPU +environment variable. Only affects threaded +versions of HMMER (the default on most systems). + +.TP +.BI --fixed " " +Fix the length of the random sequences to +.I , +where +.I +is a positive (and reasonably sized) integer. +The default is instead to generate sequences with +a variety of different lengths, controlled by a Gaussian +(normal) distribution. + +.TP +.BI --histfile " " +Save a histogram of the scores and the fitted theoretical curve +to file +.I . + +.TP +.BI --mean " " +Set the mean length of the synthetic sequences to +.I , +where +.I +is a positive real number. The default is 350. + +.TP +.BI --num " " +Set the number of synthetic sequences to +.I , +where +.I +is a positive integer. If +.I is less than about 1000, the fit to the EVD may fail. +Higher numbers of +.I +will give better determined EVD parameters. The default +is 5000; it was empirically chosen as +a tradeoff between accuracy and computation time. + +.TP +.B --pvm +Run on a Parallel Virtual Machine (PVM). The PVM must +already be running. The client program +.B hmmcalibrate-pvm +must be installed on all the PVM nodes. +Optional PVM support must have been compiled into +HMMER. + +.TP +.BI --sd " " +Set the standard deviation of the synthetic sequence +length distribution to +.I , +where +.I +is a positive real number. The default is 350. Note that the +Gaussian is left-truncated so that no sequences have lengths +<= 0. + +.TP +.BI --seed " " +Set the random seed to +.I , +where +.I +is a positive integer. The default is to use +.B time() +to generate a different seed for each run, which +means that two different runs of +.B hmmcalibrate +on the same HMM will give slightly different +results. You can use +this option to generate reproducible results for +different +.B hmmcalibrate +runs on the same HMM. + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmconvert.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmconvert.man new file mode 100644 index 0000000..58cb3a6 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmconvert.man @@ -0,0 +1,124 @@ +.TH "hmmconvert" 1 "@RELEASEDATE@" "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmconvert - convert between profile HMM file formats + +.SH SYNOPSIS +.B hmmconvert +.I [options] +.I oldhmmfile +.I newhmmfile + +.SH DESCRIPTION + +.B hmmconvert +reads an HMM file from +.I oldhmmfile +in any HMMER format, and writes it to a new file +.I newhmmfile +in a new format. +.I oldhmmfile +and +.I newhmmfile +must be different files; you can't reliably overwrite +the old file. +By default, the new HMM file is written in HMMER 2 +ASCII format. + +Available formats are HMMER 2 ASCII (default), HMMER 2 binary +.I (-b) +GCG profile +.I (-p) +, and Compugen XSW extended profile +.I (-P). + +.SH OPTIONS + +.TP +.B -a +Convert to HMMER 2 ASCII file. This is the default, so this option +is unnecessary. + +.TP +.B -b +Convert to HMMER 2 binary file. + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.B -p +Convert to GCG profile .prf format. + +.TP +.B -A +Append mode; append to +.I newhmmfile +rather than creating a new file. + +.TP +.B -F +Force; if +.I newhmmfile +already exists, and +.I -A +is not being used to append to the file, +hmmconvert will refuse to clobber the existing +file unless +.I -F +is used. + +.TP +.B -P +Convert the HMM to Compugen XSW extended profile format, +which is similar to GCG profile format but has two +extra columns for delete-open and delete-extend costs. +(I do not believe that Compugen publicly supports this +format; it may be undocumented.) + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmemit.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmemit.man new file mode 100644 index 0000000..bfc61c9 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmemit.man @@ -0,0 +1,130 @@ +.TH "hmmemit" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmemit - generate sequences from a profile HMM + +.SH SYNOPSIS +.B hmmemit +.I [options] +.I hmmfile + +.SH DESCRIPTION + +.B hmmemit +reads an HMM file from +.I hmmfile +containing one or more HMMs, +and generates a number of sequences from each HMM; +or, if the +.B -c +option is selected, generate a single majority-rule consensus. +This can be useful for various applications in which one needs a simulation +of sequences consistent with a sequence family consensus. + +.pp +By default, +.B hmmemit +generates 10 sequences and outputs them in FASTA (unaligned) format. + +.SH OPTIONS + +.TP +.B -a +Write the generated sequences in an aligned format (SELEX) rather than +FASTA. + +.TP +.B -c +Predict a single majority-rule consensus sequence instead of sampling +sequences from the HMM's probability distribution. Highly conserved +residues (p >= 0.9 for DNA, p >= 0.5 for protein) are shown in upper +case; others are shown in lower case. Some insert states may become +part of the majority rule consensus, because they are used in >= 50% +of generated sequences; when this happens, insert-generated residues +are simply shown as "x". + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.BI -n " " +Generate +.I +sequences. Default is 10. + +.TP +.BI -o " " +Save the synthetic sequences to file +.I +rather than writing them to stdout. + +.TP +.B -q +Quiet; suppress all output except for the sequences themselves. +Useful for piping or directing the output. + +.SH EXPERT OPTIONS + +.TP +.BI --seed " " +Set the random seed to +.I , +where +.I +is a positive integer. The default is to use +.B time() +to generate a different seed for each run, which +means that two different runs of +.B hmmemit +on the same HMM will give slightly different +results. You can use +this option to generate reproducible results. + + + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmer.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmer.man new file mode 100644 index 0000000..70e3ce9 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmer.man @@ -0,0 +1,168 @@ +.TH "hmmer" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +HMMER - profile hidden Markov model software + +.SH SYNOPSIS +.TP +.B hmmalign +Align multiple sequences to a profile HMM. + +.TP +.B hmmbuild +Build a profile HMM from a given multiple sequence alignment. + +.TP +.B hmmcalibrate +Determine appropriate statistical significance parameters +for a profile HMM prior to doing database searches. + +.TP +.B hmmconvert +Convert HMMER profile HMMs to other formats, such as GCG profiles. + +.TP +.B hmmemit +Generate sequences probabilistically from a profile HMM. + +.TP +.B hmmfetch +Retrieve an HMM from an HMM database + +.TP +.B hmmindex +Create a binary SSI index for an HMM database + +.TP +.B hmmpfam +Search a profile HMM database with a sequence (i.e., annotate various +kinds of domains in the query sequence). + +.TP +.B hmmsearch +Search a sequence database with a profile HMM (i.e., find additional +homologues of a modeled family). + +.SH DESCRIPTION + +These programs use profile hidden Markov models (profile HMMs) to +model the primary structure consensus of a family of protein or +nucleic acid sequences. + +.SH OPTIONS + +.PP +All +.B HMMER +programs give a brief summary of their command-line syntax and options +if invoked without any arguments. +When invoked with the single argument, +.B -h +(i.e., help), a program will report more verbose command-line usage +information, including rarely used, experimental, and expert options. +.B -h +will report version numbers which are useful if +you need to report a bug or problem to me. + +.PP +Each +.B HMMER +program has its own man page briefly summarizing command line usage. +There is also a user's guide that came +with the software distribution, which includes a tutorial introduction +and more detailed descriptions of the programs. + +See http://hmmer.wustl.edu/ for on-line documentation and +the current HMMER release. + +.PP +In general, no command line options should be needed by beginning users. +The defaults are set up for optimum performance in most situations. +Options that are single lowercase letters (e.g. +.B -a +) are "common" options that are expected to be frequently used +and will be important in many applications. +Options that are single uppercase letters (e.g. +.B -B +) are usually less common options, but also may be important +in some applications. +Options that are full words (e.g. +.B --verbose +) are either rarely used, experimental, or expert options. +Some experimental options are only there for my own ongoing experiments +with HMMER, and may not be supported or documented adequately. + + +.SH SEQUENCE FILE FORMATS + +In general, +.B HMMER +attempts to read most common biological sequence file formats. +It autodetects the format of the file. It also autodetects +whether the sequences are protein or nucleic acid. +Standard IUPAC degeneracy codes are allowed in addition +to the usual 4-letter or 20-letter codes. + +.TP +.B Unaligned sequences +Unaligned sequence files may be in FASTA, Swissprot, EMBL, GenBank, +PIR, Intelligenetics, Strider, or GCG format. +These formats +are documented in the User's Guide. + +.TP +.B Sequence alignments +Multiple sequence alignments may be in CLUSTALW, SELEX, or GCG MSF +format. These formats +are documented in the User's Guide. + +.SH ENVIRONMENT VARIABLES + +For ease of using large stable sequence and HMM databases, +.B HMMER +looks for sequence files and HMM files in the current +working directory as well as in system directories specified +by environment variables. + +.TP +.B BLASTDB +Specifies the directory location of sequence databases. Example: +.B /seqlibs/blast-db/. +In installations that use BLAST software, this environment variable +is likely to already be set. + +.TP +.B HMMERDB +Specifies the directory location of HMM databases. Example: +.B /seqlibs/pfam/. + +.SH SEE ALSO + +.PP +@SEEALSO@ +.PP +User guide and tutorial: Userguide.ps +.PP +WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is Copyright (C) 1992-1998 Washington +University School of Medicine. It is freely distributable under terms +of the GNU General Public License. See COPYING in the source code +distribution for more details, or contact me. + +.nf +Sean Eddy +Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmfetch.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmfetch.man new file mode 100644 index 0000000..c8e8051 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmfetch.man @@ -0,0 +1,83 @@ +.TH "hmmfetch" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmfetch - retrieve an HMM from an HMM database + +.SH SYNOPSIS +.B hmmfetch +.I [options] +.I database +.I name + +.SH DESCRIPTION + +.B hmmfetch +is a small utility that retrieves an HMM called +.I name +from a HMMER model database called +.I database. +in a new format, +and prints that model to standard output. +For example, +.I hmmfetch Pfam rrm +retrieves the RRM (RNA recognition motif) model from +Pfam, if the environment variable HMMERDB is +set to the location of the Pfam database. +The retrieved HMM file is written in HMMER 2 ASCII format. + +.PP +The database must have an associated GSI index file. +To index an HMM database, use the program +.B hmmindex. + +.SH OPTIONS + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + + + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmindex.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmindex.man new file mode 100644 index 0000000..3bf171b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmindex.man @@ -0,0 +1,73 @@ +.TH "hmmindex" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmindex - create a binary SSI index for an HMM database + +.SH SYNOPSIS +.B hmmindex +.I [options] +.I database + +.SH DESCRIPTION + +.B hmmindex +is a utility that creates a binary SSI ("squid sequence index" +format) index for an HMM database file called +.I database. +The new index file is named +.IR database.ssi. +An SSI index file is required for +.B hmmfetch +to work, and also for the PVM implementation of +.B hmmpfam. + +.SH OPTIONS + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmpfam.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmpfam.man new file mode 100644 index 0000000..a3e8c8a --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmpfam.man @@ -0,0 +1,320 @@ +.TH "hmmpfam" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmpfam - search one or more sequences against an HMM database + +.SH SYNOPSIS +.B hmmpfam +.I [options] +.I hmmfile +.I seqfile + +.SH DESCRIPTION + +.B hmmpfam +reads a sequence file +.I seqfile +and compares each sequence in it, one at a time, against all the HMMs in +.I hmmfile +looking for significantly similar sequence matches. + +.PP +.I hmmfile +will be looked for first in the current working directory, +then in a directory named by the environment variable +.I HMMERDB. +This lets administrators install HMM library(s) such as +Pfam in a common location. + +.PP +There is a separate output report for each sequence in +.I seqfile. +This report consists of three sections: a ranked list +of the best scoring HMMs, a list of the +best scoring domains in order of their occurrence +in the sequence, and alignments for all the best scoring +domains. +A sequence score may be higher than a domain score for +the same sequence if there is more than one domain in the sequence; +the sequence score takes into account all the domains. +All sequences scoring above the +.I -E +and +.I -T +cutoffs are shown in the first list, then +.I every +domain found in this list is +shown in the second list of domain hits. +If desired, E-value and bit score thresholds may also be applied +to the domain list using the +.I --domE +and +.I --domT +options. + +.SH OPTIONS + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.B -n +Specify that models and sequence are nucleic acid, not protein. +Other HMMER programs autodetect this; but because of the order in +which +.B hmmpfam +accesses data, it can't reliably determine the correct "alphabet" +by itself. + +.TP +.BI -A " " +Limits the alignment output to the +.I +best scoring domains. +.B -A0 +shuts off the alignment output and can be used to reduce +the size of output files. + +.TP +.BI -E " " +Set the E-value cutoff for the per-sequence ranked hit list to +.I , +where +.I +is a positive real number. The default is 10.0. Hits with E-values +better than (less than) this threshold will be shown. + +.TP +.BI -T " " +Set the bit score cutoff for the per-sequence ranked hit list to +.I , +where +.I +is a real number. +The default is negative infinity; by default, the threshold +is controlled by E-value and not by bit score. +Hits with bit scores better than (greater than) this threshold +will be shown. + +.TP +.BI -Z " " +Calculate the E-value scores as if we had seen a sequence database of +.I +sequences. The default is arbitrarily set to 59021, the size of +Swissprot 34. + +.SH EXPERT OPTIONS + +.TP +.B --acc +Report HMM accessions instead of names in the output reports. +Useful for high-throughput annotation, where the data are being +parsed for storage in a relational database. + +.TP +.B --compat +Use the output format of HMMER 2.1.1, the 1998-2001 public +release; provided so 2.1.1 parsers don't have to be rewritten. + +.TP +.BI --cpu " " +Sets the maximum number of CPUs that the program +will run on. The default is to use all CPUs +in the machine. Overrides the HMMER_NCPU +environment variable. Only affects threaded +versions of HMMER (the default on most systems). + +.TP +.B --cut_ga +Use Pfam GA (gathering threshold) score cutoffs. +Equivalent +to --globT --domT , but the GA1 and GA2 cutoffs +are read from each HMM in +.I hmmfile +individually. hmmbuild puts these cutoffs there +if the alignment file was annotated in a Pfam-friendly +alignment format (extended SELEX or Stockholm format) and +the optional GA annotation line was present. If these +cutoffs are not set in the HMM file, +.B --cut_ga +doesn't work. + +.TP +.B --cut_tc +Use Pfam TC (trusted cutoff) score cutoffs. Equivalent +to --globT --domT , but the TC1 and TC2 cutoffs +are read from each HMM in +.I hmmfile +individually. hmmbuild puts these cutoffs there +if the alignment file was annotated in a Pfam-friendly +alignment format (extended SELEX or Stockholm format) and +the optional TC annotation line was present. If these +cutoffs are not set in the HMM file, +.B --cut_tc +doesn't work. + +.TP +.B --cut_nc +Use Pfam NC (noise cutoff) score cutoffs. Equivalent +to --globT --domT , but the NC1 and NC2 cutoffs +are read from each HMM in +.I hmmfile +individually. hmmbuild puts these cutoffs there +if the alignment file was annotated in a Pfam-friendly +alignment format (extended SELEX or Stockholm format) and +the optional NC annotation line was present. If these +cutoffs are not set in the HMM file, +.B --cut_nc +doesn't work. + +.TP +.BI --domE " " +Set the E-value cutoff for the per-domain ranked hit list to +.I , +where +.I +is a positive real number. +The default is infinity; by default, all domains in the sequences +that passed the first threshold will be reported in the second list, +so that the number of domains reported in the per-sequence list is +consistent with the number that appear in the per-domain list. + +.TP +.BI --domT " " +Set the bit score cutoff for the per-domain ranked hit list to +.I , +where +.I +is a real number. The default is negative infinity; +by default, all domains in the sequences +that passed the first threshold will be reported in the second list, +so that the number of domains reported in the per-sequence list is +consistent with the number that appear in the per-domain list. +.I Important note: +only one domain in a sequence is absolutely controlled by this +parameter, or by +.B --domT. +The second and subsequent domains in a sequence have a de facto +bit score threshold of 0 because of the details of how HMMER +works. HMMER requires at least one pass through the main model +per sequence; to do more than one pass (more than one domain) +the multidomain alignment must have a better score than the +single domain alignment, and hence the extra domains must contribute +positive score. See the Users' Guide for more detail. + +.TP +.BI --forward +Use the Forward algorithm instead of the Viterbi algorithm +to determine the per-sequence scores. Per-domain scores are +still determined by the Viterbi algorithm. Some have argued that +Forward is a more sensitive algorithm for detecting remote +sequence homologues; my experiments with HMMER have not +confirmed this, however. + +.TP +.BI --informat " " +Assert that the input +.I seqfile +is in format +.I ; +do not run Babelfish format autodection. This increases +the reliability of the program somewhat, because +the Babelfish can make mistakes; particularly +recommended for unattended, high-throughput runs +of HMMER. Valid format strings include FASTA, +GENBANK, EMBL, GCG, PIR, STOCKHOLM, SELEX, MSF, +CLUSTAL, and PHYLIP. See the User's Guide for a complete +list. + +.TP +.B --null2 +Turn off the post hoc second null model. By default, each alignment +is rescored by a postprocessing step that takes into account possible +biased composition in either the HMM or the target sequence. +This is almost essential in database searches, especially with +local alignment models. There is a very small chance that this +postprocessing might remove real matches, and +in these cases +.B --null2 +may improve sensitivity at the expense of reducing +specificity by letting biased composition hits through. + +.TP +.B --pvm +Run on a Parallel Virtual Machine (PVM). The PVM must +already be running. The client program +.B hmmpfam-pvm +must be installed on all the PVM nodes. +The HMM database +.I hmmfile +and an associated GSI index file +.IR hmmfile. gsi +must also be installed on all the PVM nodes. +(The GSI index is produced by the program +.BR hmmindex .) +Because the PVM implementation is I/O bound, +it is highly recommended that each node have a +local copy of +.I hmmfile +rather than NFS mounting a shared copy. +Optional PVM support must have been compiled into +HMMER for +.B --pvm +to function. + +.TP +.B --xnu +Turn on XNU filtering of target protein sequences. Has no effect +on nucleic acid sequences. In trial experiments, +.B --xnu +appears to perform less well than the default +post hoc null2 model. + + + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/documentation/man/hmmsearch.man b/forester/archive/RIO/others/hmmer/documentation/man/hmmsearch.man new file mode 100644 index 0000000..0073a07 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/documentation/man/hmmsearch.man @@ -0,0 +1,289 @@ +.TH "hmmsearch" 1 @RELEASEDATE@ "HMMER @RELEASE@" "HMMER Manual" + +.SH NAME +.TP +hmmsearch - search a sequence database with a profile HMM + +.SH SYNOPSIS +.B hmmsearch +.I [options] +.I hmmfile +.I seqfile + +.SH DESCRIPTION + +.B hmmsearch +reads an HMM from +.I hmmfile +and searches +.I seqfile +for significantly similar sequence matches. + +.PP +.I seqfile +will be looked for first in the current working directory, +then in a directory named by the environment variable +.I BLASTDB. +This lets users use existing BLAST databases, if BLAST +has been configured for the site. + +.PP +.B hmmsearch +may take minutes or even hours to run, depending +on the size of the sequence database. It is a good +idea to redirect the output to a file. + +.PP +The output consists of four sections: a ranked list +of the best scoring sequences, a ranked list of the +best scoring domains, alignments for all the best scoring +domains, and a histogram of the scores. +A sequence score may be higher than a domain score for +the same sequence if there is more than one domain in the sequence; +the sequence score takes into account all the domains. +All sequences scoring above the +.I -E +and +.I -T +cutoffs are shown in the first list, then +.I every +domain found in this list is +shown in the second list of domain hits. +If desired, E-value and bit score thresholds may also be applied +to the domain list using the +.I --domE +and +.I --domT +options. + +.SH OPTIONS + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.BI -A " " +Limits the alignment output to the +.I +best scoring domains. +.B -A0 +shuts off the alignment output and can be used to reduce +the size of output files. + +.TP +.BI -E " " +Set the E-value cutoff for the per-sequence ranked hit list to +.I , +where +.I +is a positive real number. The default is 10.0. Hits with E-values +better than (less than) this threshold will be shown. + +.TP +.BI -T " " +Set the bit score cutoff for the per-sequence ranked hit list to +.I , +where +.I +is a real number. +The default is negative infinity; by default, the threshold +is controlled by E-value and not by bit score. +Hits with bit scores better than (greater than) this threshold +will be shown. + +.TP +.BI -Z " " +Calculate the E-value scores as if we had seen a sequence database of +.I +sequences. The default is the number of sequences seen in your +database file +.I . + +.SH EXPERT OPTIONS + +.TP +.B --compat +Use the output format of HMMER 2.1.1, the 1998-2001 public +release; provided so 2.1.1 parsers don't have to be rewritten. + +.TP +.BI --cpu " " +Sets the maximum number of CPUs that the program +will run on. The default is to use all CPUs +in the machine. Overrides the HMMER_NCPU +environment variable. Only affects threaded +versions of HMMER (the default on most systems). + +.TP +.B --cut_ga +Use Pfam GA (gathering threshold) score cutoffs. +Equivalent +to --globT --domT , but the GA1 and GA2 cutoffs +are read from the HMM file. hmmbuild puts these cutoffs there +if the alignment file was annotated in a Pfam-friendly +alignment format (extended SELEX or Stockholm format) and +the optional GA annotation line was present. If these +cutoffs are not set in the HMM file, +.B --cut_ga +doesn't work. + +.TP +.B --cut_tc +Use Pfam TC (trusted cutoff) score cutoffs. Equivalent +to --globT --domT , but the TC1 and TC2 cutoffs +are read from the HMM file. hmmbuild puts these cutoffs there +if the alignment file was annotated in a Pfam-friendly +alignment format (extended SELEX or Stockholm format) and +the optional TC annotation line was present. If these +cutoffs are not set in the HMM file, +.B --cut_tc +doesn't work. + +.TP +.B --cut_nc +Use Pfam NC (noise cutoff) score cutoffs. Equivalent +to --globT --domT , but the NC1 and NC2 cutoffs +are read from the HMM file. hmmbuild puts these cutoffs there +if the alignment file was annotated in a Pfam-friendly +alignment format (extended SELEX or Stockholm format) and +the optional NC annotation line was present. If these +cutoffs are not set in the HMM file, +.B --cut_nc +doesn't work. + +.TP +.BI --domE " " +Set the E-value cutoff for the per-domain ranked hit list to +.I , +where +.I +is a positive real number. +The default is infinity; by default, all domains in the sequences +that passed the first threshold will be reported in the second list, +so that the number of domains reported in the per-sequence list is +consistent with the number that appear in the per-domain list. + +.TP +.BI --domT " " +Set the bit score cutoff for the per-domain ranked hit list to +.I , +where +.I +is a real number. The default is negative infinity; +by default, all domains in the sequences +that passed the first threshold will be reported in the second list, +so that the number of domains reported in the per-sequence list is +consistent with the number that appear in the per-domain list. +.I Important note: +only one domain in a sequence is absolutely controlled by this +parameter, or by +.B --domT. +The second and subsequent domains in a sequence have a de facto +bit score threshold of 0 because of the details of how HMMER +works. HMMER requires at least one pass through the main model +per sequence; to do more than one pass (more than one domain) +the multidomain alignment must have a better score than the +single domain alignment, and hence the extra domains must contribute +positive score. See the Users' Guide for more detail. + +.TP +.BI --forward +Use the Forward algorithm instead of the Viterbi algorithm +to determine the per-sequence scores. Per-domain scores are +still determined by the Viterbi algorithm. Some have argued that +Forward is a more sensitive algorithm for detecting remote +sequence homologues; my experiments with HMMER have not +confirmed this, however. + +.TP +.BI --informat " " +Assert that the input +.I seqfile +is in format +.I ; +do not run Babelfish format autodection. This increases +the reliability of the program somewhat, because +the Babelfish can make mistakes; particularly +recommended for unattended, high-throughput runs +of HMMER. Valid format strings include FASTA, +GENBANK, EMBL, GCG, PIR, STOCKHOLM, SELEX, MSF, +CLUSTAL, and PHYLIP. See the User's Guide for a complete +list. + +.TP +.B --null2 +Turn off the post hoc second null model. By default, each alignment +is rescored by a postprocessing step that takes into account possible +biased composition in either the HMM or the target sequence. +This is almost essential in database searches, especially with +local alignment models. There is a very small chance that this +postprocessing might remove real matches, and +in these cases +.B --null2 +may improve sensitivity at the expense of reducing +specificity by letting biased composition hits through. + +.TP +.B --pvm +Run on a Parallel Virtual Machine (PVM). The PVM must +already be running. The client program +.B hmmsearch-pvm +must be installed on all the PVM nodes. +Optional PVM support must have been compiled into +HMMER. + +.TP +.B --xnu +Turn on XNU filtering of target protein sequences. Has no effect +on nucleic acid sequences. In trial experiments, +.B --xnu +appears to perform less well than the default +post hoc null2 model. + + + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the individual man +pages: see +.B hmmer(1). +.PP +A User guide and tutorial came with the distribution: +.B Userguide.ps +[Postscript] and/or +.B Userguide.pdf +[PDF]. +.PP +Finally, all documentation is also available online via WWW: +.B http://hmmer.wustl.edu/ + +.SH AUTHOR + +This software and documentation is: +.nf +@COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +.fi +See the file COPYING in your distribution for complete details. + +.nf +Sean Eddy +HHMI/Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/install-sh b/forester/archive/RIO/others/hmmer/install-sh new file mode 100755 index 0000000..e9de238 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/install-sh @@ -0,0 +1,251 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + chmodcmd="" + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/forester/archive/RIO/others/hmmer/squid/00README b/forester/archive/RIO/others/hmmer/squid/00README new file mode 100644 index 0000000..fa03d67 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/00README @@ -0,0 +1,39 @@ +SQUID - library of functions for biological sequence analysis +Copyright (C) 1992-2001 Washington University School of Medicine + +SQUID is a freely redistributable library of C code functions for +sequence analysis. SQUID also includes a number of small utility +programs. + +To install squid, see the file: + INSTALL -- instructions for installing the programs + +If you have any questions about redistributing squid or using +squid code in your own work, see the files: + COPYRIGHT -- copyright notice, and information on my distribution policy + LICENSE -- version 2 of the GNU Public License (see COPYRIGHT) + +For a web page with more information on squid, see: + http://www.genetics.wustl.edu/eddy/software/#squid + +You can always download the latest stable release of squid from: + ftp://ftp.genetics.wustl.edu/pub/eddy/software/squid.tar.gz + +The development codebase is available by anonymous CVS: + cvs -d :pserver:anonymous@skynet.wustl.edu:/repository/sre login + (password "anonymous") + cvs -d :pserver:anonymous@skynet.wustl.edu:/repository/sre checkout squid + +If you encounter any bugs in this library, or you have any questions +or comments, please e-mail me at the address below. Due to limited +personal time, I may not respond, but I do read all my mail. + + Sean Eddy + eddy@genetics.wustl.edu + + HHMI/Dept. of Genetics + Washington University School of Medicine + 660 South Euclid Box 8232 + Saint Louis Missouri 63110 + USA + diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/abstract.tex b/forester/archive/RIO/others/hmmer/squid/Docs/abstract.tex new file mode 100644 index 0000000..d27df57 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/abstract.tex @@ -0,0 +1,7 @@ +\begin{abstract} + +The {\tt squid} library is an evolving collection of C functions for +nucleic acid and protein sequence analysis. + +\end {abstract} + diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/formats.tex b/forester/archive/RIO/others/hmmer/squid/Docs/formats.tex new file mode 100644 index 0000000..f775cc9 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/formats.tex @@ -0,0 +1,517 @@ +% -------------------------------------------------------------- +% squid:formats.tex +% SRE, Wed Jul 14 17:54:59 1999 +% $CVS Id$ +% -------------------------------------------------------------- + +\chapter {Sequence file formats} + +\section{Summary} + +The software can handle a number of different file formats. By +default, it autodetects the file format, so you don't have to worry +about converting formats. Most common file formats are recognized, +including FASTA, Genbank, EMBL, Swissprot, PIR, and FASTA for +unaligned sequences, and GCG MSF, Clustal, Phylip, and Stockholm +format for multiple sequence alignments. Some parts of the source code +call the autodetector the ``Babelfish''. + +The Babelfish has three drawbacks. First, it takes a small amount of +time to do the autodetection. Second, the Babelfish is aggressive, and +it makes mistakes when a file isn't one of the known formats -- in +particular, it can recognize plain text files as SELEX alignments, +because the SELEX format is so free-form. Third, because the Babelfish +works by reading the first part of the file then rewinding it before +starting to process it, you can't use the Babelfish on a nonrewindable +stream: e.g. when you're taking sequence input from a UNIX pipe +instead of a file, or when the file is gzipped and has to be +decompressed before processing. In normal use, when you're using the +software interactively from the command line on sequence files that +you're familiar with, the Babelfish is very convenient and +(relatively) safe. + +However, you'll find that there are times when you want to override +the Babelfish -- particularly in high-throughput analysis, when you +know the format your files are supposed to be in, and you'd rather +increase robustness and sacrifice interactive flexibility. All the +programs have an \verb+--informat + option that lets you +specify the format and shut off the Babelfish. You \emph{must} use +\verb+--informat+ to use compressed files, or to read sequence from a +UNIX pipe... see below for more details on these tricks. + +\section{Formats recognized by the Babelfish} + +Recognized unaligned sequence file formats: + +\begin{tabular}{ll}\hline +Format name & Note \\ \hline +fasta & BLAST flatfile databases, etc.\\ +genbank & NCBI Genbank flat file format.\\ +embl & Includes both EMBL (DNA) and SWISSPROT (protein) databases.\\ +pir & Protein Information Resource database (NBRF/Georgetown)\\ +gcg & Wisconsin Genetics Computer Group; only allows one sequence per file.\\ +gcgdata & I think this GCG database format is obsolete now.\\ \hline +\end{tabular} + +Recognized multiple sequence alignment file formats: + +\begin{tabular}{ll}\hline +Format name & Note \\ \hline +stockholm & Pfam format. Allows databases of more than one alignment per file\\ +selex & Old NeXagen RNA alignment format, adopted by early HMMER releases.\\ +msf & GCG's alignment format.\\ +clustal & ClustalV, ClustalW, and friends.\\ +a2m & Aligned FASTA format; see comment below.\\ +phylip & Format used by Felsenstein's PHYLIP phylogenetic inference software\\\hline +\end{tabular} + +Aligned FASTA format (here called ``A2M'', though I believe that what +Haussler's group at UCSC started calling A2M is yet another variant of +aligned FASTA that's incompatible with this A2M) is only autodetected +when an alignment file is expected. Otherwise an A2M file will be +recognized as unaligned FASTA, and its gap characters (if any) will be +parsed as sequence characters -- often not what you want. + +Alignment files may be used when unaligned files are expected -- the +sequences will silently be de-aligned and read sequentially. The +converse is not true; you can't give an unaligned sequence format when +an alignment is expected (makes sense, right?). + +There is no provision for enforcing that single unaligned sequence +formats really do contain just a single sequence. An attempt to +convert a multisequence file to GCG format will silently ``succeed'', +and the file may look ok to your eye, but that multisequence ``GCG'' +file is illegal. The data will be corrupted if you try to read that +file back in, possibly without generating any error messages. + +It turns out that other formats work too, but they're undocumented, +not subjected to any quality control testing at software release time, +and prone to change without notice at my slightest whim. (In other +words, even less supported than the software already is.) The brave, +curious, or desperate are invited to peruse +\prog{seqio.c} and \prog{squid.h}. + +\section{Special tricks} + +\subsection{Reading from standard input (probably UNIX-only)} + +If you give ``-'' as a sequence filename, the software will read the +sequences from standard input rather than from a file. You will need +to specify the format of the incoming data using the +\verb+--informat+ option. +Any format except SELEX can be read from standard input. This lets you +use any program downstream in a standard UNIX pipe. + +There is one limitation: you can't use ``-'' more than once on a +command line, for obvious reasons. (How is it supposed to read more +than one file from one standard input stream?) If you do, behavior of +the software is undefined -- in other words, the software don't check +for whether you're making this mistake, so God help you if you do. + +\subsection{Reading from gzip'ed files (probably UNIX-only)} + +A sequence file in any format except SELEX can be compressed by gzip, +and read in its compressed form. The software looks for the suffix +\prog{.gz} to detect gzip'ed files. This allows you to save disk space +by keeping sequence files gzip'ed, if you like. gzip is not built in; +the software needs to find a gzip executable in your current PATH. + +If for some reason you name a file with a \prog{.gz} suffix and it's +\emph{not} a gzip-compressed file, the software will still try to +decompress it, and peculiar things may happen. + +\section{FASTA format, the recommended unaligned format} + +FASTA is probably the simplest of formats for unaligned sequences. +FASTA files are easily created in a text editor. Each sequence is +preceded by a line starting with \verb+>+. The first word on this line +is the name of the sequence. The rest of the line is a description of +the sequence (free format). The remaining lines contain the sequence +itself. You can put as many letters on a sequence line as you want. + +\textbf{Example of a simple FASTA file:} +\begin{verbatim} +>seq1 This is the description of my first sequence. +AGTACGTAGTAGCTGCTGCTACGTGCGCTAGCTAGTACGTCA CGACGTAGATGCTAGCTGACTCGATGC +>seq2 This is a description of my second sequence. +CGATCGATCGTACGTCGACTGATCGTAGCTACGTCGTACGTAG CATCGTCAGTTACTGCATGCTCG +\end{verbatim} + +For better or worse, FASTA is not a documented standard. Minor (and +major) variants are in widespread use in the bioinformatics community, +all of which are called ``FASTA format''. My software attempts to +cater to all of them, and is tolerant of common deviations in FASTA +format. Certainly anything that is accepted by the database formatting +programs in NCBI BLAST or WU-BLAST (e.g. setdb, pressdb, xdformat) +will also be accepted by my software. Blank lines in a FASTA file are +ignored, and so are spaces or other gap symbols (dashes, underscores, +periods) in a sequence. Other non-amino or non-nucleic acid symbols in +the sequence are also silently ignored, mostly because some people +seem to think that ``*'' or ``.'' should be added to protein sequences +to (redundantly) indicate the end of the sequence. The parser will +also accept unlimited line lengths, which allows it to accomodate the +enormous description lines in the NCBI NR databases. + +On the other hand, any FASTA files \emph{generated} by my software +adhere closely to community standards, and should be usable by other +software packages (BLAST, FASTA, etc.) that are more picky about +parsing their input files. That means you can run a sloppy FASTA file +thru \prog{sreformat} to clean it up. + +Partly because of this tolerance, the software may have a difficult +time dealing with files that are \textit{not} in FASTA format, +especially if you're relying on the Babelfish to do format +autodetection. Some (now mercifully uncommon) file formats are so +similar to FASTA format that they be erroneously called FASTA by the +Babelfish and then quietly but lethally misparsed. An example is the +old NBRF file format. If you're using \verb+--informat+, things will +be more robust, and the software should simply refuse to accept a +non-FASTA file -- but you shouldn't count on this, because files +perversely similar to FASTA will still confuse the parser. (The gist +of these caveats applies to all formats, not just FASTA.) + +\section{SELEX, the quick and dirty alignment format} + +An example of a simple SELEX alignment file: + +\begin{verbatim} +# Example selex file + +seq1 ACGACGACGACG. +seq2 ..GGGAAAGG.GA +seq3 UUU..AAAUUU.A + +seq1 ..ACG +seq2 AAGGG +seq3 AA...UUU +\end{verbatim} + +SELEX is an interleaved multiple alignment format that arose as a +simple, intuitive format that was easy to write and manipulate +manually in a text editor. It is usually easy to convert other +alignment formats into SELEX format, even with a couple of lines of +Perl, but it can be harder to go the other way, since SELEX is more +free-format than other alignment formats. For instance, GCG's MSF +format and the output of the CLUSTALV multiple alignment program are +similar interleaved formats that can be converted to SELEX just by +stripping a small number of non-sequence lines out. Because SELEX +evolved to accomodate different user input styles, it is very tolerant +of various inconsistencies such as different gap symbols, varying line +lengths, etc. + +Each line contains a name, followed by the aligned sequence. A space, +dash, underscore, or period denotes a gap. If the alignment is too +long to fit on one line, the alignment is split into multiple blocks, +separated by blank lines. The number of sequences, their order, and +their names must be the same in every block (even if a sequence has no +residues in a given block!) Other blank lines are ignored. You can add +comments to the file on lines starting with a \verb+#+. + +SELEX stands for ``Systematic Evolution of Ligands by Exponential +Enrichment'' -- it refers to the Tuerk and Gold technology for +evolving families of small RNAs for particular functions +\cite{Tuerk90b}. SELEX files were what we used to keep track of +alignments of these small RNA families, at a company then called +NeXagen, in Boulder. It's an interesting piece of historical baggage. +With the development of HMMER and more need for annotated alignments +in Pfam, SELEX format later evolved into ``extended SELEX'', with a +reserved comment style that allowed structural markup and other +annotations, but that became unwieldy. We now use Stockholm format +(see below) for highly annotated alignments. (Extended SELEX is +deprecated and undocumented.) Still, the basic SELEX format remains a +useful ``lowest common denominator'' alignment format, and has been +retained. + +\subsubsection {Detailed specification of a SELEX file} + +\begin{enumerate} +\item +Any line beginning with a \verb+#=+ as the first two characters is a +parsed machine comment in extended SELEX, and is now deprecated. + +\item +All other lines beginning with a \verb+%+ or \verb+#+ as the first +character are user comments. User comments are ignored by all +software. Anything may appear on these lines. Any number of comments +may be included in a SELEX file, and at any point. + +\item +Lines of data consist of a name followed by a sequence. The total +length of the line must be smaller than 4096 characters. + +\item +Names must be a single word. Any non-whitespace characters are +accepted. No spaces are tolerated in names: names MUST be a +single word. Names must be less than 32 characters long. + +\item In the sequence, any of the characters \verb+-_.+ or a space are +recognized as gaps. Any other characters are interpreted as sequence. +Sequence is case-sensitive. There is a common assumption by my +software that upper-case symbols are used for consensus (match) +positions and lower-case symbols are used for inserts. This language +of ``match'' versus ``insert'' comes from the hidden Markov model +formalism \cite{Krogh94}. To almost all of my software, this isn't +important, and it immediately converts the sequence to all upper-case +after it's read. + +\item +Multiple different sequences are grouped in a block of data lines. +Blocks are separated by blank lines. No blank lines are tolerated +between the sequence lines in a block. Each block in a multi-block +file of a long alignment must have its sequences in the same order in +each block. The names are checked to verify that this is the case; if +not, only a warning is generated. (In manually constructed files, some +users may wish to use shorthand names in subsequent blocks after an +initial block with full names -- but this isn't recommended.) +\end{enumerate} + +\section{Stockholm, the recommended multiple sequence alignment format} + +While we recommend a community standard format (FASTA) for unaligned +sequence files, the recommended multiple alignment file format is not +a community standard. The Pfam Consortium developed a format (based +on extended SELEX) called ``Stockholm format''. The reasons for this +are two-fold. First, there really is no standard accepted format for +multiple sequence alignment files, so we don't feel guilty about +inventing a new one. Second, the formats of popular multiple alignment +software (e.g. CLUSTAL, GCG MSF, PHYLIP) do not support rich +documentation and markup of the alignment. Stockholm format was +developed to support extensible markup of multiple sequence +alignments, and we use this capability extensively in both RNA work +(with structural markup) and the Pfam database (with extensive use of +both annotation and markup). + +\subsection{A minimal Stockholm file} +\begin{verbatim} +# STOCKHOLM 1.0 + +seq1 ACDEF...GHIKL +seq2 ACDEF...GHIKL +seq3 ...EFMNRGHIKL + +seq1 MNPQTVWY +seq2 MNPQTVWY +seq3 MNPQT... + +\end{verbatim} + +The simplest Stockholm file is pretty intuitive, easily generated in a +text editor. It is usually easy to convert alignment formats into a +``least common denominator'' Stockholm format. For instance, SELEX, +GCG's MSF format, and the output of the CLUSTALV multiple alignment +program are all similar interleaved formats. + +The first line in the file must be \verb+# STOCKHOLM 1.x+, where +\verb+x+ is a minor version number for the format specification +(and which currently has no effect on my parsers). This line allows a +parser to instantly identify the file format. + +In the alignment, each line contains a name, followed by the aligned +sequence. A dash or period denotes a gap. If the alignment is too long +to fit on one line, the alignment may be split into multiple blocks, +with blocks separated by blank lines. The number of sequences, their +order, and their names must be the same in every block. Within a given +block, each (sub)sequence (and any associated \verb+#=GR+ and +\verb+#=GC+ markup, see below) is of equal length, called the +\textit{block length}. Block lengths may differ from block to block; +the block length must be at least one residue, and there is no +maximum. + +Other blank lines are ignored. You can add comments to the file on +lines starting with a \verb+#+. + +All other annotation is added using a tag/value comment style. The +tag/value format is inherently extensible, and readily made +backwards-compatible; unrecognized tags will simply be ignored. Extra +annotation includes consensus and individual RNA or protein secondary +structure, sequence weights, a reference coordinate system for the +columns, and database source information including name, accession +number, and coordinates (for subsequences extracted from a longer +source sequence) See below for details. + +\subsection{Syntax of Stockholm markup} + +There are four types of Stockholm markup annotation, for per-file, +per-sequence, per-column, and per-residue annotation: + +\begin{wideitem} +\item {\emprog{#=GF }} + Per-file annotation. \prog{} is a free format text line + of annotation type \prog{}. For example, \prog{#=GF DATE + April 1, 2000}. Can occur anywhere in the file, but usually + all the \prog{#=GF} markups occur in a header. + +\item {\emprog{#=GS }} + Per-sequence annotation. \prog{} is a free format text line + of annotation type \prog{tag} associated with the sequence + named \prog{}. For example, \prog{#=GS seq1 + SPECIES_SOURCE Caenorhabditis elegans}. Can occur anywhere + in the file, but in single-block formats (e.g. the Pfam + distribution) will typically follow on the line after the + sequence itself, and in multi-block formats (e.g. HMMER + output), will typically occur in the header preceding the + alignment but following the \prog{#=GF} annotation. + +\item {\emprog{#=GC <...s...>} + Per-column annotation. \prog{<...s...>} is an aligned text line + of annotation type \prog{}. + \verb+#=GC+ lines are + associated with a sequence alignment block; \prog{<...s...>} + is aligned to the residues in the alignment block, and has + the same length as the rest of the block. + Typically \verb+#=GC+ lines are placed at the end of each block. + +\item {\emprog{#=GR <.....s.....>} + Per-residue annotation. \prog{<...s...>} is an aligned text line + of annotation type \prog{}, associated with the sequence + named \prog{}. + \verb+#=GR+ lines are + associated with one sequence in a sequence alignment block; + \prog{<...s...>} + is aligned to the residues in that sequence, and has + the same length as the rest of the block. + Typically + \verb+#=GR+ lines are placed immediately following the + aligned sequence they annotate. +\end{wideitem} + +\subsection{Semantics of Stockholm markup} + +Any Stockholm parser will accept syntactically correct files, but is +not obligated to do anything with the markup lines. It is up to the +application whether it will attempt to interpret the meaning (the +semantics) of the markup in a useful way. At the two extremes are the +Belvu alignment viewer and the HMMER profile hidden Markov model +software package. + +Belvu simply reads Stockholm markup and displays it, without trying to +interpret it at all. The tag types (\prog{#=GF}, etc.) are sufficient +to tell Belvu how to display the markup: whether it is attached to the +whole file, sequences, columns, or residues. + +HMMER uses Stockholm markup to pick up a variety of information from +the Pfam multiple alignment database. The Pfam consortium therefore +agrees on additional syntax for certain tag types, so HMMER can parse +some markups for useful information. This additional syntax is imposed +by Pfam, HMMER, and other software of mine, not by Stockholm format +per se. You can think of Stockholm as akin to XML, and what my +software reads as akin to an XML DTD, if you're into that sort of +structured data format lingo. + +The Stockholm markup tags that are parsed semantically by my software +are as follows: + +\subsubsection{Recognized #=GF annotations} +\begin{wideitem} +\item [\emprog{ID }] + Identifier. \emprog{} is a name for the alignment; + e.g. ``rrm''. One word. Unique in file. + +\item [\emprog{AC }] + Accession. \emprog{} is a unique accession number for the + alignment; e.g. + ``PF00001''. Used by the Pfam database, for instance. + Often a alphabetical prefix indicating the database + (e.g. ``PF'') followed by a unique numerical accession. + One word. Unique in file. + +\item [\emprog{DE }] + Description. \emprog{} is a free format line giving + a description of the alignment; e.g. + ``RNA recognition motif proteins''. One line. Unique in file. + +\item [\emprog{AU }] + Author. \emprog{} is a free format line listing the + authors responsible for an alignment; e.g. + ``Bateman A''. One line. Unique in file. + +\item [\emprog{GA }] + Gathering thresholds. Two real numbers giving HMMER bit score + per-sequence and per-domain cutoffs used in gathering the + members of Pfam full alignments. See Pfam and HMMER + documentation for more detail. + +\item [\emprog{NC }] + Noise cutoffs. Two real numbers giving HMMER bit score + per-sequence and per-domain cutoffs, set according to the + highest scores seen for unrelated sequences when gathering + members of Pfam full alignments. See Pfam and HMMER + documentation for more detail. + +\item [\emprog{TC }] + Trusted cutoffs. Two real numbers giving HMMER bit score + per-sequence and per-domain cutoffs, set according to the + lowest scores seen for true homologous sequences that + were above the GA gathering thresholds, when gathering + members of Pfam full alignments. See Pfam and HMMER + documentation for more detail. +\end{wideitem} + +\subsection{Recognized #=GS annotations} + +\begin{wideitem} +\item [\emprog{WT }] + Weight. \emprog{} is a positive real number giving the + relative weight for a sequence, usually used to compensate + for biased representation by downweighting similar sequences. + Usually the weights average 1.0 (e.g. the weights sum to + the number of sequences in the alignment) but this is not + required. Either every sequence must have a weight annotated, + or none of them can. + +\item [\emprog{AC }] + Accession. \emprog{} is a database accession number for + this sequence. (Compare the \prog{#=GF AC} markup, which gives + an accession for the whole alignment.) One word. + +\item [\emprog{DE }] + Description. \emprog{} is one line giving a description for + this sequence. (Compare the \prog{#=GF DE} markup, which gives + a description for the whole alignment.) +\end{wideitem} + + +\subsection{Recognized #=GC annotations} + +\begin{wideitem} +\item [\emprog{RF}] + Reference line. Any character is accepted as a markup for a + column. The intent is to allow labeling the columns with some + sort of mark. + +\item [\emprog{SS_cons}] + Secondary structure consensus. For protein alignments, + DSSP codes or gaps are accepted as markup: [HGIEBTSCX.-_], where + H is alpha helix, G is 3/10-helix, I is p-helix, E is extended + strand, B is a residue in an isolated b-bridge, T is a turn, + S is a bend, C is a random coil or loop, and X is unknown + (for instance, a residue that was not resolved in a crystal + structure). For RNA alignments + the symbols \verb+>+ and \verb+<+ are + used for base pairs (pairs point at each other). \verb-+- indicate + definitely single-stranded positions, and any gap symbol indicates + unassigned bases or single-stranded positions. This description + roughly follows \cite{Konings89}. + RNA pseudoknots are represented by alphabetic characters, with upper + case letters representing the 5' side of the helix and lower case + letters representing the 3' side. Note that this limits the + annotation to a maximum of 26 pseudoknots per sequence. + + +\item [\emprog{SA_cons}] + Surface accessibility consensus. 0-9, gap symbols, or X are + accepted as markup. 0 means <10\% accessible residue surface + area, 1 means <20\%, 9 means <100\%, etc. X means unknown + structure. +\end{wideitem} + +\subsection{Recognized #=GR annotations} + +\begin{wideitem} +\item [\emprog{SS}] + Secondary structure consensus. See \prog{#=GC SS_cons} above. +\item [\emprog{SA}] + Surface accessibility consensus. See \prog{#=GC SA_cons} above. +\end{wideitem} + + diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/gsi-format.tex b/forester/archive/RIO/others/hmmer/squid/Docs/gsi-format.tex new file mode 100644 index 0000000..3170824 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/gsi-format.tex @@ -0,0 +1,87 @@ +% Mon Dec 5 15:23:18 1994 + +\section{GSI format} + +{\tt GSI} (``generic sequence index'') is a format for indexing +sequence databases. Database retrieval programs such as {\tt sfetch} +can read GSI files when they are available to enable fast retrieval of +a sequence from large databases. + +GSI files are created from sequence databases by Perl scripts. +Scripts are currently provided for indexing GenBank, SwissProt, +GenPept, FASTA, and PIR formatted databases. + +\subsection {GSI programmatic details} + +A single GSI file indexes one or more files in a sequence database. +It is a binary file consisting of a number of fixed-length records. +There are three types of records: one header record, one file record +for every file in the database, and one keyword record for every +sequence retrieval key. (The retrieval key is usually the sequence +name, but may also be a database accession number.) + +Every GSI record is 38 bytes long and contains three fields: 32 bytes +of text (31 bytes plus a trailing NUL byte), a 2 byte network short, +and a 4 byte network long. (``Network short'' and ``network long'' +refer to portable integer variables of fixed size and byte order. See +Perl manuals for a few more details.) + +The first record is a header. It contains a short identifying text +string (``GSI''), then the number of files indexed ({\tt nfiles}), and +the number of keywords indexed ({\tt nkeys}). + +The next {\tt nfiles} records (records 1..{\tt nfiles}) map file +numbers onto file names. The three fields are \verb+ +. These records must be in numerical order +according to their file numbers. Because of the 31-character +restriction on filename lengths, the sequence files will generally +have to be in the same directory as the GSI index file. The file +format number is defined in {\tt squid.h}: + +\begin{tabular}{rl} +0 & Unknown \\ +1 & Intelligenetics\\ +2 & Genbank\\ +4 & EMBL\\ +5 & GCG single sequence\\ +6 & Strider \\ +7 & FASTA\\ +8 & Zuker\\ +9 & Idraw\\ +12 & PIR\\ +13 & Raw\\ +14 & SQUID\\ +16 & GCG data library \\ +101& Stockholm alignment\\ +102& SELEX alignment\\ +103& GCG MSF alignment\\ +104& Clustal alignment\\ +105& A2M (aligned FASTA) alignment\\ +106& Phylip\\ +\end{tabular} + +The remaining records ({\tt nfiles}+1..{\tt nfiles+nkeys}) are for +mapping keys onto files and disk offsets. The three fields are +\verb+ +. These records must be +sorted in alphabetic order by their retrieval keys, because the +function GSIGetOffset() locates a keyword in the index file by a +binary search. + +\subsection{Relevant functions} +\begin{description} +\item[GSIOpen()] + Opens a GSI index file. +\item[GSIGetRecord()] + Gets three fields from the current record. +\item[GSIGetOffset()] + Looks up a keyword in a GSI index and returns a filename, + file format, and disk offset in the file. +\item[SeqfilePosition()] + Repositions an open sequence file to a given disk offset. +\item[GSIClose()] + Closes an open GSI index file. +\end{description} + + + + diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/intro.tex b/forester/archive/RIO/others/hmmer/squid/Docs/intro.tex new file mode 100644 index 0000000..166c382 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/intro.tex @@ -0,0 +1,2 @@ +\section {Introduction} + diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/main.tex b/forester/archive/RIO/others/hmmer/squid/Docs/main.tex new file mode 100644 index 0000000..e469d1f --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/main.tex @@ -0,0 +1,35 @@ + +\documentstyle[jmb]{article} +\setcounter{secnumdepth}{0} +\input{psfig} + +\addtolength{\oddsidemargin}{-.5in} +\addtolength{\textwidth}{1in} +\addtolength{\topmargin}{-.5in} +\addtolength{\textheight}{1in} +\renewcommand{\baselinestretch}{1.2} + +\title{The SQUID sequence function library} + +\author{Sean R. Eddy \\ +MRC Laboratory of Molecular Biology\\ +Hills Road\\ +Cambridge CB2 2QH\\ +England\\ +sre@mrc-lmb.cam.ac.uk} + +\begin{document} +\bibliographystyle{jmb} +\nocite{TitlesOn} + +\maketitle + +\input{abstract} +\input{formats} +\input{selex} +\input{squid-format} +\input{gsi-format} + +\bibliography{master} + +\end{document} diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/selex.tex b/forester/archive/RIO/others/hmmer/squid/Docs/selex.tex new file mode 100644 index 0000000..c802e16 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/selex.tex @@ -0,0 +1,153 @@ +\section{ SELEX alignment file format } + +\subsection{ Example of a simple SELEX format file} + +\begin{verbatim} +# Example selex file + +seq1 ACGACGACGACG. +seq2 ..GGGAAAGG.GA +seq3 UUU..AAAUUU.A + +seq1 ..ACG +seq2 AAGGG +seq3 AA...UUU +\end{verbatim} + +SELEX is an interleaved multiple alignment format that evolved as an +intuitive format. SELEX files are easy to write and manipulate +manually with a text editor. It is usually easy to convert other +alignment formats into SELEX format; the output of the CLUSTALV +multiple alignment program and GCG's MSF format are similar +interleaved formats. Because it evolved to accomodate different user +input styles, it is very tolerant of various inconsistencies such as +different gap symbols, varying line lengths, etc. + +As the format evolved, more features have been added. To maintain +compatibility with past alignment files, the new features are added +using a reserved comment style. These extra features are usually +maintained by automated SELEX-generating software, such as the {\tt +koala} sequence alignment editor or my {\tt cove} and {\tt hmm} sequence +analysis packages. This extra information includes consensus and +individual RNA or protein secondary structure, per-sequence weights, a +reference coordinate system for the columns, and database source +information including name, accession number, and coordinates (for +subsequences extracted from a longer source sequence). + +\subsection {Specification of a SELEX file} + +\begin{enumerate} +\item +Any line beginning with a \verb+#=+ as the first two characters is a +machine ``comment''. \verb+#=+ comments are reserved for additional +data about the alignment. Usually these features are maintained by +software such as the {\tt koala} editor, not by hand. + +\item +All other lines beginning with a \verb+%+ or \verb+#+ as the first +character is a user comment. User comments are ignored by all +software. Any number of comments may be included. + +\item +Lines of data consist of a name followed by a sequence. The total +length of the line must be smaller than 1024 characters. + +\item +Names must be a single word. Any non-whitespace characters are +accepted. No spaces are tolerated in names: names MUST be a +single word. + +\item +In the sequence, any of the characters \verb+-_.+ or a space are +recognized as gaps. Gaps are converted to a '.'. Any other characters +are interpreted as sequence. Sequence is case-sensitive. There is a +common assumption by my software that upper-case symbols are used for +consensus (match) positions and lower-case symbols are used for +inserts. This language of ``match'' versus ``insert'' comes from the +hidden Markov model formalism \cite{Krogh94}. To almost all of my +software, this isn't important, and it immediately converts the +sequence to all upper-case after it's read. + +\item +Multiple different sequences are grouped in a block of data lines. +Blocks are separated by blank lines. No blank lines are tolerated +between the sequence lines in a block. Each block in a multi-block +file of a long alignment must have its sequences in the same order in +each block. The names are checked to verify that this is the case; if +not, only a warning is generated. (In manually constructed files, some +users may wish to use shorthand names after the first block with full +names, but this isn't recommended.) +\end{enumerate} + +\subsection {Special comments} + +\subsubsection {Secondary structure} + +I use one-letter codes to indicate secondary structures. Secondary +structure strings are aligned to sequence blocks just like additional +sequences. + +For RNA secondary structure, the symbols \verb+>+ and \verb+<+ are +used for base pairs (pairs point at each other). \verb-+- indicate +other single-stranded positions, {\tt .} indicates unassigned bases. +This description follows \cite{Konings89}. For protein secondary +structure, I use {\tt E} to indicate residues in $\beta$-sheet, {\tt +H} for those in $\alpha$-helix, {\tt L} for those in loops, and {\tt +.} for unassigned residues. + +RNA pseudoknots are represented by alphabetic characters, with upper +case letters representing the 5' side of the helix and lower case +letters representing the 3' side. Note that this restricts the +annotation to a maximum of 26 pseudoknots per sequence. + +Lines beginning with \verb+#=SS+ or \verb+#=CS+ are individual or +consensus secondary structure data, respectively. \verb+#=SS+ +individual secondary structure lines must immediately follow the +sequence they are associated with. There can only be one \verb+#=SS+ +per sequence. \verb+#=CS+ consensus secondary structure predictions +precede all the sequences in each block. There can only be one +\verb+#=CS+ per file. + +\subsubsection {Reference coordinate system} + +Alignments are usually numbered by some reference coordinate system, +often a canonical molecule. For instance, tRNA positions are numbered +by reference to the positions of yeast tRNA-Phe. + +A line beginning with \verb+#=RF+ preceding the sequences in a block +gives a reference coordinate system. Any non-gap symbol in the +\verb+#=RF+ line indicates that sequence positions in its columns are +numbered. For instance, the \verb+#=RF+ lines for a tRNA alignment +would have 76 non-gap symbols for the canonical numbered columns; they +might be the aligned tRNA-Phe sequence itself, or they might be just +X's. + +\subsubsection {Sequence header} + +Additional per-sequence information can be placed in a header before +any blocks appear. These lines, one per sequence and in exactly the +same order as the sequences appear in the alignment, are formatted +like \verb+#=SQ ++. + +This information includes a sequence weight (for compensating for +biased representation of subfamilies of sequences in the alignment); +source information, if the sequence came from a database, consisting +of identifier, accession number, and source coordinates; and a +description of the sequence. + +If a \verb+#=SQ+ line is present, all the fields must be present. If +no information is available for a field, use '-' for all the fields +except the source coordinates, which would be given as '0'. + +\subsubsection {Author} + +The first non-comment, non-blank line of the file may be a \verb+#=AU+ +``author'' line. There is a programmatic interface for +alignment-generating programs to record a short comment like \verb+11 +November 1993, by Feng-Doolittle v. 2.1.1+, and this comment will be +recorded on the \verb+#=AU+ line by \verb+WriteSELEX()+. + + + diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/squid-format.tex b/forester/archive/RIO/others/hmmer/squid/Docs/squid-format.tex new file mode 100644 index 0000000..e318534 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/squid-format.tex @@ -0,0 +1,80 @@ +\newpage +\section {SQUID format} + +SQUID format is a sequence database format similar to the PIR, +GenBank, and EMBL formats. The primary difference is that SQUID format +may optionally contain secondary structure annotation information for +the sequence. No other sequence format allows secondary structure +annotation, which is why SQUID format became necessary. + +An example SQUID format file: + +\begin{verbatim} +NAM DY9990 +SRC HSTGYA M27547 76..169::196 +DES Human Tyr-tRNA gene, clone pM6. +SEQ +SS + 1 ccttcgatagctcagctggtagagcggaggactgtagactgcggaaacgt + >>>>>>>..>>>>........<<<<.>>>>>................... + 51 ttgtggacatccttaggtcgctggttcaattccggctcgaagga + .........<<<<<.....>>>>>.......<<<<<<<<<<<<. +++ +NAM DY9991 +SRC HSTRNAYE M55611 1..93::93 +DES Human Tyr-tRNA precursor. +SEQ +SS + 1 ccttcgatagctcagctggtagagcggaggactgtagcctgtagaaacat + >>>>>>>..>>>>........<<<<.>>>>>................... + 51 ttgtggacatccttaggtcgctggttcgattccggctcgaagg + .........<<<<<.....>>>>>.......<<<<<<<<<<<< +++ +NAM DA0260 +SEQ + 1 GGGCGAAUAGUGUCAGCGGGAGCACACCAGACUUGCAAUCUGGUAGGGAG + 51 GGUUCGAGUCCCUCUUUGUCCACCA +++ +\end{verbatim} + + +\subsection {Specification of a SQUID file} + +\begin{enumerate} +\item There must be a line of the form \verb+NAM +. + +\item There may be an optional line \verb+SRC +..::+, which specified a database source for this +sequence, giving the database identifier (name), accession number, +start and end position in the database sequence, and the original +length of the database sequence, respectively. If a \verb+SRC+ line +is present, all of these values must be specified. If any values are +unknown, they may be set to \verb+-+ in the case of \verb++ and +\verb++ and \verb+0+ in the case of \verb++, \verb+, +and \verb++, and in these cases the values will be ignored. + +\item There may be an optional line \verb+DES + giving +a one-line description of the sequence. + +\item There must be a line of the form \verb-SEQ +SS- or \verb-SEQ-. +If the line contains \verb-+SS-, it means that the record contains +secondary structure annotation interleaved with the sequence. + +\item The sequence (and optional structure) immediately follow. There may be +optional numbering either before or after the sequence. The number of +characters per line is unimportant. Spaces and tabs are ignored. +There must be no non-numeric non-space characters on any lines except +sequence or structure annotation characters. Structure annotation is +fairly free-form; any alphabetic character or character in the set +\verb/_.-*?<>{}[]()!@#$%^&=+;:'|`~"\/ is accepted. There must +be one such character for every sequence character (preferably aligned +to the sequence, but in fact this is not checked for). Note that +spaces in the secondary structure annotation are not permitted, +except where they are aligned to gaps in the sequence. + +\item Sequence records are separated by a line of the form \verb-++-. +\end{enumerate} + + + + + + diff --git a/forester/archive/RIO/others/hmmer/squid/Docs/ssi-format.tex b/forester/archive/RIO/others/hmmer/squid/Docs/ssi-format.tex new file mode 100644 index 0000000..b236d13 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Docs/ssi-format.tex @@ -0,0 +1,641 @@ +% SRE, Mon Dec 25 13:00:46 2000 + +\documentclass[12pt]{report} +\usepackage{fullpage} +\usepackage{times} +\usepackage{epsfig} +%\usepackage{html} % From the LaTeX2html translator +\usepackage{apalike} +\setcounter{secnumdepth}{2} + +\input{macros} + +\begin{document} +\bibliographystyle{apalike} + +\section{SSI format} + +SSI format (Sequence/Subsequence Index format) indexes flatfile +databases by names and/or accessions, enabling fast retrieval. + +An SSI index is a binary file that stores sequence names or accessions +as \emph{keys} that it can look up rapidly. It differentiates between +\emph{primary keys} and \emph{secondary keys}. There is one and only +one primary key per sequence. There can be more than one secondary key +per sequence. Both primary and secondary keys must be unique +identifiers (no two records have the same key). A program (like +HMMER's distributed PVM implementation) that needs to step through +each sequence one at a time can refer to the list of primary keys. A +program solely concerned with flexible sequence retrieval (such as +SQUID's \prog{sfetch}) might consult an SSI index with accessions as +primary keys, and names as secondary keys. + +A single SSI file can index multiple sequence data files. This allows +indexing multifile databases (e.g. Genbank flatfile distributions). + +The SSI format is relatively simple and may prove useful for other +indexing tasks besides sequence names. HMMER uses SSI format to index +HMM files. + +\subsection{Special features of SSI} + +SSI superceded 1994's GSI format after human genome sequence files +started exceeding 2 GB filesystem limitations, and after problems in +the HMMER PVM implementation had to be hacked around. SSI has the +following additional features compared to GSI. + +\begin{description} +\item[Separate primary key section] +Primary keys are set apart in a separate section, enabling programs to +step through a guaranteed one-to-one mapping of keys to sequences. A +secondary key section adds many-to-one mapping of keys to sequences. + +\item[Arbitrary filename and key lengths] +File name lengths and key name lengths are effectively unlimited. + +\item[64-bit indexing] +For sequence files exceeding 2GB, on architectures that support 64-bit +filesystems (such as IRIX, Solaris, Tru64 UNIX, FreeBSD...), SSI +supports 64-bit indexing; depending on the system, file sizes may +theoretically be allowed to range up to millions of terabytes. + +\item[Fast subsequence extraction] +SSI can be used to greatly accelerate \emph{subsequence} extraction +from very long sequences (example: human chromosome contigs). The +sequence file must meet certain formatting conditions for this to +work; see below for details. +\end{description} + +\subsection{SSI API in SQUID} + +\subsubsection{Functions for using a SSI index file:} + +\begin{sreapi} +\item[int SSIOpen(char *filename, SSIFILE **ret\_sfp)] + +Opens the SSI index file \prog{filename} and returns a \prog{SSIFILE +*} stream through \prog{ret\_sfp}. Returns 0 on success, nonzero on +failure. The caller must eventually close this stream using +\prog{SSIClose()}. More than one index can be open at once. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_NOFILE} & failed to open file; doesn't exist or not readable\\ +\prog{SSI\_ERR\_BADMAGIC} & not a SSI file \\ +\prog{SSI\_ERR\_NO64BIT} & it has 64-bit offsets, and we can't support that\\ +\prog{SSI\_ERR\_FORMAT} & file appears to be corrupted\\ +\prog{SSI\_ERR\_MALLOC} & malloc failed \\ +\end{tabular} + +\item[int SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret\_fh, SSIOFFSET *ret\_offset)] + +Looks up the string \prog{key} in the open index \prog{sfp}. +\prog{key} can be either a primary or secondary key. If \prog{key} is +found, \prog{*ret\_fh} contains a unique handle on the file +that contains {key} (suitable for an \prog{SSIFileInfo()} call, or for +comparison to the handle of the last file that was opened for +retrieval), and \prog{offset} is filled in with the offset in that +file. Returns 0 on success, non-zero on error. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_NO\_SUCH\_KEY} & key not found \\ +\prog{SSI\_ERR\_NODATA} & fread() failed, file appears to be corrupted\\ +\end{tabular} + +\item[int SSIGetOffsetByNumber(SSIFILE *sfp, int nkey, int +*ret\_fh, SSIOFFSET *offset)] + +Retrieves information for primary key number \prog{nkey}. \prog{nkey} +ranges from 0..\prog{nprimary-1}. When the key is found, +\prog{*ret\_fh} contains a unique handle on the file that +contains {key} (suitable for an SSIFileInfo() call, or for comparison +to the handle of the last file that was opened for retrieval), and +\prog{offset} is filled in with the offset in that file. Returns 0 on +success, non-zero on error. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_SEEK\_FAILED} & failed to reposition in index file\\ +\prog{SSI\_ERR\_NO\_SUCH\_KEY} & key not found \\ +\prog{SSI\_ERR\_NODATA} & fread() failed, file appears to be corrupted\\ +\end{tabular} + +\item[int SSIGetSubseqOffset(SSIFILE *sfp, char *key, int +requested\_start, int *ret\_fh, +SSIOFFSET *record\_offset, SSIOFFSET *data\_offset, int *ret\_actual\_start)] + +Implements \prog{SSI\_FAST\_SUBSEQ}. + +Looks up the string \prog{key} in the open index \prog{sfp}, and +asks for the nearest offset to a subsequence starting at position +\prog{requested\_start} in the sequence (numbering the sequence 1..L). +\prog{key} can be either a primary or secondary key. If \prog{key} is +found, \prog{*ret\_fh} contains a unique handle on the file that +contains {key} (suitable for an SSIFileInfo() call, or for comparison +to the handle of the last file that was opened for retrieval); +\prog{record\_offset} contains the disk offset to the start of the +record; \prog{data\_offset} contains the disk offset either exactly at +the requested residue, or at the start of the line containing the +requested residue; \prog{ret\_actual\_start} contains the coordinate +(1..L) of the first valid residue at or after +\prog{data\_offset}. \prog{ret\_actual\_start} is $\leq$ +\prog{requested\_start}. Returns 0 on success, non-zero on failure. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_NO\_SUBSEQS} & this file or key doesn't allow subseq lookup\\ +\prog{SSI\_ERR\_NO\_SUCH\_KEY} & key not found \\ +\prog{SSI\_ERR\_RANGE} & the requested\_start is out of bounds\\ +\prog{SSI\_ERR\_NODATA} & fread() failed, file appears to be corrupted\\ +\end{tabular} + +\item[int SSISetFilePosition(FILE *fp, SSIOFFSET *offset] + +Uses \prog{offset} to sets the file position for \prog{fp} (usually an +open sequence file) relative to the start of the file. Hides the +details of system-dependent shenanigans necessary for file positioning +in large ($>2$ GB) files. Behaves just like \prog{fseek(fp, offset, +SEEK\_SET)} for 32 bit offsets and $<2$ GB files. Returns 0 on +success, nonzero on error. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_SEEK\_FAILED} & failed to reposition the file\\ +\end{tabular} + +\item[int SSIFileInfo(SSIFILE *sfp, int fh, char **ret\_filename, int *ret\_format)] + +Given a file handle \prog{fh} in an open index file \prog{sfp}, +retrieve file name \prog{ret\_filename} and the file format +\prog{ret\_format}. \prog{ret\_filename} is a pointer to a string +maintained internally by \prog{sfp}. It should not be free'd; +\prog{SSIClose(sfp)} will take care of it. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_BADARG} & no such file n\\ +\end{tabular} + +\item[void SSIClose(SSIFILE *sfp)] + +Close an open \prog{SSIFILE *}. +\end{sreapi} + +\subsubsection{Skeleton example code for using a SSI index file:} + +\small\begin{verbatim} + SSIFILE *sfp; + FILE *fp; + int fh; + char *seqfile; + int fmt; + SSIOFFSET offset; + + SSIOpen(``foo.gsi'', &sfp); + + /* Finding an entry by name + * (by number, with SSIGetOffsetByNumber(), is analogous) + */ + SSIGetOffsetByName(sfp, ``important_key'', &fh, &offset); + SSIGetFileInfo(sfp, fh, &seqfile, &fmt); + fp = fopen(seqfile, ``r''); /* more usually SeqfileOpen(), using fmt */ + SSIFilePosition(fp, &offset); + /* read the entry from there, do whatever... */ + free(seqfile); + fclose(fp); + + SSIClose(sfp); +\end{verbatim}\normalsize + +\subsubsection{Functions for creating a SSI index file:} + +\begin{sreapi} +\item[int SSIRecommendMode(char *file)] + +Examines the file and determines whether it should be indexed with +large file support or not; returns \prog{SSI\_OFFSET\_I32} for most +files, \prog{SSI\_OFFSET\_I64} for large files, or -1 on failure. + +\item[SSIINDEX *SSICreateIndex(int mode)] + +Creates and initializes a SSI index structure. Sequence file offset +type to be used is specified by \prog{mode}, which may be either +\prog{SSI\_OFFSET\_I32} or \prog{SSI\_OFFSET\_I64}. Returns a +pointer to the new structure, or NULL on failure. The caller must free +this structure with \prog{SSIFreeIndex()} when done. + +\item[int SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret\_offset)] + +Fills \prog{ret\_offset} with the current disk offset of \prog{fp}, +relative to the start of the file. {mode} is the type of offset to +use; it must be either \prog{SSI\_OFFSET\_I32} or +\prog{SSI\_OFFSET\_I64}. Returns 0 on success, non-zero on error. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_NO64BIT} & 64-bit mode unsupported on this system\\ +\prog{SSI\_ERR\_TELL\_FAILED} & failed to determine position in file\\ +\end{tabular} + +\item[int SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, +int *ret\_fh)] + +Adds the sequence file \prog{filename}, which is known to be in format +\prog{fmt}, to the index \prog{g}. Creates and returns a unique +filehandle \prog{ret\_fh} for associating primary keys with this file +using \prog{SSIAddPrimaryKeyToIndex()}. Returns 0 on success, non-zero +on failure. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_TOOMANY\_FILES} & exceeded file number limit\\ +\prog{SSI\_ERR\_MALLOC} & a malloc() failed\\ +\end{tabular} + +\item[int SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl)] + +Set \prog{SSI\_FAST\_SUBSEQ} for the file indicated by filehandle +\prog{fh} in the index \prog{g}, setting parameters \prog{bpl} and +\prog{rpl} to the values given. \prog{bpl} is the number of bytes per +sequence data line. \prog{rpl} is the number of residues per sequence +data line. Caller must be sure that \prog{bpl} and \prog{rpl} do not +change on any line of any sequence record in the file (except for the +last data line of each record). If this is not the case in this file, +\prog{SSI\_FAST\_SUBSEQ} will not work, and this routine should not be +called. Returns 0 on success, non-zero on failure. + +\item[int SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int +fh, SSIOFFSET *r\_off, SSIOFFSET *d\_off, int L)] + +Puts a primary key \prog{key} in the index \prog{g}, while telling the +index that this primary key is in the file associated with filehandle +\prog{fh} and its record starts at position \prog{r\_off} in that +file. + +\prog{d\_off} and \prog{L} are optional; they may be left unset by +passing NULL and 0, respectively. (If one is provided, both must be +provided.) If they are provided, \prog{d\_off} gives the position of +the first line of sequence data in the record, and \prog{L} gives +the length of the sequence in residues. They are used when +\prog{SSI\_FAST\_SUBSEQ} is set for the sequence file. If +\prog{SSI\_FAST\_SUBSEQ} is not set for the file, \prog{d\_off} and +\prog{L} will be ignored even if they are available, so it doesn't +hurt for the indexing program to provide them; typically it won't know +whether it's safe to set \prog{SSI\_FAST\_SUBSEQ} for the whole file +until the whole file has been read and every key has already been +added to the index. + +Through \prog{ret\_kh} it provides a ``handle'' - a unique +identifier for the primary key - that any subsequent calls to +\prog{SSIAddSecondaryKeyToIndex()} will use to associate one or more +secondary keys with this primary key. + +Returns 0 on success, non-zero on error. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_TOOMANY\_KEYS} & exceeded primary key limit\\ +\prog{SSI\_ERR\_TOOMANY\_FILES} & filenum exceeds file limit\\ +\prog{SSI\_ERR\_MALLOC} & a malloc() failed\\ +\end{tabular} + + +\item[int SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey)] + +Puts a secondary key \prog{key} in the index \prog{g}, associating it +with a primary key \prog{pkey} that has already been added to the index +by \prog{SSIAddPrimaryKeyToIndex()}. +Returns 0 on success, non-zero on error. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_TOOMANY\_KEYS} & exceeded secondary key limit\\ +\prog{SSI\_ERR\_MALLOC} & a malloc() failed\\ +\end{tabular} + + + +\item[int SSIWriteIndex(char *file, SSIINDEX *g)] + +Writes complete index \prog{g} in SSI format to a binary file +\prog{file}. Does all overhead of sorting the primary and secondary +keys, and maintaining the association of secondary keys with primary +keys during and after the sort. Returns 0 on success, non-zero on +error. + +Error codes:\\ +\begin{tabular}{ll} +\prog{SSI\_ERR\_NOFILE} & an fopen() failed\\ +\prog{SSI\_ERR\_FWRITE} & an fwrite() failed\\ +\prog{SSI\_ERR\_MALLOC} & a malloc() failed\\ +\end{tabular} + + +\item[void SSIFreeIndex(SSIINDEX *g)] + +Free an index structure. +\end{sreapi} + + +\subsubsection{Other SSI functions:} + +\begin{sreapi} +\item[char *SSIErrorString(int n)] + +Returns a pointer to an internal string corresponding to error +\prog{n}, a return code from any of the functions in the API that +return non-zero on error. +\end{sreapi} + +\subsection{Detailed specification of SSI binary format} + +There are four sections to the SSI file: +\begin{sreitems}{\textbf{Secondary keys}} +\item[\textbf{Header}] +Contains a magic number indicating GSI version number, and +various information about the number and sizes of things in the index. + +\item[\textbf{Files}] +Contains one or more \emph{file records}, one per sequence file that's +indexed. These contain information about the individual files. + +\item[\textbf{Primary keys}] +Contains one or more \emph{primary key records}, one per primary key. + +\item[\textbf{Secondary keys}] +Contains one or more \emph{secondary key records}, one per secondary key. +\end{sreitems} + +All numeric quantities are stored as unsigned integers of known size +in network (bigendian) order, for maximum crossplatform portability of +the index files. \prog{sqd\_uint16}, \prog{sqd\_uint32}, and +\prog{sqd\_uint64} are typically typedef'd as \prog{unsigned short}, +\prog{unsigned int}, and \prog{unsigned long long} or \prog{unsigned +long} at SQUID compile-time. Values may need to be cast to signed +quantities, so only half of their dynamic range is valid +(e.g. 0..32,767 for values of type \prog{sqd\_uint16}; +0..2,146,483,647 (2 billion) for \prog{sqd\_uint32}; and 0..9.22e18 (9 +million trillion) for \prog{sqd\_uint64}). These typedef's are +handled automatically by the \prog{./configure} script (see +\prog{squidconf.h.in} before configuration, \prog{squidconf.h} after +configuration). If necessary, \prog{./configure}'s guess can be +overridden in \prog{squidconf.h} after configuration. + +\subsubsection{Header section} + +The header section contains: + +\vspace{1em} +\begin{tabular}{llrr} +Variable & Description & Bytes & Type \\\hline +\prog{magic} & SSI version magic number. & 4 & \prog{sqd\_uint32}\\ +\prog{flags} & Optional behavior flags (see below) & 4 & \prog{sqd\_uint32}\\ +\prog{nfiles} & Number of files in file section. & 2 & \prog{sqd\_uint16}\\ +\prog{nprimary} & Number of primary keys. & 4 & \prog{sqd\_uint32}\\ +\prog{nsecondary} & Number of secondary keys. & 4 & \prog{sqd\_uint32}\\ +\prog{flen} & Length of filenames (incl. '\verb+\0+') & 4 & \prog{sqd\_uint32}\\ +\prog{plen} & Length of primary key names (incl. '\verb+\0+') & 4 & \prog{sqd\_uint32}\\ +\prog{slen} & Length of sec. key names (incl. '\verb+\0+') & 4 & \prog{sqd\_uint32}\\ +\prog{frecsize} & \# of bytes in a file record & 4 & \prog{sqd\_uint32}\\ +\prog{precsize} & \# of bytes in a primary key record & 4 & \prog{sqd\_uint32}\\ +\prog{srecsize} & \# of bytes in a sec. key record & 4 & \prog{sqd\_uint32}\\ +\prog{foffset} & disk offset, start of file records & \dag & \dag\\ +\prog{poffset} & disk offset, start of primary key recs & \dag & \dag\\ +\prog{soffset} & disk offset, start of sec. key records & \dag & \dag\\ +\end{tabular} +\vspace{1em} + +The optional behavior flags are: + +\vspace{1em} +\begin{tabular}{lll} +Flag & Value& Note\\ \hline +\prog{SSI\_USE64} & $1 \ll 0$ & Large sequence files; all key offsets 64 bit.\\ +\prog{SSI\_USE64\_INDEX} & $1 \ll 1$ & Large index; GSI file itself uses 64-bit offsets.\\\hline +\end{tabular} +\vspace{1em} + +The optional behavior flags define whether the SSI file uses large +file (64-bit) offsets. This issue is discussed in greater detail +below (see ``Large sequence files and large indices''). Briefly: if +\prog{SSI\_USE64} is set, the sequence file is large, and all sequence +file offsets are 64-bit integers. If \prog{SSI\_USE64\_INDEX} is +set, the index file itself is large, and \prog{foffset}, +\prog{poffset}, and \prog{soffset} (that is, all offsets within the +index file itself, indicated as \dag\ in the above table) are 64-bit +integers. \footnote{In the current API it is not expected that +\prog{SSI\_USE64\_INDEX} would ever be set. The current index-writing +API keeps the entire index in RAM (it has to sort the keys), and would +presumably have to be modified or replaced to be able to generate very +large indices.} + +The reason to explicitly record various record sizes (\prog{frecsize}, +\prog{precsize}, \prog{srecsize}) and index file positions +(\prog{foffset}, \prog{poffset}, \prog{soffset}) is to allow future +extendibility. More fields might be added without breaking older SSI +parsers. The format is meant to be both forwards- and +backwards-compatible. + +\subsubsection{File section} + +The file section consists of \prog{nfiles} file records. Each record +is \prog{frecsize} bytes long, and contains: + +\vspace{1em} +\begin{tabular}{llrr} +Variable & Description & Bytes & Type \\\hline +\prog{filename} & Name of file (possibly including full path) & \prog{flen} & char *\\ +\prog{format} & Format code for file; see squid.h for definitions & 4 & \prog{sqd\_uint32} \\ +\prog{flags} & Optional behavior flags & 4 & \prog{sqd\_uint32} \\ +\prog{bpl} & Bytes per sequence data line & 4 & \prog{sqd\_uint32} \\ +\prog{rpl} & Residues per sequence data line & 4 & \prog{sqd\_uint32} \\\hline +\end{tabular} +\vspace{1em} + +When a SSI file is written, \prog{frecsize} is equal to the sum of +the sizes above. When a SSI file is read by a parser, it is possible +that \prog{frecsize} is larger than the parser expects, if the parser +is expecting an older version of the SSI format: additional fields +may be present, which increases \prog{frecsize}. The parser will only +try to understand the data up to the \prog{frecsize} it expected to +see, but still knows the absolutely correct \prog{frecsize} for +purposes of skipping around in the index file. + +Normally the SSI index resides in the same directory as the sequence +data file(s), so \prog{filename} is relative to the location of the +SSI index. In the event this is not true, \prog{filename} can contain +a full path. + +\prog{format} is a SQUID sequence file format code; e.g. something like +\prog{SQFILE\_FASTA} or \prog{MSAFILE\_STOCKHOLM}. These constants are defined +in \prog{squid.h}. + +Only one possible optional behavior flag is defined: + +\vspace{1em} +\begin{tabular}{lll} +Flag & Value& Note\\ \hline +\prog{SSI\_FAST\_SUBSEQ} & $1 \ll 0$ & Fast subseq retrieval is possible for this file.\\\hline +\end{tabular} +\vspace{1em} + +When \prog{SSI\_FAST\_SUBSEQ} is set, \prog{bpl} and \prog{rpl} are +nonzero. They can be used to calculate the offset of subsequence +positions in the data file. This is described in the optional behavior +section below. + +\subsubsection{Primary key section} + +The primary key section consists of \prog{nprimary} records. Each +record is \prog{precsize} bytes long, and contains: + +\vspace{1em} +\begin{tabular}{llrr} +Variable & Description & Bytes & Type \\\hline +\prog{key} & Key name (seq name, identifier, accession) & \prog{plen}& char *\\ +\prog{fnum} & File number (0..nfiles-1) & 2 & \prog{sqd\_uint16}\\ +\prog{offset1} & Offset to start of record & \ddag & \ddag \\ +\prog{offset2} & Offset to start of sequence data & \ddag & \ddag \\ +\prog{len} & Length of data (e.g. seq length, residues) & 4 & \prog{sqd\_uint32} \\\hline +\end{tabular} +\vspace{1em} + +The offsets are sequence file offsets (indicated by \ddag). They are +4 bytes of type \prog{sqd\_uint32} normally, 8 bytes of type +\prog{sqd\_uint32} if \prog{SSI\_USE64} is set, and \prog{sizeof(fpos\_t)} +bytes of type \prog{fpos\_t} if \prog{SSI\_FPOS\_T} is set. + +\prog{offset2} and \prog{len} are only meaningful if \prog{SSI\_FAST\_SUBSEQ} +is set on this key's file. \prog{offset2} gives the absolute disk +position of line 0 in the sequence data. \prog{len} is necessary for +bounds checking in a subsequence retrieval, to be sure we don't try to +reposition the disk outside the valid data. + +\subsubsection{Secondary key section} + +The secondary key section consists of \prog{nsecondary} records. Each +record is \prog{srecsize} bytes long, and contains: + +\vspace{1em} +\begin{tabular}{llrr} +Variable & Description & Bytes & Type \\\hline +\prog{key} & Key name (seq name, identifier, accession) & \prog{slen}& char *\\ +\prog{pkey} & Primary key & +\prog{plen}& char *\\\hline +\end{tabular} +\vspace{1em} + +All data are kept with the primary key records. Secondary keys are +simply translated to primary keys, then the primary key has to be +looked up. + +\subsection{Optional behaviors} + +\subsubsection{Large sequence files and large indices: 64-bit operation} + +Normally a SSI index file can be no larger than 2 GB, and can index +sequence files that are no larger than 2 GB each. This is due to +limitations in the ANSI C/POSIX standards, which were developed for +32-bit operating systems and filesystems. Most modern operating +systems allow larger 64-bit file sizes, but as far as I'm aware (Dec +2000), there are no standard interfaces yet for working with positions +(offsets) in large files. On many platforms, SSI can extend to full +64-bit capabilities, but on some platforms, it cannot. To understand +the limitations (of SSI, and possibly of my understanding) you need +to understand some details about what's happening behind the SSI API +and how I understand C API's to modern 64-bit OS's and hardware. + +First, some information on ANSI C APIs for file positioning. ANSI C +provides the portable functions \prog{fseek()} and \prog{ftell()} for +manipulating simple offsets in a file. They store the offset in a +\prog{long} (which ranges up to 2 GB). The Standard says we're allowed +to do arithmetic on this value if the file is binary. ANSI C also +provides \prog{fgetpos()} and \prog{fsetpos()} which store file +positions in an opaque data type called \prog{fpos\_t}. Modern +operating systems with large file support define \prog{fpos\_t} in a +way that permits files $>$2 GB. However, \prog{fpos\_t} is an opaque +type. It has two disadvantages compared to a simple arithmetic type +like \prog{long}: first, we're not allowed to do arithmetic on it, and +second, we can't store it in a binary file in an +architecture-independent manner. We need both features for SSI, +unfortunately. \footnote{Surely the professional C community has the +same problem; does \emph{everyone} hack around \prog{fpos\_t}?} + +Therefore we have to rely on system dependent features. Most operating +systems provide a non-compliant library call that returns an +arithmetic offset. Fully 64-bit systems typically give us a 64-bit +\prog{off\_t} and functions \prog{ftello()}/\prog{fseeko()} that work +with that offset. Many systems provide a ``transitional interface'' +where all normally named functions are 32-bits, but specially named +64-bit varieties are available: e.g. \prog{off\_t} is 32 bits, but +\prog{off64\_t} is 64 bits and we have functions \prog{ftello64()} and +\prog{fseeko64()}. Some systems provide a \prog{ftell64()} and +\prog{fseek64()} that work on offsets of type \prog{long long}. Many +popular systems may even provide more than one of these models, +depending on compiler flags. + +And, unfortunately, some systems provide none of these models (FreeBSD +for example). There, we will exploit the fact that most systems +(including FreeBSD) do in fact implement \prog{fpos\_t} as a simple +arithmetic type, such as an \prog{off\_t}, so we can misuse it. + +At compile time, SQUID's \prog{./configure} script tests for the +system's capabilities for 64-bit file offsets, and configures a +section in the \prog{squidconf.h} file. (The configuration includes a +custom autoconf macro, \prog{SQ\_ARITHMETIC\_FPOS\_T()}, to test +\prog{fpos\_t} and define \prog{ARITHMETIC\_FPOS\_T} if it is.) Four +possible 64-bit models are tested in the following order; if one of +them is possible, it will be used, and the constant +\prog{HAS\_64BIT\_FILE\_OFFSETS} is set. + +\begin{enumerate} +\item has \prog{ftello()}, \prog{fseeko()}; sizeof(\prog{off\_t}) $= 8$. +\item has \prog{ftello64()}, \prog{fseeko64()}; sizeof(\prog{off64\_t}) $= 8$. +\item has \prog{ftell64()}, \prog{fseek64()} +\item \prog{fpos\_t} is an arithmetic 64-bit type; (mis)use +\prog{fgetpos()}, \prog{fsetpos()}. +\end{enumerate} + + + +\subsubsection{Fast subsequence retrieval} + +In some files (notably vertebrate chromosome contigs) the size of each +sequence is large. It may be slow to extract a subsequence by first +reading the whole sequence into memory -- or even prohibitive, if the +sequence is so large that it can't be stored in memory. + +If the sequence data file is very consistently formatted so that each +line in each record (except the last one) is of the same length, in +both bytes and residues, we can determine a disk offset of the start +of any subsequence by direct calculation. +For example, a simple well-formatted FASTA +file with 50 residues per line would have 51 bytes per sequence line +(counting the '\verb+\0+') (\prog{bpl}=51, \prog{rpl}=50). Position $i$ in a sequence +$1..L$ will be on line $l = (i-1)/\mbox{\prog{rpl}}$, and line $l$ starts at +disk offset $l * \mbox{\prog{bpl}}$ relative to the start of the sequence +data. If there are no nonsequence characters in the data line except +the terminal '\verb+\0+' (which is true iff \prog{bpl} = \prog{rpl}+1 and 1 residue = 1 +byte), position $i$ can be precisely found: + +\[ +\mbox{relative offset of residue $i$} = +\left((i-1)/\mbox{\prog{rpl}}\right)*\mbox{\prog{bpl}} + (i-1) \% \mbox{ \prog{rpl}} +\] + +Even for sequence data lines with extra characters (e.g. spaces, +coordinates, whatever), fast subsequence retrieval is possible; a +parser can be positioned at the beginning of the appropriate line $l$, +which starts at residue $(l*\mbox{\prog{rpl}}) + 1$, and it can start reading +from there (e.g. the line that $i$ is on) rather than the beginning of +the whole sequence record. + +The program that creates the index is responsible for determining if +\prog{bpl} and \prog{rpl} are consistent throughout a file; if so, it +may set the \prog{SSI\_FAST\_SUBSEQ} flag for the file. Then any record +whose primary key carries the optional data offset (\prog(offset2)) +and sequence length data is available for subsequence position +calculations by \prog{SSIGetSubseqOffset()}. + +\end{document} \ No newline at end of file diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/a2m b/forester/archive/RIO/others/hmmer/squid/Formats/a2m new file mode 100644 index 0000000..5001742 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/a2m @@ -0,0 +1,200 @@ +>GLB2_MORMR +...PIVD..SGSVSPLSDAEKNKIRAAW.DIVYKNYEKNGVDILVKFFTGTPAAQAFFPK +FKGLTTADALKKSSDVRWHAERIINAVNDAVKSMDDTEKMSMKLQELSVKHAQSFYVDRQ +YFKVLAGII.........ADTTAPGDAGFEKLMSMICILLSSAY....... +>GLBZ_CHITH +MKFIILALCVAAASALSGDQIGLVQST.YGKVKG....DSVGILYAVFKADPTIQAAFPQ +FVGK.DLDAIKGGAEFSTHAGRIVGFLGGVIDDLP...NIGKHVDALVATH.KPRGVTHA +QFNNFRAAFIAYLKGHV..DYTAAVEAAWGATFDAFFGAVFAK.......M +>HBA2_BOSMU +...V...........LSAADKGNVKAAW.GKVGGHAAEYGAEALERMFLSFPTTKTYFPH +FD.LSH.....GSAQVKGHGAKVAAALTKAVGHLDD...LPGALSELSDLHAHKLRVDPV +NFKLLSHSLLVTLASHLPSDFTPAVHASLDKFLANVSTVLTSKYR...... +>HBA2_GALCR +...V...........LSPTDKSNVKAAW.EKVGAHAGDYGAEALERMFLSFPTTKTYFPH +FD.LSH.....GSTQVKGHGKKVADALTNAVLHVDD...MPSALSALSDLHAHKLRVDPV +NFKLLRHCLLVTLACHHPAEFTPAVHASLDKFMASVSTVLTSKYR...... +>HBA4_SALIR +...S...........LSAKDKANVKAIW.GKILPKSDEIGEQALSRMLVVYPQTKAYFSH +WASVAP.....GSAPVKKHGITIMNQIDDCVGHMDD...LFGFLTKLSELHATKLRVDPT +NFKILAHNLIVVIAAYFPAEFTPEIHLSVDKFLQQLALALAEKYR...... +>HBAD_CHLME +...M...........LTADDKKLLTQLW.EKVAGHQEEFGSEALQRMFLTYPQTKTYFPH +FD.LHP.....GSEQVRGHGKKVAAALGNAVKSLDN...LSQALSELSNLHAYNLRVDPA +NFKLLAQCFQVVLATHLGKDYSPEMHAAFDKFLSAVAAVLAEKYR...... +>HBAD_PASMO +...M...........LTAEDKKLIQQIW.GKLGGAEEEIGADALWRMFHSYPSTKTYFPH +FD.LSQ.....GSDQIRGHGKKVVAALSNAIKNLDN...LSQALSELSNLHAYNLRVDPV +NFKFLSQCLQVSLATRLGKEYSPEVHSAVDKFMSAVASVLAEKYR...... +>HBAZ_HORSE +...S...........LTKAERTMVVSIW.GKISMQADAVGTEALQRLFSSYPQTKTYFPH +FD.LHE.....GSPQLRAHGSKVAAAVGDAVKSIDN...VAGALAKLSELHAYILRVDPV +NFKFLSHCLLVTLASRLPADFTADAHAAWDKFLSIVSSVLTEKYR...... +>HBA_AILME +...V...........LSPADKTNVKATW.DKIGGHAGEYGGEALERTFASFPTTKTYFPH +FD.LSP.....GSAQVKAHGKKVADALTTAVGHLDD...LPGALSALSDLHAHKLRVDPV +NFKLLSHCLLVTLASHHPAEFTPAVHASLDKFFSAVSTVLTSKYR...... +>HBA_ANSSE +...V...........LSAADKGNVKTVF.GKIGGHAEEYGAETLQRMFQTFPQTKTYFPH +FD.LQP.....GSAQIKAHGKKVAAALVEAANHIDD...IAGALSKLSDLHAQKLRVDPV +NFKFLGHCFLVVLAIHHPSLLTPEVHASMDKFLCAVATVLTAKYR...... +>HBA_COLLI +...V...........LSANDKSNVKAVF.AKIGGQAGDLGGEALERLFITYPQTKTYFPH +FD.LSH.....GSAQIKGHGKKVAEALVEAANHIDD...IAGALSKLSDLHAQKLRVDPV +NFKLLGHCFLVVVAVHFPSLLTPEVHASLDKFVLAVGTVLTAKYR...... +>HBA_ERIEU +...V...........LSATDKANVKTFW.GKLGGHGGEYGGEALDRMFQAHPTTKTYFPH +FD.LNP.....GSAQVKGHGKKVADALTTAVNNLDD...VPGALSALSDLHAHKLRVDPV +NFKLLSHCLLVTLALHHPADFTPAVHASLDKFLATVATVLTSKYR...... +>HBA_FRAPO +...V...........LSAADKNNVKGIF.GKISSHAEDYGAEALERMFITYPSTKTYFPH +FD.LSH.....GSAQVKGHGKKVVAALIEAANHIDD...IAGTLSKLSDLHAHKLRVDPV +NFKLLGQCFLVVVAIHHPSALTPEVHASLDKFLCAVGNVLTAKYR...... +>HBA_MACFA +...V...........LSPADKTNVKAAW.GKVGGHAGEYGAEALERMFLSFPTTKTYFPH +FD.LSH.....GSAQVKGHGKKVADALTLAVGHVDD...MPQALSALSDLHAHKLRVDPV +NFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR...... +>HBA_MACSI +...V...........LSPADKTNVKDAW.GKVGGHAGEYGAEALERMFLSFPTTKTYFPH +FD.LSH.....GSAQVKGHGKKVADALTLAVGHVDD...MPQALSALSDLHAHKLRVDPV +NFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR...... +>HBA_MESAU +...V...........LSAKDKTNISEAW.GKIGGHAGEYGAEALERMFFVYPTTKTYFPH +FD.VSH.....GSAQVKGHGKKVADALTNAVGHLDD...LPGALSALSDLHAHKLRVDPV +NFKLLSHCLLVTLANHHPADFTPAVHASLDKFFASVSTVLTSKYR...... +>HBA_PAGLA +...V...........LSSADKNNIKATW.DKIGSHAGEYGAEALERTFISFPTTKTYFPH +FD.LSH.....GSAQVKAHGKKVADALTLAVGHLED...LPNALSALSDLHAYKLRVDPV +NFKLLSHCLLVTLACHHPAEFTPAVHSALDKFFSAVSTVLTSKYR...... +>HBA_PHACO +...V...........LSAADKNNVKGIF.TKIAGHAEEYGAEALERMFITYPSTKTYFPH +FD.LSH.....GSAQIKGHGKKVVAALIEAVNHIDD...ITGTLSKLSDLHAHKLRVDPV +NFKLLGQCFLVVVAIHHPSALTPEVHASLDKFLCAVGTVLTAKYR...... +>HBA_PONPY +...V...........LSPADKTNVKTAW.GKVGAHAGDYGAEALERMFLSFPTTKTYFPH +FD.LSH.....GSAQVKDHGKKVADALTNAVAHVDD...MPNALSALSDLHAHKLRVDPV +NFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR...... +>HBA_PROLO +...V...........LSPADKANIKATW.DKIGGHAGEYGGEALERTFASFPTTKTYFPH +FD.LSP.....GSAQVKAHGKKVADALTLAVGHLDD...LPGALSALSDLHAYKLRVDPV +NFKLLSHCLLVTLACHHPAEFTPAVHASLDKFFTSVSTVLTSKYR...... +>HBA_TRIOC +...V...........LSANDKTNVKTVF.TKITGHAEDYGAETLERMFITYPPTKTYFPH +FD.LHH.....GSAQIKAHGKKVVGALIEAVNHIDD...IAGALSKLSDLHAQKLRVDPV +NFKLLGQCFLVVVAIHHPSVLTPEVHASLDKFLCAVGNVLSAKYR...... +>HBB1_VAREX +...V..........HWTAEEKQLICSLW.GKI..DVGLIGGETLAGLLVIYPWTQRQFSH +FGNLSSPTAIAGNPRVKAHGKKVLTSFGDAIKNLDN...IKDTFAKLSELHCDKLHVDPT +NFKLLGNVLVIVLADHHGKEFTPAHHAAYQKLVNVVSHSLARRYH...... +>HBB2_TRICR +...V..........HLTAEDRKEIAAIL.GKV..NVDSLGGQCLARLIVVNPWSRRYFHD +FGDLSSCDAICRNPKVLAHGAKVMRSIVEATKHLDN...LREYYADLSVTHSLKFYVDPE +NFKLFSGIVIVCLALTLQTDFSCHKQLAFEKLMKGVSHALGHGY....... +>HBB2_XENTR +...V..........HWTAEEKATIASVW.GKV..DIEQDGHDALSRLLVVYPWTQRYFSS +FGNLSNVSAVSGNVKVKAHGNKVLSAVGSAIQHLDD...VKSHLKGLSKSHAEDLHVDPE +NFKRLADVLVIVLAAKLGSAFTPQVQAVWEKLNATLVAALSHGYF...... +>HBBL_RANCA +...V..........HWTAEEKAVINSVW.QKV..DVEQDGHEALTRLFIVYPWTQRYFST +FGDLSSPAAIAGNPKVHAHGKKILGAIDNAIHNLDD...VKGTLHDLSEEHANELHVDPE +NFRRLGEVLIVVLGAKLGKAFSPQVQHVWEKFIAVLVDALSHSYH...... +>HBB_CALAR +...V..........HLTGEEKSAVTALW.GKV..NVDEVGGEALGRLLVVYPWTQRFFES +FGDLSTPDAVMNNPKVKAHGKKVLGAFSDGLTHLDN...LKGTFAHLSELHCDKLHVDPE +NFRLLGNVLVCVLAHHFGKEFTPVVQAAYQKVVAGVANALAHKYH...... +>HBB_COLLI +...V..........HWSAEEKQLITSIW.GKV..NVADCGAEALARLLIVYPWTQRFFSS +FGNLSSATAISGNPNVKAHGKKVLTSFGDAVKNLDN...IKGTFAQLSELHCDKLHVDPE +NFRLLGDILVIILAAHFGKDFTPECQAAWQKLVRVVAHALARKYH...... +>HBB_EQUHE +...V..........QLSGEEKAAVLALW.DKV..NEEEVGGEALGRLLVVYPWTQRFFDS +FGDLSNPAAVMGNPKVKAHGKKVLHSFGEGVHHLDN...LKGTFAQLSELHCDKLHVDPE +NFRLLGNVLVVVLARHFGKDFTPELQASYQKVVAGVANALAHKYH...... +>HBB_LARRI +...V..........HWSAEEKQLITGLW.GKV..NVADCGAEALARLLIVYPWTQRFFAS +FGNLSSPTAINGNPMVRAHGKKVLTSFGEAVKNLDN...IKNTFAQLSELHCDKLHVDPE +NFRLLGDILIIVLAAHFAKDFTPDSQAAWQKLVRVVAHALARKYH...... +>HBB_MANSP +...V..........HLTPEEKTAVTTLW.GKV..NVDEVGGEALGRLLVVYPWTQRFFDS +FGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLNHLDN...LKGTFAQLSELHCDKLHVDPE +NFKLLGNVLVCVLAHHFGKEFTPQVQAAYQKVVAGVANALAHKYH...... +>HBB_ORNAN +...V..........HLSGGEKSAVTNLW.GKV..NINELGGEALGRLLVVYPWTQRFFEA +FGDLSSAGAVMGNPKVKAHGAKVLTSFGDALKNLDD...LKGTFAKLSELHCDKLHVDPE +NFNRLGNVLIVVLARHFSKDFSPEVQAAWQKLVSGVAHALGHKYH...... +>HBB_RABIT +...V..........HLSSEEKSAVTALW.GKV..NVEEVGGEALGRLLVVYPWTQRFFES +FGDLSSANAVMNNPKVKAHGKKVLAAFSEGLSHLDN...LKGTFAKLSELHCDKLHVDPE +NFRLLGNVLVIVLSHHFGKEFTPQVQAAYQKVVAGVANALAHKYH...... +>HBB_SPECI +...V..........HLSDGEKNAISTAW.GKV..HAAEVGAEALGRLLVVYPWTQRFFDS +FGDLSSASAVMGNAKVKAHGKKVIDSFSNGLKHLDN...LKGTFASLSELHCDKLHVDPE +NFKLLGNMIVIVMAHHLGKDFTPEAQAAFQKVVAGVANALAHKYH...... +>HBB_SPETO +...V..........HLTDGEKNAISTAW.GKV..NAAEIGAEALGRLLVVYPWTQRFFDS +FGDLSSASAVMGNAKVKAHGKKVIDSFSNGLKHLDN...LKGTFASLSELHCDKLHVDPE +NFKLLGNMIVIVMAHHLGKDFTPEAQAAFQKVVAGVANALSHKYH...... +>HBB_SUNMU +...V..........HLSGEEKACVTGLW.GKV..NEDEVGAEALGRLLVVYPWTQRFFDS +FGDLSSASAVMGNPKVKAHGKKVLHSLGEGVANLDN...LKGTFAKLSELHCDKLHVDPE +NFRLLGNVLVVVLASKFGKEFTPPVQAAFQKVVAGVANALAHKYH...... +>HBB_TACAC +...V..........HLSGSEKTAVTNLW.GHV..NVNELGGEALGRLLVVYPWTQRFFES +FGDLSSADAVMGNAKVKAHGAKVLTSFGDALKNLDN...LKGTFAKLSELHCDKLHVDPE +NFNRLGNVLVVVLARHFSKEFTPEAQAAWQKLVSGVSHALAHKYH...... +>HBB_TRIIN +...V..........HLTPEEKALVIGLW.AKV..NVKEYGGEALGRLLVVYPWTQRFFEH +FGDLSSASAIMNNPKVKAHGEKVFTSFGDGLKHLED...LKGAFAELSELHCDKLHVDPE +NFRLLGNVLVCVLARHFGKEFSPEAQAAYQKVVAGVANALAHKYH...... +>HBB_TUPGL +...V..........HLSGEEKAAVTGLW.GKV..DLEKVGGQSLGSLLIVYPWTQRFFDS +FGDLSSPSAVMSNPKVKAHGKKVLTSFSDGLNHLDN...LKGTFAKLSELHCDKLHVDPE +NFRLLGNVLVRVLACNFGPEFTPQVQAAFQKVVAGVANALAHKYH...... +>HBB_URSMA +...V..........HLTGEEKSLVTGLW.GKV..NVDEVGGEALGRLLVVYPWTQRFFDS +FGDLSSADAIMNNPKVKAHGKKVLNSFSDGLKNLDN...LKGTFAKLSELHCDKLHVDPE +NFKLLGNVLVCVLAHHFGKEFTPQVQAAYQKVVAGVANALAHKYH...... +>HBE_PONPY +...V..........HFTAEEKAAVTSLW.SKM..NVEEAGGEALGRLLVVYPWTQRFFDS +FGNLSSPSAILGNPKVKAHGKKVLTSFGDAIKNMDN...LKTTFAKLSELHCDKLHVDPE +NFKLLGNVMVIILATHFGKEFTPEVQAAWQKLVSAVAIALAHKYH...... +>HBF1_URECA +..............GLTTAQIKAIQDHWFLNIKGCLQAAADSIFFKYLTAYPGDLAFFHK +FSSV.PLYGLRSNPAYKAQTLTVINYLDKVVDALGG..NAGALMKAKVPSH.DAMGITPK +HFGQLLKLVGGVFQEEF..SADPTTVAAWGDAAGVLVAAM..........K +>LGB1_PEA +GFTDKQEALVNSSSE.FKQNLPGYSILFYTIVLEKAP..AAKGL................ +FSFLKDTAGVEDSPKLQAHAEQVFGLVRDSAAQLRTKGEVVLGNATLGAIHVQKGVTNP. +HFVVVKEALLQTIKKASGNNWSEELNTAWEVAYDGLATAIKKAMKT....A +>LGB1_VICFA +GFTEKQEALVNSSSQLFKQNPSNYSVLFYTIILQKAP..TAKAM................ +FSFLKDSAGVVDSPKLGAHAEKVFGMVRDSAVQLRATGEVVLDGKD.GSIHIQKGVLDP. +HFVVVKEALLKTIKEASGDKWSEELSAAWEVAYDGLATAIK....A....A +>MYG_ESCGI +...V...........LSDAEWQLVLNIW.AKVEADVAGHGQDILIRLFKGHPETLEKFDK +FKHLKTEAEMKASEDLKKHGNTVLTALGGILKKKGH...HEAELKPLAQSHATKHKIPIK +YLEFISDAIIHVLHSRHPGDFGADAQAAMNKALELFRKDIAAKYKELGFQG +>MYG_HORSE +...G...........LSDGEWQQVLNVW.GKVEADIAGHGQEVLIRLFTGHPETLEKFDK +FKHLKTEAEMKASEDLKKHGTVVLTALGGILKKKGH...HEAELKPLAQSHATKHKIPIK +YLEFISDAIIHVLHSKHPGNFGADAQGAMTKALELFRNDIAAKYKELGFQG +>MYG_LYCPI +...G...........LSDGEWQIVLNIW.GKVETDLAGHGQEVLIRLFKNHPETLDKFDK +FKHLKTEDEMKGSEDLKKHGNTVLTALGGILKKKGH...HEAELKPLAQSHATKHKIPVK +YLEFISDAIIQVLQNKHSGDFHADTEAAMKKALELFRNDIAAKYKELGFQG +>MYG_MOUSE +...G...........LSDGEWQLVLNVW.GKVEADLAGHGQEVLIGLFKTHPETLDKFDK +FKNLKSEEDMKGSEDLKKHGCTVLTALGTILKKKGQ...HAAEIQPLAQSHATKHKIPVK +YLEFISEIIIEVLKKRHSGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +>MYG_MUSAN +..................VDWEKVNSVW.SAVESDLTAIGQNILLRLFEQYPESQNHFPK +FKN.KSLGELKDTADIKAQADTVLSALGNIVKKKGS...HSQPVKALAATHITTHKIPPH +YFTKITTIAVDVLSEMYPSEMNAQVQAAFSGAFKIICSDIEKEYKAANFQG +>MYG_PROGU +...G...........LSDGEWQLVLNVW.GKVEGDLSGHGQEVLIRLFKGHPETLEKFDK +FKHLKAEDEMRASEELKKHGTTVLTALGGILKKKGQ...HAAELAPLAQSHATKHKIPVK +YLEFISEAIIQVLQSKHPGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +>MYG_SAISC +...G...........LSDGEWQLVLNIW.GKVEADIPSHGQEVLISLFKGHPETLEKFDK +FKHLKSEDEMKASEELKKHGTTVLTALGGILKKKGQ...HEAELKPLAQSHATKHKIPVK +YLELISDAIVHVLQKKHPGDFGADAQGAMKKALELFRNDMAAKYKELGFQG diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/clustal b/forester/archive/RIO/others/hmmer/squid/Formats/clustal new file mode 100644 index 0000000..cebd347 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/clustal @@ -0,0 +1,47 @@ +CLUSTAL W(1.5) multiple sequence alignment + + +REF -----GCGGATTTAGCTCAGTTGGGAGAGCGCCAGACTGAAAATCTGGAGGTC-CTGTGT +A0380 -----GGGCTCGTAGATCAG-CGGTAGATCGCTTCCTTCGCAAGGAAGAGGCC-CTGGGT +A0500 -----GGGCTCGTAGATCAG-TGGCAGATCGCTTCCTTCGCAAGGAAGAGGCC-CGGGGT +A0501 -----GGGCTCGTAGATCAG-GGGTAGATCACTCCCTTGGCATGGGAGAGGCC-CCGGGT +A0502 -----GGGCCCATAGCTCAG-TGGTAGAGTGCCTCCTTTGCAAGGAGGATGCC-CAGGGT +A1140 -----GGGCCCTAAGCTCAGCTGGGAGAGCACCTGCCTTGCACGCAGGGGGTC-GACGGT +A1180 -----GGGCCCTTAGCTCAGCTGGGAGAGCACCTGCCTTGCACGCAGGGGGTC-GACGGT +A1540 -----GGAGCCTTAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTC-AGCGGT +A1660 -----GGGGCTATAGCTCAGCTGGGAGAGCGCTTGCATGGCATGCAAGAGGTC-AGCGGT +A1661 -----GGGGGCATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTC-TGCGGT +A1662 -----GGGGCTATAGCTCAGCTGGGAGAGCGCCTGCTTTGCACGCAGGAGGTC-TGCGGT +A3920 -----GGGGGTATAGTATAATTGGTAGTACAGCAATCTTGCTCAATGCTTGTC--AAGGT +A6360 -----GGGCGTGTGGCGTAGTTGGTAGCGCGTTCGCTTAGCATGCGAAAGGTC-TCCGGT +A6400 -----GGGCGTGTGGCGTAGTCGGTAGCGCGCTCCCTTAGCATGGGAGAGGTC-TCCGGT +A7680 -----GGGGGCGTAGCTCAGATGGTAGAGCGCTCGCTTAGCATGTGAGAGGTA-CCGGGA +A7681 -----GGGGGCGTAGCTCAGATGGTAGAGCGCTCGCTTAGCATGCGAGAGGTA-CCGGGA +A9990 -----GGGGGATTAGCTCAAATGGTAGAGCGCTCGCTTAGCATGCGAGAGGTA-GCGGGA +A9991 -----GGGGAATTAGCTCAAATGGTAGAGCGCTCGCTTAGCATGCGAGAGGTA-GCGGGA +C0500 GCCAAGGTGGCAGAATTCGGC--CCAACGCATCCGCCTGCAGAGCGGAACCCCCGCCGGT +C1140 -----GGCAACAAGGCCAAGCGGCTAAGGCATGGGTCTGCAACACCCTGATC--ATCGGT + * * * * + +REF TCGATCCACAGAATTCGCACCA +A0380 TCAAATCCCAGCGAGTCCACCA +A0500 TCAAATCCCCGCGAGTCCACCA +A0501 TCAAATCCCGGCGAGTCCACCA +A0502 TCGAATCCCTGTGGGTCCACCA +A1140 TCGATCCCGTTAGGGTCCACCA +A1180 TCGATCCCGTTAGGGTCCACCA +A1540 TCGATCCCGCTAGGCTCCACCA +A1660 TCGATCCCGCTTAGCTCCACCA +A1661 TCGATCCCGCGCGCTCCCACCA +A1662 TCGATCCCGCATAGCTCCACCA +A3920 TCAAATCCTTGTATCTCCACCA +A6360 TCGACTCCGGACTCGTCCACCA +A6400 TCGATTCCGGACTCGTCCACCA +A7680 TCGATACCCGGCGCCTCCACCA +A7681 TCGATACCCGGCGCCTCCACCA +A9990 TCGATGCCCGCATCCTCCACCA +A9991 TCGATGCCCGCATTCTCCACCA +C0500 TCAAATCCGGCCCTTGGCTCCA +C1140 TCGAATCCGATTGTTGCCTCCA + ** * * * *** + diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/embl b/forester/archive/RIO/others/hmmer/squid/Formats/embl new file mode 100644 index 0000000..717a99e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/embl @@ -0,0 +1,155 @@ +ID XXPHA21 standard; DNA; PHG; 1635 BP. +XX +AC X02501; M23775; +XX +SV X02501.1 +XX +DT 28-JAN-1986 (Rel. 08, Created) +DT 12-SEP-1993 (Rel. 36, Last updated, Version 3) +XX +DE Bacteriophage 21 DNA for left end sequence with genes 1 and 2 +XX +KW overlapping genes. +XX +OS Bacteriophage 21 +OC Viruses; dsDNA viruses, no RNA stage; Tailed phages; Siphoviridae. +XX +RN [1] +RP 1-1635 +RX MEDLINE; 85237525. +RA Miller G., Feiss M.; +RT "Sequence of the left end of phage 21 DNA"; +RL J. Mol. Biol. 183:246-249(1985). +XX +DR SWISS-PROT; P36693; TERL_BPP21. +DR SWISS-PROT; P36694; TERS_BPP21. +XX +CC Data kindly reviewed (06-MAR-1986) by M. Feiss +XX +FH Key Location/Qualifiers +FH +FT source 1..1635 +FT /db_xref="taxon:10743" +FT /organism="Bacteriophage 21" +FT misc_feature 1..177 +FT /note="bacteriophage 21 cos segment" +FT misc_feature complement(31..41) +FT /note="integrative host factor (IHF) binding sequence 1" +FT misc_feature 75..85 +FT /note="IHF binding sequence 2" +FT misc_feature 175..185 +FT /note="IHF binding sequence 3" +FT RBS 178..181 +FT /note="pot. SD-sequence" +FT CDS 189..737 +FT /db_xref="SWISS-PROT:P36694" +FT /note="gp 1 (aa 1-182)" +FT /transl_table=11 +FT /protein_id="CAA26342.1" +FT /translation="MKVNKKRLAEIFNVDPRTIERWQSQGLPCASKGSKGIESVFDTAM +FT AIQWYAQRETDIENEKLRKELDDLRAAAESDLQPGTIDYERYRLTKAQADAQELKNARE +FT DGVVLETELFTFILQRVAQEISGILVRVPLTLQRKYPDISPSHLDVVKTEIAKASNVAA +FT KAGENVGGWIDDFRRAEGS" +FT RBS 699..702 +FT /note="pot. SD-sequence" +FT CDS 709..>1635 +FT /db_xref="SWISS-PROT:P36693" +FT /note="gp 2 (aa 1-309)" +FT /transl_table=11 +FT /protein_id="CAA26343.1" +FT /translation="MISDAQKAANAAGAIATGLLSLIIPVPLTTVQWANKHYYLPKESS +FT YTPGRWETLPFQVGIMNCMGNDLIRTVNLIKSARVGYTKMLLGVEAYFIEHKSRNSLLF +FT QPTDSAAEDFMKSHVEPTIRDVPALLELAPWFGRKHRDNTLTLKRFSSGVGFWCLGGAA +FT AKNYREKSVDVVCYDELSSFEPDVEKEGSPTLLGDKRIEGSVWPKSIRGSTPKIKGSCQ +FT IEKAANESAHFMRFYVPCPHCGEEQYLKFGDDASPFGLKWEKNKPESVFYLCEHHGCVI +FT HQSELDQSNGRWICENTGMWTRDGLMFF" +XX +SQ Sequence 1635 BP; 411 A; 356 C; 436 G; 432 T; 0 other; + gggcggcgac ctcgcggttt ttcactattt atgaaaattt ttcagggaaa atcgtgtcgg 60 + tacttctcga atataacttt ttgttttttt taatattgca tccgtaaagg tccgacatga 120 + aagtgtccga aaatgccttt ttctggcgtt ttcatgtcgg gccttgtatt tgataatggg 180 + ttgttttcat gaaggttaat aaaaagaggc ttgccgaaat tttcaacgtg gacccgcgga 240 + cgattgaacg ctggcagtct cagggactcc cttgcgcctc caaaggtagt aagggcattg 300 + aatctgtatt tgatactgcc atggcaattc agtggtatgc gcagagggaa actgatatcg 360 + aaaacgaaaa gctccgcaaa gaactggacg atttgcgtgc ggcagcggag tcagatttac 420 + aacccggcac cattgactat gaacgctacc ggctcacaaa agcgcaggca gatgcgcagg 480 + aactgaaaaa tgcccgtgaa gacggagtag tgctggaaac tgaactgttt accttcattc 540 + tgcaacgtgt ggcacaggag atttcgggga tacttgtgcg tgtgccgttg acattacagc 600 + gtaaatatcc ggacatttca ccatcacacc ttgatgtggt gaaaactgaa atcgcgaaag 660 + cctccaatgt tgcagctaag gccggtgaaa acgtgggcgg gtggatcgat gatttcagac 720 + gcgcagaagg cagctaatgc agccggtgcg atagctacag ggcttttatc tctcattatt 780 + cctgttccac tgacgacagt tcagtgggcc aataaacatt attaccttcc taaagagtcg 840 + tcttataccc cggggcgatg ggaaacactg ccgtttcagg ttggcatcat gaactgtatg 900 + ggcaacgatc tgattcgcac ggttaacctg attaaatctg cccgtgttgg ttatacaaag 960 + atgttgctgg gagtggaggc ttattttatt gagcataaat cacgcaacag ccttcttttt 1020 + cagcccacgg actcagctgc tgaagatttt atgaaatctc atgttgagcc aacgataagg 1080 + gatgttcctg cattgctgga gctggctcca tggttcggaa gaaaacaccg cgataatacg 1140 + ctcaccctga agcgtttttc ctccggtgtg gggttctggt gtctgggtgg tgcggcagca 1200 + aaaaactacc gtgaaaaatc cgtggatgtg gtctgttatg acgagctttc ctcgttcgaa 1260 + ccggatgttg aaaaagaggg ttcgccaacc ctgctggggg ataaacgtat tgagggctct 1320 + gtatggccaa aatccattcg cggctcgacg ccaaaaatca aaggctcctg tcagatcgaa 1380 + aaagccgcta acgagtcggc acacttcatg cgtttttatg tgccctgtcc gcactgtggg 1440 + gaggagcagt atctgaaatt tggcgatgat gcctcgcctt tcggtcttaa gtgggagaag 1500 + aataagccag aaagtgtttt ctacctttgc gagcatcatg gctgtgtgat ccatcagtct 1560 + gagcttgacc agagtaacgg gcggtggatc tgtgaaaaca cgggcatgtg gacccgtgac 1620 + ggcctgatgt ttttc 1635 +// +ID XXPHI80 standard; DNA; PHG; 233 BP. +XX +AC X01639; +XX +SV X01639.1 +XX +DT 02-JUL-1986 (Rel. 09, Created) +DT 02-JUL-1986 (Rel. 09, Last updated, Version 1) +XX +DE Bacteriophage phi 80 DNA-fragment with replication origin +XX +KW origin of replication. +XX +OS Bacteriophage phi-80 +OC Viruses; dsDNA viruses, no RNA stage; Tailed phages; Siphoviridae; +OC Lambda phage group; bacteriophage lambda. +XX +RN [1] +RP 1-233 +RX MEDLINE; 79135017. +RA Grosschedl R., Hobom G.; +RT "DNA sequences and structural homologies of the replication origins of +RT lambdoid bacteriophages"; +RL Nature 277:621-627(1979). +XX +FH Key Location/Qualifiers +FH +FT source 1..233 +FT /db_xref="taxon:10713" +FT /organism="Bacteriophage phi-80" +FT rep_origin 40..187 +FT /note="origin of replication of phi 21" +FT misc_feature 40..128 +FT /note="pot. binding site for initiator protein" +FT repeat_region 50..55 +FT /note="multiple repeated sequence I" +FT misc_feature 61..66 +FT /note="inverted repeat of sequence I" +FT repeat_region 71..76 +FT /note="direct repeat of I" +FT misc_feature 82..87 +FT /note="inverted repeat of I" +FT repeat_region 92..97 +FT /note="direct repeat of I" +FT misc_feature 103..108 +FT /note="inverted repeat of I" +FT repeat_region 113..117 +FT /note="imp. direct repeat of I" +FT misc_feature 129..155 +FT /note="pot. region of replicational primer start site" +FT misc_feature 156..187 +FT /note="pot. binding site for initiator protein" +XX +SQ Sequence 233 BP; 91 A; 51 C; 48 G; 43 T; 0 other; + ggaccaaata aaaacatctc agaatggtgc atcctcaaaa cgagggaaaa tcccctaaaa 60 + cgagggataa aacatccctc aaattggggg attgctatcc ctcaaaacag ggggacacaa 120 + aagacactat tacaaaagaa aaaagaaaag attattcgtc agagaattct ggcgaatcct 180 + ctgaccagcc agaaaacgac ctttctgtgg tgaaaccgga tgctgcaatt cag 233 +// diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/fasta b/forester/archive/RIO/others/hmmer/squid/Formats/fasta new file mode 100644 index 0000000..783cf51 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/fasta @@ -0,0 +1,26 @@ +>AC3.1 CE05131 (CAMBRIDGE) +MAQTLLSRHFRGVNCYFIFLNFAGWLMDLHLSTFMQFIPLFPVFGGYCTGLLTQIFRIDDSFQTTYTAFTICLVASALNS +CFVRKHQAISKISSKYLLDNVTYCIVIFLLNIYPVIAASLLYLSMLNKSEQVELVKSVYPNLVDKFASLPNYVVFDSNIW +AIVFFAFIFFGCTYTLVLIVTTTYQMFKILDDNRKHISASNYAKHRATLRSLLAQFTTCFLIVGPASLFSLLVVIRYEHS +QVATHWTIVALTLHSSANAIIIPLRIISISTVYSPEYKNSNGAKYCSNNNNPLTSNFPSFKLVNKLFRLLMILFYYFKLK +VPDLLSFCDFTHFDNPESKFYKIHIAKIRLNCSLIF +>AC3.2 CE05132 UDP-GLUCURONOSYLTRANSFERASE (CAMBRIDGE) +MLHFLSVLRSEETNFLKISKLKKLKTCILNFSIKYGLFEFVKVNHQISILGMYTFLFLLLSLLAVDAGKILVYSPSISRS +HLISNGRIADALVDAGHDVVMFITEYEPLTEFTGTKKAKVITMKGFSTKFAEDMDGIGEYLLSSSRLSFLERLMFEKTCT +GACDDLMTRREELEQLRAYNFDVAFSEQIDLCGVGIVRYLGIKNHLWISTTPIMDAVSYNLGIPAPSSYVPTIEENDNGD +KMDFWQRTFNLYMKIGSILIHRYGTDGTTEVFRKYIPDFPNVREIAANSSLCFVNSDEVLDLPRPTITKAIYVGGLGIPK +VSKPLDKKFTNIMSKGKEGVVIISLGSIIPFGDLPAAAKEGVLRAIQEISDYHFLIKIAKGDNNTKKLVEGIKNVDVAEW +LPQVDILSHPRLKLFVMHGGINGLVETAIQAVPTVIVPVFADQFRNGRMVEKRGIGKVLLKLDIGYESFKNTVLTVLNTP +SYKKNAIRIGKMMRDKPFSPEERLTKWTQFAIDHGVLEELHVEGSRLNTIIYYNLDVIAFVLFVFVAVLHVFIYAFKFLC +CDCYDLISYSSPSSCSFSSILVYSPSISRSHLISNGRIADALVDAGHDVVMFITEYEPLTEFTGTKKAKVRSTMIIQWTI +LGSTLLLIQEQIFWKGLCTKNGSLIFVMVICFKILNTKSNILNLDLMARREELEQLRAYNFDVAFSEQIDLCGVRIVRYL +GIKNHLWISTTPIMDAVSYNLGIPAPSSYVPTIEENDNGDKMDFWQRTFSLYMKIGAILIHRYATDSTTEVFRKYIPDFP +NVREIAANSSLCFVNSDEVLDLPRLTITKTIYVGGLGTPNISQHLDNVFAKIMSKGKRGVIIISLGSFVQFGDFPVNIKK +EVFRAISELSEYHFLIKISKDDTNTKTLTKEISNVDLVHWFPQVDLLSNPRLKLFIMHGGINGLVEKFF +>AC3.3 CE05133 (CAMBRIDGE) +MRFIAIAALIASSVLLAEATTIRDKRQSCGCAPRVQPSCSCQRTTYTQPQQYSCSCQNTAPVQKSCSCAQPVQQQTYQIQ +ASQCAPACQQSCQNQCQSAPSVSQCQSTCQQSCQTSSCYTPTTPAPVQCQPSCMPACEQSCVVQTPAPVQCVPQCQQQCQ +QQCVQTQPIQQCQPQCQQQCVQQCAPTTTAAPQIIKINMEISAQCVPQCQQSCQQQCVQQQVPAQQCNQQCTQQCQTTCQ +QAVPQCQQQCAPQCQQPSAPQCQQCQNTCQQAAPVCQQQCAPQCQQQSAPACQQCQTSCQQTQQCQQQCTPQCQQPSAPQ +CQQCQSACQAPVATTAAPQVVTIILEASVSQSAQCEPQCQQSCQQQCVQQQQPMQQCAPACTQSCSQSCSAAQPAQMPCQ +TQSVNSCSCQQNYSPCGNGQCCKRK diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/formattest.pl b/forester/archive/RIO/others/hmmer/squid/Formats/formattest.pl new file mode 100755 index 0000000..9eafaac --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/formattest.pl @@ -0,0 +1,97 @@ +#! /usr/local/bin/perl + +$binpath = shift; + +# Suck in the regression data on our file format test suite. +# + +print "Format test suite...\t"; + +open(DAT,"regression.dat") || die "failed to open regression.dat"; +$nfiles = 0; +while () { + if (/^\#/) { next; } + if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)/) { + $filename[$nfiles] = $1; + $format[$nfiles] = $2; + $seqtype[$nfiles] = $3; + $nseq[$nfiles] = $4; + $nres[$nfiles] = $5; + $shortest[$nfiles] = $6; + $longest[$nfiles] = $7; + if ($8 eq "yes") { $autodetect[$nfiles] = 1; } else { $autodetect[$nfiles] = 0; } + if ($9 eq "yes") { $is_alignment[$nfiles] = 1; } else { $is_alignment[$nfiles] = 0; } + if ($10 eq "yes") { $is_singleseq[$nfiles] = 1; } else { $is_singleseq[$nfiles] = 0; } + $nfiles++; + } +} +close(DAT); + +# Test 1. +# Run seqstat on every file in two modes; +# autodetecting (if format allows it), then forcing a format with --informat. +# +for ($i = 0; $i < $nfiles; $i++) { + if ($autodetect[$i]) { + $output = `$binpath/seqstat $filename[$i]`; + if ($? != 0) { die "seqstat failed, autodetecting, on $filename[$i]"; } + ($ns, $nr, $fr, $to) = &parse_seqstat($output); + if ($ns != $nseq[$i] || + $nr != $nres[$i] || + $fr != $shortest[$i] || + $to != $longest[$i]) + { die "seqstat regression failed, autodetecting, on $filename[$i]"; } + } + $output = `$binpath/seqstat --informat $format[$i] $filename[$i]`; + if ($? != 0) { die "seqstat failed, using --informat, on $filename[$i]"; } + ($ns, $nr, $fr, $to) = &parse_seqstat($output); + if ($ns != $nseq[$i] || + $nr != $nres[$i] || + $fr != $shortest[$i] || + $to != $longest[$i]) + { die "seqstat regression failed, using --informat, on $filename[$i]"; } +} + +# Test 2. +# Reformatting tests. +# +for ($i = 0; $i < $nfiles; $i++) { + for ($j = 0; $j < $nfiles; $j++) { + if (! $is_alignment[$i] && $is_alignment[$j]) { next; } # can't convert unaligned to aligned + if (! $is_singleseq[$i] && $is_singleseq[$j]) { next; } # can't convert multiple seqs to single seq format + + `$binpath/sreformat --informat $format[$i] $format[$j] $filename[$i] > formattest.tmp`; + if ($? != 0) { die "sreformat failed ($format[$i] to $format[$j]) on $filename[$i]"; } + $output = `$binpath/seqstat --informat $format[$j] formattest.tmp`; + if ($? != 0) { die "seqstat failed after sreformat ($format[$i] to $format[$j]) on $filename[$i]"; } + ($ns, $nr, $fr, $to) = &parse_seqstat($output); + if ($ns != $nseq[$i] || + $nr != $nres[$i] || + $fr != $shortest[$i] || + $to != $longest[$i]) + { die "seqstat regression failed after sreformat ($format[$i] to $format[$j]) on $filename[$i]"; } + } +} + +print "passed.\n"; +unlink "formattest.tmp"; + + +# Function: parse_seqstat(file) +# +# Returns the number of sequences in the file, +# and their maximum and minimum length, and their avg. len. +# Dies if 'seqstat' fails. +# +sub parse_seqstat { + local($output) = shift; + my ($nseq, $nres, $fromlen, $tolen); + + if ($output =~ /Number of sequences:\s+(\d+)/) {$nseq = $1; } + if ($output =~ /Total # residues:\s+(\d+)/) {$nres = $1; } + if ($output =~ /Smallest:\s+(\d+)/) {$fromlen = $1; } + if ($output =~ /Largest:\s+(\d+)/) {$tolen = $1; } + ($nseq, $nres, $fromlen, $tolen); +} + + diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/gcg b/forester/archive/RIO/others/hmmer/squid/Formats/gcg new file mode 100644 index 0000000..0affa1f --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/gcg @@ -0,0 +1,397 @@ + Free format documentation can precede a GCG file. + In principle it's possible for this documentation to confuse + Babelfish autodetection, if it looks like the header of + a different sequence file format. + +HIVHXB2CG Length: 9718 March 10, 1993 14:11 Type: N Check: 1730 .. + + 1 TGGAAGGGCT AATTCACTCC CAACGAAGAC AAGATATCCT TGATCTGTGG + + 51 ATCTACCACA CACAAGGCTA CTTCCCTGAT TAGCAGAACT ACACACCAGG + + 101 GCCAGGGATC AGATATCCAC TGACCTTTGG ATGGTGCTAC AAGCTAGTAC + + 151 CAGTTGAGCC AGAGAAGTTA GAAGAAGCCA ACAAAGGAGA GAACACCAGC + + 201 TTGTTACACC CTGTGAGCCT GCATGGAATG GATGACCCGG AGAGAGAAGT + + 251 GTTAGAGTGG AGGTTTGACA GCCGCCTAGC ATTTCATCAC ATGGCCCGAG + + 301 AGCTGCATCC GGAGTACTTC AAGAACTGCT GACATCGAGC TTGCTACAAG + + 351 GGACTTTCCG CTGGGGACTT TCCAGGGAGG CGTGGCCTGG GCGGGACTGG + + 401 GGAGTGGCGA GCCCTCAGAT CCTGCATATA AGCAGCTGCT TTTTGCCTGT + + 451 ACTGGGTCTC TCTGGTTAGA CCAGATCTGA GCCTGGGAGC TCTCTGGCTA + + 501 ACTAGGGAAC CCACTGCTTA AGCCTCAATA AAGCTTGCCT TGAGTGCTTC + + 551 AAGTAGTGTG TGCCCGTCTG TTGTGTGACT CTGGTAACTA GAGATCCCTC + + 601 AGACCCTTTT AGTCAGTGTG GAAAATCTCT AGCAGTGGCG CCCGAACAGG + + 651 GACCTGAAAG CGAAAGGGAA ACCAGAGCTC TCTCGACGCA GGACTCGGCT + + 701 TGCTGAAGCG CCCGCACGGC AAGAGGCGAG GGGCGGCGAC TGGTGAGTAC + + 751 GCCAAAAATT TTGACTAGCG GAGGCTAGAA GGAGAGAGAT GGGTGCGAGA + + 801 GCGTCAGTAT TAAGCGGGGG AGAATTAGAT CGATGGGAAA AAATTCGGTT + + 851 AAGGCCAGGG GGAAAGAAAA AATATAAATT AAAACATATA GTATGGGCAA + + 901 GCAGGGAGCT AGAACGATTC GCAGTTAATC CTGGCCTGTT AGAAACATCA + + 951 GAAGGCTGTA GACAAATACT GGGACAGCTA CAACCATCCC TTCAGACAGG + + 1001 ATCAGAAGAA CTTAGATCAT TATATAATAC AGTAGCAACC CTCTATTGTG + + 1051 TGCATCAAAG GATAGAGATA AAAGACACCA AGGAAGCTTT AGACAAGATA + + 1101 GAGGAAGAGC AAAACAAAAG TAAGAAAAAA GCACAGCAAG CAGCAGCTGA + + 1151 CACAGGACAC AGCAATCAGG TCAGCCAAAA TTACCCTATA GTGCAGAACA + + 1201 TCCAGGGGCA AATGGTACAT CAGGCCATAT CACCTAGAAC TTTAAATGCA + + 1251 TGGGTAAAAG TAGTAGAAGA GAAGGCTTTC AGCCCAGAAG TGATACCCAT + + 1301 GTTTTCAGCA TTATCAGAAG GAGCCACCCC ACAAGATTTA AACACCATGC + + 1351 TAAACACAGT GGGGGGACAT CAAGCAGCCA TGCAAATGTT AAAAGAGACC + + 1401 ATCAATGAGG AAGCTGCAGA ATGGGATAGA GTGCATCCAG TGCATGCAGG + + 1451 GCCTATTGCA CCAGGCCAGA TGAGAGAACC AAGGGGAAGT GACATAGCAG + + 1501 GAACTACTAG TACCCTTCAG GAACAAATAG GATGGATGAC AAATAATCCA + + 1551 CCTATCCCAG TAGGAGAAAT TTATAAAAGA TGGATAATCC TGGGATTAAA + + 1601 TAAAATAGTA AGAATGTATA GCCCTACCAG CATTCTGGAC ATAAGACAAG + + 1651 GACCAAAGGA ACCCTTTAGA GACTATGTAG ACCGGTTCTA TAAAACTCTA + + 1701 AGAGCCGAGC AAGCTTCACA GGAGGTAAAA AATTGGATGA CAGAAACCTT + + 1751 GTTGGTCCAA AATGCGAACC CAGATTGTAA GACTATTTTA AAAGCATTGG + + 1801 GACCAGCGGC TACACTAGAA GAAATGATGA CAGCATGTCA GGGAGTAGGA + + 1851 GGACCCGGCC ATAAGGCAAG AGTTTTGGCT GAAGCAATGA GCCAAGTAAC + + 1901 AAATTCAGCT ACCATAATGA TGCAGAGAGG CAATTTTAGG AACCAAAGAA + + 1951 AGATTGTTAA GTGTTTCAAT TGTGGCAAAG AAGGGCACAC AGCCAGAAAT + + 2001 TGCAGGGCCC CTAGGAAAAA GGGCTGTTGG AAATGTGGAA AGGAAGGACA + + 2051 CCAAATGAAA GATTGTACTG AGAGACAGGC TAATTTTTTA GGGAAGATCT + + 2101 GGCCTTCCTA CAAGGGAAGG CCAGGGAATT TTCTTCAGAG CAGACCAGAG + + 2151 CCAACAGCCC CACCAGAAGA GAGCTTCAGG TCTGGGGTAG AGACAACAAC + + 2201 TCCCCCTCAG AAGCAGGAGC CGATAGACAA GGAACTGTAT CCTTTAACTT + + 2251 CCCTCAGGTC ACTCTTTGGC AACGACCCCT CGTCACAATA AAGATAGGGG + + 2301 GGCAACTAAA GGAAGCTCTA TTAGATACAG GAGCAGATGA TACAGTATTA + + 2351 GAAGAAATGA GTTTGCCAGG AAGATGGAAA CCAAAAATGA TAGGGGGAAT + + 2401 TGGAGGTTTT ATCAAAGTAA GACAGTATGA TCAGATACTC ATAGAAATCT + + 2451 GTGGACATAA AGCTATAGGT ACAGTATTAG TAGGACCTAC ACCTGTCAAC + + 2501 ATAATTGGAA GAAATCTGTT GACTCAGATT GGTTGCACTT TAAATTTTCC + + 2551 CATTAGCCCT ATTGAGACTG TACCAGTAAA ATTAAAGCCA GGAATGGATG + + 2601 GCCCAAAAGT TAAACAATGG CCATTGACAG AAGAAAAAAT AAAAGCATTA + + 2651 GTAGAAATTT GTACAGAGAT GGAAAAGGAA GGGAAAATTT CAAAAATTGG + + 2701 GCCTGAAAAT CCATACAATA CTCCAGTATT TGCCATAAAG AAAAAAGACA + + 2751 GTACTAAATG GAGAAAATTA GTAGATTTCA GAGAACTTAA TAAGAGAACT + + 2801 CAAGACTTCT GGGAAGTTCA ATTAGGAATA CCACATCCCG CAGGGTTAAA + + 2851 AAAGAAAAAA TCAGTAACAG TACTGGATGT GGGTGATGCA TATTTTTCAG + + 2901 TTCCCTTAGA TGAAGACTTC AGGAAGTATA CTGCATTTAC CATACCTAGT + + 2951 ATAAACAATG AGACACCAGG GATTAGATAT CAGTACAATG TGCTTCCACA + + 3001 GGGATGGAAA GGATCACCAG CAATATTCCA AAGTAGCATG ACAAAAATCT + + 3051 TAGAGCCTTT TAGAAAACAA AATCCAGACA TAGTTATCTA TCAATACATG + + 3101 GATGATTTGT ATGTAGGATC TGACTTAGAA ATAGGGCAGC ATAGAACAAA + + 3151 AATAGAGGAG CTGAGACAAC ATCTGTTGAG GTGGGGACTT ACCACACCAG + + 3201 ACAAAAAACA TCAGAAAGAA CCTCCATTCC TTTGGATGGG TTATGAACTC + + 3251 CATCCTGATA AATGGACAGT ACAGCCTATA GTGCTGCCAG AAAAAGACAG + + 3301 CTGGACTGTC AATGACATAC AGAAGTTAGT GGGGAAATTG AATTGGGCAA + + 3351 GTCAGATTTA CCCAGGGATT AAAGTAAGGC AATTATGTAA ACTCCTTAGA + + 3401 GGAACCAAAG CACTAACAGA AGTAATACCA CTAACAGAAG AAGCAGAGCT + + 3451 AGAACTGGCA GAAAACAGAG AGATTCTAAA AGAACCAGTA CATGGAGTGT + + 3501 ATTATGACCC ATCAAAAGAC TTAATAGCAG AAATACAGAA GCAGGGGCAA + + 3551 GGCCAATGGA CATATCAAAT TTATCAAGAG CCATTTAAAA ATCTGAAAAC + + 3601 AGGAAAATAT GCAAGAATGA GGGGTGCCCA CACTAATGAT GTAAAACAAT + + 3651 TAACAGAGGC AGTGCAAAAA ATAACCACAG AAAGCATAGT AATATGGGGA + + 3701 AAGACTCCTA AATTTAAACT GCCCATACAA AAGGAAACAT GGGAAACATG + + 3751 GTGGACAGAG TATTGGCAAG CCACCTGGAT TCCTGAGTGG GAGTTTGTTA + + 3801 ATACCCCTCC CTTAGTGAAA TTATGGTACC AGTTAGAGAA AGAACCCATA + + 3851 GTAGGAGCAG AAACCTTCTA TGTAGATGGG GCAGCTAACA GGGAGACTAA + + 3901 ATTAGGAAAA GCAGGATATG TTACTAATAG AGGAAGACAA AAAGTTGTCA + + 3951 CCCTAACTGA CACAACAAAT CAGAAGACTG AGTTACAAGC AATTTATCTA + + 4001 GCTTTGCAGG ATTCGGGATT AGAAGTAAAC ATAGTAACAG ACTCACAATA + + 4051 TGCATTAGGA ATCATTCAAG CACAACCAGA TCAAAGTGAA TCAGAGTTAG + + 4101 TCAATCAAAT AATAGAGCAG TTAATAAAAA AGGAAAAGGT CTATCTGGCA + + 4151 TGGGTACCAG CACACAAAGG AATTGGAGGA AATGAACAAG TAGATAAATT + + 4201 AGTCAGTGCT GGAATCAGGA AAGTACTATT TTTAGATGGA ATAGATAAGG + + 4251 CCCAAGATGA ACATGAGAAA TATCACAGTA ATTGGAGAGC AATGGCTAGT + + 4301 GATTTTAACC TGCCACCTGT AGTAGCAAAA GAAATAGTAG CCAGCTGTGA + + 4351 TAAATGTCAG CTAAAAGGAG AAGCCATGCA TGGACAAGTA GACTGTAGTC + + 4401 CAGGAATATG GCAACTAGAT TGTACACATT TAGAAGGAAA AGTTATCCTG + + 4451 GTAGCAGTTC ATGTAGCCAG TGGATATATA GAAGCAGAAG TTATTCCAGC + + 4501 AGAAACAGGG CAGGAAACAG CATATTTTCT TTTAAAATTA GCAGGAAGAT + + 4551 GGCCAGTAAA AACAATACAT ACTGACAATG GCAGCAATTT CACCGGTGCT + + 4601 ACGGTTAGGG CCGCCTGTTG GTGGGCGGGA ATCAAGCAGG AATTTGGAAT + + 4651 TCCCTACAAT CCCCAAAGTC AAGGAGTAGT AGAATCTATG AATAAAGAAT + + 4701 TAAAGAAAAT TATAGGACAG GTAAGAGATC AGGCTGAACA TCTTAAGACA + + 4751 GCAGTACAAA TGGCAGTATT CATCCACAAT TTTAAAAGAA AAGGGGGGAT + + 4801 TGGGGGGTAC AGTGCAGGGG AAAGAATAGT AGACATAATA GCAACAGACA + + 4851 TACAAACTAA AGAATTACAA AAACAAATTA CAAAAATTCA AAATTTTCGG + + 4901 GTTTATTACA GGGACAGCAG AAATTCACTT TGGAAAGGAC CAGCAAAGCT + + 4951 CCTCTGGAAA GGTGAAGGGG CAGTAGTAAT ACAAGATAAT AGTGACATAA + + 5001 AAGTAGTGCC AAGAAGAAAA GCAAAGATCA TTAGGGATTA TGGAAAACAG + + 5051 ATGGCAGGTG ATGATTGTGT GGCAAGTAGA CAGGATGAGG ATTAGAACAT + + 5101 GGAAAAGTTT AGTAAAACAC CATATGTATG TTTCAGGGAA AGCTAGGGGA + + 5151 TGGTTTTATA GACATCACTA TGAAAGCCCT CATCCAAGAA TAAGTTCAGA + + 5201 AGTACACATC CCACTAGGGG ATGCTAGATT GGTAATAACA ACATATTGGG + + 5251 GTCTGCATAC AGGAGAAAGA GACTGGCATT TGGGTCAGGG AGTCTCCATA + + 5301 GAATGGAGGA AAAAGAGATA TAGCACACAA GTAGACCCTG AACTAGCAGA + + 5351 CCAACTAATT CATCTGTATT ACTTTGACTG TTTTTCAGAC TCTGCTATAA + + 5401 GAAAGGCCTT ATTAGGACAC ATAGTTAGCC CTAGGTGTGA ATATCAAGCA + + 5451 GGACATAACA AGGTAGGATC TCTACAATAC TTGGCACTAG CAGCATTAAT + + 5501 AACACCAAAA AAGATAAAGC CACCTTTGCC TAGTGTTACG AAACTGACAG + + 5551 AGGATAGATG GAACAAGCCC CAGAAGACCA AGGGCCACAG AGGGAGCCAC + + 5601 ACAATGAATG GACACTAGAG CTTTTAGAGG AGCTTAAGAA TGAAGCTGTT + + 5651 AGACATTTTC CTAGGATTTG GCTCCATGGC TTAGGGCAAC ATATCTATGA + + 5701 AACTTATGGG GATACTTGGG CAGGAGTGGA AGCCATAATA AGAATTCTGC + + 5751 AACAACTGCT GTTTATCCAT TTTCAGAATT GGGTGTCGAC ATAGCAGAAT + + 5801 AGGCGTTACT CGACAGAGGA GAGCAAGAAA TGGAGCCAGT AGATCCTAGA + + 5851 CTAGAGCCCT GGAAGCATCC AGGAAGTCAG CCTAAAACTG CTTGTACCAA + + 5901 TTGCTATTGT AAAAAGTGTT GCTTTCATTG CCAAGTTTGT TTCATAACAA + + 5951 AAGCCTTAGG CATCTCCTAT GGCAGGAAGA AGCGGAGACA GCGACGAAGA + + 6001 GCTCATCAGA ACAGTCAGAC TCATCAAGCT TCTCTATCAA AGCAGTAAGT + + 6051 AGTACATGTA ACGCAACCTA TACCAATAGT AGCAATAGTA GCATTAGTAG + + 6101 TAGCAATAAT AATAGCAATA GTTGTGTGGT CCATAGTAAT CATAGAATAT + + 6151 AGGAAAATAT TAAGACAAAG AAAAATAGAC AGGTTAATTG ATAGACTAAT + + 6201 AGAAAGAGCA GAAGACAGTG GCAATGAGAG TGAAGGAGAA ATATCAGCAC + + 6251 TTGTGGAGAT GGGGGTGGAG ATGGGGCACC ATGCTCCTTG GGATGTTGAT + + 6301 GATCTGTAGT GCTACAGAAA AATTGTGGGT CACAGTCTAT TATGGGGTAC + + 6351 CTGTGTGGAA GGAAGCAACC ACCACTCTAT TTTGTGCATC AGATGCTAAA + + 6401 GCATATGATA CAGAGGTACA TAATGTTTGG GCCACACATG CCTGTGTACC + + 6451 CACAGACCCC AACCCACAAG AAGTAGTATT GGTAAATGTG ACAGAAAATT + + 6501 TTGACATGTG GAAAAATGAC ATGGTAGAAC AGATGCATGA GGATATAATC + + 6551 AGTTTATGGG ATCAAAGCCT AAAGCCATGT GTAAAATTAA CCCCACTCTG + + 6601 TGTTAGTTTA AAGTGCACTG ATTTGAAGAA TGATACTAAT ACCAATAGTA + + 6651 GTAGCGGGAG AATGATAATG GAGAAAGGAG AGATAAAAAA CTGCTCTTTC + + 6701 AATATCAGCA CAAGCATAAG AGGTAAGGTG CAGAAAGAAT ATGCATTTTT + + 6751 TTATAAACTT GATATAATAC CAATAGATAA TGATACTACC AGCTATAGCT + + 6801 TGACAAGTTG TAACACCTCA GTCATTACAC AGGCCTGTCC AAAGGTATCC + + 6851 TTTGAGCCAA TTCCCATACA TTATTGTGCC CCGGCTGGTT TTGCGATTCT + + 6901 AAAATGTAAT AATAAGACGT TCAATGGAAC AGGACCATGT ACAAATGTCA + + 6951 GCACAGTACA ATGTACACAT GGAATTAGGC CAGTAGTATC AACTCAACTG + + 7001 CTGTTAAATG GCAGTCTAGC AGAAGAAGAG GTAGTAATTA GATCTGTCAA + + 7051 TTTCACGGAC AATGCTAAAA CCATAATAGT ACAGCTGAAC ACATCTGTAG + + 7101 AAATTAATTG TACAAGACCC AACAACAATA CAAGAAAAAG AATCCGTATC + + 7151 CAGAGAGGAC CAGGGAGAGC ATTTGTTACA ATAGGAAAAA TAGGAAATAT + + 7201 GAGACAAGCA CATTGTAACA TTAGTAGAGC AAAATGGAAT AACACTTTAA + + 7251 AACAGATAGA TAGCAAATTA AGAGAACAAT TCGGAAATAA TAAAACAATA + + 7301 ATCTTTAAGC AATCCTCAGG AGGGGACCCA GAAATTGTAA CGCACAGTTT + + 7351 TAATTGTGGA GGGGAATTTT TCTACTGTAA TTCAACACAA CTGTTTAATA + + 7401 GTACTTGGTT TAATAGTACT TGGAGTACTG AAGGGTCAAA TAACACTGAA + + 7451 GGAAGTGACA CAATCACCCT CCCATGCAGA ATAAAACAAA TTATAAACAT + + 7501 GTGGCAGAAA GTAGGAAAAG CAATGTATGC CCCTCCCATC AGTGGACAAA + + 7551 TTAGATGTTC ATCAAATATT ACAGGGCTGC TATTAACAAG AGATGGTGGT + + 7601 AATAGCAACA ATGAGTCCGA GATCTTCAGA CTTGGAGGAG GAGATATGAG + + 7651 GGACAATTGG AGAAGTGAAT TATATAAATA TAAAGTAGTA AAAATTGAAC + + 7701 CATTAGGAGT AGCACCCACC AAGGCAAAGA GAAGAGTGGT GCAGAGAGAA + + 7751 AAAAGAGCAG TGGGAATAGG AGCTTTGTTC CTTGGGTTCT TGGGAGCAGC + + 7801 AGGAAGCACT ATGGGCGCAG CCTCAATGAC GCTGACGGTA CAGGCCAGAC + + 7851 AATTATTGTC TGGTATAGTG CAGCAGCAGA ACAATTTGCT GAGGGCTATT + + 7901 GAGGCGCAAC AGCATCTGTT GCAACTCACA GTCTGGGGCA TCAAGCAGCT + + 7951 CCAAGCAAGA ATCCTAGCTG TGGAAAGATA CCTAAAGGAT CAACAGCTCC + + 8001 TAGGGATTTG GGGTTGCTCT GGAAAACTCA TTTGCACCAC TGCTGTGCCT + + 8051 TGGAATGCTA GTTGGAGTAA TAAATCTCTG GAACAGATCT GGAATCACAC + + 8101 GACCTGGATG GAGTGGGACA GAGAAATTAA CAATTACACA AGCTTAATAC + + 8151 ACTCCTTAAT TGAAGAATCG CAAAACCAGC AAGAAAAGAA TGAACAAGAA + + 8201 TTATTGGAAT TAGATAAATG GGCAAGTTTG TGGAATTGGT TTAACATAAC + + 8251 AAATTGGCTG TGGTATATAA AATTATTCAT AATGATAGTA GGAGGCTTGG + + 8301 TAGGTTTAAG AATAGTTTTT GCTGTACTTT CTATAGTGAA TAGAGTTAGG + + 8351 CAGGGATATT CACCATTATC GTTTCAGACC CACCTCCCAA TCCCGAGGGG + + 8401 ACCCGACAGG CCCGAAGGAA TAGAAGAAGA AGGTGGAGAG AGAGACAGAG + + 8451 ACAGATCCAT TCGATTAGTG AACGGATCCT TGGCACTTAT CTGGGACGAT + + 8501 CTGCGGAGCC TGTGCCTCTT CAGCTACCAC CGCTTGAGAG ACTTACTCTT + + 8551 GATTGTAACG AGGATTGTGG AACTTCTGGG ACGCAGGGGG TGGGAAGCCC + + 8601 TCAAATATTG GTGGAATCTC CTACAGTATT GGAGTCAGGA ACTAAAGAAT + + 8651 AGTGCTGTTA GCTTGCTCAA TGCCACAGCC ATAGCAGTAG CTGAGGGGAC + + 8701 AGATAGGGTT ATAGAAGTAG TACAAGGAGC TTGTAGAGCT ATTCGCCACA + + 8751 TACCTAGAAG AATAAGACAG GGCTTGGAAA GGATTTTGCT ATAAGATGGG + + 8801 TGGCAAGTGG TCAAAAAGTA GTGTGATTGG ATGGCTTACT GTAAGGGAAA + + 8851 GAATGAGACG AGCTGAGCCA GCAGCAGATG GGGTGGGAGC AGCATCTCGA + + 8901 GACCTGGAAA AACATGGAGC AATCACAAGT AGCAACACAG CAGCTACCAA + + 8951 TGCTGCTTGT GCCTGGCTAG AAGCACAAGA GGAGGAGGAG GTGGGTTTTC + + 9001 CAGTCACACC TCAGGTACCT TTAAGACCAA TGACTTACAA GGCAGCTGTA + + 9051 GATCTTAGCC ACTTTTTAAA AGAAAAGGGG GGACTGGAAG GGCTAATTCA + + 9101 CTCCCAAAGA AGACAAGATA TCCTTGATCT GTGGATCTAC CACACACAAG + + 9151 GCTACTTCCC TGATTGACAG AACTACACAC CAGGGCCAGG GGTCAGATAT + + 9201 CCACTGACCT TTGGATGGTG CTACAAGCTA GTACCAGTTG AGCCAGATAA + + 9251 GATAGAAGAG GCCAATAAAG GAGAGAACAC CAGCTTGTTA CACCCTGTGA + + 9301 GCCTGCATGG GATGGATGAC CCGGAGAGAG AAGTGTTAGA GTGGAGGTTT + + 9351 GACAGCCGCC TAGCATTTCA TCACGTGGCC CGAGAGCTGC ATCCGGAGTA + + 9401 CTTCAAGAAC TGCTGACATC GAGCTTGCTA CAAGGGACTT TCCGCTGGGG + + 9451 ACTTTCCAGG GAGGCGTGGC CTGGGCGGGA CTGGGGAGTG GCGAGCCCTC + + 9501 AGATCCTGCA TATAAGCAGC TGCTTTTTGC CTGTACTGGG TCTCTCTGGT + + 9551 TAGACCAGAT CTGAGCCTGG GAGCTCTCTG GCTAACTAGG GAACCCACTG + + 9601 CTTAAGCCTC AATAAAGCTT GCCTTGAGTG CTTCAAGTAG TGTGTGCCCG + + 9651 TCTGTTGTGT GACTCTGGTA ACTAGAGATC CCTCAGACCC TTTTAGTCAG + + 9701 TGTGGAAAAT CTCTAGCA + diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/gcgdata.1 b/forester/archive/RIO/others/hmmer/squid/Formats/gcgdata.1 new file mode 100644 index 0000000..93c01ff --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/gcgdata.1 @@ -0,0 +1,60 @@ +>>>>104K_THEPA 8/92 ASCII Len: 924 +P15711 theileria parva. 104 kd microneme-rhoptry antigen. 8/92 +MKFLILLFNILCLFPVLAADNHGVGPQGASGVDPITFDINSNQTGPAFLTAVEMAGVKYLQVQHGSNVNIHRLVEGNVVIWENASTPLYTGAIVTNNDGPYMAYVEVLGDPNLQFFIKSGDAWVTLSEHEYLAKLQEIRQAVHIESVFSLNMAFQLENNKYEVETHAKNGANMVTFIPRNGHICKMVYHKNVRIYKATGNDTVTSVVGFFRGLRLLLINVFSIDDNGMMSNRYFQHVDDKYVPISQKNYETGIVKLKDYKHAYHPVDLDIKDIDYTMFHLADATYHEPCFKIIPNTGFCITKLFDGDQVLYESFNPLIHCINEVHIYDRNNGSIICLHLNYSPPSYKAYLVLKDTGWEATTHPLLEEKIEELQDQRACELDVNFISDKDLYVAALTNADLNYTMVTPRPHRDVIRVSDGSEVLWYYEGLDNFLVCAWIYVSDGVASLVHLRIKDRIPANNDIYVLKGDLYWTRITKIQFTQEIKRLVKKSKKKLAPITEEDSDKHDEPPEGPGASGLPPKAPGDKEGSEGHKGPSKGSDSSKEGKKPGSGKKPGPAREHKPSKIPTLSKKPSGPKDPKHPRDPKEPRKSKSPRTASPTRRPSPKLPQLSKLPKSTSPRSPPPPTRPSSPERPEGTKIIKTSKPPSPKPPFDPSFKEKFYDDYSKAASRSKETKTTVVLDESFESILKETLPETPGTPFTTPRPVPPKRPRTPESPFEPPKDPDSPSTSPSEFFTPPESKRTRFHETPADTPLPDVTAELFKEPDVTAETKSPDEAMKRPRSPSEYEDTSPGDYPSLPMKRHRLERLRLTTTEMETDPGRMAKDASGKPVKLKRSKSFDDLTTVELAPEPKASRIVVDDEGTEADDEETHPPEERQKTEVRRRRPPKKPSKSPRPSKPKKPKKPDSAYIPSILAILVVSLIVGIL +>>>>10KD_VIGUN 2/95 ASCII Len: 75 +P18646 vigna unguiculata (cowpea). 10 kd protein precursor (clone psas10). 2/95 +MEKKSIAGLCFLFLVLFVAQEVVVQSEAKTCENLVDTYRGPCFTTGSCDDHCKNKEHLLSGRCRDDVRCWCTRNC +>>>>110K_PLAKN 2/94 ASCII Len: 296 +P13813 plasmodium knowlesi. 110 kd antigen (pk110) (fragment). 2/94 +FNSNMLRGSVCEEDVSLMTSIDNMIEEIDFYEKEIYKGSHSGGVIKGMDYDLEDDENDEDEMTEQMVEEVADHITQDMIDEVAHHVLDNITHDMAHMEEIVHGLSGDVTQIKEIVQKVNVAVEKVKHIVETEETQKTVEPEQIEETQNTVEPEQTEETQKTVEPEQTEETQNTVEPEQIEETQKTVEPEQTEEAQKTVEPEQTEETQKTVEPEQTEETQKTVEPEQTEETQKTVEPEQTEETQKTVEPEQTEETQKTVEPEQTEETQKTVEPEQTEETQNTVEPEPTQETQNTVEP +>>>>11S3_HELAN 2/94 ASCII Len: 493 +P19084 helianthus annuus (common sunflower). 11s globulin seed storage protein g3 precursor (helianthinin g3). 2/94 +MASKATLLLAFTLLFATCIARHQQRQQQQNQCQLQNIEALEPIEVIQAEAGVTEIWDAYDQQFQCAWSILFDTGFNLVAFSCLPTSTPLFWPSSREGVILPGCRRTYEYSQEQQFSGEGGRRGGGEGTFRTVIRKLENLKEGDVVAIPTGTAHWLHNDGNTELVVVFLDTQNHENQLDENQRRFFLAGNPQAQAQSQQQQQRQPRQQSPQRQRQRQRQGQGQNAGNIFNGFTPELIAQSFNVDQETAQKLQGQNDQRGHIVNVGQDLQIVRPPQDRRSPRQQQEQATSPRQQQEQQQGRRGGWSNGVEETICSMKFKVNIDNPSQADFVNPQAGSIANLNSFKFPILEHLRLSVERGELRPNAIQSPHWTINAHNLLYVTEGALRVQIVDNQGNSVFDNELREGQVVVIPQNFAVIKRANEQGSRWVSFKTNDNAMIANLAGRVSASAASPLTLWANRYQLSREEAQQLKFSQRETVLFAPSFSRGQGIRASR +>>>>11SB_CUCMA 11/90 ASCII Len: 480 +P13744 cucurbita maxima (pumpkin) (winter squash). 11s globulin beta subunit precursor. 11/90 +MARSSLFTFLCLAVFINGCLSQIEQQSPWEFQGSEVWQQHRYQSPRACRLENLRAQDPVRRAEAEAIFTEVWDQDNDEFQCAGVNMIRHTIRPKGLLLPGFSNAPKLIFVAQGFGIRGIAIPGCAETYQTDLRRSQSAGSAFKDQHQKIRPFREGDLLVVPAGVSHWMYNRGQSDLVLIVFADTRNVANQIDPYLRKFYLAGRPEQVERGVEEWERSSRKGSSGEKSGNIFSGFADEFLEEAFQIDGGLVRKLKGEDDERDRIVQVDEDFEVLLPEKDEEERSRGRYIESESESENGLEETICTLRLKQNIGRSVRADVFNPRGGRISTANYHTLPILRQVRLSAERGVLYSNAMVAPHYTVNSHSVMYATRGNARVQVVDNFGQSVFDGEVREGQVLMIPQNFVVIKRASDRGFEWIAFKTNDNAITNLLAGRVSQMRMLPLGVLSNMYRISREEAQRLKYGQQEMRVLSPGRSQGRRE +>>>>120K_RICRI 10/94 ASCII Len: 1299 +P14914 rickettsia rickettsii. 120 kd surface-exposed protein. 10/94 +MVIQSANATGQVNFRHIVDVGADGTTAFKTAASKVTITQDSNFGNTDFGNLAAQIKVPNAITLTGNFTGDASNPGNTAGVITFDANGTLESASADANVAVTNNITAIEASGAGVVQLSGTHAAELRLGNAGSIFKLADGTVINGKVNQTALVGGALAAGTITLDGSATITGDIGNAGGAAALQRITLANDAKKTLTLGGANIIGAGGGTIDLQANGGTIKLTSTQNNIVVDFDLAIATDQTGVVDASSLTNAQTLTINGKIGTIGANNKTLGQFNIGSSKTVLSNGNVAINELVIGNDGAVQFAHDTYLITRTTNAAGQGKIIFNPVVNNGTTLAAGTNLGSATNPLAEINFGSKGVNVDTVLNVGEGVNLYATNITTTDANVGSFVFNAGGTNIVSGTVGGQQGNKFNTVALENGTTVKFLGNATFNGNTTIAANSTLQIGGNYTADCVASADGTGIVEFVNTGPITVTLNKEAAPVNALKQITVSGPGNVVINEIGNAGNHHGAVTDTIAFENSSLGAVVFLPRGIPFNDAGNTMPLTIKSTVGNKTAKGFDVPSVVVLGVDSVIADGQVIVDQNNIVGLGLGSDNGIIVNATTLYAGISTLNNNQGTVTLSGGVPNTPGTVYGLGTGIGASKFKQVTFTTDYNNLGNIIATNATINDGVTVTTGGIAGIGFDGKITLGSVNGNGNVRFADGILSNSTSMIGTTKANNGTVTYLGNAFVGNIGDSDTPVASVRFTGSDSGAGLQGNIYSQVIDFGTYNLGIVNSNIILGGGTTAINGKIDLVTNTLTFASGTSTWGNNTSIETTLTLANGNIGHIVILEGAQVNTTTTGTTTIKVQDNANANFSGTQTYTLIQGGARFNGTLGSPNFAVTGSNRFVNYSLIRAANQDYVITRTNNAENVVTNDIANSPFGGAPGVDQNVTTFVNATNTAAYNNLLLAKNSANSANFVGAIVTDTSAAITNVQLDLAKDIQAQLGNRLGALRYLGTPETAEMADLKLEHIGSVAAGDEAIDNVAYGIWAKPFYTDAHQSKKGGLAGYKAKTTGVVIGLDTLANDNLMIGAAIGITKTDIKHQDYKKGDKTDVNGFSFSLYGAQQLVKNFFAQGSAIFSLNQVKNKSQRYFFDANGNMSKQIAAGHYDNMTFGGNLTVGYDYNAMQGVLVTPMAGLSYLKSSDENYKETGTTVANKQVNSKFSDRTDLIVGAKVAGSTMNRTDLAVYPEVHAFVVHKVTGRLSKTQSVLDGQVTPCINQPDRTTKTSYNLGLSASIRSDAKMEYGIGYDAQISSKYTAHQGTLKVRVNF +>>>>128U_DROME 2/94 ASCII Len: 368 +P32234 drosophila melanogaster (fruit fly). gtp-binding protein 128up. 2/94 +MITILEKISAIESEMARTQKNKATSAHLGLLKANVAKLRRELISPKGGGGGTGEAGFEVAKTGDARVGFVGFPSVGKSTLLSNLAGVYSEVAAYEFTTLTTVPGCIKYKGAKIQLLDLPGIIEGAKDGKGRGRQVIAVARTCNLIFMVLDCLKPLGHKKLLEHELEGFGIRLNKKPPNIYYKRKDKGGINLNSMVPQSELDTDLVKTILSEYKIHNADITLRYDATSDDLIDVIEGNRIYIPCIYLLNKIDQISIEELDVIYKIPHCVPISAHHHWNFDDLLELMWEYLRLQRIYTKPKGQLPDYNSPVVLHNERTSIEDFCNKLHRSIAKEFKYALVWGSSVKHQPQKVGIEHVLNDEDVVQIVKKV +>>>>12AH_CLOS4 8/91 ASCII Len: 29 +P21215 clostridium sp. (strain c 48-50). 12-alpha-hydroxysteroid dehydrogenase (ec 1.1.1.176) (fragment). 8/91 +MIFDGKVAIITGGGKAKSIGYGIAVAYAK +>>>>12KD_MYCLE 2/95 ASCII Len: 156 +P15878 mycobacterium leprae. 12 kd protein. 2/95 +MNDIIALKFHISLNATTWIGRIGMVILPLLVYFITYRWCIGLQRSDRAVLEHGIETGIIKRLPHGAYIELHQPLGPVDDHGHPIPLEYQGTAVPKRMNKLGSAGSPSSGSFLFADPVSEDAALREATHVAEQRALTALREHQDSIASSPNGERGKH +>>>>12S1_ARATH 4/90 ASCII Len: 472 +P15455 arabidopsis thaliana (mouse-ear cress). 12s seed storage protein. 4/90 +MARVSSLLSFCLTLLILFHGYAAQQGQQGQQFPNECQLDQLNALEPSHVLKSEAGRIEVWDHHAPQLRCSGVSFARYIIESKGLYLPSFFNTAKLSFVAKGRGLMGKVIPGCAETFQDSSEFQPRFEGQGQSQRFRDMHQKVEHIRSGDTIATTPGVAQWFYNDGQQPLVIVSVFDLASHQNQLDRNPRPFYLAGNNPQGQVWLQGREQQPQKNIFNGFGPEVIAQALKIDLQTAQQLQNQDDNRGNIVRVQGPFGVIRPPLRGQRPQEEEEEEGRHGRHGNGLEETICSARCTDNLDDPSRADVYKPQLGYISTLNSYDLPILRFIRLSALRGSIRQNAMVLPQWNANANAILYETDGEAQIQIVNDNGNRVFDGQVSQGQLIAVPQGFSVVKRATSNRFQWVEFKTNANAQINTLAGRTSVLRGLPLEVITNGFQISPEEARRVKFNTLETTLTHSSGPASYGRPRVAAA +>>>>12S2_ARATH 4/90 ASCII Len: 455 +P15456 arabidopsis thaliana (mouse-ear cress). 12s seed storage protein. 4/90 +MGRVSSIISFSLTLLILFNGYTAQQWPNECQLDQLNALEPSQIIKSEGGRIEVWDHHAPQLRCSGFAFERFVIEPQGLFLPTFLNAGKLTFVVHGRGLMGRVIPGCAETFMESPVFGEGQGQGQSQGFRDMHQKVEHLRCGDTIATPSGVAQWFYNNGNEPLILVAAADLASNQNQLDRNLRPFLIAGNNPQGQEWLQGRKQQKQNNIFNGFAPEILAQAFKINVETAQQLQNQQDNRGNIVKVNGPFGVIRPPLRRGEGGQQPHEIANGLEETLCTMRCTENLDDPSDADVYKPSLGYISTLNSYNLPILRLLRLSALRGSIRKNAMVLPQWNVNANAALYVTNGKAHIQMVNDNGERVFDQEISSGQLLVVPQGFSVMKHRIGEQFEWIEFKTNENAQVNTLAGRTSVMRGLPLEVITNGYQISPEEAKRVKFSTIETTLTHSSPMSYGRPRA +>>>>1433_DROME 12/92 ASCII Len: 248 +P29310 drosophila melanogaster (fruit fly). 14-3-3-like protein. 12/92 +MSTVDKEELVQKAKLAEQSERYDDMAQAMKSVTETGVELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTEASARKQQLAREYRERVEKELREICYEVLGLLDKYLIPKASNPESKVFYLKMKGDYYRYLAEVATGDARNTVVDDSQTAYQDAFDISKGKMQPTHPIRLGLALNFSVFYYEILNSPDKACQLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDTQGDEAEPQEGGDN +>>>>1433_HORVU 2/94 ASCII Len: 262 +P29305 hordeum vulgare (barley). 14-3-3-like protein. 2/94 +MSTAEATREENVYMAKLAEQAERYEEMVEFMEKVAKTADVGELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEESRGNEAYVASIKEYRTRIETELSKICDGILKLLDSHLVPSATAAESKVFYLKMKGDYHRYLAEFKAGAERKEAAENTLVAYKSAQDIALADLPTTHPIRLGLALNFSVFYYEILNSPDRACNLAKQAFDEAIAELDSLGEESYKDSTLIMQLLRDNLTLWTSDNAEEGGDEIKEAASKPEGEGHS +>>>>1433_MAIZE 6/94 ASCII Len: 61 +P29306 zea mays (maize). 14-3-3-like protein (fragment). 6/94 +ILNSPDRACNLAKQAFDEAISELDSLGEESYKDSTLIMQLLXDNLTLWTSDTNEDGGDEIK +>>>>1433_OENHO 12/92 ASCII Len: 260 +P29307 oenothera hookeri (hooker's evening primrose). 14-3-3-like protein. 12/92 +MATAPSPREENVYLAKLAEQAERYEEMVEFMEKVCAAADSEELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEESRGNDDHVSTIRDYRSKIETELSNICGGILKLLDSRLIPSAASGDSKVFYLKMKGDYHRYLAEFKTGAERKEAAESTLSAYKAAQDIANAELAPTHPIRLGLALNFSVFYYEILNSPDRACNLANEAFDEAIAELDTLEEESYKDSTLIMQLLRDNLTLWTSDNQDDGGDEIKEAAPKPDEQY +>>>>1433_ORYSA 6/94 ASCII Len: 260 +Q06967 oryza sativa (rice). 14-3-3-like protein s94. 6/94 +MSPAEASREENVYMAKLAEQAERYEEMVEFMEKVAKTTDVGELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEESRGNEAYVASIKEYRSRIETELSKICDGILKLLDSHLVPSATAAESNVFYLKMKGDYHRYLAEFKSGAERKEAAENTLVAYKSAQDIALADLPTTHPIRLGLALNLSVFYYEILNSPDRACNLAKQAFDDAIAELDTLGEESYKDSTLIMQLLRDNLTLWTSDNAEDGGDEIKEAAKPEGEGH +>>>>1433_SPIOL 12/92 ASCII Len: 220 +P29308 spinacia oleracea (spinach). 14-3-3-like protein (fragment). 12/92 +RNLLSVAYKNVVGARRASWRIISSIEQKEESRGNEDHVSVIRDYRSRIEKELSDNCDGILKLLDTKLVPAASSGDSKVFYLKMKGDYHRYLAEFKTGAQRKEAAESTLTAYKAAQDIANAELAPTHPIRLGLALNFSVFYYEILNSPDRACNLAKQAFVEAIAELDTLGEDSYKDSTLIMQLLRDNLTLWTSDMQDEAADEITEEAAKQQKAVNNNKIAY +>>>>1433_XENLA 12/92 ASCII Len: 235 +P29309 xenopus laevis (african clawed frog). 14-3-3-like protein (fragment). 12/92 +AKLSEQAERYDDMAASMKAVTELGAELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTEGNDKRQQMAREYREKVETELQDICKDVLDLLDRFLVPNATPPESKVFYLKMKGDYYRYLSEVASGDSKQETVASSQQAYQEAFEISKSEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKSAFDEAIRELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGEEADNVEGDN +>>>>1434_ARATH 2/94 ASCII Len: 259 +Q01525 arabidopsis thaliana (mouse-ear cress). 14-3-3-like protein gf14. 2/94 +MASGREELVYMAKLAEQAERYEEMVEFMEKVSAAVDGDELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEESRGNDDHVTAIREYRSKIETELSGICDGILKLLDSRLIPAAASGDSKVFYLKMKGDYHRYLAEFKTGQERKDAAEHTLAAYKSAQDIANAELAPTHPIRLGLALNFSVFYYEILNSPDRACNLAKQAFDEAIAELDTLGEESYKDSTLIMQLLRDNLTLWTSDMQDDAADEIKEAAAPKPTEEQQ +>>>>1434_MAIZE 6/94 ASCII Len: 248 +Q01526 zea mays (maize). 14-3-3-like protein gf14-12. 6/94 +MAKLAEQAERYEEMVEFMEKVAKTVDSEELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEEGRGNEDRVTLIKDYRGKIETELTKICDGILKLLESHLVPSSTAPESKVFYLKMKGDYYRYLAEFKTGAERKDAAENTMVAYKAAQDIALAELAPTHPIRLGLALNFSVFYYEILNSPDRACSLAKQAFDEAISELDTLSEESYKDSTLIMQLLHDNLTLWTSDISEDPAEEIREAPKHDLSEGQ diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/gcgdata.2 b/forester/archive/RIO/others/hmmer/squid/Formats/gcgdata.2 new file mode 100644 index 0000000..b512585 Binary files /dev/null and b/forester/archive/RIO/others/hmmer/squid/Formats/gcgdata.2 differ diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/genbank b/forester/archive/RIO/others/hmmer/squid/Formats/genbank new file mode 100644 index 0000000..d504b58 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/genbank @@ -0,0 +1,310 @@ +GBPRI1.SEQ Genetic Sequence Data Bank + 15 August 2000 + + NCBI-GenBank Flat File Release 119.0 + + Primate Sequences (Part 1) + + 6395 loci, 170956138 bases, from 6395 reported sequences + + +LOCUS AAB2MCG1 289 bp DNA PRI 06-JUL-1998 +DEFINITION Aotus azarai beta-2-microglobulin precursor, gene, exon 1. +ACCESSION AF032092 +VERSION AF032092.1 GI:3265027 +KEYWORDS . +SEGMENT 1 of 2 +SOURCE Azara's night monkey. + ORGANISM Aotus azarai + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Primates; Platyrrhini; Cebidae; Aotinae; Aotus. +REFERENCE 1 (bases 1 to 289) + AUTHORS Canavez,F.C., Ladasky,J.J., Muniz,J.A.C., Seuanez,H.N., Parham,P. + and Cavanez,F.C. + TITLE beta2-Microglobulin in neotropical primates (Platyrrhini) + JOURNAL Immunogenetics 48 (2), 133-140 (1998) + MEDLINE 98298008 +REFERENCE 2 (bases 1 to 289) + AUTHORS Canavez,F.C., Ladasky,J.J., Seuanez,H.N. and Parham,P. + TITLE Direct Submission + JOURNAL Submitted (31-OCT-1997) Structural Biology, Stanford University, + Fairchild Building Campus West Dr. Room D-100, Stanford, CA + 94305-5126, USA +FEATURES Location/Qualifiers + source 1..289 + /organism="Aotus azarai" + /db_xref="taxon:30591" + exon <134..200 + /number=1 + sig_peptide 134..193 + intron 201..>289 + /number=1 +BASE COUNT 30 a 99 c 80 g 80 t +ORIGIN + 1 gtccccgcgg gccttgtcct gattggctgt ccctgcgggc cttgtcctga ttggctgtgc + 61 ccgactccgt ataacataaa tagaggcgtc gagtcgcgcg ggcattactg cagcggacta + 121 cacttgggtc gagatggctc gcttcgtggt ggtggccctg ctcgtgctac tctctctgtc + 181 tggcctggag gctatccagc gtaagtctct cctcccgtcc ggcgctggtc cttcccctcc + 241 cgctcccacc ctctgtagcc gtctctgtgc tctctggttt cgttacctc +// + +LOCUS AAB2MCG2 1276 bp DNA PRI 06-JUL-1998 +DEFINITION Aotus azarai beta-2-microglobulin precursor, gene, exons 2 and 3 + and complete cds. +ACCESSION AF032093 AF032094 +VERSION AF032093.1 GI:3287308 +KEYWORDS . +SEGMENT 2 of 2 +SOURCE Azara's night monkey. + ORGANISM Aotus azarai + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Primates; Platyrrhini; Cebidae; Aotinae; Aotus. +REFERENCE 1 (bases 1 to 1276) + AUTHORS Canavez,F.C., Ladasky,J.J., Muniz,J.A.C., Seuanez,H.N., Parham,P. + and Cavanez,F.C. + TITLE beta2-Microglobulin in neotropical primates (Platyrrhini) + JOURNAL Immunogenetics 48 (2), 133-140 (1998) + MEDLINE 98298008 +REFERENCE 2 (bases 1 to 1276) + AUTHORS Canavez,F.C., Ladasky,J.J., Seuanez,H.N. and Parham,P. + TITLE Direct Submission + JOURNAL Submitted (31-OCT-1997) Structural Biology, Stanford University, + Fairchild Building Campus West Dr. Room D-100, Stanford, CA + 94305-5126, USA +COMMENT On Jul 2, 1998 this sequence version replaced gi:3265029 + gi:3265028. +FEATURES Location/Qualifiers + source 1..1276 + /organism="Aotus azarai" + /db_xref="taxon:30591" + mRNA join(AF032092.1:<134..200,66..344,1023..>1050) + /product="beta-2-microglobulin precursor" + CDS join(AF032092.1:134..200,66..344,1023..1036) + /codon_start=1 + /product="beta-2-microglobulin precursor" + /protein_id="AAC52107.1" + /db_xref="GI:3289965" + /translation="MARFVVVALLVLLSLSGLEAIQRXPKIQVYSRHPAENGKPNFLN + CYVSGFHPSDIEVDLLKNGKKIEKVEHSDLSFSKDWSFYLLYYTEFTPNEKDEYACRV + SHVTLSTPKTVKWDRNM" + mat_peptide join(AF032092.1:194..200,66..344,1023..1033) + /product="beta-2-microglobulin" + intron <1..65 + /number=1 + variation 3 + /note="allele 1" + /replace="g" + exon 66..344 + /number=2 + intron 345..1022 + /number=2 + exon 1023..1050 + /number=3 + intron 1051..>1276 + /number=3 +BASE COUNT 353 a 253 c 269 g 400 t 1 others +ORIGIN + 1 caagttatcc gtaattgaaa taccctggta attaatattc atttgtcttt tcctgatttt + 61 ttcaggtrct ccaaagattc aggtttactc acgtcatccg gcagagaatg gaaagccaaa + 121 ttttctgaat tgctatgtgt ctgggtttca tccgtccgac attgaagttg acttactgaa + 181 gaatggaaag aaaattgaaa aagtggagca ttcagacttg tctttcagca aggactggtc + 241 tttctatctc ttgtactaca ccgagtttac ccccaatgaa aaagatgagt atgcctgccg + 301 tgtgagccat gtgactttat caacacccaa gacagtaaag tggggtaagt cttacgttct + 361 tttgtaggct gctgaaagtt gtgtatgggt agtcatgtca taaagctgct ttgatataaa + 421 aaaaattcgt ctatggccat actgccctga atgagtccca tcccgtctga taaaaaaaaa + 481 tcttcatatt gggattgtca gggaatgtgc ttaaagatca gattagagac aacggctgag + 541 agagcgctgc acagcattct tctgaaccag cagtttccct gcagctgagc agggagcagc + 601 agcagcagtt gcacaaatac atatgcactc ctaacacttc ttacctactg acttcctcag + 661 ctttcgtggc agctttaggt atatttagca ctaatgaaca tcaggaaggt ataggccttt + 721 ctttgtaaat ccttctatcc tagcatccta taatcctgga ctcctccagt actctctggc + 781 tggattggta tctgaggcta gtaggtgggg cttgttcctg ctgggtagct ccaaacaagg + 841 tattcatgga taggaacagc agcctatttt gccagcctta tttcttaata gttttagaaa + 901 tctgttagta cgtggtgttt tttgttttgt tttgttttaa cacagtgtaa acaaaaagta + 961 catgtatttt aaaagtaaaa cttaatgtct tcctttttct ttctccactg tctttttcat + 1021 agatcgaaac atgtaaccag catcatggag gtaagttctt gaccttaatt aaatgttttt + 1081 tgtttcactg gggactattt atagacagcc ctaacatgat aaccctcact atgtggagaa + 1141 cattgacaga gtagcatttt agcaggcaaa gaggaatcct atagggttac attccctttt + 1201 cctgtggagt ggcatgaaaa aggtatgtgg ccccagctgt ggccacatta ctgactctac + 1261 agggagggca aaggaa +// +LOCUS AACCOSIV1 168 bp DNA PRI 26-JAN-1998 +DEFINITION Aotus azarai cytochrome c oxidase subunit IV gene, exon 3. +ACCESSION AF042765 +VERSION AF042765.1 GI:2809514 +KEYWORDS . +SEGMENT 1 of 3 +SOURCE Azara's night monkey. + ORGANISM Aotus azarai + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Primates; Platyrrhini; Cebidae; Aotinae; Aotus. +REFERENCE 1 (bases 1 to 168) + AUTHORS Wu,W., Goodman,M., Lomax,M.I. and Grossman,L.I. + TITLE Molecular evolution of cytochrome c oxidase subunit IV: evidence + for positive selection in simian primates + JOURNAL J. Mol. Evol. 44 (5), 477-491 (1997) + MEDLINE 97277139 +REFERENCE 2 (bases 1 to 168) + AUTHORS Wu,W., Goodman,M., Lomax,M.I. and Grossman,L.I. + TITLE Direct Submission + JOURNAL Submitted (14-JAN-1998) CMMG, Wayne State University, 540 E. + Canfield, Detroit, MI 48201, USA +FEATURES Location/Qualifiers + source 1..168 + /organism="Aotus azarai" + /db_xref="taxon:30591" + exon 1..168 + /number=3 +BASE COUNT 40 a 42 c 54 g 32 t +ORIGIN + 1 gaagtgttgt gaagagcgaa gactatgcgc tcccaagtta tgtggatcgg cgtgactatc + 61 ccttgcccga cgtggcccat gtcaggcacc tgtcggccag ccagaaggcc ttgaaggaga + 121 aggagaaggc ctcctggagc agcctctcca tggatgagaa agtcgagt +// + +LOCUS AACCOSIV2 132 bp DNA PRI 26-JAN-1998 +DEFINITION Aotus azarai cytochrome c oxidase subunit IV gene, exon 4. +ACCESSION AF042766 +VERSION AF042766.1 GI:2809515 +KEYWORDS . +SEGMENT 2 of 3 +SOURCE Azara's night monkey. + ORGANISM Aotus azarai + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Primates; Platyrrhini; Cebidae; Aotinae; Aotus. +REFERENCE 1 (bases 1 to 132) + AUTHORS Wu,W., Goodman,M., Lomax,M.I. and Grossman,L.I. + TITLE Molecular evolution of cytochrome c oxidase subunit IV: evidence + for positive selection in simian primates + JOURNAL J. Mol. Evol. 44 (5), 477-491 (1997) + MEDLINE 97277139 +REFERENCE 2 (bases 1 to 132) + AUTHORS Wu,W., Goodman,M., Lomax,M.I. and Grossman,L.I. + TITLE Direct Submission + JOURNAL Submitted (14-JAN-1998) CMMG, Wayne State University, 540 E. + Canfield, Detroit, MI 48201, USA +FEATURES Location/Qualifiers + source 1..132 + /organism="Aotus azarai" + /db_xref="taxon:30591" + exon 1..132 + /number=4 +BASE COUNT 30 a 25 c 38 g 39 t +ORIGIN + 1 tgtatcgtat tcagttcaag gagagctttg ctgagatgaa caggggctcc aatgagtgga + 61 agacggttgt gggtgctgcc atgttcttca tcggcttcac agcaattctt atcatcttgg + 121 agaagcgcta tg +// + +LOCUS AACCOSIV3 137 bp DNA PRI 26-JAN-1998 +DEFINITION Aotus azarai cytochrome c oxidase subunit IV gene, exon 5; and + partial cds. +ACCESSION AF042767 +VERSION AF042767.1 GI:2809516 +KEYWORDS . +SEGMENT 3 of 3 +SOURCE Azara's night monkey. + ORGANISM Aotus azarai + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Primates; Platyrrhini; Cebidae; Aotinae; Aotus. +REFERENCE 1 (bases 1 to 137) + AUTHORS Wu,W., Goodman,M., Lomax,M.I. and Grossman,L.I. + TITLE Molecular evolution of cytochrome c oxidase subunit IV: evidence + for positive selection in simian primates + JOURNAL J. Mol. Evol. 44 (5), 477-491 (1997) + MEDLINE 97277139 +REFERENCE 2 (bases 1 to 137) + AUTHORS Wu,W., Goodman,M., Lomax,M.I. and Grossman,L.I. + TITLE Direct Submission + JOURNAL Submitted (14-JAN-1998) CMMG, Wayne State University, 540 E. + Canfield, Detroit, MI 48201, USA +FEATURES Location/Qualifiers + source 1..137 + /organism="Aotus azarai" + /db_xref="taxon:30591" + mRNA join(AF042765.1:<1..168,AF042766.1:1..132,1..>137) + /product="cytochrome c oxidase subunit IV" + CDS join(AF042765.1:<1..168,AF042766.1:1..132,1..137) + /codon_start=3 + /product="cytochrome c oxidase subunit IV" + /protein_id="AAB97755.1" + /db_xref="GI:2809518" + /translation="SVVKSEDYALPSYVDRRDYPLPDVAHVRHLSASQKALKEKEKAS + WSSLSMDEKVELYRIQFKESFAEMNRGSNEWKTVVGAAMFFIGFTAILIILEKRYVYG + PLPHTFDKEWVAMQTKRMLDLKVNPVDGLASKWDYDKKEWKK" + exon 1..137 + /number=5 +BASE COUNT 36 a 36 c 43 g 22 t +ORIGIN + 1 tgtacggccc cctcccgcac acctttgaca aagagtgggt ggccatgcag accaagagga + 61 tgctggacct gaaggtgaac cctgtcgatg gcctcgcctc caagtgggac tacgacaaga + 121 aggagtggaa gaagtga +// +LOCUS AAU18601 1771 bp DNA PRI 17-JAN-1997 +DEFINITION Aotus azarae interphotoreceptor retinoid-binding protein (IRBP) + gene, intron 1, complete sequence. +ACCESSION U18601 +VERSION U18601.1 GI:624187 +KEYWORDS . +SOURCE Azara's night monkey. + ORGANISM Aotus azarai + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; + Mammalia; Eutheria; Primates; Platyrrhini; Cebidae; Aotinae; Aotus. +REFERENCE 1 (bases 1 to 1771) + AUTHORS Harada,M.L., Schneider,H., Schneider,M.P., Sampaio,I., + Czelusniak,J. and Goodman,M. + TITLE DNA evidence on the phylogenetic systematics of New World monkeys: + support for the sister-grouping of Cebus and Saimiri from two + unlinked nuclear genes + JOURNAL Mol. Phylogenet. Evol. 4 (3), 331-349 (1995) + MEDLINE 96111507 +REFERENCE 2 (bases 1 to 1771) + AUTHORS Harada,M.L. + TITLE Direct Submission + JOURNAL Submitted (13-DEC-1994) Universidade Federal do Para, Departamento + de Genetica, Campus Universitario do Guama, Belem, Para, 66075-900, + Brazil +FEATURES Location/Qualifiers + source 1..1771 + /organism="Aotus azarai" + /db_xref="taxon:30591" + /tissue_type="lymphocytes" + /dev_stage="adult" + gene 1..1771 + /gene="IRBP" + intron 1..1771 + /gene="IRBP" + /note="interphotoreceptor retinoid-binding protein" + /number=1 +BASE COUNT 384 a 443 c 533 g 411 t +ORIGIN + 1 gtgagaccca agggagacct ggccgggccc agtcctggga gtgagttgac ctgtcgtttg + 61 cacatgcagg gctctgtgca caatgcgtga caatggcttt tagatttgtt ctcatgctta + 121 agttgtggcc agttgagtcc tttcctcttt ccatccactg ttccatccac tctctgggac + 181 cctggtgctg ctgtagaacc tccgtagaac attcatgtta ggttggtgtg aaagtacttt + 241 taattgcaaa acccacaatt acttttgcat caacctcacg ggaagccagt ttggaagcct + 301 cgggatagac agagtttcag ccttggctgg gtggaaggtg agcgttggcg gggcttctca + 361 tcgtcagtgt gggagaagag gccaacatgt ggcagaggtg gcggtgggct tcaccgcgtg + 421 ccccaccgca ggccgagagc tccgcccggg cagcactcac tccacgctgt tctcctacct + 481 gtggctttgc tgcattgtca cagttgggca gggcagcatg tgtcatgaat cccttgcaag + 541 gagggtctga gactggggtt gggtgcaggc agtttgtctg ggaggtggtt gctgaagcag + 601 gtgtgaagga gggagcaggg agagtgagat aggaaggtga caggcaggtc cctcaaagct + 661 gttctgctga agccaggacg ctgacaagtg tggggatgct cccaggcaca gttctctgcg + 721 ggcgggcccc agggctcctg tcccgctttg gccaagagtt gccctgagga cataactcgg + 781 ggtggggcag gctcccctct cttggagaag gcctgagctg agggtggaaa gacaggatgg + 841 tgctgtggga gagcctgtca gtggggccag gtgcagctga aatcagaggg ggctgagagt + 901 gccaacggca tctgttacag aattctcatc cccattttgc ataactgagg cccagagagg + 961 tgcagagggg agtggcctgg agccagagag ctgtgactga aggcagggca gggcctggag + 1021 ggcagtgtct ctgtcagcac aggctccttg ccccagtcca gctcaccaag tcctgccgcc + 1081 ctcccgcagc cttagagagg gaggaagagg tgcatccaca tggaagtagc ctgtgctagg + 1141 ctttcagaat acccagtttc caaattaatt gcttcttcct ttctggtata gccaaggttc + 1201 acaatttgga gtcagatgtg gattcagatg ctggctccac cacttattga ctgtgtaacc + 1261 tgggactagt tacttaatct cactgtgctt cagtttttcc gtggaaaaga tggggaccat + 1321 gttatctcct gtacaggtgg ctgtgaggat gacgataagc tctgcaaagt gcttagtaca + 1381 gggccaggca cctgttaaag gtaactaaca tcttccaatc ctgccccagt ggaggggaag + 1441 ataagcttag agatgttggg aagtatctgg cgaggttgga cgaatcagag aggagaccat + 1501 tcctgggcct tccagctctg aacaccagag cagacaggag catcctctgc aaggaggctt + 1561 cccatggatc acacatgtcc cagtggcatg tcacatccca gacatgccac tgggaaagtc + 1621 ccaggtgcct actgactcct tcagaaatgt cagttcctgt cccatgccct taatatttcc + 1681 catgacataa aggcgatcca tggcacctgc tttcctgggc tcgaaaaccg gctgccctcc + 1741 tgacactgag caggacctcc aactcttgca g +// diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/msf b/forester/archive/RIO/others/hmmer/squid/Formats/msf new file mode 100644 index 0000000..8ef0bcd --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/msf @@ -0,0 +1,266 @@ + + + MSF: 171 Type: P Check: 4694 .. + + Name: GLB2_MORMR oo Len: 171 Check: 6522 Weight: 2.7687 + Name: GLBZ_CHITH oo Len: 171 Check: 6733 Weight: 2.9329 + Name: HBA2_BOSMU oo Len: 171 Check: 5006 Weight: 0.6394 + Name: HBA2_GALCR oo Len: 171 Check: 6652 Weight: 0.5183 + Name: HBA4_SALIR oo Len: 171 Check: 5434 Weight: 1.9511 + Name: HBAD_CHLME oo Len: 171 Check: 6621 Weight: 1.2927 + Name: HBAD_PASMO oo Len: 171 Check: 8118 Weight: 1.2927 + Name: HBAZ_HORSE oo Len: 171 Check: 8382 Weight: 1.6223 + Name: HBA_AILME oo Len: 171 Check: 5402 Weight: 0.4145 + Name: HBA_ANSSE oo Len: 171 Check: 3688 Weight: 0.8315 + Name: HBA_COLLI oo Len: 171 Check: 4420 Weight: 0.8557 + Name: HBA_ERIEU oo Len: 171 Check: 5528 Weight: 0.8390 + Name: HBA_FRAPO oo Len: 171 Check: 4136 Weight: 0.5014 + Name: HBA_MACFA oo Len: 171 Check: 5986 Weight: 0.2233 + Name: HBA_MACSI oo Len: 171 Check: 6064 Weight: 0.2233 + Name: HBA_MESAU oo Len: 171 Check: 5499 Weight: 0.6722 + Name: HBA_PAGLA oo Len: 171 Check: 6189 Weight: 0.5388 + Name: HBA_PHACO oo Len: 171 Check: 5129 Weight: 0.5014 + Name: HBA_PONPY oo Len: 171 Check: 5894 Weight: 0.3907 + Name: HBA_PROLO oo Len: 171 Check: 5810 Weight: 0.4145 + Name: HBA_TRIOC oo Len: 171 Check: 6427 Weight: 0.6883 + Name: HBB1_VAREX oo Len: 171 Check: 7239 Weight: 1.1252 + Name: HBB2_TRICR oo Len: 171 Check: 7790 Weight: 1.9629 + Name: HBB2_XENTR oo Len: 171 Check: 9537 Weight: 1.4685 + Name: HBBL_RANCA oo Len: 171 Check: 7490 Weight: 1.4685 + Name: HBB_CALAR oo Len: 171 Check: 6568 Weight: 0.4226 + Name: HBB_COLLI oo Len: 171 Check: 5043 Weight: 0.7672 + Name: HBB_EQUHE oo Len: 171 Check: 6101 Weight: 0.6734 + Name: HBB_LARRI oo Len: 171 Check: 5673 Weight: 0.7672 + Name: HBB_MANSP oo Len: 171 Check: 7148 Weight: 0.4226 + Name: HBB_ORNAN oo Len: 171 Check: 6249 Weight: 0.6615 + Name: HBB_RABIT oo Len: 171 Check: 7043 Weight: 0.5259 + Name: HBB_SPECI oo Len: 171 Check: 3680 Weight: 0.5422 + Name: HBB_SPETO oo Len: 171 Check: 4246 Weight: 0.5422 + Name: HBB_SUNMU oo Len: 171 Check: 5601 Weight: 0.6734 + Name: HBB_TACAC oo Len: 171 Check: 7133 Weight: 0.6615 + Name: HBB_TRIIN oo Len: 171 Check: 4009 Weight: 0.8445 + Name: HBB_TUPGL oo Len: 171 Check: 7197 Weight: 0.7375 + Name: HBB_URSMA oo Len: 171 Check: 7200 Weight: 0.4695 + Name: HBE_PONPY oo Len: 171 Check: 5902 Weight: 1.0101 + Name: HBF1_URECA oo Len: 171 Check: 6462 Weight: 2.9329 + Name: LGB1_PEA oo Len: 171 Check: 4791 Weight: 2.0005 + Name: LGB1_VICFA oo Len: 171 Check: 7426 Weight: 2.0005 + Name: MYG_ESCGI oo Len: 171 Check: 9170 Weight: 0.7432 + Name: MYG_HORSE oo Len: 171 Check: 1290 Weight: 0.7432 + Name: MYG_LYCPI oo Len: 171 Check: 1107 Weight: 0.8773 + Name: MYG_MOUSE oo Len: 171 Check: 1320 Weight: 1.0018 + Name: MYG_MUSAN oo Len: 171 Check: 5461 Weight: 2.3158 + Name: MYG_PROGU oo Len: 171 Check: 1450 Weight: 0.7629 + Name: MYG_SAISC oo Len: 171 Check: 1728 Weight: 0.7629 + +// + + + +GLB2_MORMR ...PIVD..S GSVSPLSDAE KNKIRAAW.D IVYKNYEKNG VDILVKFFTG +GLBZ_CHITH MKFIILALCV AAASALSGDQ IGLVQST.YG KVKG....DS VGILYAVFKA +HBA2_BOSMU ...V...... .....LSAAD KGNVKAAW.G KVGGHAAEYG AEALERMFLS +HBA2_GALCR ...V...... .....LSPTD KSNVKAAW.E KVGAHAGDYG AEALERMFLS +HBA4_SALIR ...S...... .....LSAKD KANVKAIW.G KILPKSDEIG EQALSRMLVV +HBAD_CHLME ...M...... .....LTADD KKLLTQLW.E KVAGHQEEFG SEALQRMFLT +HBAD_PASMO ...M...... .....LTAED KKLIQQIW.G KLGGAEEEIG ADALWRMFHS +HBAZ_HORSE ...S...... .....LTKAE RTMVVSIW.G KISMQADAVG TEALQRLFSS +HBA_AILME ...V...... .....LSPAD KTNVKATW.D KIGGHAGEYG GEALERTFAS +HBA_ANSSE ...V...... .....LSAAD KGNVKTVF.G KIGGHAEEYG AETLQRMFQT +HBA_COLLI ...V...... .....LSAND KSNVKAVF.A KIGGQAGDLG GEALERLFIT +HBA_ERIEU ...V...... .....LSATD KANVKTFW.G KLGGHGGEYG GEALDRMFQA +HBA_FRAPO ...V...... .....LSAAD KNNVKGIF.G KISSHAEDYG AEALERMFIT +HBA_MACFA ...V...... .....LSPAD KTNVKAAW.G KVGGHAGEYG AEALERMFLS +HBA_MACSI ...V...... .....LSPAD KTNVKDAW.G KVGGHAGEYG AEALERMFLS +HBA_MESAU ...V...... .....LSAKD KTNISEAW.G KIGGHAGEYG AEALERMFFV +HBA_PAGLA ...V...... .....LSSAD KNNIKATW.D KIGSHAGEYG AEALERTFIS +HBA_PHACO ...V...... .....LSAAD KNNVKGIF.T KIAGHAEEYG AEALERMFIT +HBA_PONPY ...V...... .....LSPAD KTNVKTAW.G KVGAHAGDYG AEALERMFLS +HBA_PROLO ...V...... .....LSPAD KANIKATW.D KIGGHAGEYG GEALERTFAS +HBA_TRIOC ...V...... .....LSAND KTNVKTVF.T KITGHAEDYG AETLERMFIT +HBB1_VAREX ...V...... ....HWTAEE KQLICSLW.G KI..DVGLIG GETLAGLLVI +HBB2_TRICR ...V...... ....HLTAED RKEIAAIL.G KV..NVDSLG GQCLARLIVV +HBB2_XENTR ...V...... ....HWTAEE KATIASVW.G KV..DIEQDG HDALSRLLVV +HBBL_RANCA ...V...... ....HWTAEE KAVINSVW.Q KV..DVEQDG HEALTRLFIV +HBB_CALAR ...V...... ....HLTGEE KSAVTALW.G KV..NVDEVG GEALGRLLVV +HBB_COLLI ...V...... ....HWSAEE KQLITSIW.G KV..NVADCG AEALARLLIV +HBB_EQUHE ...V...... ....QLSGEE KAAVLALW.D KV..NEEEVG GEALGRLLVV +HBB_LARRI ...V...... ....HWSAEE KQLITGLW.G KV..NVADCG AEALARLLIV +HBB_MANSP ...V...... ....HLTPEE KTAVTTLW.G KV..NVDEVG GEALGRLLVV +HBB_ORNAN ...V...... ....HLSGGE KSAVTNLW.G KV..NINELG GEALGRLLVV +HBB_RABIT ...V...... ....HLSSEE KSAVTALW.G KV..NVEEVG GEALGRLLVV +HBB_SPECI ...V...... ....HLSDGE KNAISTAW.G KV..HAAEVG AEALGRLLVV +HBB_SPETO ...V...... ....HLTDGE KNAISTAW.G KV..NAAEIG AEALGRLLVV +HBB_SUNMU ...V...... ....HLSGEE KACVTGLW.G KV..NEDEVG AEALGRLLVV +HBB_TACAC ...V...... ....HLSGSE KTAVTNLW.G HV..NVNELG GEALGRLLVV +HBB_TRIIN ...V...... ....HLTPEE KALVIGLW.A KV..NVKEYG GEALGRLLVV +HBB_TUPGL ...V...... ....HLSGEE KAAVTGLW.G KV..DLEKVG GQSLGSLLIV +HBB_URSMA ...V...... ....HLTGEE KSLVTGLW.G KV..NVDEVG GEALGRLLVV +HBE_PONPY ...V...... ....HFTAEE KAAVTSLW.S KM..NVEEAG GEALGRLLVV +HBF1_URECA .......... ....GLTTAQ IKAIQDHWFL NIKGCLQAAA DSIFFKYLTA +LGB1_PEA GFTDKQEALV NSSSE.FKQN LPGYSILFYT IVLEKAP..A AKGL...... +LGB1_VICFA GFTEKQEALV NSSSQLFKQN PSNYSVLFYT IILQKAP..T AKAM...... +MYG_ESCGI ...V...... .....LSDAE WQLVLNIW.A KVEADVAGHG QDILIRLFKG +MYG_HORSE ...G...... .....LSDGE WQQVLNVW.G KVEADIAGHG QEVLIRLFTG +MYG_LYCPI ...G...... .....LSDGE WQIVLNIW.G KVETDLAGHG QEVLIRLFKN +MYG_MOUSE ...G...... .....LSDGE WQLVLNVW.G KVEADLAGHG QEVLIGLFKT +MYG_MUSAN .......... ........VD WEKVNSVW.S AVESDLTAIG QNILLRLFEQ +MYG_PROGU ...G...... .....LSDGE WQLVLNVW.G KVEGDLSGHG QEVLIRLFKG +MYG_SAISC ...G...... .....LSDGE WQLVLNIW.G KVEADIPSHG QEVLISLFKG + + +GLB2_MORMR TPAAQAFFPK FKGLTTADAL KKSSDVRWHA ERIINAVNDA VKSMDDTEKM +GLBZ_CHITH DPTIQAAFPQ FVGK.DLDAI KGGAEFSTHA GRIVGFLGGV IDDLP...NI +HBA2_BOSMU FPTTKTYFPH FD.LSH.... .GSAQVKGHG AKVAAALTKA VGHLDD...L +HBA2_GALCR FPTTKTYFPH FD.LSH.... .GSTQVKGHG KKVADALTNA VLHVDD...M +HBA4_SALIR YPQTKAYFSH WASVAP.... .GSAPVKKHG ITIMNQIDDC VGHMDD...L +HBAD_CHLME YPQTKTYFPH FD.LHP.... .GSEQVRGHG KKVAAALGNA VKSLDN...L +HBAD_PASMO YPSTKTYFPH FD.LSQ.... .GSDQIRGHG KKVVAALSNA IKNLDN...L +HBAZ_HORSE YPQTKTYFPH FD.LHE.... .GSPQLRAHG SKVAAAVGDA VKSIDN...V +HBA_AILME FPTTKTYFPH FD.LSP.... .GSAQVKAHG KKVADALTTA VGHLDD...L +HBA_ANSSE FPQTKTYFPH FD.LQP.... .GSAQIKAHG KKVAAALVEA ANHIDD...I +HBA_COLLI YPQTKTYFPH FD.LSH.... .GSAQIKGHG KKVAEALVEA ANHIDD...I +HBA_ERIEU HPTTKTYFPH FD.LNP.... .GSAQVKGHG KKVADALTTA VNNLDD...V +HBA_FRAPO YPSTKTYFPH FD.LSH.... .GSAQVKGHG KKVVAALIEA ANHIDD...I +HBA_MACFA FPTTKTYFPH FD.LSH.... .GSAQVKGHG KKVADALTLA VGHVDD...M +HBA_MACSI FPTTKTYFPH FD.LSH.... .GSAQVKGHG KKVADALTLA VGHVDD...M +HBA_MESAU YPTTKTYFPH FD.VSH.... .GSAQVKGHG KKVADALTNA VGHLDD...L +HBA_PAGLA FPTTKTYFPH FD.LSH.... .GSAQVKAHG KKVADALTLA VGHLED...L +HBA_PHACO YPSTKTYFPH FD.LSH.... .GSAQIKGHG KKVVAALIEA VNHIDD...I +HBA_PONPY FPTTKTYFPH FD.LSH.... .GSAQVKDHG KKVADALTNA VAHVDD...M +HBA_PROLO FPTTKTYFPH FD.LSP.... .GSAQVKAHG KKVADALTLA VGHLDD...L +HBA_TRIOC YPPTKTYFPH FD.LHH.... .GSAQIKAHG KKVVGALIEA VNHIDD...I +HBB1_VAREX YPWTQRQFSH FGNLSSPTAI AGNPRVKAHG KKVLTSFGDA IKNLDN...I +HBB2_TRICR NPWSRRYFHD FGDLSSCDAI CRNPKVLAHG AKVMRSIVEA TKHLDN...L +HBB2_XENTR YPWTQRYFSS FGNLSNVSAV SGNVKVKAHG NKVLSAVGSA IQHLDD...V +HBBL_RANCA YPWTQRYFST FGDLSSPAAI AGNPKVHAHG KKILGAIDNA IHNLDD...V +HBB_CALAR YPWTQRFFES FGDLSTPDAV MNNPKVKAHG KKVLGAFSDG LTHLDN...L +HBB_COLLI YPWTQRFFSS FGNLSSATAI SGNPNVKAHG KKVLTSFGDA VKNLDN...I +HBB_EQUHE YPWTQRFFDS FGDLSNPAAV MGNPKVKAHG KKVLHSFGEG VHHLDN...L +HBB_LARRI YPWTQRFFAS FGNLSSPTAI NGNPMVRAHG KKVLTSFGEA VKNLDN...I +HBB_MANSP YPWTQRFFDS FGDLSSPDAV MGNPKVKAHG KKVLGAFSDG LNHLDN...L +HBB_ORNAN YPWTQRFFEA FGDLSSAGAV MGNPKVKAHG AKVLTSFGDA LKNLDD...L +HBB_RABIT YPWTQRFFES FGDLSSANAV MNNPKVKAHG KKVLAAFSEG LSHLDN...L +HBB_SPECI YPWTQRFFDS FGDLSSASAV MGNAKVKAHG KKVIDSFSNG LKHLDN...L +HBB_SPETO YPWTQRFFDS FGDLSSASAV MGNAKVKAHG KKVIDSFSNG LKHLDN...L +HBB_SUNMU YPWTQRFFDS FGDLSSASAV MGNPKVKAHG KKVLHSLGEG VANLDN...L +HBB_TACAC YPWTQRFFES FGDLSSADAV MGNAKVKAHG AKVLTSFGDA LKNLDN...L +HBB_TRIIN YPWTQRFFEH FGDLSSASAI MNNPKVKAHG EKVFTSFGDG LKHLED...L +HBB_TUPGL YPWTQRFFDS FGDLSSPSAV MSNPKVKAHG KKVLTSFSDG LNHLDN...L +HBB_URSMA YPWTQRFFDS FGDLSSADAI MNNPKVKAHG KKVLNSFSDG LKNLDN...L +HBE_PONPY YPWTQRFFDS FGNLSSPSAI LGNPKVKAHG KKVLTSFGDA IKNMDN...L +HBF1_URECA YPGDLAFFHK FSSV.PLYGL RSNPAYKAQT LTVINYLDKV VDALGG..NA +LGB1_PEA .......... FSFLKDTAGV EDSPKLQAHA EQVFGLVRDS AAQLRTKGEV +LGB1_VICFA .......... FSFLKDSAGV VDSPKLGAHA EKVFGMVRDS AVQLRATGEV +MYG_ESCGI HPETLEKFDK FKHLKTEAEM KASEDLKKHG NTVLTALGGI LKKKGH...H +MYG_HORSE HPETLEKFDK FKHLKTEAEM KASEDLKKHG TVVLTALGGI LKKKGH...H +MYG_LYCPI HPETLDKFDK FKHLKTEDEM KGSEDLKKHG NTVLTALGGI LKKKGH...H +MYG_MOUSE HPETLDKFDK FKNLKSEEDM KGSEDLKKHG CTVLTALGTI LKKKGQ...H +MYG_MUSAN YPESQNHFPK FKN.KSLGEL KDTADIKAQA DTVLSALGNI VKKKGS...H +MYG_PROGU HPETLEKFDK FKHLKAEDEM RASEELKKHG TTVLTALGGI LKKKGQ...H +MYG_SAISC HPETLEKFDK FKHLKSEDEM KASEELKKHG TTVLTALGGI LKKKGQ...H + + +GLB2_MORMR SMKLQELSVK HAQSFYVDRQ YFKVLAGII. ........AD TTAPGDAGFE +GLBZ_CHITH GKHVDALVAT H.KPRGVTHA QFNNFRAAFI AYLKGHV..D YTAAVEAAWG +HBA2_BOSMU PGALSELSDL HAHKLRVDPV NFKLLSHSLL VTLASHLPSD FTPAVHASLD +HBA2_GALCR PSALSALSDL HAHKLRVDPV NFKLLRHCLL VTLACHHPAE FTPAVHASLD +HBA4_SALIR FGFLTKLSEL HATKLRVDPT NFKILAHNLI VVIAAYFPAE FTPEIHLSVD +HBAD_CHLME SQALSELSNL HAYNLRVDPA NFKLLAQCFQ VVLATHLGKD YSPEMHAAFD +HBAD_PASMO SQALSELSNL HAYNLRVDPV NFKFLSQCLQ VSLATRLGKE YSPEVHSAVD +HBAZ_HORSE AGALAKLSEL HAYILRVDPV NFKFLSHCLL VTLASRLPAD FTADAHAAWD +HBA_AILME PGALSALSDL HAHKLRVDPV NFKLLSHCLL VTLASHHPAE FTPAVHASLD +HBA_ANSSE AGALSKLSDL HAQKLRVDPV NFKFLGHCFL VVLAIHHPSL LTPEVHASMD +HBA_COLLI AGALSKLSDL HAQKLRVDPV NFKLLGHCFL VVVAVHFPSL LTPEVHASLD +HBA_ERIEU PGALSALSDL HAHKLRVDPV NFKLLSHCLL VTLALHHPAD FTPAVHASLD +HBA_FRAPO AGTLSKLSDL HAHKLRVDPV NFKLLGQCFL VVVAIHHPSA LTPEVHASLD +HBA_MACFA PQALSALSDL HAHKLRVDPV NFKLLSHCLL VTLAAHLPAE FTPAVHASLD +HBA_MACSI PQALSALSDL HAHKLRVDPV NFKLLSHCLL VTLAAHLPAE FTPAVHASLD +HBA_MESAU PGALSALSDL HAHKLRVDPV NFKLLSHCLL VTLANHHPAD FTPAVHASLD +HBA_PAGLA PNALSALSDL HAYKLRVDPV NFKLLSHCLL VTLACHHPAE FTPAVHSALD +HBA_PHACO TGTLSKLSDL HAHKLRVDPV NFKLLGQCFL VVVAIHHPSA LTPEVHASLD +HBA_PONPY PNALSALSDL HAHKLRVDPV NFKLLSHCLL VTLAAHLPAE FTPAVHASLD +HBA_PROLO PGALSALSDL HAYKLRVDPV NFKLLSHCLL VTLACHHPAE FTPAVHASLD +HBA_TRIOC AGALSKLSDL HAQKLRVDPV NFKLLGQCFL VVVAIHHPSV LTPEVHASLD +HBB1_VAREX KDTFAKLSEL HCDKLHVDPT NFKLLGNVLV IVLADHHGKE FTPAHHAAYQ +HBB2_TRICR REYYADLSVT HSLKFYVDPE NFKLFSGIVI VCLALTLQTD FSCHKQLAFE +HBB2_XENTR KSHLKGLSKS HAEDLHVDPE NFKRLADVLV IVLAAKLGSA FTPQVQAVWE +HBBL_RANCA KGTLHDLSEE HANELHVDPE NFRRLGEVLI VVLGAKLGKA FSPQVQHVWE +HBB_CALAR KGTFAHLSEL HCDKLHVDPE NFRLLGNVLV CVLAHHFGKE FTPVVQAAYQ +HBB_COLLI KGTFAQLSEL HCDKLHVDPE NFRLLGDILV IILAAHFGKD FTPECQAAWQ +HBB_EQUHE KGTFAQLSEL HCDKLHVDPE NFRLLGNVLV VVLARHFGKD FTPELQASYQ +HBB_LARRI KNTFAQLSEL HCDKLHVDPE NFRLLGDILI IVLAAHFAKD FTPDSQAAWQ +HBB_MANSP KGTFAQLSEL HCDKLHVDPE NFKLLGNVLV CVLAHHFGKE FTPQVQAAYQ +HBB_ORNAN KGTFAKLSEL HCDKLHVDPE NFNRLGNVLI VVLARHFSKD FSPEVQAAWQ +HBB_RABIT KGTFAKLSEL HCDKLHVDPE NFRLLGNVLV IVLSHHFGKE FTPQVQAAYQ +HBB_SPECI KGTFASLSEL HCDKLHVDPE NFKLLGNMIV IVMAHHLGKD FTPEAQAAFQ +HBB_SPETO KGTFASLSEL HCDKLHVDPE NFKLLGNMIV IVMAHHLGKD FTPEAQAAFQ +HBB_SUNMU KGTFAKLSEL HCDKLHVDPE NFRLLGNVLV VVLASKFGKE FTPPVQAAFQ +HBB_TACAC KGTFAKLSEL HCDKLHVDPE NFNRLGNVLV VVLARHFSKE FTPEAQAAWQ +HBB_TRIIN KGAFAELSEL HCDKLHVDPE NFRLLGNVLV CVLARHFGKE FSPEAQAAYQ +HBB_TUPGL KGTFAKLSEL HCDKLHVDPE NFRLLGNVLV RVLACNFGPE FTPQVQAAFQ +HBB_URSMA KGTFAKLSEL HCDKLHVDPE NFKLLGNVLV CVLAHHFGKE FTPQVQAAYQ +HBE_PONPY KTTFAKLSEL HCDKLHVDPE NFKLLGNVMV IILATHFGKE FTPEVQAAWQ +HBF1_URECA GALMKAKVPS H.DAMGITPK HFGQLLKLVG GVFQEEF..S ADPTTVAAWG +LGB1_PEA VLGNATLGAI HVQKGVTNP. HFVVVKEALL QTIKKASGNN WSEELNTAWE +LGB1_VICFA VLDGKD.GSI HIQKGVLDP. HFVVVKEALL KTIKEASGDK WSEELSAAWE +MYG_ESCGI EAELKPLAQS HATKHKIPIK YLEFISDAII HVLHSRHPGD FGADAQAAMN +MYG_HORSE EAELKPLAQS HATKHKIPIK YLEFISDAII HVLHSKHPGN FGADAQGAMT +MYG_LYCPI EAELKPLAQS HATKHKIPVK YLEFISDAII QVLQNKHSGD FHADTEAAMK +MYG_MOUSE AAEIQPLAQS HATKHKIPVK YLEFISEIII EVLKKRHSGD FGADAQGAMS +MYG_MUSAN SQPVKALAAT HITTHKIPPH YFTKITTIAV DVLSEMYPSE MNAQVQAAFS +MYG_PROGU AAELAPLAQS HATKHKIPVK YLEFISEAII QVLQSKHPGD FGADAQGAMS +MYG_SAISC EAELKPLAQS HATKHKIPVK YLELISDAIV HVLQKKHPGD FGADAQGAMK + + +GLB2_MORMR KLMSMICILL SSAY...... . +GLBZ_CHITH ATFDAFFGAV FAK....... M +HBA2_BOSMU KFLANVSTVL TSKYR..... . +HBA2_GALCR KFMASVSTVL TSKYR..... . +HBA4_SALIR KFLQQLALAL AEKYR..... . +HBAD_CHLME KFLSAVAAVL AEKYR..... . +HBAD_PASMO KFMSAVASVL AEKYR..... . +HBAZ_HORSE KFLSIVSSVL TEKYR..... . +HBA_AILME KFFSAVSTVL TSKYR..... . +HBA_ANSSE KFLCAVATVL TAKYR..... . +HBA_COLLI KFVLAVGTVL TAKYR..... . +HBA_ERIEU KFLATVATVL TSKYR..... . +HBA_FRAPO KFLCAVGNVL TAKYR..... . +HBA_MACFA KFLASVSTVL TSKYR..... . +HBA_MACSI KFLASVSTVL TSKYR..... . +HBA_MESAU KFFASVSTVL TSKYR..... . +HBA_PAGLA KFFSAVSTVL TSKYR..... . +HBA_PHACO KFLCAVGTVL TAKYR..... . +HBA_PONPY KFLASVSTVL TSKYR..... . +HBA_PROLO KFFTSVSTVL TSKYR..... . +HBA_TRIOC KFLCAVGNVL SAKYR..... . +HBB1_VAREX KLVNVVSHSL ARRYH..... . +HBB2_TRICR KLMKGVSHAL GHGY...... . +HBB2_XENTR KLNATLVAAL SHGYF..... . +HBBL_RANCA KFIAVLVDAL SHSYH..... . +HBB_CALAR KVVAGVANAL AHKYH..... . +HBB_COLLI KLVRVVAHAL ARKYH..... . +HBB_EQUHE KVVAGVANAL AHKYH..... . +HBB_LARRI KLVRVVAHAL ARKYH..... . +HBB_MANSP KVVAGVANAL AHKYH..... . +HBB_ORNAN KLVSGVAHAL GHKYH..... . +HBB_RABIT KVVAGVANAL AHKYH..... . +HBB_SPECI KVVAGVANAL AHKYH..... . +HBB_SPETO KVVAGVANAL SHKYH..... . +HBB_SUNMU KVVAGVANAL AHKYH..... . +HBB_TACAC KLVSGVSHAL AHKYH..... . +HBB_TRIIN KVVAGVANAL AHKYH..... . +HBB_TUPGL KVVAGVANAL AHKYH..... . +HBB_URSMA KVVAGVANAL AHKYH..... . +HBE_PONPY KLVSAVAIAL AHKYH..... . +HBF1_URECA DAAGVLVAAM .......... K +LGB1_PEA VAYDGLATAI KKAMKT.... A +LGB1_VICFA VAYDGLATAI K....A.... A +MYG_ESCGI KALELFRKDI AAKYKELGFQ G +MYG_HORSE KALELFRNDI AAKYKELGFQ G +MYG_LYCPI KALELFRNDI AAKYKELGFQ G +MYG_MOUSE KALELFRNDI AAKYKELGFQ G +MYG_MUSAN GAFKIICSDI EKEYKAANFQ G +MYG_PROGU KALELFRNDI AAKYKELGFQ G +MYG_SAISC KALELFRNDM AAKYKELGFQ G + diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/phylip b/forester/archive/RIO/others/hmmer/squid/Formats/phylip new file mode 100644 index 0000000..32646bd --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/phylip @@ -0,0 +1,204 @@ + 50 171 +GLB2_MORMR...PIVD..SGSVSPLSDAEKNKIRAAW.DIVYKNYEKNGVDILVKFFTG +GLBZ_CHITHMKFIILALCVAAASALSGDQIGLVQST.YGKVKG....DSVGILYAVFKA +HBA2_BOSMU...V...........LSAADKGNVKAAW.GKVGGHAAEYGAEALERMFLS +HBA2_GALCR...V...........LSPTDKSNVKAAW.EKVGAHAGDYGAEALERMFLS +HBA4_SALIR...S...........LSAKDKANVKAIW.GKILPKSDEIGEQALSRMLVV +HBAD_CHLME...M...........LTADDKKLLTQLW.EKVAGHQEEFGSEALQRMFLT +HBAD_PASMO...M...........LTAEDKKLIQQIW.GKLGGAEEEIGADALWRMFHS +HBAZ_HORSE...S...........LTKAERTMVVSIW.GKISMQADAVGTEALQRLFSS +HBA_AILME ...V...........LSPADKTNVKATW.DKIGGHAGEYGGEALERTFAS +HBA_ANSSE ...V...........LSAADKGNVKTVF.GKIGGHAEEYGAETLQRMFQT +HBA_COLLI ...V...........LSANDKSNVKAVF.AKIGGQAGDLGGEALERLFIT +HBA_ERIEU ...V...........LSATDKANVKTFW.GKLGGHGGEYGGEALDRMFQA +HBA_FRAPO ...V...........LSAADKNNVKGIF.GKISSHAEDYGAEALERMFIT +HBA_MACFA ...V...........LSPADKTNVKAAW.GKVGGHAGEYGAEALERMFLS +HBA_MACSI ...V...........LSPADKTNVKDAW.GKVGGHAGEYGAEALERMFLS +HBA_MESAU ...V...........LSAKDKTNISEAW.GKIGGHAGEYGAEALERMFFV +HBA_PAGLA ...V...........LSSADKNNIKATW.DKIGSHAGEYGAEALERTFIS +HBA_PHACO ...V...........LSAADKNNVKGIF.TKIAGHAEEYGAEALERMFIT +HBA_PONPY ...V...........LSPADKTNVKTAW.GKVGAHAGDYGAEALERMFLS +HBA_PROLO ...V...........LSPADKANIKATW.DKIGGHAGEYGGEALERTFAS +HBA_TRIOC ...V...........LSANDKTNVKTVF.TKITGHAEDYGAETLERMFIT +HBB1_VAREX...V..........HWTAEEKQLICSLW.GKI..DVGLIGGETLAGLLVI +HBB2_TRICR...V..........HLTAEDRKEIAAIL.GKV..NVDSLGGQCLARLIVV +HBB2_XENTR...V..........HWTAEEKATIASVW.GKV..DIEQDGHDALSRLLVV +HBBL_RANCA...V..........HWTAEEKAVINSVW.QKV..DVEQDGHEALTRLFIV +HBB_CALAR ...V..........HLTGEEKSAVTALW.GKV..NVDEVGGEALGRLLVV +HBB_COLLI ...V..........HWSAEEKQLITSIW.GKV..NVADCGAEALARLLIV +HBB_EQUHE ...V..........QLSGEEKAAVLALW.DKV..NEEEVGGEALGRLLVV +HBB_LARRI ...V..........HWSAEEKQLITGLW.GKV..NVADCGAEALARLLIV +HBB_MANSP ...V..........HLTPEEKTAVTTLW.GKV..NVDEVGGEALGRLLVV +HBB_ORNAN ...V..........HLSGGEKSAVTNLW.GKV..NINELGGEALGRLLVV +HBB_RABIT ...V..........HLSSEEKSAVTALW.GKV..NVEEVGGEALGRLLVV +HBB_SPECI ...V..........HLSDGEKNAISTAW.GKV..HAAEVGAEALGRLLVV +HBB_SPETO ...V..........HLTDGEKNAISTAW.GKV..NAAEIGAEALGRLLVV +HBB_SUNMU ...V..........HLSGEEKACVTGLW.GKV..NEDEVGAEALGRLLVV +HBB_TACAC ...V..........HLSGSEKTAVTNLW.GHV..NVNELGGEALGRLLVV +HBB_TRIIN ...V..........HLTPEEKALVIGLW.AKV..NVKEYGGEALGRLLVV +HBB_TUPGL ...V..........HLSGEEKAAVTGLW.GKV..DLEKVGGQSLGSLLIV +HBB_URSMA ...V..........HLTGEEKSLVTGLW.GKV..NVDEVGGEALGRLLVV +HBE_PONPY ...V..........HFTAEEKAAVTSLW.SKM..NVEEAGGEALGRLLVV +HBF1_URECA..............GLTTAQIKAIQDHWFLNIKGCLQAAADSIFFKYLTA +LGB1_PEA GFTDKQEALVNSSSE.FKQNLPGYSILFYTIVLEKAP..AAKGL...... +LGB1_VICFAGFTEKQEALVNSSSQLFKQNPSNYSVLFYTIILQKAP..TAKAM...... +MYG_ESCGI ...V...........LSDAEWQLVLNIW.AKVEADVAGHGQDILIRLFKG +MYG_HORSE ...G...........LSDGEWQQVLNVW.GKVEADIAGHGQEVLIRLFTG +MYG_LYCPI ...G...........LSDGEWQIVLNIW.GKVETDLAGHGQEVLIRLFKN +MYG_MOUSE ...G...........LSDGEWQLVLNVW.GKVEADLAGHGQEVLIGLFKT +MYG_MUSAN ..................VDWEKVNSVW.SAVESDLTAIGQNILLRLFEQ +MYG_PROGU ...G...........LSDGEWQLVLNVW.GKVEGDLSGHGQEVLIRLFKG +MYG_SAISC ...G...........LSDGEWQLVLNIW.GKVEADIPSHGQEVLISLFKG + +TPAAQAFFPKFKGLTTADALKKSSDVRWHAERIINAVNDAVKSMDDTEKM +DPTIQAAFPQFVGK.DLDAIKGGAEFSTHAGRIVGFLGGVIDDLP...NI +FPTTKTYFPHFD.LSH.....GSAQVKGHGAKVAAALTKAVGHLDD...L +FPTTKTYFPHFD.LSH.....GSTQVKGHGKKVADALTNAVLHVDD...M +YPQTKAYFSHWASVAP.....GSAPVKKHGITIMNQIDDCVGHMDD...L +YPQTKTYFPHFD.LHP.....GSEQVRGHGKKVAAALGNAVKSLDN...L +YPSTKTYFPHFD.LSQ.....GSDQIRGHGKKVVAALSNAIKNLDN...L +YPQTKTYFPHFD.LHE.....GSPQLRAHGSKVAAAVGDAVKSIDN...V +FPTTKTYFPHFD.LSP.....GSAQVKAHGKKVADALTTAVGHLDD...L +FPQTKTYFPHFD.LQP.....GSAQIKAHGKKVAAALVEAANHIDD...I +YPQTKTYFPHFD.LSH.....GSAQIKGHGKKVAEALVEAANHIDD...I +HPTTKTYFPHFD.LNP.....GSAQVKGHGKKVADALTTAVNNLDD...V +YPSTKTYFPHFD.LSH.....GSAQVKGHGKKVVAALIEAANHIDD...I +FPTTKTYFPHFD.LSH.....GSAQVKGHGKKVADALTLAVGHVDD...M +FPTTKTYFPHFD.LSH.....GSAQVKGHGKKVADALTLAVGHVDD...M +YPTTKTYFPHFD.VSH.....GSAQVKGHGKKVADALTNAVGHLDD...L +FPTTKTYFPHFD.LSH.....GSAQVKAHGKKVADALTLAVGHLED...L +YPSTKTYFPHFD.LSH.....GSAQIKGHGKKVVAALIEAVNHIDD...I +FPTTKTYFPHFD.LSH.....GSAQVKDHGKKVADALTNAVAHVDD...M +FPTTKTYFPHFD.LSP.....GSAQVKAHGKKVADALTLAVGHLDD...L +YPPTKTYFPHFD.LHH.....GSAQIKAHGKKVVGALIEAVNHIDD...I +YPWTQRQFSHFGNLSSPTAIAGNPRVKAHGKKVLTSFGDAIKNLDN...I +NPWSRRYFHDFGDLSSCDAICRNPKVLAHGAKVMRSIVEATKHLDN...L +YPWTQRYFSSFGNLSNVSAVSGNVKVKAHGNKVLSAVGSAIQHLDD...V +YPWTQRYFSTFGDLSSPAAIAGNPKVHAHGKKILGAIDNAIHNLDD...V +YPWTQRFFESFGDLSTPDAVMNNPKVKAHGKKVLGAFSDGLTHLDN...L +YPWTQRFFSSFGNLSSATAISGNPNVKAHGKKVLTSFGDAVKNLDN...I +YPWTQRFFDSFGDLSNPAAVMGNPKVKAHGKKVLHSFGEGVHHLDN...L +YPWTQRFFASFGNLSSPTAINGNPMVRAHGKKVLTSFGEAVKNLDN...I +YPWTQRFFDSFGDLSSPDAVMGNPKVKAHGKKVLGAFSDGLNHLDN...L +YPWTQRFFEAFGDLSSAGAVMGNPKVKAHGAKVLTSFGDALKNLDD...L +YPWTQRFFESFGDLSSANAVMNNPKVKAHGKKVLAAFSEGLSHLDN...L +YPWTQRFFDSFGDLSSASAVMGNAKVKAHGKKVIDSFSNGLKHLDN...L +YPWTQRFFDSFGDLSSASAVMGNAKVKAHGKKVIDSFSNGLKHLDN...L +YPWTQRFFDSFGDLSSASAVMGNPKVKAHGKKVLHSLGEGVANLDN...L +YPWTQRFFESFGDLSSADAVMGNAKVKAHGAKVLTSFGDALKNLDN...L +YPWTQRFFEHFGDLSSASAIMNNPKVKAHGEKVFTSFGDGLKHLED...L +YPWTQRFFDSFGDLSSPSAVMSNPKVKAHGKKVLTSFSDGLNHLDN...L +YPWTQRFFDSFGDLSSADAIMNNPKVKAHGKKVLNSFSDGLKNLDN...L +YPWTQRFFDSFGNLSSPSAILGNPKVKAHGKKVLTSFGDAIKNMDN...L +YPGDLAFFHKFSSV.PLYGLRSNPAYKAQTLTVINYLDKVVDALGG..NA +..........FSFLKDTAGVEDSPKLQAHAEQVFGLVRDSAAQLRTKGEV +..........FSFLKDSAGVVDSPKLGAHAEKVFGMVRDSAVQLRATGEV +HPETLEKFDKFKHLKTEAEMKASEDLKKHGNTVLTALGGILKKKGH...H +HPETLEKFDKFKHLKTEAEMKASEDLKKHGTVVLTALGGILKKKGH...H +HPETLDKFDKFKHLKTEDEMKGSEDLKKHGNTVLTALGGILKKKGH...H +HPETLDKFDKFKNLKSEEDMKGSEDLKKHGCTVLTALGTILKKKGQ...H +YPESQNHFPKFKN.KSLGELKDTADIKAQADTVLSALGNIVKKKGS...H +HPETLEKFDKFKHLKAEDEMRASEELKKHGTTVLTALGGILKKKGQ...H +HPETLEKFDKFKHLKSEDEMKASEELKKHGTTVLTALGGILKKKGQ...H + +SMKLQELSVKHAQSFYVDRQYFKVLAGII.........ADTTAPGDAGFE +GKHVDALVATH.KPRGVTHAQFNNFRAAFIAYLKGHV..DYTAAVEAAWG +PGALSELSDLHAHKLRVDPVNFKLLSHSLLVTLASHLPSDFTPAVHASLD +PSALSALSDLHAHKLRVDPVNFKLLRHCLLVTLACHHPAEFTPAVHASLD +FGFLTKLSELHATKLRVDPTNFKILAHNLIVVIAAYFPAEFTPEIHLSVD +SQALSELSNLHAYNLRVDPANFKLLAQCFQVVLATHLGKDYSPEMHAAFD +SQALSELSNLHAYNLRVDPVNFKFLSQCLQVSLATRLGKEYSPEVHSAVD +AGALAKLSELHAYILRVDPVNFKFLSHCLLVTLASRLPADFTADAHAAWD +PGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLASHHPAEFTPAVHASLD +AGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVLAIHHPSLLTPEVHASMD +AGALSKLSDLHAQKLRVDPVNFKLLGHCFLVVVAVHFPSLLTPEVHASLD +PGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLALHHPADFTPAVHASLD +AGTLSKLSDLHAHKLRVDPVNFKLLGQCFLVVVAIHHPSALTPEVHASLD +PQALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLD +PQALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLD +PGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLANHHPADFTPAVHASLD +PNALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTPAVHSALD +TGTLSKLSDLHAHKLRVDPVNFKLLGQCFLVVVAIHHPSALTPEVHASLD +PNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLD +PGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTPAVHASLD +AGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTPEVHASLD +KDTFAKLSELHCDKLHVDPTNFKLLGNVLVIVLADHHGKEFTPAHHAAYQ +REYYADLSVTHSLKFYVDPENFKLFSGIVIVCLALTLQTDFSCHKQLAFE +KSHLKGLSKSHAEDLHVDPENFKRLADVLVIVLAAKLGSAFTPQVQAVWE +KGTLHDLSEEHANELHVDPENFRRLGEVLIVVLGAKLGKAFSPQVQHVWE +KGTFAHLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPVVQAAYQ +KGTFAQLSELHCDKLHVDPENFRLLGDILVIILAAHFGKDFTPECQAAWQ +KGTFAQLSELHCDKLHVDPENFRLLGNVLVVVLARHFGKDFTPELQASYQ +KNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFAKDFTPDSQAAWQ +KGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFGKEFTPQVQAAYQ +KGTFAKLSELHCDKLHVDPENFNRLGNVLIVVLARHFSKDFSPEVQAAWQ +KGTFAKLSELHCDKLHVDPENFRLLGNVLVIVLSHHFGKEFTPQVQAAYQ +KGTFASLSELHCDKLHVDPENFKLLGNMIVIVMAHHLGKDFTPEAQAAFQ +KGTFASLSELHCDKLHVDPENFKLLGNMIVIVMAHHLGKDFTPEAQAAFQ +KGTFAKLSELHCDKLHVDPENFRLLGNVLVVVLASKFGKEFTPPVQAAFQ +KGTFAKLSELHCDKLHVDPENFNRLGNVLVVVLARHFSKEFTPEAQAAWQ +KGAFAELSELHCDKLHVDPENFRLLGNVLVCVLARHFGKEFSPEAQAAYQ +KGTFAKLSELHCDKLHVDPENFRLLGNVLVRVLACNFGPEFTPQVQAAFQ +KGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFGKEFTPQVQAAYQ +KTTFAKLSELHCDKLHVDPENFKLLGNVMVIILATHFGKEFTPEVQAAWQ +GALMKAKVPSH.DAMGITPKHFGQLLKLVGGVFQEEF..SADPTTVAAWG +VLGNATLGAIHVQKGVTNP.HFVVVKEALLQTIKKASGNNWSEELNTAWE +VLDGKD.GSIHIQKGVLDP.HFVVVKEALLKTIKEASGDKWSEELSAAWE +EAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSRHPGDFGADAQAAMN +EAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSKHPGNFGADAQGAMT +EAELKPLAQSHATKHKIPVKYLEFISDAIIQVLQNKHSGDFHADTEAAMK +AAEIQPLAQSHATKHKIPVKYLEFISEIIIEVLKKRHSGDFGADAQGAMS +SQPVKALAATHITTHKIPPHYFTKITTIAVDVLSEMYPSEMNAQVQAAFS +AAELAPLAQSHATKHKIPVKYLEFISEAIIQVLQSKHPGDFGADAQGAMS +EAELKPLAQSHATKHKIPVKYLELISDAIVHVLQKKHPGDFGADAQGAMK + +KLMSMICILLSSAY....... +ATFDAFFGAVFAK.......M +KFLANVSTVLTSKYR...... +KFMASVSTVLTSKYR...... +KFLQQLALALAEKYR...... +KFLSAVAAVLAEKYR...... +KFMSAVASVLAEKYR...... +KFLSIVSSVLTEKYR...... +KFFSAVSTVLTSKYR...... +KFLCAVATVLTAKYR...... +KFVLAVGTVLTAKYR...... +KFLATVATVLTSKYR...... +KFLCAVGNVLTAKYR...... +KFLASVSTVLTSKYR...... +KFLASVSTVLTSKYR...... +KFFASVSTVLTSKYR...... +KFFSAVSTVLTSKYR...... +KFLCAVGTVLTAKYR...... +KFLASVSTVLTSKYR...... +KFFTSVSTVLTSKYR...... +KFLCAVGNVLSAKYR...... +KLVNVVSHSLARRYH...... +KLMKGVSHALGHGY....... +KLNATLVAALSHGYF...... +KFIAVLVDALSHSYH...... +KVVAGVANALAHKYH...... +KLVRVVAHALARKYH...... +KVVAGVANALAHKYH...... +KLVRVVAHALARKYH...... +KVVAGVANALAHKYH...... +KLVSGVAHALGHKYH...... +KVVAGVANALAHKYH...... +KVVAGVANALAHKYH...... +KVVAGVANALSHKYH...... +KVVAGVANALAHKYH...... +KLVSGVSHALAHKYH...... +KVVAGVANALAHKYH...... +KVVAGVANALAHKYH...... +KVVAGVANALAHKYH...... +KLVSAVAIALAHKYH...... +DAAGVLVAAM..........K +VAYDGLATAIKKAMKT....A +VAYDGLATAIK....A....A +KALELFRKDIAAKYKELGFQG +KALELFRNDIAAKYKELGFQG +KALELFRNDIAAKYKELGFQG +KALELFRNDIAAKYKELGFQG +GAFKIICSDIEKEYKAANFQG +KALELFRNDIAAKYKELGFQG +KALELFRNDMAAKYKELGFQG diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/pir b/forester/archive/RIO/others/hmmer/squid/Formats/pir new file mode 100644 index 0000000..0999703 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/pir @@ -0,0 +1,249 @@ + + P R O T E I N S E Q U E N C E D A T A B A S E + of PIR-International + + Section 1. Fully Classified Entries + Release 63.00, December 30, 1999 + 20032 sequences, 7820966 residues + + Protein Information Resource (PIR)* + National Biomedical Research Foundation + 3900 Reservoir Road, N.W., + Washington, DC 20007, USA + +International Protein Information Munich Information Center for + Database in Japan (JIPID) Protein Sequences (MIPS) + Science University of Tokyo GSF-Forschungszentrum f. Umwelt und Gesundheit + 2669 Yamazaki, Noda 278, Japan am Max-Planck-Instut f. Biochemie + Am Klopferspitz 18, D-82152 Martinsried, FRG + + This database may be redistributed without prior consent, provided that + this notice be given to each user and that the words "Derived from" shall + precede this notice if the database has been altered by the redistributor. + + *PIR is a registered mark of NBRF. + + +\\\ +ENTRY CCHU #type complete +TITLE cytochrome c - human +ORGANISM #formal_name Homo sapiens #common_name man +DATE 24-Apr-1984 #sequence_revision 30-Sep-1991 #text_change + 28-Jun-1999 +ACCESSIONS A31764; A05676; I55192; A00001 +REFERENCE A31764 + #authors Evans, M.J.; Scarpulla, R.C. + #journal Proc. Natl. Acad. Sci. U.S.A. (1988) 85:9625-9629 + #title The human somatic cytochrome c gene: two classes of processed + pseudogenes demarcate a period of rapid molecular + evolution. + #cross-references MUID:89071748 + #accession A31764 + ##molecule_type DNA + ##residues 1-105 ##label EVA + ##cross-references GB:M22877; NID:g181241; PIDN:AAA35732.1; PID:g181242 +REFERENCE A05676 + #authors Matsubara, H.; Smith, E.L. + #journal J. Biol. Chem. (1963) 238:2732-2753 + #title Human heart cytochrome c. Chymotryptic peptides, tryptic + peptides, and the complete amino acid sequence. + #accession A05676 + ##molecule_type protein + ##residues 2-28;29-46;47-100;101-105 ##label MATS +REFERENCE A00001 + #authors Matsubara, H.; Smith, E.L. + #journal J. Biol. Chem. (1962) 237:3575-3576 + #title The amino acid sequence of human heart cytochrome c. + #contents annotation + #note 66-Leu is found in 10% of the molecules in pooled protein +REFERENCE I55192 + #authors Tanaka, Y.; Ashikari, T.; Shibano, Y.; Amachi, T.; Yoshizumi, + H.; Matsubara, H. + #journal J. Biochem. (1988) 103:954-961 + #title Construction of a human cytochrome c gene and its functional + expression in Saccharomyces cerevisiae. + #cross-references MUID:89008207 + #accession I55192 + ##status translated from GB/EMBL/DDBJ + ##molecule_type mRNA + ##residues 78-105 ##label RES + ##cross-references GB:D00265; NID:g2897691; PIDN:BAA00187.1; + PID:d1000635; PID:g219557 +GENETICS + #introns 57/1 +CLASSIFICATION #superfamily cytochrome c; cytochrome c homology +KEYWORDS acetylated amino end; chromoprotein; electron transfer; heme; + iron; mitochondrion; oxidative phosphorylation; + polymorphism; respiratory chain +FEATURE + 2-105 #product cytochrome c #status experimental #label MAT\ + 5-99 #domain cytochrome c homology #label CYC\ + 2 #modified_site acetylated amino end (Gly) (in mature + form) #status experimental\ + 15,18 #binding_site heme (Cys) (covalent) #status + experimental\ + 19,81 #binding_site heme iron (His, Met) (axial ligands) + #status predicted +SUMMARY #length 105 #molecular-weight 11749 #checksum 3247 +SEQUENCE + 5 10 15 20 25 30 + 1 M G D V E K G K K I F I M K C S Q C H T V E K G G K H K T G + 31 P N L H G L F G R K T G Q A P G Y S Y T A A N K N K G I I W + 61 G E D T L M E Y L E N P K K Y I P G T K M I F V G I K K K E + 91 E R A D L I A Y L K K A T N E +/// +ENTRY CCCZ #type complete +TITLE cytochrome c - chimpanzee (tentative sequence) +ORGANISM #formal_name Pan troglodytes #common_name chimpanzee +DATE 17-Mar-1987 #sequence_revision 17-Mar-1987 #text_change + 25-Apr-1997 +ACCESSIONS A00002 +REFERENCE A94601 + #authors Needleman, S.B. + #submission submitted to the Atlas, October 1968 + #accession A00002 + ##molecule_type protein + ##residues 1-104 ##label NEE +REFERENCE A94455 + #authors Needleman, S.B.; Margoliash, E. + #citation unpublished results, 1966, cited by Margoliash, E., and + Fitch, W.M., Ann. N.Y. Acad. Sci. 151, 359-381, 1968 + #contents annotation; compositions of chymotryptic peptides +CLASSIFICATION #superfamily cytochrome c; cytochrome c homology +KEYWORDS acetylated amino end; chromoprotein; electron transfer; heme; + iron; mitochondrion; oxidative phosphorylation; respiratory + chain +FEATURE + 4-98 #domain cytochrome c homology #label CYC\ + 1 #modified_site acetylated amino end (Gly) #status + predicted\ + 14,17 #binding_site heme (Cys) (covalent) #status predicted\ + 18,80 #binding_site heme iron (His, Met) (axial ligands) + #status predicted +SUMMARY #length 104 #molecular-weight 11617 #checksum 9501 +SEQUENCE + 5 10 15 20 25 30 + 1 G D V E K G K K I F I M K C S Q C H T V E K G G K H K T G P + 31 N L H G L F G R K T G Q A P G Y S Y T A A N K N K G I I W G + 61 E D T L M E Y L E N P K K Y I P G T K M I F V G I K K K E E + 91 R A D L I A Y L K K A T N E +/// +ENTRY CCMQR #type complete +TITLE cytochrome c - rhesus macaque (tentative sequence) +ORGANISM #formal_name Macaca mulatta #common_name rhesus macaque +DATE 17-Mar-1987 #sequence_revision 17-Mar-1987 #text_change + 25-Apr-1997 +ACCESSIONS A00003 +REFERENCE A00003 + #authors Rothfus, J.A.; Smith, E.L. + #journal J. Biol. Chem. (1965) 240:4277-4283 + #title Amino acid sequence of rhesus monkey heart cytochrome c. + #cross-references MUID:66045191 + #contents compositions of chymotryptic peptides; sequences of residues + 55-61 and 68-70 + #accession A00003 + ##molecule_type protein + ##residues 1-104 ##label ROT +CLASSIFICATION #superfamily cytochrome c; cytochrome c homology +KEYWORDS acetylated amino end; chromoprotein; electron transfer; heme; + iron; mitochondrion; oxidative phosphorylation; respiratory + chain +FEATURE + 4-98 #domain cytochrome c homology #label CYC\ + 1 #modified_site acetylated amino end (Gly) #status + experimental\ + 14,17 #binding_site heme (Cys) (covalent) #status predicted\ + 18,80 #binding_site heme iron (His, Met) (axial ligands) + #status predicted +SUMMARY #length 104 #molecular-weight 11605 #checksum 9512 +SEQUENCE + 5 10 15 20 25 30 + 1 G D V E K G K K I F I M K C S Q C H T V E K G G K H K T G P + 31 N L H G L F G R K T G Q A P G Y S Y T A A N K N K G I T W G + 61 E D T L M E Y L E N P K K Y I P G T K M I F V G I K K K E E + 91 R A D L I A Y L K K A T N E +/// +ENTRY CCMKP #type complete +TITLE cytochrome c - spider monkey +ORGANISM #formal_name Ateles sp. #common_name spider monkey +DATE 17-Dec-1982 #sequence_revision 17-Dec-1982 #text_change + 25-Apr-1997 +ACCESSIONS A00004 +REFERENCE A00004 + #authors Margoliash, E. + #citation unpublished results, cited by Shelnutt, J.A., Rousseau, D.L., + Dethmers, J.K., and Margoliash, E., Biochemistry 20, + 6485-6497, 1981 + #accession A00004 + ##molecule_type protein + ##residues 1-104 ##label MAR +CLASSIFICATION #superfamily cytochrome c; cytochrome c homology +KEYWORDS acetylated amino end; chromoprotein; electron transfer; heme; + iron; mitochondrion; oxidative phosphorylation; respiratory + chain +FEATURE + 4-98 #domain cytochrome c homology #label CYC\ + 1 #modified_site acetylated amino end (Gly) #status + predicted\ + 14,17 #binding_site heme (Cys) (covalent) #status predicted\ + 18,80 #binding_site heme iron (His, Met) (axial ligands) + #status predicted +SUMMARY #length 104 #molecular-weight 11710 #checksum 9066 +SEQUENCE + 5 10 15 20 25 30 + 1 G D V F K G K R I F I M K C S Q C H T V E K G G K H K T G P + 31 N L H G L F G R K T G Q A S G F T Y T E A N K N K G I I W G + 61 E D T L M E Y L E N P K K Y I P G T K M I F V G I K K K E E + 91 R A D L I A Y L K K A T N E +/// +ENTRY CCMS #type complete +TITLE cytochrome c - mouse +ORGANISM #formal_name Mus musculus #common_name house mouse +DATE 31-Dec-1990 #sequence_revision 30-Sep-1991 #text_change + 11-Jun-1999 +ACCESSIONS A23057; A04604; A00009 +REFERENCE A23057 + #authors Limbach, K.J.; Wu, R. + #journal Nucleic Acids Res. (1985) 13:617-630 + #title Characterization of a mouse somatic cytochrome c gene and + three cytochrome c pseudogenes. + #cross-references MUID:85215501 + #accession A23057 + ##molecule_type DNA + ##residues 1-105 ##label LIM + ##cross-references EMBL:X01756; NID:g50618; PIDN:CAA25899.1; PID:g50619 + ##experimental_source strain BALB/c +REFERENCE A04604 + #authors Carlson, S.S.; Mross, G.A.; Wilson, A.C.; Mead, R.T.; Wolin, + L.D.; Bowers, S.F.; Foley, N.T.; Muijsers, A.O.; + Margoliash, E. + #journal Biochemistry (1977) 16:1437-1442 + #title Primary structure of mouse, rat, and guinea pig cytochrome c. + #cross-references MUID:77134768 + #accession A04604 + ##molecule_type protein + ##residues 2-105 ##label CAR + ##experimental_source strain BALB/c +GENETICS + #introns 57/1 +CLASSIFICATION #superfamily cytochrome c; cytochrome c homology +KEYWORDS acetylated amino end; chromoprotein; electron transfer; heme; + iron; mitochondrion; oxidative phosphorylation; respiratory + chain +FEATURE + 2-105 #product cytochrome c #status experimental #label MAT\ + 5-99 #domain cytochrome c homology #label CYC\ + 2 #modified_site acetylated amino end (Gly) (in mature + form) #status experimental\ + 15,18 #binding_site heme (Cys) (covalent) #status + experimental\ + 19,81 #binding_site heme iron (His, Met) (axial ligands) + #status predicted +SUMMARY #length 105 #molecular-weight 11605 #checksum 1273 +SEQUENCE + 5 10 15 20 25 30 + 1 M G D V E K G K K I F V Q K C A Q C H T V E K G G K H K T G + 31 P N L H G L F G R K T G Q A A G F S Y T D A N K N K G I T W + 61 G E D T L M E Y L E N P K K Y I P G T K M I F A G I K K K G + 91 E R A D L I A Y L K K A T N E +/// diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/regression.dat b/forester/archive/RIO/others/hmmer/squid/Formats/regression.dat new file mode 100644 index 0000000..5743930 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/regression.dat @@ -0,0 +1,20 @@ +# filename format type nseq nres short long autodetect? alignment? singleseq? +# -------- -------- ------- ---- ---- ----- ---- ---------- ---------- ---------- +fasta fasta Protein 3 1730 356 949 yes no no +genbank genbank DNA 6 3773 132 1771 yes no no +embl embl DNA 2 1868 233 1635 yes no no +swissprot embl Protein 6 2779 75 924 yes no no +gcg gcg DNA 1 9718 9718 9718 yes no yes +gcgdata.1 gcgdata Protein 20 7100 29 1299 yes no no +gcgdata.2 gcgdata DNA 7 5867 105 2886 yes no no +pir pir Protein 5 522 104 105 yes no no +stockholm.1 stockholm Protein 7 266 38 38 yes yes no +stockholm.2 stockholm Protein 16 3769 227 239 yes yes no +msf msf Protein 50 7251 141 153 yes yes no +clustal clustal DNA 20 1518 75 80 yes yes no +selex.1 selex RNA 11 201 11 29 yes yes no +selex.2 selex RNA 11 201 11 29 yes yes no +phylip phylip Protein 50 7251 141 153 yes yes no +a2m a2m Protein 50 7251 141 153 no yes no + + diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/selex.1 b/forester/archive/RIO/others/hmmer/squid/Formats/selex.1 new file mode 100644 index 0000000..bf2f905 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/selex.1 @@ -0,0 +1,37 @@ +#=ID r17 +#=AC PF99999 +#=DE Test of extended SELEX format +#=AU SRE, Tue Dec 1 19:17:48 1998 +#=GA -1.0 -2.0 +#=TC -3.0 -4.0 +#=NC -5.0 -6.0 + +# Derived from r17.slx +#tag name weight source src acc # from,to,olen description +#=SQ 28 1.0000 IDENT: ACCESSION 1..29::29 Sequence one +#=SQ longname 102.0000 SWISS NUMBER; 3..19::100 A long name +#=SQ 2 2.0000 OR OR 0..0::0 two +#=SQ 3 3.0000 PIR - 0..0::0 three +#=SQ 4 4.0000 OR FOR 0..0::0 - means no data +#=SQ 5 5.0000 WHATEVER MISSING 0..0::0 - +#=SQ 6 6.0000 - DATA 0..0::0 foo +#=SQ 7 7 - - 0..0::0 bar +#=SQ 8 8.0000 - - 0..0::0 baz +#=SQ 9 9.0000 - - 0..0::0 a description of several words. +#=SQ 10 10.0000 - - 0..0::0 - + +#=RF xxxxxxx xxxx xxxxxx +#=CS >>>>+>> ^^^^ <<<<<< +28 gGAGUAAGAUAGC AUCA GCAUCUUGUUCC +#=SS +++++>>>>>+>> ^^^^ <<<<<<<+++++ +longname GUUCACC AUCA GGGGAc +#=SS >>>>+>> ^^^^ <<<<<< +2 AUGGAUGCGCACC AUCA GGGCGUaucuau +3 GAUCACC AUCA GGGauc +4 GGUCACC AUCA GGGauc +5 GGACACC AUCA GGGucu +6 CACC AUCA GGG +7 GAUCACC AUCA GGGauc +8 CUCACC AUCA GGGGG +9 AUGCACC AUCA GGGCAU +10 CUCACC AUCA GGGGG diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/selex.2 b/forester/archive/RIO/others/hmmer/squid/Formats/selex.2 new file mode 100644 index 0000000..1f8f6c7 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/selex.2 @@ -0,0 +1,11 @@ +28 gGAGUAAGAUAGC AUCA GCAUCUUGUUCC +longname GUUCACC AUCA GGGGAc +2 AUGGAUGCGCACC AUCA GGGCGUaucuau +3 GAUCACC AUCA GGGauc +4 GGUCACC AUCA GGGauc +5 GGACACC AUCA GGGucu +6 CACC AUCA GGG +7 GAUCACC AUCA GGGauc +8 CUCACC AUCA GGGGG +9 AUGCACC AUCA GGGCAU +10 CUCACC AUCA GGGGG diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/stockholm.1 b/forester/archive/RIO/others/hmmer/squid/Formats/stockholm.1 new file mode 100644 index 0000000..c557d53 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/stockholm.1 @@ -0,0 +1,94 @@ +# STOCKHOLM 1.0 +# +# This is an example of a Stockholm multiple sequence alignment +# file. It is deliberately designed to exercise many of the +# features of Stockholm format, in order to test a parser. +# +#=GF ID 14-3-3 +#=GF AC PF00244 +#=GF DE 14-3-3 proteins +#=GF AU Finn RD +#=GF AL Clustalw +#=GF SE Prosite +#=GF GA 25 25 +#=GF TC 35.40 35.40 +#=GF NC 8.80 8.80 +#=GF BM hmmbuild -f HMM SEED +#=GF BM hmmcalibrate --seed 0 HMM +#=GF RN [1] +#=GF RM 95327195 +#=GF RT Structure of a 14-3-3 protein and implications for +#=GF RT coordination of multiple signalling pathways. +#=GF RA Xiao B, Smerdon SJ, Jones DH, Dodson GG, Soneji Y, Aitken +#=GF RA A, Gamblin SJ; +#=GF RL Nature 1995;376:188-191. +#=GF RN [2] +#=GF RM 95327196 +#=GF RT Crystal structure of the zeta isoform of the 14-3-3 +#=GF RT protein. +#=GF RA Liu D, Bienkowska J, Petosa C, Collier RJ, Fu H, Liddington +#=GF RA R; +#=GF RL Nature 1995;376:191-194. +#=GF DR PROSITE; PDOC00633; +#=GF DR SMART; 14_3_3; +#=GF DR PRINTS; PR00305; +#=GF SQ 119 + +#=GS 1431_ENTHI/4-239 WT 0.42 +#=GS seq1 WT 0.40 +#=GS seq2 WT 0.41 +#=GS seq3 WT 0.43 +#=GS seq4 WT 0.44 +#=GS seq5 WT 0.45 +#=GS seq6 WT 0.46 + +#=GS seq4 AC PF00001 +#=GS seq4 DE A description of seq4. + +#=GS seq1 NEWTAG foo +#=GS seq2 NEWTAG bar +#=GS seq3 NEWTAG baz + +#=GS seq3 TAG2 foo2 +#=GS seq4 TAG2 foo3 +#=GS seq5 TAG2 foo4 + +#=GC SS_cons xxxxxxxxxxxxxxxxxxx +#=GC SA_cons xxxxxxxxxxxxxxxxxxx +#=GC New_long_tag_thingie xxxxxxxxxxxxxxxxxxx +1431_ENTHI/4-239 ACDEFGHKLMNPQRSTVWY +#=GR seq1 SS ................... +#=GR seq1 SA 0000000000000000000 +seq1 ACDEFGHKLMNPQRSTVWY +seq2 ACDEFGHKLMNPQRSTVWY +seq3 ACDEFGHKLMNPQRSTVWY +seq4 ACDEFGHKLMNPQRSTVWY +seq5 ACDEFGHKLMNPQRSTVWY +seq6 ACDEFGHKLMNPQRSTVWY +#=GR seq6 SS ................... +#=GR seq6 SA 9999999999999999999 +#=GR seq6 Invented_tag ******************* + + +#=GC SS_cons xxxxxxxxxxxxxxxxxxx +#=GC SA_cons xxxxxxxxxxxxxxxxxxx +#=GC New_long_tag_thingie xxxxxxxxxxxxxxxxxxx +1431_ENTHI/4-239 ACDEFGHKLMNPQRSTVWY +#=GR seq1 SS ................... +#=GR seq1 SA 0000000000000000000 +seq1 ACDEFGHKLMNPQRSTVWY +seq2 ACDEFGHKLMNPQRSTVWY +seq3 ACDEFGHKLMNPQRSTVWY +seq4 ACDEFGHKLMNPQRSTVWY +seq5 ACDEFGHKLMNPQRSTVWY +seq6 ACDEFGHKLMNPQRSTVWY +#=GR seq6 SS ................... +#=GR seq6 SA 9999999999999999999 +#=GR seq6 Invented_tag ******************* + +# +# And here's some trailing comments, just to +# try to confuse a parser. +# + +// \ No newline at end of file diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/stockholm.2 b/forester/archive/RIO/others/hmmer/squid/Formats/stockholm.2 new file mode 100644 index 0000000..ba224d4 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/stockholm.2 @@ -0,0 +1,366 @@ +# STOCKHOLM 1.0 +#=GF ID 14-3-3 +#=GF AC PF00244 +#=GF DE 14-3-3 proteins +#=GF AU Finn RD +#=GF AL Clustalw +#=GF SE Prosite +#=GF GA 25 25 +#=GF TC 35.40 35.40 +#=GF NC 8.80 8.80 +#=GF BM hmmbuild -f HMM SEED +#=GF BM hmmcalibrate --seed 0 HMM +#=GF RN [1] +#=GF RM 95327195 +#=GF RT Structure of a 14-3-3 protein and implications for +#=GF RT coordination of multiple signalling pathways. +#=GF RA Xiao B, Smerdon SJ, Jones DH, Dodson GG, Soneji Y, Aitken A, +#=GF RA Gamblin SJ; +#=GF RL Nature 1995;376:188-191. +#=GF RN [2] +#=GF RM 95327196 +#=GF RT Crystal structure of the zeta isoform of the 14-3-3 protein. +#=GF RA Liu D, Bienkowska J, Petosa C, Collier RJ, Fu H, Liddington +#=GF RA R; +#=GF RL Nature 1995;376:191-194. +#=GF RN [3] +#=GF RM 96182649 +#=GF RT Interaction of 14-3-3 with signaling proteins is mediated by +#=GF RT the recognition of phosphoserine. +#=GF RA Muslin AJ, Tanner JW, Allen PM, Shaw AS; +#=GF RL Cell 1996;84:889-897. +#=GF RN [4] +#=GF RM 97424374 +#=GF RT The 14-3-3 protein binds its target proteins with a common +#=GF RT site located towards the C-terminus. +#=GF RA Ichimura T, Ito M, Itagaki C, Takahashi M, Horigome T, Omata +#=GF RA S, Ohno S, Isobe T +#=GF RL FEBS Lett 1997;413:273-276. +#=GF RN [5] +#=GF RM 96394689 +#=GF RT Molecular evolution of the 14-3-3 protein family. +#=GF RA Wang W, Shakes DC +#=GF RL J Mol Evol 1996;43:384-398. +#=GF RN [6] +#=GF RM 96300316 +#=GF RT Function of 14-3-3 proteins. +#=GF RA Jin DY, Lyu MS, Kozak CA, Jeang KT +#=GF RL Nature 1996;382:308-308. +#=GF DR PROSITE; PDOC00633; +#=GF DR SMART; 14_3_3; +#=GF DR PRINTS; PR00305; +#=GF SQ 16 +1431_ENTHI/4-239 REDCVYTAKLAEQSERYDEMVQCMKQVAEMEA...ELSIEERNLLSVAYKNVIGAKRASWRIISSLEQKEQAKG.NDKHVEIIKGYRAKIEKELSTCCDDVLKVIQENLLPKA..STSESKVFFKKMEGDYYRYFAEFTVDEKRKEVADKSLAAYTEATEISNAELAPTHPIRLGLALNFSVFYFEIMNDADKACQLAKQAFDDAIAKLDEVPENMYKDSTLIMQLLRDNLTLWTSDACDEE +#=GS 1431_ENTHI/4-239 AC P42648 +1432_ENTHI/4-238 REDLVYLSKLAEQSERYEEMVQYMKQVAEMGT...ELSVEERNLISVAYKNVVGSRRASWRIISSLEQKEQAKG.NTQRVELIKTYRAKIEQELSQKCDDVLKIITEFLLKNS..TSIESKVFFKKMEGDYYRYYAEFTVDEKRKEVADKSLAAYQEATDTA.ASLVPTHPIRLGLALNFSVFYYQIMNDADKACQLAKEAFDEAIQKLDEVPEESYKESTLIMQLLRDNLTLWTSDMGDDE +#=GS 1432_ENTHI/4-238 AC P42649 +1433_CAEEL/5-237 VEELVQRAKLAEQAERYDDMAAAMKKVTEQGQ...ELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTEG...SEKKQQLAKEYRVKVEQELNDICQDVLKLLDEFLIVKA..GAAESKAFYLKMKGDYYRYLAEVAS.EDRAAVVEKSQKAYQEALDIAKDKMQPTHPIRLGLALNFSVFYYEILNTPEHACQLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDVGAED +#=GS 1433_CAEEL/5-237 AC P41932 +1433_LYCES/9-246 REENVYMAKLADRAESDEEMVEFMEKVSNSLGS.EELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEESRG.NEEHVNSIREYRSKIENELSKICDGILKLLDSKLIPSA..TSGDSKVFYLKMKGDYHRYLAEFKTGAERKEAAESTLTAYKAAQDIASAELAPTHPIRLGLALNFSVFYYEILNSPDRACNLAKQAFDEAIAELDTLGEESYKDSTLIMQLLRDNLTLWTSDMQDDG +#=GS 1433_LYCES/9-246 AC P93209 +1433_XENLA/1-227 .......AKLSEQAERYDDMAASMKAVTELGA...ELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTEG...NDKRQQMAREYREKVETELQDICKDVLDLLDRFLVPNA..TPPESKVFYLKMKGDYYRYLSEVASGDSKQETVASSQQAYQEAFEISKSEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKSAFDEAIRELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGEE +#=GS 1433_XENLA/1-227 AC P29309 +1434_LYCES/6-243 REENVYLAKLAEQAERYEEMIEFMEKVAKTADV.EELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEESRG.NEDHVNTIKEYRSKIEADLSKICDGILSLLESNLIPSA..STAESKVFHLKMKGDYHRYLAEFKTGTERKEAAENTLLAYKSAQDIALAELAPTHPIRLGLALNFSVFYYEILNSPDRACNLAKQAFDEAISELDTLGEESYKDSTLIMQLLRDNLTLWTSDNADDV +#=GS 1434_LYCES/6-243 AC P42652 +143B_VICFA/7-242 RENFVYIAKLAEQAERYEEMVDSMKNVANLDV...ELTIEERNLLSVGYKNVIGARRASWRILSSIEQKEESKG.NDVNAKRIKEYRHKVETELSNICIDVMRVIDEHLIPSA..AAGESTVFYYKMKGDYYRYLAEFKTGNEKKEAGDQSMKAYESATTAAEAELPPTHPIRLGLALNFSVFYYEILNSPERACHLAKQAFDEAISELDTLNEESYKDSTLIMQLLRDNLTLWTSDIPEDG +#=GS 143B_VICFA/7-242 AC P42654 +143E_HUMAN/4-239 REDLVYQAKLAEQAERYDEMVESMKKVAGMDV...ELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEENKG.GEDKLKMIREYRQMVETELKLICCDILDVLDKHLIPAA..NTGESKVFYYKMKGDYHRYLAEFATGNDRKEAAENSLVAYKAASDIAMTELPPTHPIRLGLALNFSVFYYEILNSPDRACRLAKAAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDMQGDG +#=GS 143E_HUMAN/4-239 AC P42655 +143F_MOUSE/3-240 REQLLQRARLAEQAERYDDMASAMKAVTELNE...PLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADG.NEKKLEKVKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEE +#=GS 143F_MOUSE/3-240 AC P11576 +143R_ARATH/7-245 RDQYVYMAKLAEQAERYEEMVQFMEQLVTGATPAEELTVEERNLLSVAYKNVIGSLRAAWRIVSSIEQKEESRK.NDEHVSLVKDYRSKVESELSSVCSGILKLLDSHLIPSA..GASESKVFYLKMKGDYHRYMAEFKSGDERKTAAEDTMLAYKAAQDIAAADMAPTHPIRLGLALNFSVFYYEILNSSDKACNMAKQAFEEAIAELDTLGEESYKDSTLIMQLLRDNLTLWTSDYAGAD +#=GS 143R_ARATH/7-245 AC P42647 +143S_HUMAN/3-238 RASLIQKAKLAEQAERYEDMAAFMKGAVEKGE...ELSCEERNLLSVAYKNVVGGQRAAWRVLSSIEQKSNEEG.SEEKGPEVREYREKVETELQGVCDTVLGLLDSHLIKEA..GDAESRVFYLKMKGDYYRYLAEVATGDDKKRIIDSARSAYQEAMDISKKEMPPTNPIRLGLALNFSVFHYEIANSPEEAISLAKTTFDEAMADLHTLSEDSYKDSTLIMQLLRDNLTLWTADNAGEE +#=GS 143S_HUMAN/3-238 AC P31947 +143T_HUMAN/3-236 KTELIQKAKLAEQAERYDDMATCMKAVTEQGA...ELSNEERNLLSVAYKNVVGGRRSAWRVISSIEQKTDT...SDKKLQLIKDYREKVESELRSICTTVLELLDKYLIANA..TNPESKVFYLKMKGDYFRYLAEVACGDDRKQTIDNSQGAYQEAFDISKKEMQPTHPIRLGLALNFSVFYYEILNNPELACTLAKTAFDEAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDSAGEE +#=GS 143T_HUMAN/3-236 AC P27348 +143Z_DROME/6-239 KEELVQKAKLAEQSERYDDMAQAMKSVTETGV...ELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTEA...SARKQQLAREYRERVEKELREICYEVLGLLDKYLIPKA..SNPESKVFYLKMKGDYYRYLAEVATGDARNTVVDDSQTAYQDAFDISKGKMQPTHPIRLGLALNFSVFYYEILNSPDKACQLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDTQGDE +#=GS 143Z_DROME/6-239 AC P29310 +BMH1_YEAST/4-240 REDSVYLAKLAEQAERYEEMVENMKTVASSGQ...ELSVEERNLLSVAYKNVIGARRASWRIVSSIEQKEESKEKSEHQVELICSYRSKIETELTKISDDILSVLDSHLIPSA..TTGESKVFYYKMKGDYHRYLAEFSSGDAREKATNASLEAYKTASEIATTELPPTHPIRLGLALNFSVFYYEIQNSPDKACHLAKQAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDMSESG +#=GS BMH1_YEAST/4-240 AC P29311 +RA24_SCHPO/6-241 REDAVYLAKLAEQAERYEGMVENMKSVASTDQ...ELTVEERNLLSVAYKNVIGARRASWRIVSSIEQKEESKG.NTAQVELIKEYRQKIEQELDTICQDILTVLEKHLIPNA..ASAESKVFYYKMKGDYYRYLAEFAVGEKRQHSADQSLEGYKAASEIATAELAPTHPIRLGLALNFSVFYYEILNSPDRACYLAKQAFDEAISELDSLSEESYKDSTLIMQLLRDNLTLWTSDAEYSA +#=GS RA24_SCHPO/6-241 AC P42656 +RA25_SCHPO/5-240 RENSVYLAKLAEQAERYEEMVENMKKVACSND...KLSVEERNLLSVAYKNIIGARRASWRIISSIEQKEESRG.NTRQAALIKEYRKKIEDELSDICHDVLSVLEKHLIPAA..TTGESKVFYYKMKGDYYRYLAEFTVGEVCKEAADSSLEAYKAASDIAVAELPPTDPMRLGLALNFSVFYYEILDSPESACHLAKQVFDEAISELDSLSEESYKDSTLIMQLLRDNLTLWTSDAEYNQ +#=GS RA25_SCHPO/5-240 AC P42657 +// +# STOCKHOLM 1.0 +#=GF ID rrm +#=GF AC PF00076 +#=GF DE RNA recognition motif. (a.k.a. RRM, RBD, or RNP domain) +#=GF GA 14.6 0.0 +#=GF TC 14.5 14.5 +#=GF TC 14.6 0.1 + +ARP2_PLAFA/364-438 VEVTYLF....STYLVNGQTL..IYS.....N.ISVV....LVILY.... +CABA_MOUSE/77-147 MFVGGL......SWDTSKKDLKDYFT.....K.FGEV..VDCTIKMD... +GR10_BRANA/8-79 CFVGGL......AWATGDAELERTFS.....Q.FGEV..IDSKIIND... +NSR1_YEAST/170-241 IFVGRL......SWSIDDEWLKKEFE.....H.IGGV..IGARVIYE... +RT19_ARATH/33-104 LYIGGL......SPGTDEHSLKDAFS.....S.FNGV..TEARVMTN... +RO28_NICSY/99-170 LFVGNL......PYDIDSEGLAQLFQ.....Q.AGVV..EIAEVIYN... +RO33_NICSY/116-187 LYVGNL......PFSMTSSQLSEIFA.....E.AGTV..ANVEIVYD... +RO33_NICSY/219-290 LYVANL......SWALTSQGLRDAFA.....D.QPGF..MSAKVIYD... +GBP2_YEAST/221-291 VFIINL......PYSMNWQSLKDMFK.....E.CGHV..LRADVELD... +HUD_HUMAN/48-119 LIVNYL......PQNMTQEEFRSLFG.....S.IGEI..ESCKLVRD... +SXLF_DROME/127-198 LIVNYL......PQDMTDRELYALFR.....A.IGPI..NTCRIMRD... +PABP_DROME/4-75 LYVGDL......PQDVNESGLFDKFS.....S.AGPV..LSIRVCRD... +NAM8_YEAST/165-237 IFVGDL......APNVTESQLFELFI.....NRYAST..SHAKIVHD... +PUB1_YEAST/163-234 LFVGDL......NVNVDDETLRNAFK.....D.FPSY..LSGHVMWD... +TIA1_HUMAN/108-179 VFVGDL......SPQITTEDIKAAFA.....P.FGRI..SDARVVKD... +PES4_YEAST/93-164 LFIGDL......HETVTEETLKGIFK.....K.YPSF..VSAKVCLD... +NOP4_YEAST/28-98 LFVRSI......PQDVTDEQLADFFS.....N.FAPI..KHAVVVKD... +CST2_HUMAN/18-89 VFVGNI......PYEATEEQLKDIFS.....E.VGPV..VSFRLVYD... +RN15_YEAST/20-91 VYLGSI......PYDQTEEQILDLCS.....N.VGPV..INLKMMFD... +YIS1_YEAST/66-136 IFVGNI......TPDVTPEQIEDHFK.....D.CGQI..KRITLLYD... +IF4B_HUMAN/98-168 AFLGNL......PYDVTEESIKEFFR.....G.LNIS...AVRLPR.... +NSR1_YEAST/269-340 LFLGNL......SFNADRDAIFELFA.....K.HGEV..VSVRIPTH... +GBP2_YEAST/124-193 IFVRNL......TFDCTPEDLKELFG.....T.VGEV..VEADIIT.... +NOP3_YEAST/127-190 LFVRPF......PLDVQESELNEIFG.....P.FGPM..KEVKILN.... +U2AF_HUMAN/261-332 LFIGGL......PNYLNDDQVKELLT.....S.FGPL..KAFNLVKD... +U2AF_SCHPO/312-383 IYISNL......PLNLGEDQVVELLK.....P.FGDL..LSFQLIKN... +ELAV_DROME/250-322 LYVSGL......PKTMTQQELEAIFA.....P.FGAI..ITSRILQN... +SXLF_DROME/213-285 LYVTNL......PRTITDDQLDTIFG.....K.YGSI..VQKNILRD... +ELAV_DROME/404-475 IFIYNL......APETEEAALWQLFG.....P.FGAV..QSVKIVKD... +MSSP_HUMAN/31-102 LYIRGL......PPHTTDQDLVKLCQ.....P.YGKI..VSTKAILD... +NONA_DROME/304-369 LYVGNL......TNDITDDELREMFK.....P.YGEI..SEIFSNLD... +PABP_DROME/92-162 VFIKNL......DRAIDNKAIYDTFS.....A.FGNI..LSCKVATD... +PABP_DROME/183-254 VYVKNF......TEDFDDEKLKEFFE.....P.YGKI..TSYKVMS.... +PABP_SCHPO/263-333 VYIKNL......DTEITEQEFSDLFG.....Q.FGEI..TSLSLVKD... +PUB1_YEAST/342-407 AYIGNI......PHFATEADLIPLFQ.....N.FGFI..LDFKHYPE... +PUB1_YEAST/76-146 LYVGNL......DKAITEDILKQYFQ.....V.GGPI..ANIKIMID... +TIA1_HUMAN/9-78 LYVGNL......SRDVTEALILQLFS.....Q.IGPC..KNCKMIMD... +TIA1_HUMAN/216-281 VYCGGV......TSGLTEQLMRQTFS.....P.FGQI..MEIRVFPD... +EWS_HUMAN/363-442 IYVQGL......NDSVTLDDLADFFK.....Q.CGVV..K.MNKRTG... +PTB_HUMAN/186-253 IIVENL......FYPVTLDVLHQIFS.....K.FGTV....LKIIT.... +ROC_HUMAN/18-82 VFIGNL.....NTLVVKKSDVEAIFS.....K.YGKI..VGCSVHK.... +YIS5_YEAST/33-104 IYIGNL......NRELTEGDILTVFS.....E.YGVP..VDVILSRD... +RU1A_HUMAN/12-84 IYINNLNE..KIKKDELKKSLYAIFS.....Q.FGQI..LDILVSR.... +RU2B_HUMAN/9-81 IYINNMND..KIKKEELKRSLYALFS.....Q.FGHV..VDIVALK.... +CABA_MOUSE/161-231 IFVGGL......NPEATEEKIREYFG.....Q.FGEI..EAIELPID... +ROA1_BOVIN/106-176 IFVGGI......KEDTEEHHLRDYFE.....Q.YGKI..EVIEIMTD... +SQD_DROME/138-208 IFVGGL......TTEISDEEIKTYFG.....Q.FGNI..VEVEMPLD... +RB97_DROME/34-104 LFIGGL......APYTTEENLKLFYG.....Q.WGKV..VDVVVMRD... +SQD_DROME/58-128 LFVGGL......SWETTEKELRDHFG.....K.YGEI..ESINVKTD... +ROG_HUMAN/10-81 LFIGGL......NTETNEKALEAVFG.....K.YGRI..VEVLLMKD... +SFR2_CHICK/16-87 LKVDNL......TYRTSPDTLRRVFE.....K.YGRV..GDVYIPRD... +SFR1_HUMAN/17-85 IYVGNL......PPDIRTKDIEDVFY.....K.YGAI..RDIDLKNR... +SR55_DROME/5-68 VYVGGL......PYGVRERDLERFFK.....G.YGRT..RDILIKN.... +SFR3_HUMAN/12-78 VYVGNL......GNNGNKTELERAFG.....Y.YGPL..RSVWVARN... +TRA2_DROME/99-170 IGVFGL......NTNTSQHKVRELFN.....K.YGPI..ERIQMVID... +RU17_DROME/104-175 LFIARI......NYDTSESKLRREFE.....F.YGPI..KKIVLIHD... +GBP2_YEAST/351-421 IYCSNL......PFSTARSDLFDLFG.....P.IGKI..NNAELKP.... +RNP1_YEAST/37-109 LYVGNL......PKNCRKQDLRDLFE.....PNYGKI..TINMLKKK... +PES4_YEAST/305-374 IFIKNL......PTITTRDDILNFFS.....E.VGPI..KSIYLSN.... +YHH5_YEAST/315-384 ILVKNL......PSDTTQEEVLDYFS.....T.IGPI..KSVFISEK... +YHC4_YEAST/348-415 IFVGQL......DKETTREELNRRFS.....T.HGKI..QDINLIFK... +IF39_YEAST/79-157 IVVNGAPVIPSAKVPVLKKALTSLFS.....K.AGKV..VNMEFPID... +MEI2_SCHPO/197-265 LFVTNL......PRIVPYATLLELFS.....K.LGDV..KGIDTSSL... +NOP4_YEAST/292-363 VFVRNV......PYDATEESLAPHFS.....K.FGSV..KYALPVID... +MODU_DROME/260-326 VVVGLI......GPNITKDDLKTFFE.....K.VAPV..EAVTISSN... +ROF_HUMAN/113-183 VRLRGL......PFGCTKEEIVQFFS.....G.LEIV.PNGITLPVD... +MODU_DROME/342-410 LVVENVG....KHESYSSDALEKIFK.....K.FGDV..EEIDVVC.... +NUCL_CHICK/283-352 LFVKNL......TPTKDYEELRTAIK.....EFFGKK...NLQVSEV... +NONA_DROME/378-448 LRVSNL......TPFVSNELLYKSFE.....I.FGPI..ERASITVD... +PSF_HUMAN/373-443 LSVRNL......SPYVSNELLEEAFS.....Q.FGPI..ERAVVIVD... +NOP3_YEAST/202-270 ITMKNL......PEGCSWQDLKDLAR.....E.NSLE..TTFSSVN.... +SFR1_HUMAN/122-186 VVVSGL......PPSGSWQDLKDHMR.....E.AGDV..CYADVYRD... +CPO_DROME/453-526 LFVSGL......PMDAKPRELYLLFR.....A.YEGY..EGSLLKV.... +WHI3_YEAST/540-614 LYVGNL......PSDATEQELRQLFS.....G.QEGF..RRLSFRNK... +RU1A_HUMAN/210-276 LFLTNL......PEETNELMLSMLFN.....Q.FPGF..KEVRLVPG... +RU2B_HUMAN/153-220 LFLNNL......PEETNEMMLSMLFN.....Q.FPGF..KEVRLVPG... +RU1A_YEAST/229-293 LLIQNL......PSGTTEQLLSQILG.....N.EALV...EIRLVSV... +MODU_DROME/177-246 VFVTNL......PNEYLHKDLVALFA.....K.FGRL..SALQRFTN... +PR24_YEAST/43-111 VLVKNL......PKSYNQNKVYKYFK.....H.CGPI..IHVDVAD.... +MODU_DROME/422-484 ILVTNL......TSDATEADLRKVFN.....D.SGEI..ESIIMLG.... +PR24_YEAST/212-284 IMIRNL.....STELLDENLLRESFE.....G.FGSI..EKINIPAG... +SSB1_YEAST/39-114 IFIGNV......AHECTEDDLKQLFV.....EEFGDE..VSVEIPIK... +PTB_HUMAN/61-128 IHIRKL......PIDVTEGEVISLGL.....P.FGKV..TNLLMLKG... +RN12_YEAST/200-267 IVIKFQ......GPALTEEEIYSLFR.....R.YGTI....IDIFP.... +D111_ARATH/281-360 LLLRNMVG.PGQVDDELEDEVGGECA.....K.YGTV..TRVLIFE.... +U2AG_HUMAN/67-142 CAVSDVEM..QEHYDEFFEEVFTEME.....EKYGEV..EEMNVCDN... +IF39_SCHPO/41-124 VVIEGAP....VVEEAKQQDFFRFLSSKVLAK.IGKVKENGFYMPFE... +LA_DROME/151-225 AYAKGF......PLDSQISELLDFTA.....N.YDKV..VNLTMRNS... +LA_HUMAN/113-182 VYIKGF......PTDATLDDIKEWLE.....D.KGQV..LNIQMRR.... +PR24_YEAST/119-190 LWMTNF......PPSYTQRNIRDLLQ.....D.INVV.ALSIRLPSL... + +ARP2_PLAFA/364-438 ....HQKFKETVLGRNSGFGFVSYDNVISAQHAIQFMNG.Y...FVNNKY +CABA_MOUSE/77-147 ..........PNTGRSRGFGFILFKDSSSVEKVLD.QKE.H...RLDGRV +GR10_BRANA/8-79 ..........RETGRSRGFGFVTFKDEKSMKDAIDEMNG.K...ELDGRT +NSR1_YEAST/170-241 ..........RGTDRSRGYGYVDFENKSYAEKAIQEMQG.K...EIDGRP +RT19_ARATH/33-104 ..........KVTGRSRGYGFVNFISEDSANSAISAMNG.Q...ELNGFN +RO28_NICSY/99-170 ..........RETDRSRGFGFVTMSTVEEADKAVELYSQ.Y...DLNGRL +RO33_NICSY/116-187 ..........RVTDRSRGFAFVTMGSVEEAKEAIRLFDG.S...QVGGRT +RO33_NICSY/219-290 ..........RSSGRSRGFGFITFSSAEAMNSALDTMNE.V...ELEGRP +GBP2_YEAST/221-291 ...........FNGFSRGFGSVIYPTEDEMIRAIDTFNG.M...EVEGRV +HUD_HUMAN/48-119 ..........KITGQSLGYGFVNYIDPKDAEKAINTLNG.L...RLQTKT +SXLF_DROME/127-198 ..........YKTGYSFGYAFVDFTSEMDSQRAIKVLNG.I...TVRNKR +PABP_DROME/4-75 ..........VITRRSLGYAYVNFQQPADAERALDTMNF.D...LVRNKP +NAM8_YEAST/165-237 ..........QVTGMSKGYGFVKFTNSDEQQLALSEMQG.V...FLNGRA +PUB1_YEAST/163-234 ..........MQTGSSRGYGFVSFTSQDDAQNAMDSMQG.Q...DLNGRP +TIA1_HUMAN/108-179 ..........MATGKSKGYGFVSFFNKWDAENAIQQMGG.Q...WLGGRQ +PES4_YEAST/93-164 ..........SVTKKSLGHGYLNFEDKEEAEKAMEELNY.T...KVNGKE +NOP4_YEAST/28-98 ...........TNKRSRGFGFVSFAVEDDTKEALAKARK.T...KFNGHI +CST2_HUMAN/18-89 ..........RETGKPKGYGFCEYQDQETALSAMRNLNG.R...EFSGRA +RN15_YEAST/20-91 ..........PQTGRSKGYAFIEFRDLESSASAVRNLNG.Y...QLGSRF +YIS1_YEAST/66-136 ..........RNTGTPKGYGYIEFESPAYREKALQ.LNG.G...ELKGKK +IF4B_HUMAN/98-168 ........EPSNPERLKGFGYAEFEDLDSLLSALS.LNE.E...SLGNRR +NSR1_YEAST/269-340 ..........PETEQPKGFGYVQFSNMEDAKKALDALQG.E...YIDNRP +GBP2_YEAST/124-193 ...........SKGHHRGMGTVEFTKNESVQDAISKFDG.A...LFMDRK +NOP3_YEAST/127-190 .................GFAFVEFEEAESAAKAIEEVHG.K...SFANQP +U2AF_HUMAN/261-332 ..........SATGLSKGYAFCEYVDINVTDQAIAGLNG.M...QLGDKK +U2AF_SCHPO/312-383 ..........IADGSSKGFCFCEFKNPSDAEVAISGLDG.K...DTYGNK +ELAV_DROME/250-322 .........AGNDTQTKGVGFIRFDKREEATRAIIALNG.T...TPSSCT +SXLF_DROME/213-285 ..........KLTGRPRGVAFVRYNKREEAQEAISALNNVI...PEGGSQ +ELAV_DROME/404-475 ..........PTTNQCKGYGFVSMTNYDEAAMAIRALNG.Y...TMGNRV +MSSP_HUMAN/31-102 ..........KTTNKCKGYGFVDFDSPAAAQKAVSALKA.S...GVQAQK +NONA_DROME/304-369 ................KNFTFLKVDYHPNAEKAKRALDG.S...MRKGRQ +PABP_DROME/92-162 ...........EKGNSKGYGFVHFETEEAANTSIDKVNG.M...LLNGKK +PABP_DROME/183-254 ..........KEDGKSKGFGFVAFETTEAAEAAVQALNGKD...MGEGKS +PABP_SCHPO/263-333 ...........QNDKPRGFGFVNYANHECAQKAVDELND.K...EYKGKK +PUB1_YEAST/342-407 ................KGCCFIKYDTHEQAAVCIVALAN.F...PFQGRN +PUB1_YEAST/76-146 ...........KNNKNVNYAFVEYHQSHDANIALQTLNG.K...QIENNI +TIA1_HUMAN/9-78 ............TAGNDPYCFVEFHEHRHAAAALAAMNG.R...KIMGKE +TIA1_HUMAN/216-281 ................KGYSFVRFNSHESAAHAIVSVNG.T...TIEGHV +EWS_HUMAN/363-442 .QPMIHIYLDKETGKPKGDATVSYEDPPTAKAAVEWFDG.K...DFQGSK +PTB_HUMAN/186-253 ...........FTKNNQFQALLQYADPVSAQHAKLSLDG.Q...NIYNAC +ROC_HUMAN/18-82 .................GFAFVQYVNERNARAAVAGEDG.R...MIAGQV +YIS5_YEAST/33-104 ..........ENTGESQGFAYLKYEDQRSTILAVDNLNG.F...KIGGRA +RU1A_HUMAN/12-84 ............SLKMRGQAFVIFKEVSSATNALRSMQG.F...PFYDKP +RU2B_HUMAN/9-81 ............TMKMRGQAFVIFKELGSSTNALRQLQG.F...PFYGKP +CABA_MOUSE/161-231 ..........PKLNKRRGFVFITFKEEDPVKKVLE.KKF.H...TVSGSK +ROA1_BOVIN/106-176 ..........RGSGKKRGFAFVTFDDHDSVDKIVI.QKY.H...TVNGHN +SQD_DROME/138-208 ..........KQKSQRKGFCFITFDSEQVVTDLLK.TPK.Q...KIAGKE +RB97_DROME/34-104 ..........AATKRSRGFGFITYTKSLMVDRAQE..NRPH...IIDGKT +SQD_DROME/58-128 ..........PQTGRSRGFAFIVFTNTEAIDKVSA.ADE.H...IINSKK +ROG_HUMAN/10-81 ..........RETNKSRGFAFVTFESPADAKDAARDMNG.K...SLDGKA +SFR2_CHICK/16-87 ..........RYTKESRGFAFVRFHDKRDAEDAMDAMDG.A...VLDGRE +SFR1_HUMAN/17-85 .............RGGPPFAFVEFEDPRDAEDAVYGRDG.Y...DYDGYR +SR55_DROME/5-68 .................GYGFVEFEDYRDADDAVYELNG.K...ELLGER +SFR3_HUMAN/12-78 ...............PPGFAFVEFEDPRDAADAVRELDG.R...TLCGCR +TRA2_DROME/99-170 ..........AQTQRSRGFCFIYFEKLSDARAAKDSCSG.I...EVDGRR +RU17_DROME/104-175 ..........QESGKPKGYAFIEYEHERDMHAAYKHADG.K...KIDSKR +GBP2_YEAST/351-421 ..........QENGQPTGVAVVEYENLVDADFCIQKLNN.Y...NYGGCS +RNP1_YEAST/37-109 ..........PLKKPLKRFAFIEFQEGVNLKKVKEKMNG.K...IFMNEK +PES4_YEAST/305-374 ...........ATKVKYLWAFVTYKNSSDSEKAIKRYNN.F...YFRGKK +YHH5_YEAST/315-384 ............QANTPHKAFVTYKNEEESKKAQKCLNK.T...IFKNHT +YHC4_YEAST/348-415 ..............PTNIFAFIKYETEEAAAAALESENH.A...IFLNKT +IF39_YEAST/79-157 ..........EATGKTKGFLFVECGSMNDAKKIIKSFHGKR...LDLKHR +MEI2_SCHPO/197-265 ..............STDGICIVAFFDIRQAIQAAKSLRSQR...FFNDRL +NOP4_YEAST/292-363 ..........KSTGLAKGTAFVAFKDQYTYNECIKNAPA.A...GSTSLL +MODU_DROME/260-326 ..............RLMPRAFVRLASVDDIPKALK.LHS.T...ELFSRF +ROF_HUMAN/113-183 ...........PEGKITGEAFVQFASQELAEKALG.KHK.E...RIGHRY +MODU_DROME/342-410 ..............SKAVLAFVTFKQSDAATKALAQLDG.K...TVNKFE +NUCL_CHICK/283-352 ...........RIGSSKRFGYVDFLSAEDMDKALQ.LNG.K...KLMGLE +NONA_DROME/378-448 ...........DRGKHMGEGIVEFAKKSSASACLRMCNE.K...CFFLTA +PSF_HUMAN/373-443 ...........DRGRSTGKGIVEFASKPAARKAFERCSE.G...VFLLTT +NOP3_YEAST/202-270 ............TRDFDGTGALEFPSEEILVEALERLNN.I...EFRGSV +SFR1_HUMAN/122-186 .................GTGVVEFVRKEDMTYAVRKLDN.T...KFRSHE +CPO_DROME/453-526 ........TSKNGKTASPVGFVTFHTRAGAEAAKQDLQGVR...FDPDMP +WHI3_YEAST/540-614 .......NTTSNGHSHGPMCFVEFDDVSFATRALAELYG.R...QLPRST +RU1A_HUMAN/210-276 ...............RHDIAFVEFDNEVQAGAARDALQG.F...KITQNN +RU2B_HUMAN/153-220 ...............RHDIAFVEFENDGQAGAARDALQGFK...ITPSHA +RU1A_YEAST/229-293 ................RNLAFVEYETVADATKIKNQLGS.T...YKLQNN +MODU_DROME/177-246 .............LNGNKSVLIAFDTSTGAEAVLQAKPKAL...TLGDNV +PR24_YEAST/43-111 ...........SLKKNFRFARIEFARYDGALAAIT.KTH.K...VVGQNE +MODU_DROME/422-484 .................QKAVVKFKDDEGFCKSFL.ANE.S...IVNNAP +PR24_YEAST/212-284 .........QKEHSFNNCCAFMVFENKDSAERALQ.MNR.S...LLGNRE +SSB1_YEAST/39-114 .......EHTDGHIPASKHALVKFPTKIDFDNIKENYDT.K...VVKDRE +PTB_HUMAN/61-128 ................KNQAFIEMNTEEAANTMVN.YYT.SVTPVLRGQP +RN12_YEAST/200-267 ...........PTAANNNVAKVRYRSFRGAISAKNCVSG.I...EIHNTV +D111_ARATH/281-360 ......ITEPNFPVHEAVRIFVQFSRPEETTKALVDLDG.R...YFGGRT +U2AG_HUMAN/67-142 ...........LGDHLVGNVYVKFRREEDAEKAVIDLNN.R...WFNGQP +IF39_SCHPO/41-124 ......EKNGK..KMSLGLVFADFENVDGADLCVQELDGKQ...ILKNHT +LA_DROME/151-225 ......YDKPTKSYKFKGSIFLTFETKDQAKAFLE.QEK.I...VYKERE +LA_HUMAN/113-182 ..........TLHKAFKGSIFVVFDSIESAKKFVE.TPG.Q...KYKETD +PR24_YEAST/119-190 ...........RFNTSRRFAYIDVTSKEDARYCVEKLNG.L...KIEGYT + +ARP2_PLAFA/364-438 LKV +CABA_MOUSE/77-147 IDP +GR10_BRANA/8-79 ITV +NSR1_YEAST/170-241 INC +RT19_ARATH/33-104 ISV +RO28_NICSY/99-170 LTV +RO33_NICSY/116-187 VKV +RO33_NICSY/219-290 LRL +GBP2_YEAST/221-291 LEV +HUD_HUMAN/48-119 IKV +SXLF_DROME/127-198 LKV +PABP_DROME/4-75 IRI +NAM8_YEAST/165-237 IKV +PUB1_YEAST/163-234 LRI +TIA1_HUMAN/108-179 IRT +PES4_YEAST/93-164 IRI +NOP4_YEAST/28-98 LRV +CST2_HUMAN/18-89 LRV +RN15_YEAST/20-91 LKC +YIS1_YEAST/66-136 IAV +IF4B_HUMAN/98-168 IRV +NSR1_YEAST/269-340 VRL +GBP2_YEAST/124-193 LMV +NOP3_YEAST/127-190 LEV +U2AF_HUMAN/261-332 LLV +U2AF_SCHPO/312-383 LHA +ELAV_DROME/250-322 DPI +SXLF_DROME/213-285 PLS +ELAV_DROME/404-475 LQV +MSSP_HUMAN/31-102 AKQ +NONA_DROME/304-369 LRV +PABP_DROME/92-162 VYV +PABP_DROME/183-254 LYV +PABP_SCHPO/263-333 LYV +PUB1_YEAST/342-407 LRT +PUB1_YEAST/76-146 VKI +TIA1_HUMAN/9-78 VKV +TIA1_HUMAN/216-281 VKC +EWS_HUMAN/363-442 LKV +PTB_HUMAN/186-253 CTL +ROC_HUMAN/18-82 LDI +YIS5_YEAST/33-104 LKI +RU1A_HUMAN/12-84 MRI +RU2B_HUMAN/9-81 MRI +CABA_MOUSE/161-231 CEI +ROA1_BOVIN/106-176 CEV +SQD_DROME/138-208 VDV +RB97_DROME/34-104 VEA +SQD_DROME/58-128 VDP +ROG_HUMAN/10-81 IKV +SFR2_CHICK/16-87 LRV +SFR1_HUMAN/17-85 LRV +SR55_DROME/5-68 VVV +SFR3_HUMAN/12-78 VRV +TRA2_DROME/99-170 IRV +RU17_DROME/104-175 VLV +GBP2_YEAST/351-421 LQI +RNP1_YEAST/37-109 IVI +PES4_YEAST/305-374 LLV +YHH5_YEAST/315-384 IWV +YHC4_YEAST/348-415 MHV +IF39_YEAST/79-157 LFL +MEI2_SCHPO/197-265 LYF +NOP4_YEAST/292-363 IGD +MODU_DROME/260-326 ITV +ROF_HUMAN/113-183 IEV +MODU_DROME/342-410 WKL +NUCL_CHICK/283-352 IKL +NONA_DROME/378-448 SLR +PSF_HUMAN/373-443 TPR +NOP3_YEAST/202-270 ITV +SFR1_HUMAN/122-186 GET +CPO_DROME/453-526 QTI +WHI3_YEAST/540-614 VSS +RU1A_HUMAN/210-276 AMK +RU2B_HUMAN/153-220 MKI +RU1A_YEAST/229-293 DVT +MODU_DROME/177-246 LSV +PR24_YEAST/43-111 IIV +MODU_DROME/422-484 IFI +PR24_YEAST/212-284 ISV +SSB1_YEAST/39-114 IHI +PTB_HUMAN/61-128 IYI +RN12_YEAST/200-267 LHI +D111_ARATH/281-360 VRA +U2AG_HUMAN/67-142 IHA +IF39_SCHPO/41-124 FVV +LA_DROME/151-225 LLR +LA_HUMAN/113-182 LLI +PR24_YEAST/119-190 LVT +// diff --git a/forester/archive/RIO/others/hmmer/squid/Formats/swissprot b/forester/archive/RIO/others/hmmer/squid/Formats/swissprot new file mode 100644 index 0000000..1f1220e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Formats/swissprot @@ -0,0 +1,317 @@ +ID 100K_RAT STANDARD; PRT; 889 AA. +AC Q62671; +DT 01-NOV-1997 (Rel. 35, Created) +DT 01-NOV-1997 (Rel. 35, Last sequence update) +DT 15-JUL-1999 (Rel. 38, Last annotation update) +DE 100 KD PROTEIN (EC 6.3.2.-). +OS Rattus norvegicus (Rat). +OC Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Mammalia; +OC Eutheria; Rodentia; Sciurognathi; Muridae; Murinae; Rattus. +RN [1] +RP SEQUENCE FROM N.A. +RC STRAIN=WISTAR; TISSUE=TESTIS; +RX MEDLINE; 92253337. +RA MUELLER D., REHBEIN M., BAUMEISTER H., RICHTER D.; +RT "Molecular characterization of a novel rat protein structurally +RT related to poly(A) binding proteins and the 70K protein of the U1 +RT small nuclear ribonucleoprotein particle (snRNP)."; +RL Nucleic Acids Res. 20:1471-1475(1992). +RN [2] +RP ERRATUM. +RA MUELLER D., REHBEIN M., BAUMEISTER H., RICHTER D.; +RL Nucleic Acids Res. 20:2624-2624(1992). +CC -!- FUNCTION: E3 UBIQUITIN-PROTEIN LIGASE WHICH ACCEPTS UBIQUITIN FROM +CC AN E2 UBIQUITIN-CONJUGATING ENZYME IN THE FORM OF A THIOESTER AND +CC THEN DIRECTLY TRANSFERS THE UBIQUITIN TO TARGETED SUBSTRATES (BY +CC SIMILARITY). THIS PROTEIN MAY BE INVOLVED IN MATURATION AND/OR +CC POST-TRANSCRIPTIONAL REGULATION OF MRNA. +CC -!- TISSUE SPECIFICITY: HIGHEST LEVELS FOUND IN TESTIS. ALSO PRESENT +CC IN LIVER, KIDNEY, LUNG AND BRAIN. +CC -!- DEVELOPMENTAL STAGE: IN EARLY POST-NATAL LIFE, EXPRESSION IN +CC THE TESTIS INCREASES TO REACH A MAXIMUM AROUND DAY 28. +CC -!- MISCELLANEOUS: A CYSTEINE RESIDUE IS REQUIRED FOR +CC UBIQUITIN-THIOLESTER FORMATION. +CC -!- SIMILARITY: CONTAINS AN HECT-TYPE E3 UBIQUITIN-PROTEIN LIGASE +CC DOMAIN. +CC -!- SIMILARITY: A CENTRAL REGION (AA 485-514) IS SIMILAR TO THE +CC C-TERMINAL DOMAINS OF MAMMALIAN AND YEAST POLY (A) RNA BINDING +CC PROTEINS (PABP). +CC -!- SIMILARITY: THE C-TERMINAL HALF SHOWS HIGH SIMILARITY TO +CC DROSOPHILA HYPERPLASMIC DISC PROTEIN AND SOME, TO HUMAN E6-AP. +CC -!- SIMILARITY: CONTAINS MIXED-CHARGE DOMAINS SIMILAR TO RNA-BINDING +CC PROTEINS. +CC -------------------------------------------------------------------------- +CC This SWISS-PROT entry is copyright. It is produced through a collaboration +CC between the Swiss Institute of Bioinformatics and the EMBL outstation - +CC the European Bioinformatics Institute. There are no restrictions on its +CC use by non-profit institutions as long as its content is in no way +CC modified and this statement is not removed. Usage by and for commercial +CC entities requires a license agreement (See http://www.isb-sib.ch/announce/ +CC or send an email to license@isb-sib.ch). +CC -------------------------------------------------------------------------- +DR EMBL; X64411; CAA45756.1; -. +DR PFAM; PF00632; HECT; 1. +DR PFAM; PF00658; PABP; 1. +KW Ubiquitin conjugation; Ligase. +FT DOMAIN 77 88 ASP/GLU-RICH (ACIDIC). +FT DOMAIN 127 150 PRO-RICH. +FT DOMAIN 420 439 ARG/GLU-RICH (MIXED CHARGE). +FT DOMAIN 448 457 ARG/ASP-RICH (MIXED CHARGE). +FT DOMAIN 485 514 PABP-LIKE. +FT DOMAIN 579 590 ASP/GLU-RICH (ACIDIC). +FT DOMAIN 786 889 HECT DOMAIN. +FT DOMAIN 827 847 PRO-RICH. +FT BINDING 858 858 UBIQUITIN (BY SIMILARITY). +SQ SEQUENCE 889 AA; 100368 MW; DD7E6C7A CRC32; + MMSARGDFLN YALSLMRSHN DEHSDVLPVL DVCSLKHVAY VFQALIYWIK AMNQQTTLDT + PQLERKRTRE LLELGIDNED SEHENDDDTS QSATLNDKDD ESLPAETGQN HPFFRRSDSM + TFLGCIPPNP FEVPLAEAIP LADQPHLLQP NARKEDLFGR PSQGLYSSSA GSGKCLVEVT + MDRNCLEVLP TKMSYAANLK NVMNMQNRQK KAGEDQSMLA EEADSSKPGP SAHDVAAQLK + SSLLAEIGLT ESEGPPLTSF RPQCSFMGMV ISHDMLLGRW RLSLELFGRV FMEDVGAEPG + SILTELGGFE VKESKFRREM EKLRNQQSRD LSLEVDRDRD LLIQQTMRQL NNHFGRRCAT + TPMAVHRVKV TFKDEPGEGS GVARSFYTAI AQAFLSNEKL PNLDCIQNAN KGTHTSLMQR + LRNRGERDRE REREREMRRS SGLRAGSRRD RDRDFRRQLS IDTRPFRPAS EGNPSDDPDP + LPAHRQALGE RLYPRVQAMQ PAFASKITGM LLELSPAQLL LLLASEDSLR ARVEEAMELI + VAHGRENGAD SILDLGLLDS SEKVQENRKR HGSSRSVVDM DLDDTDDGDD NAPLFYQPGK + RGFYTPRPGK NTEARLNCFR NIGRILGLCL LQNELCPITL NRHVIKVLLG RKVNWHDFAF + FDPVMYESLR QLILASQSSD ADAVFSAMDL AFAVDLCKEE GGGQVELIPN GVNIPVTPQN + VYEYVRKYAE HRMLVVAEQP LHAMRKGLLD VLPKNSLEDL TAEDFRLLVN GCGEVNVQML + ISFTSFNDES GENAEKLLQF KRWFWSIVER MSMTERQDLV YFWTSSPSLP ASEEGFQPMP + SITIRPPDDQ HLPTANTCIS RLYVPLYSSK QILKQKLLLA IKTKNFGFV +// +ID 104K_THEPA STANDARD; PRT; 924 AA. +AC P15711; +DT 01-APR-1990 (Rel. 14, Created) +DT 01-APR-1990 (Rel. 14, Last sequence update) +DT 01-AUG-1992 (Rel. 23, Last annotation update) +DE 104 KD MICRONEME-RHOPTRY ANTIGEN. +OS Theileria parva. +OC Eukaryota; Alveolata; Apicomplexa; Piroplasmida; Theileriidae; +OC Theileria. +RN [1] +RP SEQUENCE FROM N.A. +RC STRAIN=MUGUGA; +RX MEDLINE; 90158697. +RA IAMS K.P., YOUNG J.R., NENE V., DESAI J., WEBSTER P., +RA OLE-MOIYOI O.K., MUSOKE A.J.; +RT "Characterisation of the gene encoding a 104-kilodalton microneme- +RT rhoptry protein of Theileria parva."; +RL Mol. Biochem. Parasitol. 39:47-60(1990). +CC -!- SUBCELLULAR LOCATION: IN MICRONEME/RHOPTRY COMPLEXES. +CC -!- DEVELOPMENTAL STAGE: SPOROZOITE ANTIGEN. +CC -------------------------------------------------------------------------- +CC This SWISS-PROT entry is copyright. It is produced through a collaboration +CC between the Swiss Institute of Bioinformatics and the EMBL outstation - +CC the European Bioinformatics Institute. There are no restrictions on its +CC use by non-profit institutions as long as its content is in no way +CC modified and this statement is not removed. Usage by and for commercial +CC entities requires a license agreement (See http://www.isb-sib.ch/announce/ +CC or send an email to license@isb-sib.ch). +CC -------------------------------------------------------------------------- +DR EMBL; M29954; AAA18217.1; -. +DR PIR; A44945; A44945. +KW Antigen; Sporozoite; Repeat. +FT DOMAIN 1 19 HYDROPHOBIC. +FT DOMAIN 905 924 HYDROPHOBIC. +SQ SEQUENCE 924 AA; 103625 MW; 4563AAA0 CRC32; + MKFLILLFNI LCLFPVLAAD NHGVGPQGAS GVDPITFDIN SNQTGPAFLT AVEMAGVKYL + QVQHGSNVNI HRLVEGNVVI WENASTPLYT GAIVTNNDGP YMAYVEVLGD PNLQFFIKSG + DAWVTLSEHE YLAKLQEIRQ AVHIESVFSL NMAFQLENNK YEVETHAKNG ANMVTFIPRN + GHICKMVYHK NVRIYKATGN DTVTSVVGFF RGLRLLLINV FSIDDNGMMS NRYFQHVDDK + YVPISQKNYE TGIVKLKDYK HAYHPVDLDI KDIDYTMFHL ADATYHEPCF KIIPNTGFCI + TKLFDGDQVL YESFNPLIHC INEVHIYDRN NGSIICLHLN YSPPSYKAYL VLKDTGWEAT + THPLLEEKIE ELQDQRACEL DVNFISDKDL YVAALTNADL NYTMVTPRPH RDVIRVSDGS + EVLWYYEGLD NFLVCAWIYV SDGVASLVHL RIKDRIPANN DIYVLKGDLY WTRITKIQFT + QEIKRLVKKS KKKLAPITEE DSDKHDEPPE GPGASGLPPK APGDKEGSEG HKGPSKGSDS + SKEGKKPGSG KKPGPAREHK PSKIPTLSKK PSGPKDPKHP RDPKEPRKSK SPRTASPTRR + PSPKLPQLSK LPKSTSPRSP PPPTRPSSPE RPEGTKIIKT SKPPSPKPPF DPSFKEKFYD + DYSKAASRSK ETKTTVVLDE SFESILKETL PETPGTPFTT PRPVPPKRPR TPESPFEPPK + DPDSPSTSPS EFFTPPESKR TRFHETPADT PLPDVTAELF KEPDVTAETK SPDEAMKRPR + SPSEYEDTSP GDYPSLPMKR HRLERLRLTT TEMETDPGRM AKDASGKPVK LKRSKSFDDL + TTVELAPEPK ASRIVVDDEG TEADDEETHP PEERQKTEVR RRRPPKKPSK SPRPSKPKKP + KKPDSAYIPS ILAILVVSLI VGIL +// +ID 108_LYCES STANDARD; PRT; 102 AA. +AC Q43495; +DT 15-JUL-1999 (Rel. 38, Created) +DT 15-JUL-1999 (Rel. 38, Last sequence update) +DT 15-JUL-1999 (Rel. 38, Last annotation update) +DE PROTEIN 108 PRECURSOR. +OS Lycopersicon esculentum (Tomato). +OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; +OC euphyllophytes; Spermatophyta; Magnoliophyta; eudicotyledons; +OC core eudicots; Asteridae; euasterids I; Solanales; Solanaceae; +OC Solanum. +RN [1] +RP SEQUENCE FROM N.A. +RC STRAIN=CV. VF36; TISSUE=ANTHER; +RX MEDLINE; 94143497. +RA CHEN R., SMITH A.G.; +RT "Nucleotide sequence of a stamen- and tapetum-specific gene from +RT Lycopersicon esculentum."; +RL Plant Physiol. 101:1413-1413(1993). +CC -!- TISSUE SPECIFICITY: STAMEN- AND TAPETUM-SPECIFIC. +CC -!- SIMILARITY: BELONGS TO THE A9 / FIL1 FAMILY. +CC -------------------------------------------------------------------------- +CC This SWISS-PROT entry is copyright. It is produced through a collaboration +CC between the Swiss Institute of Bioinformatics and the EMBL outstation - +CC the European Bioinformatics Institute. There are no restrictions on its +CC use by non-profit institutions as long as its content is in no way +CC modified and this statement is not removed. Usage by and for commercial +CC entities requires a license agreement (See http://www.isb-sib.ch/announce/ +CC or send an email to license@isb-sib.ch). +CC -------------------------------------------------------------------------- +DR EMBL; Z14088; CAA78466.1; -. +DR MENDEL; 8853; LYCes;1133;1. +KW Signal. +FT SIGNAL 1 30 POTENTIAL. +FT CHAIN 31 102 PROTEIN 108. +FT DISULFID 41 77 BY SIMILARITY. +FT DISULFID 51 66 BY SIMILARITY. +FT DISULFID 67 92 BY SIMILARITY. +FT DISULFID 79 99 BY SIMILARITY. +SQ SEQUENCE 102 AA; 10576 MW; AFA4875A CRC32; + MASVKSSSSS SSSSFISLLL LILLVIVLQS QVIECQPQQS CTASLTGLNV CAPFLVPGSP + TASTECCNAV QSINHDCMCN TMRIAAQIPA QCNLPPLSCS AN +// +ID 10KD_VIGUN STANDARD; PRT; 75 AA. +AC P18646; +DT 01-NOV-1990 (Rel. 16, Created) +DT 01-NOV-1990 (Rel. 16, Last sequence update) +DT 01-FEB-1995 (Rel. 31, Last annotation update) +DE 10 KD PROTEIN PRECURSOR (CLONE PSAS10). +OS Vigna unguiculata (Cowpea). +OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; +OC euphyllophytes; Spermatophyta; Magnoliophyta; eudicotyledons; +OC core eudicots; Rosidae; eurosids I; Fabales; Fabaceae; Papilionoideae; +OC Vigna. +RN [1] +RP SEQUENCE FROM N.A. +RC TISSUE=COTYLEDON; +RX MEDLINE; 91355865. +RA ISHIBASHI N., YAMAUCHI D., MINIAMIKAWA T.; +RT "Stored mRNA in cotyledons of Vigna unguiculata seeds: nucleotide +RT sequence of cloned cDNA for a stored mRNA and induction of its +RT synthesis by precocious germination."; +RL Plant Mol. Biol. 15:59-64(1990). +CC -!- FUNCTION: THIS PROTEIN IS REQUIRED FOR GERMINATION. +CC -!- SIMILARITY: BELONGS TO THE GAMMA-PUROTHIONIN FAMILY. +CC -------------------------------------------------------------------------- +CC This SWISS-PROT entry is copyright. It is produced through a collaboration +CC between the Swiss Institute of Bioinformatics and the EMBL outstation - +CC the European Bioinformatics Institute. There are no restrictions on its +CC use by non-profit institutions as long as its content is in no way +CC modified and this statement is not removed. Usage by and for commercial +CC entities requires a license agreement (See http://www.isb-sib.ch/announce/ +CC or send an email to license@isb-sib.ch). +CC -------------------------------------------------------------------------- +DR EMBL; X16877; CAA34760.1; -. +DR PIR; S11156; S11156. +DR HSSP; P45639; 1CHL. +DR PFAM; PF00304; Gamma-thionin; 1. +DR PROSITE; PS00940; GAMMA_THIONIN; 1. +KW Germination; Signal. +FT SIGNAL 1 ? POTENTIAL. +FT CHAIN ? 75 10 KD PROTEIN. +FT DISULFID 31 75 BY SIMILARITY. +FT DISULFID 42 63 BY SIMILARITY. +FT DISULFID 48 69 BY SIMILARITY. +FT DISULFID 52 71 BY SIMILARITY. +SQ SEQUENCE 75 AA; 8523 MW; AFF911AB CRC32; + MEKKSIAGLC FLFLVLFVAQ EVVVQSEAKT CENLVDTYRG PCFTTGSCDD HCKNKEHLLS + GRCRDDVRCW CTRNC +// +ID 110K_PLAKN STANDARD; PRT; 296 AA. +AC P13813; +DT 01-JAN-1990 (Rel. 13, Created) +DT 01-JAN-1990 (Rel. 13, Last sequence update) +DT 01-FEB-1994 (Rel. 28, Last annotation update) +DE 110 KD ANTIGEN (PK110) (FRAGMENT). +OS Plasmodium knowlesi. +OC Eukaryota; Alveolata; Apicomplexa; Haemosporida; Plasmodium. +RN [1] +RP SEQUENCE FROM N.A. +RX MEDLINE; 88039002. +RA PERLER F.B., MOON A.M., QIANG B.Q., MEDA M., DALTON M., CARD C., +RA SCHMIDT-ULLRICH R., WALLACH D., LYNCH J., DONELSON J.E.; +RT "Cloning and characterization of an abundant Plasmodium knowlesi +RT antigen which cross reacts with Gambian sera."; +RL Mol. Biochem. Parasitol. 25:185-193(1987). +CC -------------------------------------------------------------------------- +CC This SWISS-PROT entry is copyright. It is produced through a collaboration +CC between the Swiss Institute of Bioinformatics and the EMBL outstation - +CC the European Bioinformatics Institute. There are no restrictions on its +CC use by non-profit institutions as long as its content is in no way +CC modified and this statement is not removed. Usage by and for commercial +CC entities requires a license agreement (See http://www.isb-sib.ch/announce/ +CC or send an email to license@isb-sib.ch). +CC -------------------------------------------------------------------------- +DR EMBL; M19152; AAA29471.1; -. +DR PIR; A54527; A54527. +KW Malaria; Antigen; Repeat. +FT NON_TER 1 1 +FT DOMAIN 131 296 13.5 X 12 AA TANDEM REPEATS OF E-E-T-Q-K- +FT T-V-E-P-E-Q-T. +SQ SEQUENCE 296 AA; 34077 MW; 666F88DF CRC32; + FNSNMLRGSV CEEDVSLMTS IDNMIEEIDF YEKEIYKGSH SGGVIKGMDY DLEDDENDED + EMTEQMVEEV ADHITQDMID EVAHHVLDNI THDMAHMEEI VHGLSGDVTQ IKEIVQKVNV + AVEKVKHIVE TEETQKTVEP EQIEETQNTV EPEQTEETQK TVEPEQTEET QNTVEPEQIE + ETQKTVEPEQ TEEAQKTVEP EQTEETQKTV EPEQTEETQK TVEPEQTEET QKTVEPEQTE + ETQKTVEPEQ TEETQKTVEP EQTEETQKTV EPEQTEETQN TVEPEPTQET QNTVEP +// +ID 11S3_HELAN STANDARD; PRT; 493 AA. +AC P19084; +DT 01-NOV-1990 (Rel. 16, Created) +DT 01-NOV-1990 (Rel. 16, Last sequence update) +DT 01-FEB-1994 (Rel. 28, Last annotation update) +DE 11S GLOBULIN SEED STORAGE PROTEIN G3 PRECURSOR (HELIANTHININ G3). +GN HAG3. +OS Helianthus annuus (Common sunflower). +OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; +OC euphyllophytes; Spermatophyta; Magnoliophyta; eudicotyledons; +OC core eudicots; Asteridae; euasterids II; Asterales; Asteraceae; +OC Helianthus. +RN [1] +RP SEQUENCE FROM N.A. +RX MEDLINE; 89232734. +RA VONDER HARR R.A., ALLEN R.D., COHEN E.A., NESSLER C.L., THOMAS T.L.; +RT "Organization of the sunflower 11S storage protein gene family."; +RL Gene 74:433-443(1988). +CC -!- FUNCTION: THIS IS A SEED STORAGE PROTEIN. +CC -!- SUBUNIT: HEXAMER; EACH SUBUNIT IS COMPOSED OF AN ACIDIC AND A +CC BASIC CHAIN DERIVED FROM A SINGLE PRECURSOR AND LINKED BY A +CC DISULFIDE BOND. +CC -!- SIMILARITY: BELONGS TO THE 11S SEED STORAGE PROTEINS (GLOBULINS) +CC FAMILY. +CC -------------------------------------------------------------------------- +CC This SWISS-PROT entry is copyright. It is produced through a collaboration +CC between the Swiss Institute of Bioinformatics and the EMBL outstation - +CC the European Bioinformatics Institute. There are no restrictions on its +CC use by non-profit institutions as long as its content is in no way +CC modified and this statement is not removed. Usage by and for commercial +CC entities requires a license agreement (See http://www.isb-sib.ch/announce/ +CC or send an email to license@isb-sib.ch). +CC -------------------------------------------------------------------------- +DR EMBL; M28832; AAA33374.1; -. +DR PIR; JA0089; JA0089. +DR PFAM; PF00190; Seedstore_11s; 1. +DR PROSITE; PS00305; 11S_SEED_STORAGE; 1. +KW Seed storage protein; Multigene family; Signal. +FT SIGNAL 1 20 +FT CHAIN 21 305 ACIDIC CHAIN. +FT CHAIN 306 493 BASIC CHAIN. +FT DISULFID 103 312 INTERCHAIN (ACIDIC-BASIC) (POTENTIAL). +FT DOMAIN 23 35 GLN-RICH. +FT DOMAIN 111 127 GLN/GLY-RICH. +FT DOMAIN 191 297 GLN-RICH. +SQ SEQUENCE 493 AA; 55687 MW; E79DEAAE CRC32; + MASKATLLLA FTLLFATCIA RHQQRQQQQN QCQLQNIEAL EPIEVIQAEA GVTEIWDAYD + QQFQCAWSIL FDTGFNLVAF SCLPTSTPLF WPSSREGVIL PGCRRTYEYS QEQQFSGEGG + RRGGGEGTFR TVIRKLENLK EGDVVAIPTG TAHWLHNDGN TELVVVFLDT QNHENQLDEN + QRRFFLAGNP QAQAQSQQQQ QRQPRQQSPQ RQRQRQRQGQ GQNAGNIFNG FTPELIAQSF + NVDQETAQKL QGQNDQRGHI VNVGQDLQIV RPPQDRRSPR QQQEQATSPR QQQEQQQGRR + GGWSNGVEET ICSMKFKVNI DNPSQADFVN PQAGSIANLN SFKFPILEHL RLSVERGELR + PNAIQSPHWT INAHNLLYVT EGALRVQIVD NQGNSVFDNE LREGQVVVIP QNFAVIKRAN + EQGSRWVSFK TNDNAMIANL AGRVSASAAS PLTLWANRYQ LSREEAQQLK FSQRETVLFA + PSFSRGQGIR ASR +// diff --git a/forester/archive/RIO/others/hmmer/squid/INSTALL b/forester/archive/RIO/others/hmmer/squid/INSTALL new file mode 100644 index 0000000..4f96254 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/INSTALL @@ -0,0 +1,31 @@ +Brief installation instructions for squid +SRE, Tue Jul 25 08:52:03 2000 +________________________________________________________________ + +For a source distribution (example: squid-1.7.tar.gz), on a UNIX system: + + gunzip squid-1.7.tar.gz Uncompresses the archive. + tar xf squid-1.7.tar Unpacks the archive. + (makes a new directory, squid-1.7) + cd squid-1.7 Moves into the distribution toplevel directory. + ./configure Configures the software for your system. + make Builds the binaries. + make install Installs the software. (You may need to be root.) + make clean Cleans up. + +The default is to install into /usr/local/bin and other /usr/local +subdirectories. If this isn't what you want, edit the top of the +Makefile; instructions are provided there for changing the +installation paths. + +Any failure to install on a UNIX system is a bug. Please report it. + +Man pages are provided for some programs; see Man/ subdirectory. + + + + + + + + diff --git a/forester/archive/RIO/others/hmmer/squid/Makefile.in b/forester/archive/RIO/others/hmmer/squid/Makefile.in new file mode 100644 index 0000000..340aa82 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Makefile.in @@ -0,0 +1,292 @@ +############################################################### +# Makefile for SQUID library +# CVS $Id: Makefile.in,v 1.1.1.1 2005/03/22 08:34:26 cmzmasek Exp $ +# +# Note: The autoconf variables in this file must be coordinated +# with HMMER, if you change them, because HMMER will +# create a Makefile from this Makefile.in using its own +# configure script, not SQUID's. +# +########### +# HMMER - Biological sequence analysis with profile HMMs +# Copyright (C) 1992-1999 Washington University School of Medicine +# All Rights Reserved +# +# This source code is distributed under the terms of the +# GNU General Public License. See the files COPYING and LICENSE +# for details. +########### + +### Installation points +### +# For simple installations, just make sure ${prefix} is set correctly: +# default is /usr/local. +# +# For heterogenous computing environments, also +# set ${exec_prefix}, which gives you some flexibility +# for installing architecture dependent files (e.g. the programs). +# +# It's less likely that you'll need to set the individual +# variables BINDIR, MANDIR, etc., but they're there if you need 'em. +# +# The (simple) default configuration installs as follows: +# prefix = /usr/local +# executables in /usr/local/bin +# man pages in /usr/local/man/man1 +# header files in /usr/local/include +# libsquid.a in /usr/local/lib +# scripts in /usr/local/bin +# +# The St. Louis configuration, an example of a heterogenous +# computing environment, installs by setting: +# prefix = /usr/seshare/ +# exec_prefix = /usr/seshare/`uname` +# +# on a Linux platform, for instance, this results in: +# executables in /usr/seshare/Linux/bin +# man pages in /usr/seshare/man +# header files in /usr/seshare/include +# libsquid.a in /usr/seshare/Linux/lib +# scripts in /usr/seshare/Linux/bin +# +prefix = @prefix@ +exec_prefix = @exec_prefix@ +BINDIR = @bindir@ +MANDIR = @mandir@ +INCLUDEDIR = @includedir@ +LIBDIR = @libdir@ +SCRIPTDIR = @bindir@ + +## your compiler and compiler flags +# +CC = @CC@ +CFLAGS = @CFLAGS@ + +## other defined flags for machine-specific stuff +# +MDEFS = @MDEFS@ @DEFS@ +LIBS = @LIBS@ -lm + +## Archiver command +# +AR = ar rcv +RANLIB = @RANLIB@ + +## instructions for installing man pages +# +INSTMAN = cp +MANSUFFIX = 1 + +# Configuration for compiling in optional PVM support +# +PVMFLAG = @PVMFLAG@ +PVMLIBDIR = @PVMLIBDIR@ +PVMINCDIR = @PVMINCDIR@ +PVMLIBS = @PVMLIBS@ + +####### +## You should not need to modify below this line +####### +SHELL = /bin/sh +BASENAME = "squid" +PACKAGE = "SQUID" +RELEASE = "1.7" +RELCODE = "rel1_7" +RELEASEDATE = "July 2000" +COPYRIGHT = "Copyright \(C\) 1992-2000 HHMI/Washington University School of Medicine" +LICENSE = "Freely distributed under the GNU General Public License \(GPL\)" +LICENSETAG = gnu +COMPRESS = gzip + +PROGS = afetch\ + alistat\ + compalign\ + compstruct\ + sfetch\ + sreformat\ + revcomp\ + seqsplit\ + seqstat\ + shuffle\ + sindex\ + translate\ + weight + +MANS = alistat\ + seqstat\ + sfetch\ + shuffle\ + sreformat\ + +READMES = 00README INSTALL Makefile.in + +SCRIPTS = + +PRECONFHDRS = \ + squid.h.in\ + squidconf.h.in + +POSTCONFHDRS = \ + squid.h\ + squidconf.h\ + version.h + +HDRS = rk.h\ + sqfuncs.h\ + gki.h\ + gsi.h\ + msa.h\ + ssi.h\ + stopwatch.h + +OBJS = a2m.o\ + aligneval.o\ + alignio.o\ + clustal.o\ + cluster.o\ + dayhoff.o\ + eps.o\ + file.o\ + getopt.o\ + gki.o\ + gsi.o\ + hsregex.o\ + iupac.o\ + msa.o\ + msf.o\ + phylip.o\ + revcomp.o\ + rk.o\ + selex.o\ + seqencode.o\ + shuffle.o\ + sqerror.o\ + sqio.o\ + squidcore.o\ + sre_ctype.o\ + sre_math.o\ + sre_string.o\ + ssi.o\ + stack.o\ + stockholm.o\ + stopwatch.o\ + translate.o\ + types.o\ + weight.o + +################################################################ +# Targets that actually build the squid executables +all: version.h $(PROGS) + +$(PROGS): @EXEC_DEPENDENCY@ version.h $(OBJS) + $(CC) $(CFLAGS) $(MDEFS) $(PVMLIBDIR) -o $@ $@_main.o $(OBJS) $(PVMLIBS) $(LIBS) + +.c.o: + $(CC) $(CFLAGS) $(PVMFLAG) $(PVMINCDIR) $(MDEFS) -c $< +################################################################ + + +################################################################ +# Targets expected by packages (e.g. HMMER) that +# include SQUID as a module. +# +module: libsquid.a + +libsquid.a: version.h $(OBJS) + $(AR) libsquid.a $(OBJS) + $(RANLIB) libsquid.a + chmod 644 libsquid.a +################################################################# + + +# version.h: +# create the version.h file that will define stamps used by +# squidcore.c's Banner(), which is called by all executables to +# print a standard package/copyright/license banner; +# then puts copies of version.h in all directories that are +# going to need it. +# +version.h: + @echo "Creating version.h..." + @echo "/* version.h -- automatically generated by a Makefile. DO NOT EDIT. */" > version.h + @echo "#define PACKAGE \"$(PACKAGE)\"" >> version.h + @echo "#define RELEASE \"$(RELEASE)\"" >> version.h + @echo "#define RELEASEDATE \"$(RELEASEDATE)\"" >> version.h + @echo "#define COPYRIGHT \"$(COPYRIGHT)\"" >> version.h + @echo "#define LICENSE \"$(LICENSE)\"" >> version.h + +install: $(PROGS) libsquid.a + test -d $(LIBDIR) || mkdir -p $(LIBDIR) + test -d $(BINDIR) || mkdir -p $(BINDIR) + test -d $(SCRIPTDIR) || mkdir -p $(SCRIPTDIR) + test -d $(INCLUDEDIR)|| mkdir -p $(INCLUDEDIR) + test -d $(MANDIR)/man$(MANSUFFIX) || mkdir -p $(MANDIR)/man$(MANSUFFIX) + cp libsquid.a $(LIBDIR)/ + cp $(HDRS) $(INCLUDEDIR)/ + cp $(PROGS) $(BINDIR)/ + for scriptfile in $(SCRIPTS); do\ + cp Scripts/$$scriptfile $(SCRIPTDIR)/;\ + done + @for manpage in $(MANS); do\ + $(INSTMAN) $$manpage.man $(MANDIR)/man$(MANSUFFIX)/$$manpage.$(MANSUFFIX);\ + done + +distclean: + make clean + -rm -f Makefile libsquid.a version.h config.cache config.log config.status ${POSTCONFHDRS} + +clean: + -rm -f *.o *~ core TAGS llib-lsquid.ln $(PROGS) + +# dist: build a new distribution directory in squid-$RELEASE, and make a tarball. +# Extracts straight from the CVS repository, so you must first do +# a "cvs commit" (it checks to be sure you do, at least for the current +# working directory). +dist: +# Delete old versions of the same release +# + @if test -d ${BASENAME}-$(RELEASE); then rm -rf ${BASENAME}-$(RELEASE); fi + @if test -e ${BASENAME}-$(RELEASE).tar; then rm -f ${BASENAME}-$(RELEASE).tar; fi + @if test -e ${BASENAME}-$(RELEASE).tar.Z; then rm -f ${BASENAME}-$(RELEASE).tar.Z; fi + @if test -e ${BASENAME}-$(RELEASE).tar.gz; then rm -f ${BASENAME}-$(RELEASE).tar.gz; fi +# +# CVS tag and extract. -c: make sure we committed; +# -F: allow more than one "make dist" per rel +# prep: must have done "cvs commit", and CVSROOT must be set +# + cvs tag -c -F ${BASENAME}_${RELCODE} + cvs export -r ${BASENAME}_${RELCODE} -d ${BASENAME}-${RELEASE} ${BASENAME} +# +# Make the configure script from configure.in +# + (cd ${BASENAME}-${RELEASE}; autoconf) +# +# Include the appropriate license files +# + cp Licenses/LICENSE.${LICENSETAG} ${BASENAME}-${RELEASE}/LICENSE + cp Licenses/COPYRIGHT.${LICENSETAG} ${BASENAME}-${RELEASE}/COPYRIGHT +# +# Put license tags (short licenses) on files that need 'em (replace LICENSE keyword) +# + for file in $(READMES) *.c ${HDRS} ${PRECONFHDRS}; do\ + licenseadd.pl Licenses/$(LICENSETAG) ${BASENAME}-${RELEASE}/$$file;\ + done; +# +# Remove files/directories that aren't supposed to go out in the distro. +# Do this last, so other steps (license adding, etc.) have simple loops. +# + -rm -rf ${BASENAME}-${RELEASE}/Licenses + -rm -rf ${BASENAME}-${RELEASE}/Docs + -rm ${BASENAME}-${RELEASE}/LOG + -rm ${BASENAME}-${RELEASE}/configure.in + -rm ${BASENAME}-${RELEASE}/test_main.c +# +# pack it up! +# + tar cvf ${BASENAME}-${RELEASE}.tar ${BASENAME}-${RELEASE} + ${COMPRESS} ${BASENAME}-$(RELEASE).tar + +TAGS: + etags -t *.h *.c Makefile.in + + diff --git a/forester/archive/RIO/others/hmmer/squid/Man/afetch.man b/forester/archive/RIO/others/hmmer/squid/Man/afetch.man new file mode 100644 index 0000000..67074ad --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Man/afetch.man @@ -0,0 +1,98 @@ +.TH "afetch" 1 "@RELEASEDATE@" "@PACKAGE@ @RELEASE@" "@PACKAGE@ Manual" + +.SH NAME +.TP +afetch - retrieve an alignment from an alignment database + +.SH SYNOPSIS +.B afetch +.I [options] +.I alignmentdb +.I key + +.PP +.B afetch --index +.I alignmentdb + +.SH DESCRIPTION + +.B afetch +retrieves the alignment named +.I key +from an alignment database in file +.I alignmentdb. + +.PP +.I alignmentdb +is a "multiple multiple alignment" file in Stockholm (e.g. native +Pfam) format. + +.PP +.I key +is either the name (ID) of the alignment, or its accession +number (AC). + +.PP +The +.I alignmentdb +file should first be SSI indexed with +.B afetch --index +for efficient retrieval. An SSI index is +not required, but alignment retrieval without one may +be painfully slow. + +.SH OPTIONS + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.SH EXPERT OPTIONS + +.TP +.B --index +Instead of retrieving a +.I key, +the special command +.B afetch --index +.I alignmentdb +produces an SSI index of the names and accessions +of the alignments in +the file +.I alignmentdb. +This should be run once on the +.I alignmentdb +file to prepare it for all future afetch's. + +.SH SEE ALSO + +.PP +Master man page, with full list of and guide to the +individual man pages for SQUID's auxiliary programs: see +.B squid(1). + +.SH AUTHOR + +@PACKAGE@ and its documentation is @COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +See COPYING in the source code distribution for more details, or contact me. + +.nf +Sean Eddy +Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/squid/Man/alistat.man b/forester/archive/RIO/others/hmmer/squid/Man/alistat.man new file mode 100644 index 0000000..63d3e0d --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Man/alistat.man @@ -0,0 +1,138 @@ +.TH "alistat" 1 "@RELEASEDATE@" "@PACKAGE@ @RELEASE@" "@PACKAGE@ Manual" + +.SH NAME +.TP +alistat - show statistics for a multiple alignment file + +.SH SYNOPSIS +.B alistat +.I [options] +.I alignfile + +.SH DESCRIPTION + +.B alistat +reads a multiple sequence alignment from the file +.I alignfile +in any supported format (including SELEX, GCG MSF, and +CLUSTAL), and shows a number of simple statistics about it. +These statistics include the name of the format, +the number of sequences, the total number of residues, +the average and range of the sequence lengths, the +alignment length (e.g. including gap characters). + +.PP +Also shown are some percent identities. A percent +pairwise alignment identity is defined as +.I (idents / MIN(len1, len2)) +where +.I idents +is the number of exact identities +and +.I len1, len2 +are the unaligned lengths of the two +sequences. The "average percent identity", +"most related pair", and "most unrelated pair" +of the alignment are the average, maximum, and +minimum of all +(N)(N-1)/2 pairs, respectively. +The "most distant seq" is calculated by finding +the maximum pairwise identity (best relative) for all N sequences, +then finding the minimum of these N numbers (hence, +the most outlying sequence). + +.SH OPTIONS + +.TP +.B -a +Show additional verbose information: a table with one line per +sequence showing name, length, and its highest and lowest pairwise +identity. These lines are prefixed with a * character to enable +easily +.BR grep' ing +them out and sorting them. For example, +.I alistat -a foo.slx | grep "*" | sort -n +3 +gives a ranked list of the most distant sequences +in the alignment. +Incompatible with the +.B -f +option. + +.TP +.B -f +Fast; use a sampling method to estimate the average %id. +When this option is chosen, +.B alistat +doesn't show the other three pairwise identity numbers. +This option is useful for very large alignments, for +which the full (N)(N-1) calculation of all pairs would +be prohibitive (e.g. Pfam's GP120 alignment, with over +10,000 sequences). Incompatible with the +.B -a +option. + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.B -q +be quiet - suppress the verbose header (program name, release number +and date, the parameters and options in effect). + +.TP +.B -B +(Babelfish). Autodetect and read a sequence file format other than the +default (FASTA). Almost any common sequence file format is recognized +(including Genbank, EMBL, SWISS-PROT, PIR, and GCG unaligned sequence +formats, and Stockholm, GCG MSF, and Clustal alignment formats). See +the printed documentation for a complete list of supported formats. + +.SH EXPERT OPTIONS + +.TP +.BI --informat " " +Specify that the sequence file is in format +.I , +rather than the default FASTA format. +Common examples include Genbank, EMBL, GCG, +PIR, Stockholm, Clustal, MSF, or PHYLIP; +see the printed documentation for a complete list +of accepted format names. +This option overrides the default format (FASTA) +and the +.I -B +Babelfish autodetection option. + + + +.SH SEE ALSO + +.PP +@SEEALSO@ + +.SH AUTHOR + +@PACKAGE@ and its documentation is @COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +See COPYING in the source code distribution for more details, or contact me. + +.nf +Sean Eddy +Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/squid/Man/seqstat.man b/forester/archive/RIO/others/hmmer/squid/Man/seqstat.man new file mode 100644 index 0000000..5c0644e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Man/seqstat.man @@ -0,0 +1,98 @@ +.TH "seqstat" 1 "@RELEASEDATE@" "@PACKAGE@ @RELEASE@" "@PACKAGE@ Manual" + +.SH NAME +.TP +seqstat - show statistics and format for a sequence file + +.SH SYNOPSIS +.B seqstat +.I [options] +.I seqfile + +.SH DESCRIPTION + +.B seqstat +reads a sequence file +.I seqfile +and shows a number of simple statistics about it. + +.pp +The printed statistics include the name of the format, the residue +type of the first sequence (protein, RNA, or DNA), the number of +sequences, the total number of residues, and the average and range of +the sequence lengths. + +.SH OPTIONS + +.TP +.B -a +Show additional verbose information: a table with one line per +sequence showing name, length, and description line. +These lines are prefixed with a * character to enable +easily +.BR grep' ing +them out and sorting them. + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.B -B +(Babelfish). Autodetect and read a sequence file format other than the +default (FASTA). Almost any common sequence file format is recognized +(including Genbank, EMBL, SWISS-PROT, PIR, and GCG unaligned sequence +formats, and Stockholm, GCG MSF, and Clustal alignment formats). See +the printed documentation for a complete list of supported formats. + +.SH EXPERT OPTIONS + +.TP +.BI --informat " " +Specify that the sequence file is in format +.I , +rather than the default FASTA format. +Common examples include Genbank, EMBL, GCG, +PIR, Stockholm, Clustal, MSF, or PHYLIP; +see the printed documentation for a complete list +of accepted format names. +This option overrides the default expected format (FASTA) +and the +.I -B +Babelfish autodetection option. + +.TP +.B --quiet +Suppress the verbose header (program name, release number +and date, the parameters and options in effect). + +.SH SEE ALSO + +.PP +@SEEALSO@ + +.SH AUTHOR + +@PACKAGE@ and its documentation is @COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +See COPYING in the source code distribution for more details, or contact me. + +.nf +Sean Eddy +Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/squid/Man/sfetch.man b/forester/archive/RIO/others/hmmer/squid/Man/sfetch.man new file mode 100644 index 0000000..25f745c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Man/sfetch.man @@ -0,0 +1,226 @@ +.TH "sfetch" 1 "@RELEASEDATE@" "@PACKAGE@ @RELEASE@" "@PACKAGE@ Manual" + +.SH NAME +.TP +sfetch - get a sequence from a flatfile database. + +.SH SYNOPSIS +.B sfetch +.I [options] +.I seqname + +.SH DESCRIPTION + +.B sfetch +retrieves the sequence named +.I seqname +from a sequence database. + +.PP +Which database is used is controlled by the +.B -d +and +.B -D +options, or "little databases" and "big +databases". +The directory location of "big databases" can +be specified by environment variables, +such as $SWDIR for Swissprot, and $GBDIR +for Genbank (see +.B -D +for complete list). +A complete file path must be specified +for "little databases". +By default, if neither option is specified +and the name looks like a Swissprot identifier +(e.g. it has a _ character), the $SWDIR +environment variable is used to attempt +to retrieve the sequence +.I seqname +from Swissprot. + +.PP +A variety of other options are available which allow +retrieval of subsequences +.RI ( -f,-t ); +retrieval by accession number instead of +by name +.RI ( -a ); +reformatting the extracted sequence into a variety +of other formats +.RI ( -F ); +etc. + +.PP +If the database has been GSI indexed, sequence +retrieval will be extremely efficient; else, +retrieval may be painfully slow (the entire +database may have to be read into memory to +find +.IR seqname ). +GSI indexing +is recommended for all large or permanent +databases. + +.pp +This program was originally named +.B getseq, +and was renamed because it clashed with a GCG +program of the same name. + +.SH OPTIONS + +.TP +.B -a +Interpret +.I seqname +as an accession number, not an identifier. + +.TP +.BI -d " " +Retrieve the sequence from a sequence file named +.I . +If a GSI index +.I .gsi +exists, it is used to speed up the retrieval. + +.TP +.BI -f " " +Extract a subsequence starting from position +.I , +rather than from 1. See +.B -t. +If +.I +is greater than +.I +(as specified by the +.B -t +option), then the sequence is extracted as +its reverse complement (it is assumed to be +nucleic acid sequence). + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.BI -o " " +Direct the output to a file named +.I . +By default, output would go to stdout. + +.TP +.BI -r " " +Rename the sequence +.I +in the output after extraction. By default, the original +sequence identifier would be retained. Useful, for instance, +if retrieving a sequence fragment; the coordinates of +the fragment might be added to the name (this is what Pfam +does). + +.TP +.BI -t " " +Extract a subsequence that ends at position +.I , +rather than at the end of the sequence. See +.B -f. +If +.I +is less than +.I +(as specified by the +.B -f +option), then the sequence is extracted as +its reverse complement (it is assumed to be +nucleic acid sequence) + +.TP +.B -B +(Babelfish). Autodetect and read a sequence file format other than the +default (FASTA). Almost any common sequence file format is recognized +(including Genbank, EMBL, SWISS-PROT, PIR, and GCG unaligned sequence +formats, and Stockholm, GCG MSF, and Clustal alignment formats). See +the printed documentation for a complete list of supported formats. + + +.TP +.BI -D " " +Retrieve the sequence from the main sequence database +coded +.I . For each code, there is an environment +variable that specifies the directory path to that +database. +Recognized codes and their corresponding environment +variables are +.I -Dsw +(Swissprot, $SWDIR); +.I -Dpir +(PIR, $PIRDIR); +.I -Dem +(EMBL, $EMBLDIR); +.I -Dgb +(Genbank, $GBDIR); +.I -Dwp +(Wormpep, $WORMDIR); and +.I -Dowl +(OWL, $OWLDIR). +Each database is read in its native flatfile format. + +.TP +.BI -F " " +Reformat the extracted sequence into a different format. +(By default, the sequence is extracted from the database +in the same format as the database.) Available formats +are +.B embl, fasta, genbank, gcg, strider, zuker, ig, pir, squid, +and +.B raw. + +.SH EXPERT OPTIONS + +.TP +.BI --informat " " +Specify that the sequence file is in format +.I , +rather than the default FASTA format. +Common examples include Genbank, EMBL, GCG, +PIR, Stockholm, Clustal, MSF, or PHYLIP; +see the printed documentation for a complete list +of accepted format names. +This option overrides the default format (FASTA) +and the +.I -B +Babelfish autodetection option. + +.SH SEE ALSO + +.PP +@SEEALSO@ + +.SH AUTHOR + +@PACKAGE@ and its documentation is @COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +See COPYING in the source code distribution for more details, or contact me. + +.nf +Sean Eddy +Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/squid/Man/shuffle.man b/forester/archive/RIO/others/hmmer/squid/Man/shuffle.man new file mode 100644 index 0000000..93bbe53 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Man/shuffle.man @@ -0,0 +1,204 @@ +.TH "shuffle" 1 "@RELEASEDATE@" "@PACKAGE@ @RELEASE@" "@PACKAGE@ Manual" + +.SH NAME +.TP +shuffle - randomize the sequences in a sequence file + +.SH SYNOPSIS +.B shuffle +.I [options] +.I seqfile + +.SH DESCRIPTION + +.B shuffle +reads a sequence file +.I seqfile, +randomizes each sequence, and prints the randomized sequences +in FASTA format on standard output. The sequence names +are unchanged; this allows you to track down the source +of each randomized sequence if necessary. + +.pp +The default is to simply shuffle each input sequence, preserving +monosymbol composition exactly. To shuffle +each sequence while preserving both its monosymbol and disymbol +composition exactly, use the +.I -d +option. + +.pp +The +.I -0 +and +.I -1 +options allow you to generate sequences with the same +Markov properties as each input sequence. With +.I -0, +for each input sequence, 0th order Markov statistics +are collected (e.g. symbol composition), and a new +sequence is generated with the same composition. +With +.I -1, +the generated sequence has the same 1st order +Markov properties as the input sequence (e.g. +the same disymbol frequencies). + +.pp +Note that the default and +.I -0, +or +.I -d +and +.I -1, +are similar; the shuffling algorithms preserve +composition exactly, while the Markov algorithms +only expect to generate a sequence of similar +composition on average. + +.pp +Other shuffling algorithms are also available, +as documented below in the options. + +.SH OPTIONS + +.TP +.B -0 +Calculate 0th order Markov frequencies of each input sequence +(e.g. residue composition); generate output sequence +using the same 0th order Markov frequencies. + +.TP +.B -1 +Calculate 1st order Markov frequencies for each input +sequence (e.g. diresidue composition); generate output +sequence using the same 1st order Markov frequencies. +The first residue of the output sequence is always +the same as the first residue of the input sequence. + +.TP +.B -d +Shuffle the input sequence while preserving both +monosymbol and disymbol composition exactly. Uses +an algorithm published by S.F. Altschul and B.W. Erickson, +Mol. Biol. Evol. 2:526-538, 1985. + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.B -l +Look only at the length of each input sequence; generate +an i.i.d. output protein sequence of that length, +using monoresidue frequencies typical of proteins +(taken from Swissprot 35). + +.TP +.BI -n " " +Make +.I +different randomizations of each input sequence in +.I seqfile, +rather than the default of one. + +.TP +.B -r +Generate the output sequence by reversing the +input sequence. (Therefore only one "randomization" +per input sequence is possible, so it's +not worth using +.I -n +if you use reversal.) + +.TP +.BI -t " " +Truncate each input sequence to a fixed length of exactly +.I +residues. If the input sequence is shorter than +.I +it is discarded (therefore the output file may contain +fewer sequences than the input file). +If the input sequence is longer than +.I +a contiguous subsequence is randomly chosen. + +.TP +.BI -w " " +Regionally shuffle each input sequence in window sizes of +.I , +preserving local residue composition in each window. +Probably a better shuffling algorithm for biosequences +with nonstationary residue composition (e.g. composition +that is varying along the sequence, such as between +different isochores in human genome sequence). + +.TP +.B -B +(Babelfish). Autodetect and read a sequence file format other than the +default (FASTA). Almost any common sequence file format is recognized +(including Genbank, EMBL, SWISS-PROT, PIR, and GCG unaligned sequence +formats, and Stockholm, GCG MSF, and Clustal alignment formats). See +the printed documentation for a complete list of supported formats. + +.SH EXPERT OPTIONS + +.TP +.BI --informat " " +Specify that the sequence file is in format +.I , +rather than the default FASTA format. +Common examples include Genbank, EMBL, GCG, +PIR, Stockholm, Clustal, MSF, or PHYLIP; +see the printed documentation for a complete list +of accepted format names. +This option overrides the default expected format (FASTA) +and the +.I -B +Babelfish autodetection option. + +.TP +.B --nodesc +Do not output any sequence description in the output file, +only the sequence names. + +.TP +.BI --seed " " +Set the random number seed to +.I . +If you want reproducible results, use the same seed each time. +By default, +.B shuffle +uses a different seed each time, so does not generate +the same output in subsequent runs with the same input. + +.SH SEE ALSO + +.PP +@SEEALSO@ + +.SH AUTHOR + +@PACKAGE@ and its documentation is @COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +See COPYING in the source code distribution for more details, or contact me. + +.nf +Sean Eddy +Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/squid/Man/sreformat.man b/forester/archive/RIO/others/hmmer/squid/Man/sreformat.man new file mode 100644 index 0000000..c502a39 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/Man/sreformat.man @@ -0,0 +1,249 @@ +.TH "sreformat" 1 "@RELEASEDATE@" "@PACKAGE@ @RELEASE@" "@PACKAGE@ Manual" + +.SH NAME +.TP +sreformat - convert sequence file to different format + +.SH SYNOPSIS +.B sreformat +.I [options] +.I format +.I seqfile + +.SH DESCRIPTION + +.B sreformat +reads the sequence file +.I seqfile +in any supported format, reformats it +into a new format specified by +.I format, +then prints the reformatted text. + +.PP +Supported input formats include (but are not limited to) the unaligned +formats FASTA, Genbank, EMBL, SWISS-PROT, PIR, and GCG, and the +aligned formats Stockholm, Clustal, GCG MSF, and Phylip. + +.PP +Available unaligned output file format codes +include +.I fasta +(FASTA format); +.I embl +(EMBL/SWISSPROT format); +.I genbank +(Genbank format); +.I gcg +(GCG single sequence format); +.I gcgdata +(GCG flatfile database format); +.I strider +(MacStrider format); +.I zuker +(Zuker MFOLD format); +.I ig +(Intelligenetics format); +.I pir +(PIR/CODATA flatfile format); +.I squid +(an undocumented St. Louis format); +.I raw +(raw sequence, no other information). + +.pp +The available aligned output file format +codes include +.I stockholm +(PFAM/Stockholm format); +.I msf +(GCG MSF format); +.I a2m +(aligned FASTA format, called A2M by the UC Santa Cruz +HMM group); +.I PHYLIP +(Felsenstein's PHYLIP format); and +.I selex +(old SELEX/HMMER/Pfam annotated alignment format); + +.pp +All thee codes are interpreted case-insensitively +(e.g. MSF, Msf, or msf all work). + +.PP +Unaligned format files cannot be reformatted to +aligned formats. +However, aligned formats can be reformatted +to unaligned formats -- gap characters are +simply stripped out. + +.PP +This program was originally named +.B reformat, +but that name clashes with a GCG program of the same name. + +.SH OPTIONS + +.TP +.B -a +Enable alignment reformatting. By default, sreformat expects +that the input file should be handled as an unaligned input +file (even if it is an alignment), and it will not allow you +to convert an unaligned file to an alignment (for obvious +reasons). +.pp +This may seem silly; surely if sreformat can autodetect and parse +alignment file formats as input, it can figure out when it's got an +alignment! There are two reasons. One is just the historical +structure of the code. The other is that FASTA unaligned format and +A2M aligned format (aligned FASTA) are impossible to tell apart with +100% confidence. + +.TP +.B -d +DNA; convert U's to T's, to make sure a nucleic acid +sequence is shown as DNA not RNA. See +.B -r. + +.TP +.B -h +Print brief help; includes version number and summary of +all options, including expert options. + +.TP +.B -l +Lowercase; convert all sequence residues to lower case. +See +.B -u. + +.TP +.B -r +RNA; convert T's to U's, to make sure a nucleic acid +sequence is shown as RNA not DNA. See +.B -d. + +.TP +.B -u +Uppercase; convert all sequence residues to upper case. +See +.B -l. + +.TP +.B -x +For DNA sequences, convert non-IUPAC characters (such as X's) to N's. +This is for compatibility with benighted people who insist on using X +instead of the IUPAC ambiguity character N. (X is for ambiguity +in an amino acid residue). +.pp +Warning: the code doesn't +check that you are actually giving it DNA. It simply +literally just converts non-IUPAC DNA symbols to N. So +if you accidentally give it protein sequence, it will +happily convert most every amino acid residue to an N. + +.TP +.B -B +(Babelfish). Autodetect and read a sequence file format other than the +default (FASTA). Almost any common sequence file format is recognized +(including Genbank, EMBL, SWISS-PROT, PIR, and GCG unaligned sequence +formats, and Stockholm, GCG MSF, and Clustal alignment formats). See +the printed documentation for a complete list of supported formats. + + +.SH EXPERT OPTIONS + +.TP +.BI --informat " " +Specify that the sequence file is in format +.I , +rather than the default FASTA format. +Common examples include Genbank, EMBL, GCG, +PIR, Stockholm, Clustal, MSF, or PHYLIP; +see the printed documentation for a complete list +of accepted format names. +This option overrides the default format (FASTA) +and the +.I -B +Babelfish autodetection option. + +.TP +.B --mingap +If +.I seqfile +is an alignment, remove any columns that contain 100% gap +characters, minimizing the overall length of the alignment. +(Often useful if you've extracted a subset of aligned +sequences from a larger alignment.) + +.TP +.B --pfam +For SELEX alignment output format only, put the entire +alignment in one block (don't wrap into multiple blocks). +This is close to the format used internally by Pfam +in Stockholm and Cambridge. + +.TP +.B --sam +Try to convert gap characters to UC Santa Cruz SAM style, where a . +means a gap in an insert column, and a - means a +deletion in a consensus/match column. This only +works for converting aligned file formats, and only +if the alignment already adheres to the SAM convention +of upper case for residues in consensus/match columns, +and lower case for residues in insert columns. This is +true, for instance, of all alignments produced by old +versions of HMMER. (HMMER2 produces alignments +that adhere to SAM's conventions even in gap character choice.) +This option was added to allow Pfam alignments to be +reformatted into something more suitable for profile HMM +construction using the UCSC SAM software. + +.TP +.BI --samfrac " " +Try to convert the alignment gap characters and +residue cases to UC Santa Cruz SAM style, where a . +means a gap in an insert column and a - means a +deletion in a consensus/match column, and +upper case means match/consensus residues and +lower case means inserted resiudes. This will only +work for converting aligned file formats, but unlike the +.B --sam +option, it will work regardless of whether the file adheres +to the upper/lower case residue convention. Instead, any +column containing more than a fraction +.I +of gap characters is interpreted as an insert column, +and all other columns are interpreted as match columns. +This option was added to allow Pfam alignments to be +reformatted into something more suitable for profile HMM +construction using the UCSC SAM software. + +.SH SEE ALSO + +.PP +@SEEALSO@ + +.SH AUTHOR + +@PACKAGE@ and its documentation is @COPYRIGHT@ +HMMER - Biological sequence analysis with profile HMMs +Copyright (C) 1992-1999 Washington University School of Medicine +All Rights Reserved + + This source code is distributed under the terms of the + GNU General Public License. See the files COPYING and LICENSE + for details. +See COPYING in the source code distribution for more details, or contact me. + +.nf +Sean Eddy +Dept. of Genetics +Washington Univ. School of Medicine +4566 Scott Ave. +St Louis, MO 63110 USA +Phone: 1-314-362-7666 +FAX : 1-314-362-7855 +Email: eddy@genetics.wustl.edu +.fi + + diff --git a/forester/archive/RIO/others/hmmer/squid/a2m.c b/forester/archive/RIO/others/hmmer/squid/a2m.c new file mode 100644 index 0000000..5beff81 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/a2m.c @@ -0,0 +1,113 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* a2m.c + * + * reading/writing A2M (aligned FASTA) files. + * + * RCS $Id: a2m.c,v 1.1.1.1 2005/03/22 08:34:17 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" +#include "msa.h" + +/* Function: ReadA2M() + * Date: SRE, Sun Jun 6 17:11:29 1999 [bus from Madison 1999 worm mtg] + * + * Purpose: Parse an alignment read from an open A2M format + * alignment file. A2M is a single alignment format. + * Return the alignment, or NULL if we've already + * read the alignment. + * + * Args: afp - open alignment file + * + * Returns: MSA * - an alignment object. + * Caller responsible for an MSAFree() + */ +MSA * +ReadA2M(MSAFILE *afp) +{ + MSA *msa; + char *buf; + char *name; + char *desc; + char *seq; + int idx; + int len1, len2; + + if (feof(afp->f)) return NULL; + + name = NULL; + msa = MSAAlloc(10, 0); + idx = 0; + while ((buf = MSAFileGetLine(afp)) != NULL) + { + if (*buf == '>') + { + buf++; /* skip the '>' */ + if ((name = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) + Die("Blank name in A2M file %s (line %d)\n", afp->fname, afp->linenumber); + desc = sre_strtok(&buf, "\n", &len2); + + idx = GKIStoreKey(msa->index, name); + if (idx >= msa->nseqalloc) MSAExpand(msa); + + msa->sqname[idx] = sre_strdup(name, len1); + if (desc != NULL) MSASetSeqDescription(msa, idx, desc); + msa->nseq++; + } + else if (name != NULL) + { + if ((seq = sre_strtok(&buf, WHITESPACE, &len1)) == NULL) continue; + msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], seq, len1); + } + } + if (name == NULL) { MSAFree(msa); return NULL; } + + MSAVerifyParse(msa); + return msa; +} + + +/* Function: WriteA2M() + * Date: SRE, Sun Jun 6 17:40:35 1999 [bus from Madison, 1999 worm mtg] + * + * Purpose: Write an "aligned FASTA" (aka a2m, to UCSC) formatted + * alignment. + * + * Args: fp - open FILE to write to. + * msa - alignment to write + * + * Returns: void + */ +void +WriteA2M(FILE *fp, MSA *msa) +{ + int idx; /* sequence index */ + int pos; /* position in sequence */ + char buf[64]; /* buffer for individual lines */ + int cpl = 60; /* char per line; must be < 64 unless buf is bigger */ + + buf[cpl] = '\0'; + for (idx = 0; idx < msa->nseq; idx++) + { + fprintf(fp, ">%s %s\n", + msa->sqname[idx], + (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL) ? msa->sqdesc[idx] : ""); + for (pos = 0; pos < msa->alen; pos+=cpl) + { + strncpy(buf, &(msa->aseq[idx][pos]), cpl); + fprintf(fp, "%s\n", buf); + } + } +} diff --git a/forester/archive/RIO/others/hmmer/squid/afetch_main.c b/forester/archive/RIO/others/hmmer/squid/afetch_main.c new file mode 100644 index 0000000..23119a5 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/afetch_main.c @@ -0,0 +1,182 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* afetch_main.c + * SRE, Tue Nov 9 18:47:02 1999 [Saint Louis] + * + * afetch -- a program to extract alignments from the Pfam database + * + * CVS $Id: afetch_main.c,v 1.1.1.1 2005/03/22 08:34:30 cmzmasek Exp $ + */ + +#include +#include +#include "squid.h" +#include "msa.h" +#include "ssi.h" + +static char banner[] = "afetch - retrieve an alignment from Pfam"; + +static char usage[] = "\ +Usage: afetch [-options] \n\ + or: afetch --index \n\ +\n\ + Get an alignment from a database.\n\ + Available options:\n\ + -h : help; print version and usage info\n\ +"; + +static char experts[] = "\ + --index : construct indices for the database\n\ +"; + +struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "--index", FALSE, sqdARG_NONE } +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *afile; /* name of alignment file to read */ + MSAFILE *afp; /* pointer to open index file */ + char *key; /* name/accession of alignment to fetch */ + MSA *msa; /* the fetched alignment */ + int format; /* format of afile */ + int do_index; /* TRUE to index instead of retrieve */ + + char *optname; + char *optarg; + int optind; + + /*********************************************** + * Parse the command line + ***********************************************/ + + /* initializations and defaults */ + format = MSAFILE_STOCKHOLM; /* period. It's the only multi-MSA file format. */ + do_index = FALSE; + key = NULL; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "--index") == 0) { do_index = TRUE; } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if ((do_index && argc - optind != 1) || (! do_index && argc - optind != 2)) + Die("Incorrect number of command line arguments.\n%s\n", usage); + + afile = argv[optind++]; + if (! do_index) key = argv[optind++]; + + if ((afp = MSAFileOpen(afile, format, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", afile); + + /*********************************************** + * Section 1. Alignment database indexing + ***********************************************/ + + if (do_index) { + int mode; + char *ssifile; + SSIINDEX *si; + int fh; + int status; + SSIOFFSET offset; + int n = 0; + + /* Not that we're expecting an alignment file so + * large that it would require a 64-bit index, but... + */ + if ((mode = SSIRecommendMode(afile)) == -1) + Die("File %s doesn't exist, or is too large for your OS", afile); + + ssifile = sre_strdup(afile, -1); + sre_strcat(&ssifile, -1, ".ssi", -1); + + if ((si = SSICreateIndex(mode)) == NULL) + Die("Couldn't allocate/initialize the new SSI index"); + if (SSIAddFileToIndex(si, afile, afp->format, &fh) != 0) + Die("SSIAddFileToIndex() failed"); + + status = SSIGetFilePosition(afp->f, mode, &offset); + if (status != 0) Die("SSIGetFilePosition() failed"); + + while ((msa = MSAFileRead(afp)) != NULL) + { + if (msa->name == NULL) + Die("SSI index requires that every MSA has a name"); + + status = SSIAddPrimaryKeyToIndex(si, msa->name, fh, &offset, NULL, 0); + if (status != 0) Die("SSIAddPrimaryKeyToIndex() failed"); + + if (msa->acc != NULL) { + status = SSIAddSecondaryKeyToIndex(si, msa->acc, msa->name); + if (status != 0) Die("SSIAddSecondaryKeyToIndex() failed"); + } + + status = SSIGetFilePosition(afp->f, mode, &offset); + if (status != 0) Die("SSIGetFilePosition() failed"); + + n++; + MSAFree(msa); + } + + status = SSIWriteIndex(ssifile, si); + if (status != 0) Die("SSIWriteIndex() failed"); + + printf ("%d alignments indexed in SSI index %s\n", n, ssifile); + free(ssifile); + MSAFileClose(afp); + SSIFreeIndex(si); + SqdClean(); + exit (0); /* exit indexing program here */ + } + + /*********************************************** + * Section 2. Alignment retrieval + ***********************************************/ + + /* Indexed retrieval: + */ + if (afp->ssi != NULL) { + if (! MSAFilePositionByKey(afp, key)) + Die("No such alignment %s found in file %s", key, afile); + msa = MSAFileRead(afp); + } + /* Brute force retrieval: + */ + else { + while ((msa = MSAFileRead(afp)) != NULL) + { + if (strcmp(msa->name, key) == 0) break; + if (strcmp(msa->acc, key) == 0) break; + MSAFree(msa); + } + } + + if (msa == NULL) Die("Failed to retrieve %s from file %s", key, afile); + + /* Output the alignment we retrieved + */ + WriteStockholm(stdout, msa); + + MSAFileClose(afp); + MSAFree(msa); + exit (0); +} diff --git a/forester/archive/RIO/others/hmmer/squid/aligneval.c b/forester/archive/RIO/others/hmmer/squid/aligneval.c new file mode 100644 index 0000000..e9c23a2 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/aligneval.c @@ -0,0 +1,513 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* aligneval.c + * RCS $Id: aligneval.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $ + * + * Comparison of multiple alignments. Three functions are + * provided, using subtly different scoring schemes: + * CompareMultAlignments() - basic scoring scheme + * CompareRefMultAlignments() - only certain "canonical" columns + * are scored + * + * The similarity measure is a fractional alignment identity averaged + * over all sequence pairs. The score for all pairs is: + * (identically aligned symbols) / (total aligned columns in + * known alignment) + * + * A column c is identically aligned for sequences i, j if: + * 1) both i,j have a symbol aligned in column c, and the + * same pair of symbols is aligned somewhere in the test + * alignment + * 2) S[i][c] is aligned to a gap in sequence j, and that symbol + * is aligned to a gap in the test alignment + * 3) converse of 2) + * + * + * The algorithm is as follows: + * 1) For each known/test aligned pair of sequences (k1,k2 and t1,t2) + * construct a list for each sequence, in which for every + * counted symbol we record the raw index of the symbol in + * the other sequence that it aligns to, or -1 if it aligns + * to a gap or uncounted symbol. + * + * 2) Compare the list for k1 to the list for t1 and count an identity + * for each correct alignment. + * + * 3) Repeat 2) for comparing k2 to t2. Note that this means correct sym/sym + * alignments count for 2; correct sym/gap alignments count for 1. + * + * 4) The score is (identities from 2 + identities from 3) / + * (totals from 2 + totals from 3). + * + * Written originally for koala's ss2 pairwise alignment package. + * + * Sean Eddy, Sun Nov 1 12:45:11 1992 + * SRE, Thu Jul 29 16:47:18 1993: major revision: all functions replaced by new algorithm + * CVS $Id: aligneval.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $ + */ + + +#include +#include +#include +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +static int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen); +static int make_ref_alilist(int *refcoords, char *k1, char *k2, char *s1, char *s2, + int **ret_s1_list, int *ret_listlen); +static int compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc); + + +/* Function: ComparePairAlignments + * + * Purpose: Calculate and return a number representing how well two different alignments + * of a pair of sequences compare. The number is, roughly speaking, + * the fraction of columns which are identically aligned. + * + * For all columns c in which either known1[c] or known2[c] + * is a non-gap, count an identity if those same symbols are + * aligned somewhere in calc1/calc2. The score is identities/total + * columns examined. (i.e. fully gapped columns don't count) + * + * more explicitly, identities come from: + * both known and test aligned pairs have the same symbol in the first sequence aligned to + * a gap in the second sequence; + * both known and test aligned pairs have the same symbol in the second sequence + * aligned to a gap in the first sequence; + * the known alignment has symbols aligned at this column, and the test + * alignment aligns the same two symbols. + * + * Args: known1, known2: trusted alignment of two sequences + * calc1, calc2: test alignment of two sequences + * + * Return: Returns -1.0 on internal failure. + */ +float +ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2) +{ + int *klist1; + int *klist2; + int *tlist1; + int *tlist2; + int len1, len2; + float score; + + if (! make_alilist(calc1, calc2, &tlist1, &len1)) return -1.0; + if (! make_alilist(calc2, calc1, &tlist2, &len2)) return -1.0; + if (! make_alilist(known1, known2, &klist1, &len1)) return -1.0; + if (! make_alilist(known2, known1, &klist2, &len2)) return -1.0; + if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0; + + free(klist1); + free(klist2); + free(tlist1); + free(tlist2); + return score; +} + + + +/* Function: CompareRefPairAlignments() + * + * Same as above, but the only columns that count are the ones + * with indices in *refcoord. *refcoord and the known1, known2 + * pair must be in sync with each other (come from the same + * multiple sequence alignment) + * + * Args: ref - 0..alen-1 array of 1 or 0 + * known1,known2 - trusted alignment + * calc1, calc2 - test alignment + * + * Return: the fractional alignment identity on success, -1.0 on failure. + */ +float +CompareRefPairAlignments(int *ref, char *known1, char *known2, char *calc1, char *calc2) +{ + int *klist1; + int *klist2; + int *tlist1; + int *tlist2; + int len1, len2; + float score; + + if (! make_ref_alilist(ref, known1, known2, calc1, calc2, &tlist1, &len1)) return -1.0; + if (! make_ref_alilist(ref, known2, known1, calc2, calc1, &tlist2, &len2)) return -1.0; + if (! make_ref_alilist(ref, known1, known2, known1, known2, &klist1, &len1)) return -1.0; + if (! make_ref_alilist(ref, known2, known1, known2, known1, &klist2, &len2)) return -1.0; + if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0; + + free(klist1); + free(klist2); + free(tlist1); + free(tlist2); + return score; +} + +/* Function: make_alilist() + * + * Purpose: Construct a list (array) mapping the raw symbols of s1 + * onto the indexes of the aligned symbols in s2 (or -1 + * for gaps in s2). The list (s1_list) will be of the + * length of s1's raw sequence. + * + * Args: s1 - sequence to construct the list for + * s2 - sequence s1 is aligned to + * ret_s1_list - RETURN: the constructed list (caller must free) + * ret_listlen - RETURN: length of the list + * + * Returns: 1 on success, 0 on failure + */ +static int +make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen) +{ + int *s1_list; + int col; /* column position in alignment */ + int r1, r2; /* raw symbol index at current col in s1, s2 */ + + /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc + * for that (and waste a wee bit of space) + */ + s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1)); + r1 = r2 = 0; + for (col = 0; s1[col] != '\0'; col++) + { + /* symbol in s1? Record what it's aligned to, and bump + * the r1 counter. + */ + if (! isgap(s1[col])) + { + s1_list[r1] = isgap(s2[col]) ? -1 : r2; + r1++; + } + + /* symbol in s2? bump the r2 counter + */ + if (! isgap(s2[col])) + r2++; + } + + *ret_listlen = r1; + *ret_s1_list = s1_list; + return 1; +} + + + +/* Function: make_ref_alilist() + * + * Purpose: Construct a list (array) mapping the raw symbols of s1 + * which are under canonical columns of the ref alignment + * onto the indexes of the aligned symbols in s2 (or -1 + * for gaps in s2 or noncanonical symbols in s2). + * + * Args: ref: - array of indices of canonical coords (1 canonical, 0 non) + * k1 - s1's known alignment (w/ respect to refcoords) + * k2 - s2's known alignment (w/ respect to refcoords) + * s1 - sequence to construct the list for + * s2 - sequence s1 is aligned to + * ret_s1_list - RETURN: the constructed list (caller must free) + * ret_listlen - RETURN: length of the list + * + * Returns: 1 on success, 0 on failure + */ +/*ARGSUSED*/ +static int +make_ref_alilist(int *ref, char *k1, char *k2, + char *s1, char *s2, int **ret_s1_list, int *ret_listlen) +{ + int *s1_list; + int col; /* column position in alignment */ + int r1, r2; /* raw symbol index at current col in s1, s2 */ + int *canons1; /* flag array, 1 if position i in s1 raw seq is canonical */ + int lpos; /* position in list */ + + /* Allocations. No arrays can exceed the length of their + * appropriate parent (s1 or s2) + */ + s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1)); + canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1)); + + /* First we use refcoords and k1,k2 to construct an array of 1's + * and 0's, telling us whether s1's raw symbol number i is countable. + * It's countable simply if it's under a canonical column. + */ + r1 = 0; + for (col = 0; k1[col] != '\0'; col++) + { + if (! isgap(k1[col])) + { + canons1[r1] = ref[col] ? 1 : 0; + r1++; + } + } + + /* Now we can construct the list. We don't count pairs if the sym in s1 + * is non-canonical. + * We have to keep separate track of our position in the list (lpos) + * from our positions in the raw sequences (r1,r2) + */ + r1 = r2 = lpos = 0; + for (col = 0; s1[col] != '\0'; col++) + { + if (! isgap(s1[col]) && canons1[r1]) + { + s1_list[lpos] = isgap(s2[col]) ? -1 : r2; + lpos++; + } + + if (! isgap(s1[col])) + r1++; + if (! isgap(s2[col])) + r2++; + } + + free(canons1); + *ret_listlen = lpos; + *ret_s1_list = s1_list; + return 1; +} + +/* Function: compare_lists() + * + * Purpose: Given four alignment lists (k1,k2, t1,t2), calculate the + * alignment score. + * + * Args: k1 - list of k1's alignment to k2 + * k2 - list of k2's alignment to k1 + * t1 - list of t1's alignment to t2 + * t2 - list of t2's alignment to t2 + * len1 - length of k1, t1 lists (same by definition) + * len2 - length of k2, t2 lists (same by definition) + * ret_sc - RETURN: identity score of alignment + * + * Return: 1 on success, 0 on failure. + */ +static int +compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc) +{ + float id; + float tot; + int i; + + id = tot = 0.0; + for (i = 0; i < len1; i++) + { + tot += 1.0; + if (t1[i] == k1[i]) id += 1.0; + } + + for ( i = 0; i < len2; i++) + { + tot += 1.0; + if (k2[i] == t2[i]) id += 1.0; + } + + *ret_sc = id / tot; + return 1; +} + + +/* Function: CompareMultAlignments + * + * Purpose: Invokes pairwise alignment comparison for every possible pair, + * and returns the average score over all N(N-1) of them or -1.0 + * on an internal failure. + * + * Can be slow for large N, since it's quadratic. + * + * Args: kseqs - trusted multiple alignment + * tseqs - test multiple alignment + * N - number of sequences + * + * Return: average identity score, or -1.0 on failure. + */ +float +CompareMultAlignments(char **kseqs, char **tseqs, int N) +{ + int i, j; /* counters for sequences */ + float score; + float tot_score = 0.0; + /* do all pairwise comparisons */ + for (i = 0; i < N; i++) + for (j = i+1; j < N; j++) + { + score = ComparePairAlignments(kseqs[i], kseqs[j], tseqs[i], tseqs[j]); + if (score < 0.0) return -1.0; + tot_score += score; + } + return ((tot_score * 2.0) / ((float) N * ((float) N - 1.0))); +} + + + +/* Function: CompareRefMultAlignments() + * + * Purpose: Same as above, except an array of reference coords for + * the canonical positions of the known alignment is also + * provided. + * + * Args: ref : 0..alen-1 array of 1/0 flags, 1 if canon + * kseqs : trusted alignment + * tseqs : test alignment + * N : number of sequences + * + * Return: average identity score, or -1.0 on failure + */ +float +CompareRefMultAlignments(int *ref, char **kseqs, char **tseqs, int N) +{ + int i, j; /* counters for sequences */ + float score; + float tot_score = 0.0; + + /* do all pairwise comparisons */ + for (i = 0; i < N; i++) + for (j = i+1; j < N; j++) + { + score = CompareRefPairAlignments(ref, kseqs[i], kseqs[j], tseqs[i], tseqs[j]); + if (score < 0.0) return -1.0; + tot_score += score; + } + return ((tot_score * 2.0)/ ((float) N * ((float) N - 1.0))); +} + +/* Function: PairwiseIdentity() + * + * Purpose: Calculate the pairwise fractional identity between + * two aligned sequences s1 and s2. This is simply + * (idents / MIN(len1, len2)). + * + * Note how many ways there are to calculate pairwise identity, + * because of the variety of choices for the denominator: + * idents/(idents+mismat) has the disadvantage that artifactual + * gappy alignments would have high "identities". + * idents/(AVG|MAX)(len1,len2) both have the disadvantage that + * alignments of fragments to longer sequences would have + * artifactually low "identities". + * + * Case sensitive; also, watch out in nucleic acid alignments; + * U/T RNA/DNA alignments will be counted as mismatches! + */ +float +PairwiseIdentity(char *s1, char *s2) +{ + int idents; /* total identical positions */ + int len1, len2; /* lengths of seqs */ + int x; /* position in aligned seqs */ + + idents = len1 = len2 = 0; + for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++) + { + if (!isgap(s1[x])) { + len1++; + if (s1[x] == s2[x]) idents++; + } + if (!isgap(s2[x])) len2++; + } + if (len2 < len1) len1 = len2; + return (len1 == 0 ? 0.0 : (float) idents / (float) len1); +} + + + +/* Function: AlignmentIdentityBySampling() + * Date: SRE, Mon Oct 19 14:29:01 1998 [St. Louis] + * + * Purpose: Estimate and return the average pairwise + * fractional identity of an alignment, + * using sampling. + * + * For use when there's so many sequences that + * an all vs. all rigorous calculation will + * take too long. + * + * Case sensitive! + * + * Args: aseq - aligned sequences + * L - length of alignment + * N - number of seqs in alignment + * nsample - number of samples + * + * Returns: average fractional identity, 0..1. + */ +float +AlignmentIdentityBySampling(char **aseq, int L, int N, int nsample) +{ + int x, i, j; /* counters */ + float sum; + + if (N < 2) return 1.0; + + sum = 0.; + for (x = 0; x < nsample; x++) + { + i = CHOOSE(N); + do { j = CHOOSE(N); } while (j == i); /* make sure j != i */ + sum += PairwiseIdentity(aseq[i], aseq[j]); + } + return sum / (float) nsample; +} + +/* Function: MajorityRuleConsensus() + * Date: SRE, Tue Mar 7 15:30:30 2000 [St. Louis] + * + * Purpose: Given a set of aligned sequences, produce a + * majority rule consensus sequence. If >50% nonalphabetic + * (usually meaning gaps) in the column, ignore the column. + * + * Args: aseq - aligned sequences, [0..nseq-1][0..alen-1] + * nseq - number of sequences + * alen - length of alignment + * + * Returns: ptr to allocated consensus sequence. + * Caller is responsible for free'ing this. + */ +char * +MajorityRuleConsensus(char **aseq, int nseq, int alen) +{ + char *cs; /* RETURN: consensus sequence */ + int count[27]; /* counts for a..z and gaps in a column */ + int idx,apos; /* counters for seq, column */ + int spos; /* position in cs */ + int x; /* counter for characters */ + int sym; + int max, bestx; + + cs = MallocOrDie(sizeof(char) * (alen+1)); + + for (spos=0,apos=0; apos < alen; apos++) + { + for (x = 0; x < 27; x++) count[x] = 0; + + for (idx = 0; idx < nseq; idx++) + { + if (isalpha(aseq[idx][apos])) { + sym = toupper(aseq[idx][apos]); + count[sym-'A']++; + } else { + count[26]++; + } + } + + if ((float) count[26] / (float) nseq <= 0.5) { + max = bestx = -1; + for (x = 0; x < 26; x++) + if (count[x] > max) { max = count[x]; bestx = x; } + cs[spos++] = (char) ('A' + bestx); + } + } + cs[spos] = '\0'; + return cs; +} diff --git a/forester/archive/RIO/others/hmmer/squid/alignio.c b/forester/archive/RIO/others/hmmer/squid/alignio.c new file mode 100644 index 0000000..f9070a8 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/alignio.c @@ -0,0 +1,643 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* alignio.c + * SRE, Mon Jul 12 11:57:37 1993 + * RCS $Id: alignio.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $ + * + * Input/output of sequence alignments. + */ + +#include +#include +#include +#include +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +/* Function: AllocAlignment() + * + * Purpose: Allocate space for an alignment, given the number + * of sequences and the alignment length in columns. + * + * Args: nseq - number of sequences + * alen - width of alignment + * ret_aseq - RETURN: alignment itself + * ainfo - RETURN: other info associated with alignment + * + * Return: (void) + * aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo). + * note that ainfo itself is alloc'ed in caller, usually + * just by a "AINFO ainfo" definition. + */ +void +AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo) +{ + char **aseq; + int idx; + + InitAinfo(ainfo); + + aseq = (char **) MallocOrDie (sizeof(char *) * nseq); + for (idx = 0; idx < nseq; idx++) + aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1)); + + ainfo->alen = alen; + ainfo->nseq = nseq; + + ainfo->wgt = (float *) MallocOrDie (sizeof(float) * nseq); + FSet(ainfo->wgt, nseq, 1.0); + + ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq); + for (idx = 0; idx < nseq; idx++) + ainfo->sqinfo[idx].flags = 0; + + *ret_aseq = aseq; +} + + +/* Function: InitAinfo() + * Date: SRE, Tue Jan 19 10:16:02 1999 [St. Louis] + * + * Purpose: Initialize the fields in ainfo structure to + * default (null) values. Does nothing with + * fields that are dependent on nseq or alen. + * + * Args: ainfo - optional info structure for an alignment + * + * Returns: (void). ainfo is modified. + */ +void +InitAinfo(AINFO *ainfo) +{ + ainfo->name = NULL; + ainfo->desc = NULL; + ainfo->cs = NULL; + ainfo->rf = NULL; + ainfo->acc = NULL; + ainfo->au = NULL; + ainfo->flags = 0; + + ainfo->tc1 = ainfo->tc2 = 0.0; + ainfo->nc1 = ainfo->nc2 = 0.0; + ainfo->ga1 = ainfo->ga2 = 0.0; +} + + +/* Function: FreeAlignment() + * + * Purpose: Free the space allocated to alignment, names, and optional + * information. + * + * Args: aseqs - sequence alignment + * ainfo - associated alignment data. + */ +void +FreeAlignment(char **aseqs, AINFO *ainfo) +{ + int i; + + for (i = 0; i < ainfo->nseq; i++) + { + if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss); + if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa); + } + if (ainfo->cs != NULL) free(ainfo->cs); + if (ainfo->rf != NULL) free(ainfo->rf); + if (ainfo->name != NULL) free(ainfo->name); + if (ainfo->desc != NULL) free(ainfo->desc); + if (ainfo->acc != NULL) free(ainfo->acc); + if (ainfo->au != NULL) free(ainfo->au); + + free(ainfo->sqinfo); + free(ainfo->wgt); + Free2DArray((void **) aseqs, ainfo->nseq); +} + + + +/* Function: SAMizeAlignment() + * Date: SRE, Tue Jun 30 09:49:40 1998 [St. Louis] + * + * Purpose: Make a "best effort" attempt to convert an alignment + * to SAM gap format: - in delete col, . in insert col. + * Only works if alignment adheres to SAM's upper/lower + * case convention, which is true for instance of old + * HMMER alignments. + * + * Args: aseq - alignment to convert + * nseq - number of seqs in alignment + * alen - length of alignment + * + * Returns: (void) + */ +void +SAMizeAlignment(char **aseq, int nseq, int alen) +{ + int col; /* counter for aligned columns */ + int i; /* counter for seqs */ + int sawlower, sawupper, sawgap; + char gapchar; + + for (col = 0; col < alen; col++) + { + sawlower = sawupper = sawgap = 0; + /* pass 1: do we see only upper or lower? */ + for (i = 0; i < nseq; i++) + { + if (isgap(aseq[i][col])) { sawgap = 1; continue; } + if (isupper((int) aseq[i][col])) { sawupper = 1; continue; } + if (islower((int) aseq[i][col])) sawlower = 1; + } + /* select gap character for column */ + gapchar = '-'; /* default */ + if (sawlower && ! sawupper) gapchar = '.'; + + /* pass 2: set gap char */ + for (i = 0; i < nseq; i++) + if (isgap(aseq[i][col])) aseq[i][col] = gapchar; + } +} + + +/* Function: SAMizeAlignmentByGapFrac() + * Date: SRE, Tue Jun 30 10:58:38 1998 [St. Louis] + * + * Purpose: Convert an alignment to SAM's gap and case + * conventions, using gap fraction in a column + * to choose match versus insert columns. In match columns, + * residues are upper case and gaps are '-'. + * In insert columns, residues are lower case and + * gaps are '.' + * + * Args: aseq - aligned sequences + * nseq - number of sequences + * alen - length of alignment + * maxgap - if more gaps than this fraction, column is insert. + * + * Returns: (void) Characters in aseq may be altered. + */ +void +SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap) +{ + int apos; /* counter over columns */ + int idx; /* counter over sequences */ + int ngap; /* number of gaps seen */ + + for (apos = 0; apos < alen; apos++) + { + /* count gaps */ + ngap = 0; + for (idx = 0; idx < nseq; idx++) + if (isgap(aseq[idx][apos])) ngap++; + + /* convert to SAM conventions */ + if ((float) ngap / (float) nseq > maxgap) + { /* insert column */ + for (idx = 0; idx < nseq; idx++) + if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.'; + else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]); + } + else + { /* match column */ + for (idx = 0; idx < nseq; idx++) + if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-'; + else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]); + } + } +} + + + + +/* Function: MakeAlignedString() + * + * Purpose: Given a raw string of some type (secondary structure, say), + * align it to a given aseq by putting gaps wherever the + * aseq has gaps. + * + * Args: aseq: template for alignment + * alen: length of aseq + * ss: raw string to align to aseq + * ret_s: RETURN: aligned ss + * + * Return: 1 on success, 0 on failure (and squid_errno is set.) + * ret_ss is malloc'ed here and must be free'd by caller. + */ +int +MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s) +{ + char *new; + int apos, rpos; + + new = (char *) MallocOrDie ((alen+1) * sizeof(char)); + for (apos = rpos = 0; apos < alen; apos++) + if (! isgap(aseq[apos])) + { + new[apos] = ss[rpos]; + rpos++; + } + else + new[apos] = '.'; + new[apos] = '\0'; + + if (rpos != strlen(ss)) + { squid_errno = SQERR_PARAMETER; free(new); return 0; } + *ret_s = new; + return 1; +} + + +/* Function: MakeDealignedString() + * + * Purpose: Given an aligned string of some type (either sequence or + * secondary structure, for instance), dealign it relative + * to a given aseq. Return a ptr to the new string. + * + * Args: aseq : template alignment + * alen : length of aseq + * ss: : string to make dealigned copy of; same length as aseq + * ret_s : RETURN: dealigned copy of ss + * + * Return: 1 on success, 0 on failure (and squid_errno is set) + * ret_s is alloc'ed here and must be freed by caller + */ +int +MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s) +{ + char *new; + int apos, rpos; + + new = (char *) MallocOrDie ((alen+1) * sizeof(char)); + for (apos = rpos = 0; apos < alen; apos++) + if (! isgap(aseq[apos])) + { + new[rpos] = ss[apos]; + rpos++; + } + new[rpos] = '\0'; + if (alen != strlen(ss)) + { squid_errno = SQERR_PARAMETER; free(new); return 0; } + *ret_s = new; + return 1; +} + + +/* Function: DealignedLength() + * + * Purpose: Count the number of non-gap symbols in seq. + * (i.e. find the length of the unaligned sequence) + * + * Args: aseq - aligned sequence to count symbols in, \0 terminated + * + * Return: raw length of seq. + */ +int +DealignedLength(char *aseq) +{ + int rlen; + for (rlen = 0; *aseq; aseq++) + if (! isgap(*aseq)) rlen++; + return rlen; +} + + +/* Function: WritePairwiseAlignment() + * + * Purpose: Write a nice formatted pairwise alignment out, + * with a BLAST-style middle line showing identities + * as themselves (single letter) and conservative + * changes as '+'. + * + * Args: ofp - open fp to write to (stdout, perhaps) + * aseq1, aseq2 - alignments to write (not necessarily + * flushed right with gaps) + * name1, name2 - names of sequences + * spos1, spos2 - starting position in each (raw) sequence + * pam - PAM matrix; positive values define + * conservative changes + * indent - how many extra spaces to print on left + * + * Return: 1 on success, 0 on failure + */ +int +WritePairwiseAlignment(FILE *ofp, + char *aseq1, char *name1, int spos1, + char *aseq2, char *name2, int spos2, + int **pam, int indent) +{ + char sname1[11]; /* shortened name */ + char sname2[11]; + int still_going; /* True if writing another block */ + char buf1[61]; /* buffer for writing seq1; CPL+1*/ + char bufmid[61]; /* buffer for writing consensus */ + char buf2[61]; + char *s1, *s2; /* ptrs into each sequence */ + int count1, count2; /* number of symbols we're writing */ + int rpos1, rpos2; /* position in raw seqs */ + int rawcount1, rawcount2; /* number of nongap symbols written */ + int apos; + + strncpy(sname1, name1, 10); + sname1[10] = '\0'; + strtok(sname1, WHITESPACE); + + strncpy(sname2, name2, 10); + sname2[10] = '\0'; + strtok(sname2, WHITESPACE); + + s1 = aseq1; + s2 = aseq2; + rpos1 = spos1; + rpos2 = spos2; + + still_going = TRUE; + while (still_going) + { + still_going = FALSE; + + /* get next line's worth from both */ + strncpy(buf1, s1, 60); buf1[60] = '\0'; + strncpy(buf2, s2, 60); buf2[60] = '\0'; + count1 = strlen(buf1); + count2 = strlen(buf2); + + /* is there still more to go? */ + if ((count1 == 60 && s1[60] != '\0') || + (count2 == 60 && s2[60] != '\0')) + still_going = TRUE; + + /* shift seq ptrs by a line */ + s1 += count1; + s2 += count2; + + /* assemble the consensus line */ + for (apos = 0; apos < count1 && apos < count2; apos++) + { + if (!isgap(buf1[apos]) && !isgap(buf2[apos])) + { + if (buf1[apos] == buf2[apos]) + bufmid[apos] = buf1[apos]; + else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0) + bufmid[apos] = '+'; + else + bufmid[apos] = ' '; + } + else + bufmid[apos] = ' '; + } + bufmid[apos] = '\0'; + + rawcount1 = 0; + for (apos = 0; apos < count1; apos++) + if (!isgap(buf1[apos])) rawcount1++; + + rawcount2 = 0; + for (apos = 0; apos < count2; apos++) + if (!isgap(buf2[apos])) rawcount2++; + + (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", + sname1, rpos1, buf1, rpos1 + rawcount1 -1); + (void) fprintf(ofp, "%*s %s\n", indent, "", + bufmid); + (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", + sname2, rpos2, buf2, rpos2 + rawcount2 -1); + (void) fprintf(ofp, "\n"); + + rpos1 += rawcount1; + rpos2 += rawcount2; + } + + return 1; +} + + +/* Function: MingapAlignment() + * + * Purpose: Remove all-gap columns from a multiple sequence alignment + * and its associated data. The alignment is assumed to be + * flushed (all aseqs the same length). + */ +int +MingapAlignment(char **aseqs, AINFO *ainfo) +{ + int apos; /* position in original alignment */ + int mpos; /* position in new alignment */ + int idx; + + /* We overwrite aseqs, using its allocated memory. + */ + for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++) + { + /* check for all-gap in column */ + for (idx = 0; idx < ainfo->nseq; idx++) + if (! isgap(aseqs[idx][apos])) + break; + if (idx == ainfo->nseq) continue; + + /* shift alignment and ainfo */ + if (mpos != apos) + { + for (idx = 0; idx < ainfo->nseq; idx++) + aseqs[idx][mpos] = aseqs[idx][apos]; + + if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos]; + if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos]; + } + mpos++; + } + /* null terminate everything */ + for (idx = 0; idx < ainfo->nseq; idx++) + aseqs[idx][mpos] = '\0'; + ainfo->alen = mpos; /* set new length */ + if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0'; + if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0'; + return 1; +} + + + +/* Function: RandomAlignment() + * + * Purpose: Create a random alignment from raw sequences. + * + * Ideally, we would like to sample an alignment from the + * space of possible alignments according to its probability, + * given a prior probability distribution for alignments. + * I don't see how to describe such a distribution, let alone + * sample it. + * + * This is a rough approximation that tries to capture some + * desired properties. We assume the alignment is generated + * by a simple HMM composed of match and insert states. + * Given parameters (pop, pex) for the probability of opening + * and extending an insertion, we can find the expected number + * of match states, M, in the underlying model for each sequence. + * We use an average M taken over all the sequences (this is + * an approximation. The expectation of M given all the sequence + * lengths is a nasty-looking summation.) + * + * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) ) + * + * Then, we assign positions in each raw sequence onto the M match + * states and M+1 insert states of this "HMM", by rolling random + * numbers and inserting the (rlen-M) inserted positions randomly + * into the insert slots, taking into account the relative probability + * of open vs. extend. + * + * The resulting alignment has two desired properties: insertions + * tend to follow the HMM-like exponential distribution, and + * the "sparseness" of the alignment is controllable through + * pop and pex. + * + * Args: rseqs - raw sequences to "align", 0..nseq-1 + * sqinfo - array of 0..nseq-1 info structures for the sequences + * nseq - number of sequences + * pop - probability to open insertion (0 minlen) M = minlen; + + /* make arrays that count insertions in M+1 possible insert states + */ + ins = (int **) MallocOrDie (sizeof(int *) * nseq); + master_ins = (int *) MallocOrDie (sizeof(int) * (M+1)); + for (idx = 0; idx < nseq; idx++) + { + ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1)); + for (rpos = 0; rpos <= M; rpos++) + ins[idx][rpos] = 0; + } + /* normalize */ + pop = pop / (pop+pex); + pex = 1.0 - pop; + /* make insertions for individual sequences */ + for (idx = 0; idx < nseq; idx++) + { + apos = -1; + for (rpos = 0; rpos < rlen[idx]-M; rpos++) + { + if (sre_random() < pop || apos == -1) /* open insertion */ + apos = CHOOSE(M+1); /* choose 0..M */ + ins[idx][apos]++; + } + } + /* calculate master_ins, max inserts */ + alen = M; + for (apos = 0; apos <= M; apos++) + { + master_ins[apos] = 0; + for (idx = 0; idx < nseq; idx++) + if (ins[idx][apos] > master_ins[apos]) + master_ins[apos] = ins[idx][apos]; + alen += master_ins[apos]; + } + + + /* Now, construct alignment + */ + aseqs = (char **) MallocOrDie (sizeof (char *) * nseq); + for (idx = 0; idx < nseq; idx++) + aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1)); + for (idx = 0; idx < nseq; idx++) + { + apos = rpos = 0; + + for (statepos = 0; statepos <= M; statepos++) + { + for (count = 0; count < ins[idx][statepos]; count++) + aseqs[idx][apos++] = rseqs[idx][rpos++]; + for (; count < master_ins[statepos]; count++) + aseqs[idx][apos++] = ' '; + + if (statepos != M) + aseqs[idx][apos++] = rseqs[idx][rpos++]; + } + aseqs[idx][alen] = '\0'; + } + ainfo->flags = 0; + ainfo->alen = alen; + ainfo->nseq = nseq; + ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq); + for (idx = 0; idx < nseq; idx++) + SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx])); + + free(rlen); + free(master_ins); + Free2DArray((void **) ins, nseq); + *ret_aseqs = aseqs; + return 1; +} + +/* Function: AlignmentHomogenousGapsym() + * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis] + * + * Purpose: Sometimes we've got to convert alignments to + * a lowest common denominator, and we need + * a single specific gap character -- for example, + * PSI-BLAST blastpgp -B takes a very simplistic + * alignment input format which appears to only + * allow '-' as a gap symbol. + * + * Anything matching the isgap() macro is + * converted. + * + * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1] + * nseq - number of aligned strings + * alen - length of alignment + * gapsym - character to use for gaps. + * + * Returns: void ("never fails") + */ +void +AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym) +{ + int i, apos; + + for (i = 0; i < nseq; i++) + for (apos = 0; apos < alen; apos++) + if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym; +} diff --git a/forester/archive/RIO/others/hmmer/squid/alistat_main.c b/forester/archive/RIO/others/hmmer/squid/alistat_main.c new file mode 100644 index 0000000..b7c2c2f --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/alistat_main.c @@ -0,0 +1,273 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* alistat_main.c + * Fri Jan 27 10:41:41 1995 + * CVS $Id: alistat_main.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $ + * + * Look at an alignment file, determine some simple statistics. + */ + +#include +#include +#include +#include "squid.h" +#include "msa.h" + +static char banner[] = "alistat - show some simple statistics on an alignment file"; + +static char usage[] = "\ +Usage: alistat [-options] \n\ + Available options:\n\ + -a : report per-sequence info, not just a summary\n\ + -f : fast: estimate average %id by sampling (not compatible with -a)\n\ + -h : help: display usage and version\n\ + -q : quiet: suppress verbose header\n\ +"; + +static char experts[] = "\ + Expert options:\n\ + --consensus : write majority rule consensus sequence(s) in FASTA\n\ + format to file \n\ + --identmx : save a report on all NxN pairwise identities to file \n\ + --informat : specify alignment file format \n\ + allowed formats: SELEX, MSF, Clustal, a2m, PHYLIP\n\ +"; + +struct opt_s OPTIONS[] = { + { "-a", TRUE, sqdARG_NONE }, + { "-f", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "-q", TRUE, sqdARG_NONE }, + { "--consensus", FALSE, sqdARG_STRING }, + { "--identmx", FALSE, sqdARG_STRING }, + { "--informat", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *afile; /* name of aligned sequence file */ + MSAFILE *afp; /* pointer to open alignment file*/ + MSA *msa; /* multiple sequence alignment */ + int fmt; /* format of afile */ + int rlen; /* raw sequence length */ + int nres; /* number of residues */ + float **imx; /* identity matrix */ + int i,j; + int small, large; + int bestj, worstj; + float sum, best, worst; + float worst_worst, worst_best, best_best; + float avgid; + int nsample; + + int allreport; + int do_fast; + int be_quiet; + char *consfile; + FILE *consfp = NULL; + char *identmx_report; /* file to save identity matrix info to */ + FILE *identmx_fp = NULL; + + char *optname; + char *optarg; + int optind; + + /* These inits are solely to silence gcc warnings about + * uninitialized variables + */ + worst_worst = worst_best = best_best = 0.0; + bestj = worstj = -1; + + /*********************************************** + * Parse command line + ***********************************************/ + + fmt = MSAFILE_UNKNOWN; /* by default, we autodetect file format */ + allreport = FALSE; + do_fast = FALSE; + be_quiet = FALSE; + consfile = NULL; + identmx_report = NULL; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-a") == 0) { allreport = TRUE; } + else if (strcmp(optname, "-f") == 0) { do_fast = TRUE; } + else if (strcmp(optname, "-q") == 0) { be_quiet = TRUE; } + else if (strcmp(optname, "--consensus") == 0) { consfile = optarg; } + else if (strcmp(optname, "--identmx") == 0) { identmx_report = optarg; } + else if (strcmp(optname, "--informat") == 0) { + fmt = String2SeqfileFormat(optarg); + if (fmt == MSAFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + if (! IsAlignmentFormat(fmt)) + Die("%s is an unaligned format, can't read as an alignment", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind != 1) Die("Incorrect number of arguments.\n%s\n", usage); + afile = argv[optind]; + + if (do_fast && allreport) + Die("Verbose reports (-a, --identmx) are incompatible with fast sampling (-f)"); + if (do_fast && identmx_report != NULL) + Die("Verbose reports (-a, --identmx) are incompatible with fast sampling (-f)"); + + if (! be_quiet) + Banner(stdout, banner); + + /*********************************************** + * Loop over every alignment in the file. + ***********************************************/ + + if ((afp = MSAFileOpen(afile, fmt, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", afile); + + if (consfile != NULL && (consfp = fopen(consfile, "w")) == NULL) + Die("Failed to open consensus sequence file %s for writing", consfile); + + if (identmx_report != NULL && (identmx_fp = fopen(identmx_report, "w")) == NULL) + Die("Failed to open identity matrix report file %s for writing", identmx_report); + + while ((msa = MSAFileRead(afp)) != NULL) + { + for (i = 0; i < msa->nseq; i++) s2upper(msa->aseq[i]); + + /* Statistics we always collect: + * unaligned sequence lengths; mean and range + */ + nres = 0; + small = large = -1; + for (i = 0; i < msa->nseq; i++) + { + rlen = DealignedLength(msa->aseq[i]); + nres += rlen; + if (small == -1 || rlen < small) small = rlen; + if (large == -1 || rlen > large) large = rlen; + } + + /* Statistics we have to be careful about + * collecting, because of time constraints on NxN operations + */ + if (do_fast) + { + nsample = 1000; + avgid = AlignmentIdentityBySampling(msa->aseq, msa->alen, msa->nseq, + nsample); + } + else + { + /* In a full report, for each sequence, find the best relative, + * and the worst relative. For overall statistics, save the + * worst best (most distant single seq) and the best best + * (most closely related pair) and the worst worst (most + * distantly related pair) and yes, I know it's confusing. + */ + + MakeIdentityMx(msa->aseq, msa->nseq, &imx); + if (allreport) { + printf(" %-15s %5s %7s %-15s %7s %-15s\n", + "NAME", "LEN", "HIGH ID", "(TO)", "LOW ID", "(TO)"); + printf(" --------------- ----- ------- --------------- ------- ---------------\n"); + } + + /* Print the identity matrix report: one line per pair of sequences. + */ + if (identmx_report != NULL) + { + for (i = 0; i < msa->nseq; i++) + for (j = i+1; j < msa->nseq; j++) + fprintf(identmx_fp, "%-4d %-4d %-15s %-15s %.3f\n", + i, j, msa->sqname[i], msa->sqname[j], imx[i][j]); + } + + sum = 0.0; + worst_best = 1.0; + best_best = 0.0; + worst_worst = 1.0; + for (i = 0; i < msa->nseq; i++) + { + worst = 1.0; + best = 0.0; + for (j = 0; j < msa->nseq; j++) + { /* closest seq to this one = best */ + if (i != j && imx[i][j] > best) + { best = imx[i][j]; bestj = j; } + if (imx[i][j] < worst) + { worst = imx[i][j]; worstj = j; } + } + + if (allreport) + printf("* %-15s %5d %7.1f %-15s %7.1f %-15s\n", + msa->sqname[i], DealignedLength(msa->aseq[i]), + best * 100., msa->sqname[bestj], + worst * 100., msa->sqname[worstj]); + + if (best > best_best) best_best = best; + if (best < worst_best) worst_best = best; + if (worst < worst_worst) worst_worst = worst; + for (j = 0; j < i; j++) + sum += imx[i][j]; + + } + avgid = sum / (float) (msa->nseq * (msa->nseq-1)/2.0); + if (allreport) puts(""); + FMX2Free(imx); + } + + /* Print output. + * Some fields aren't available if -f (fast) was chosen. + */ + if (msa->name != NULL) + printf("Alignment name: %s\n", msa->name); + printf("Format: %s\n", SeqfileFormat2String(afp->format)); + printf("Number of sequences: %d\n", msa->nseq); + printf("Total # residues: %d\n", nres); + printf("Smallest: %d\n", small); + printf("Largest: %d\n", large); + printf("Average length: %.1f\n", (float) nres / (float) msa->nseq); + printf("Alignment length: %d\n", msa->alen); + printf("Average identity: %.0f%%\n", 100.*avgid); + if (! do_fast) { + printf("Most related pair: %.0f%%\n", 100.*best_best); + printf("Most unrelated pair: %.0f%%\n", 100.*worst_worst); + printf("Most distant seq: %.0f%%\n", 100.*worst_best); + } + + /* Save majority rule consensus sequence if we were asked + */ + if (consfile != NULL) { + char *cs; + cs = MajorityRuleConsensus(msa->aseq, msa->nseq, msa->alen); + WriteSimpleFASTA(consfp, cs, + msa->name != NULL? msa->name : "consensus", + msa->desc); + free(cs); + printf("Consensus: written to %s\n", consfile); + } + + puts("//"); + MSAFree(msa); + } + + MSAFileClose(afp); + if (consfile != NULL) fclose(consfp); + return 0; +} diff --git a/forester/archive/RIO/others/hmmer/squid/clustal.c b/forester/archive/RIO/others/hmmer/squid/clustal.c new file mode 100644 index 0000000..5fbafb0 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/clustal.c @@ -0,0 +1,179 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* clustal.c + * SRE, Sun Jun 6 17:50:45 1999 [bus from Madison, 1999 worm mtg] + * + * Import/export of ClustalV/W multiple sequence alignment + * formatted files. Derivative of msf.c; MSF is a pretty + * generic interleaved format. + * + * RCS $Id: clustal.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include "squid.h" +#include "msa.h" + +#ifdef TESTDRIVE_CLUSTAL +/***************************************************************** + * msf.c test driver: + * cc -DTESTDRIVE_CLUSTAL -g -O2 -Wall -o test clustal.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm + * + */ +int +main(int argc, char **argv) +{ + MSAFILE *afp; + MSA *msa; + char *file; + + file = argv[1]; + + if ((afp = MSAFileOpen(file, MSAFILE_CLUSTAL, NULL)) == NULL) + Die("Couldn't open %s\n", file); + + while ((msa = ReadClustal(afp)) != NULL) + { + WriteClustal(stdout, msa); + MSAFree(msa); + } + + MSAFileClose(afp); + exit(0); +} +/******************************************************************/ +#endif /* testdrive_clustal */ + + +/* Function: ReadClustal() + * Date: SRE, Sun Jun 6 17:53:49 1999 [bus from Madison, 1999 worm mtg] + * + * Purpose: Parse an alignment read from an open Clustal format + * alignment file. Clustal is a single-alignment format. + * Return the alignment, or NULL if we have no data. + * + * Args: afp - open alignment file + * + * Returns: MSA * - an alignment object + * caller responsible for an MSAFree() + * NULL if no more alignments + * + * Diagnostics: + * Will Die() here with a (potentially) useful message + * if a parsing error occurs. + */ +MSA * +ReadClustal(MSAFILE *afp) +{ + MSA *msa; + char *s; + int slen; + int sqidx; + char *name; + char *seq; + char *s2; + + if (feof(afp->f)) return NULL; + + /* Skip until we see the CLUSTAL header + */ + while ((s = MSAFileGetLine(afp)) != NULL) + { + if (strncmp(s, "CLUSTAL", 7) == 0 && + strstr(s, "multiple sequence alignment") != NULL) + break; + } + if (s == NULL) return NULL; + + msa = MSAAlloc(10, 0); + + /* Now we're in the sequence section. + * As discussed above, if we haven't seen a sequence name, then we + * don't include the sequence in the alignment. + * Watch out for conservation markup lines that contain *.: chars + */ + while ((s = MSAFileGetLine(afp)) != NULL) + { + if ((name = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; + if ((seq = sre_strtok(&s, WHITESPACE, &slen)) == NULL) continue; + s2 = sre_strtok(&s, "\n", NULL); + + /* The test for a conservation markup line + */ + if (strpbrk(name, ".*:") != NULL && strpbrk(seq, ".*:") != NULL) + continue; + if (s2 != NULL) + Die("Parse failed at line %d, file %s: possibly using spaces as gaps", + afp->linenumber, afp->fname); + + /* It's not blank, and it's not a coord line: must be sequence + */ + sqidx = MSAGetSeqidx(msa, name, msa->lastidx+1); + msa->lastidx = sqidx; + msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); + } + + MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ + return msa; +} + + +/* Function: WriteClustal() + * Date: SRE, Sun Jun 6 18:12:47 1999 [bus from Madison, worm mtg 1999] + * + * Purpose: Write an alignment in Clustal format to an open file. + * + * Args: fp - file that's open for writing. + * msa - alignment to write. + * + * Returns: (void) + */ +void +WriteClustal(FILE *fp, MSA *msa) +{ + int idx; /* counter for sequences */ + int len; /* tmp variable for name lengths */ + int namelen; /* maximum name length used */ + int pos; /* position counter */ + char buf[64]; /* buffer for writing seq */ + int cpl = 50; /* char per line (< 64) */ + + /* calculate max namelen used */ + namelen = 0; + for (idx = 0; idx < msa->nseq; idx++) + if ((len = strlen(msa->sqname[idx])) > namelen) + namelen = len; + + fprintf(fp, "CLUSTAL W(1.5) multiple sequence alignment\n"); + + /***************************************************** + * Write the sequences + *****************************************************/ + + for (pos = 0; pos < msa->alen; pos += cpl) + { + fprintf(fp, "\n"); /* Blank line between sequence blocks */ + for (idx = 0; idx < msa->nseq; idx++) + { + strncpy(buf, msa->aseq[idx] + pos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "%*s %s\n", namelen, msa->sqname[idx], buf); + } + } + + return; +} + + + diff --git a/forester/archive/RIO/others/hmmer/squid/cluster.c b/forester/archive/RIO/others/hmmer/squid/cluster.c new file mode 100644 index 0000000..538ae76 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/cluster.c @@ -0,0 +1,544 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* cluster.c + * SRE, Sun Jul 18 09:49:47 1993 + * moved to squid Thu Mar 3 08:42:57 1994 + * RCS $Id: cluster.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $ + * + * almost identical to bord.c, from fd + * also now contains routines for constructing difference matrices + * from alignments + * + * "branch ordering": Input a symmetric or upper-right-diagonal + * NxN difference matrix (usually constructed by pairwise alignment + * and similarity calculations for N sequences). Use the simple + * cluster analysis part of the Fitch/Margoliash tree-building algorithm + * (as described by Fitch and Margoliash 1967 as well as Feng + * and Doolittle 1987) to calculate the topology of an "evolutionary + * tree" consistent with the difference matrix. Returns an array + * which represents the tree. + * + * The input difference matrix is just an NxN matrix of floats. + * A good match is a small difference score (the algorithm is going + * to search for minima among the difference scores). The original difference + * matrix remains unchanged by the calculations. + * + * The output requires some explanation. A phylogenetic + * tree is a binary tree, with N "leaves" and N-1 "nodes". The + * topology of the tree may be completely described by N-1 structures + * containing two pointers; each pointer points to either a leaf + * or another node. Here, this is implemented with integer indices + * rather than pointers. An array of N-1 pairs of ints is returned. + * If the index is in the range (0..N-1), it is a "leaf" -- the + * number of one of the sequences. If the index is in the range + * (N..2N-2), it is another "node" -- (index-N) is the index + * of the node in the returned array. + * + * If both indices of a member of the returned array point to + * nodes, the tree is "compound": composed of more than one + * cluster of related sequences. + * + * The higher-numbered elements of the returned array were the + * first constructed, and hence represent the distal tips + * of the tree -- the most similar sequences. The root + * is node 0. + ****************************************************************** + * + * Algorithm + * + * INITIALIZATIONS: + * - copy the difference matrix (otherwise the caller's copy would + * get destroyed by the operations of this algorithm). If + * it's asymmetric, make it symmetric. + * - make a (0..N-1) array of ints to keep track of the indices in + * the difference matrix as they get swapped around. Initialize + * this matrix to 0..N-1. + * - make a (0..N-2) array of int[2] to store the results (the tree + * topology). Doesn't need to be initialized. + * - keep track of a "N'", the current size of the difference + * matrix being operated on. + * + * PROCESSING THE DIFFERENCE MATRIX: + * - for N' = N down to N' = 2 (N-1 steps): + * - in the half-diagonal N'xN' matrix, find the indices i,j at which + * there's the minimum difference score + * + * Store the results: + * - at position N'-2 of the result array, store coords[i] and + * coords[j]. + * + * Move i,j rows, cols to the outside edges of the matrix: + * - swap row i and row N'-2 + * - swap row j and row N'-1 + * - swap column i and column N'-2 + * - swap column j and column N'-1 + * - swap indices i, N'-2 in the index array + * - swap indices j, N'-1 in the index array + * + * Build a average difference score for differences to i,j: + * - for all columns, find avg difference between rows i and j and store in row i: + * row[i][col] = (row[i][col] + row[j][col]) / 2.0 + * - copy the contents of row i to column i (it's a symmetric + * matrix, no need to recalculate) + * - store an index N'+N-2 at position N'-2 of the index array: means + * that this row/column is now a node rather than a leaf, and + * contains minimum values + * + * Continue: + * - go to the next N' + * + * GARBAGE COLLECTION & RETURN. + * + ********************************************************************** + * + * References: + * + * Feng D-F and R.F. Doolittle. "Progressive sequence alignment as a + * prerequisite to correct phylogenetic trees." J. Mol. Evol. + * 25:351-360, 1987. + * + * Fitch W.M. and Margoliash E. "Construction of phylogenetic trees." + * Science 155:279-284, 1967. + * + ********************************************************************** + * + * SRE, 18 March 1992 (bord.c) + * SRE, Sun Jul 18 09:52:14 1993 (cluster.c) + * added to squid Thu Mar 3 09:13:56 1994 + ********************************************************************** + * Mon May 4 09:47:02 1992: keep track of difference scores at each node + */ + + +#include +#include +#include + +#include "squid.h" +#include "sqfuncs.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +/* Function: Cluster() + * + * Purpose: Cluster analysis on a distance matrix. Constructs a + * phylogenetic tree which contains the topology + * and info for each node: branch lengths, how many + * sequences are included under the node, and which + * sequences are included under the node. + * + * Args: dmx - the NxN distance matrix ( >= 0.0, larger means more diverged) + * N - size of mx (number of sequences) + * mode - CLUSTER_MEAN, CLUSTER_MAX, or CLUSTER_MIN + * ret_tree- RETURN: the tree + * + * Return: 1 on success, 0 on failure. + * The caller is responsible for freeing the tree's memory, + * by calling FreePhylo(tree, N). + */ +int +Cluster(float **dmx, int N, enum clust_strategy mode, struct phylo_s **ret_tree) +{ + struct phylo_s *tree; /* (0..N-2) phylogenetic tree */ + float **mx; /* copy of difference matrix */ + int *coord; /* (0..N-1), indices for matrix coords */ + int i, j; /* coords of minimum difference */ + int idx; /* counter over seqs */ + int Np; /* N', a working copy of N */ + int row, col; /* loop variables */ + float min; /* best minimum score found */ + float *trow; /* tmp pointer for swapping rows */ + float tcol; /* tmp storage for swapping cols */ + float *diff; /* (0..N-2) difference scores at nodes */ + int swapfoo; /* for SWAP() macro */ + + /************************** + * Initializations. + **************************/ + /* We destroy the matrix we work on, so make a copy of dmx. + */ + mx = MallocOrDie (sizeof(float *) * N); + for (i = 0; i < N; i++) + { + mx[i] = MallocOrDie (sizeof(float) * N); + for (j = 0; j < N; j++) + mx[i][j] = dmx[i][j]; + } + /* coord array alloc, (0..N-1) */ + coord = MallocOrDie (N * sizeof(int)); + diff = MallocOrDie ((N-1) * sizeof(float)); + /* init the coord array to 0..N-1 */ + for (col = 0; col < N; col++) coord[col] = col; + for (i = 0; i < N-1; i++) diff[i] = 0.0; + + /* tree array alloc, (0..N-2) */ + if ((tree = AllocPhylo(N)) == NULL) Die("AllocPhylo() failed"); + + /********************************* + * Process the difference matrix + *********************************/ + + /* N-prime, for an NxN down to a 2x2 diffmx */ + j= 0; /* just to silence gcc uninit warnings */ + for (Np = N; Np >= 2; Np--) + { + /* find a minimum on the N'xN' matrix*/ + min = 999999.; + for (row = 0; row < Np; row++) + for (col = row+1; col < Np; col++) + if (mx[row][col] < min) + { + min = mx[row][col]; + i = row; + j = col; + } + + /* We're clustering row i with col j. write necessary + * data into a node on the tree + */ + /* topology info */ + tree[Np-2].left = coord[i]; + tree[Np-2].right = coord[j]; + if (coord[i] >= N) tree[coord[i]-N].parent = N + Np - 2; + if (coord[j] >= N) tree[coord[j]-N].parent = N + Np - 2; + + /* keep score info */ + diff[Np-2] = tree[Np-2].diff = min; + + /* way-simple branch length estimation */ + tree[Np-2].lblen = tree[Np-2].rblen = min; + if (coord[i] >= N) tree[Np-2].lblen -= diff[coord[i]-N]; + if (coord[j] >= N) tree[Np-2].rblen -= diff[coord[j]-N]; + + /* number seqs included at node */ + if (coord[i] < N) + { + tree[Np-2].incnum ++; + tree[Np-2].is_in[coord[i]] = 1; + } + else + { + tree[Np-2].incnum += tree[coord[i]-N].incnum; + for (idx = 0; idx < N; idx++) + tree[Np-2].is_in[idx] |= tree[coord[i]-N].is_in[idx]; + } + + if (coord[j] < N) + { + tree[Np-2].incnum ++; + tree[Np-2].is_in[coord[j]] = 1; + } + else + { + tree[Np-2].incnum += tree[coord[j]-N].incnum; + for (idx = 0; idx < N; idx++) + tree[Np-2].is_in[idx] |= tree[coord[j]-N].is_in[idx]; + } + + + /* Now build a new matrix, by merging row i with row j and + * column i with column j; see Fitch and Margoliash + */ + /* Row and column swapping. */ + /* watch out for swapping i, j away: */ + if (i == Np-1 || j == Np-2) + SWAP(i,j); + + if (i != Np-2) + { + /* swap row i, row N'-2 */ + trow = mx[Np-2]; mx[Np-2] = mx[i]; mx[i] = trow; + /* swap col i, col N'-2 */ + for (row = 0; row < Np; row++) + { + tcol = mx[row][Np-2]; + mx[row][Np-2] = mx[row][i]; + mx[row][i] = tcol; + } + /* swap coord i, coord N'-2 */ + SWAP(coord[i], coord[Np-2]); + } + + if (j != Np-1) + { + /* swap row j, row N'-1 */ + trow = mx[Np-1]; mx[Np-1] = mx[j]; mx[j] = trow; + /* swap col j, col N'-1 */ + for (row = 0; row < Np; row++) + { + tcol = mx[row][Np-1]; + mx[row][Np-1] = mx[row][j]; + mx[row][j] = tcol; + } + /* swap coord j, coord N'-1 */ + SWAP(coord[j], coord[Np-1]); + } + + /* average i and j together; they're now + at Np-2 and Np-1 though */ + i = Np-2; + j = Np-1; + /* merge by saving avg of cols of row i and row j */ + for (col = 0; col < Np; col++) + { + switch (mode) { + case CLUSTER_MEAN: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break; + case CLUSTER_MIN: mx[i][col] = MIN(mx[i][col], mx[j][col]); break; + case CLUSTER_MAX: mx[i][col] = MAX(mx[i][col], mx[j][col]); break; + default: mx[i][col] =(mx[i][col]+ mx[j][col]) / 2.0; break; + } + } + /* copy those rows to columns */ + for (col = 0; col < Np; col++) + mx[col][i] = mx[i][col]; + /* store the node index in coords */ + coord[Np-2] = Np+N-2; + } + + /************************** + * Garbage collection and return + **************************/ + Free2DArray((void **) mx, N); + free(coord); + free(diff); + *ret_tree = tree; + return 1; +} + +/* Function: AllocPhylo() + * + * Purpose: Allocate space for a phylo_s array. N-1 structures + * are allocated, one for each node; in each node, a 0..N + * is_in flag array is also allocated and initialized to + * all zeros. + * + * Args: N - size; number of sequences being clustered + * + * Return: pointer to the allocated array + * + */ +struct phylo_s * +AllocPhylo(int N) +{ + struct phylo_s *tree; + int i; + + if ((tree = (struct phylo_s *) malloc ((N-1) * sizeof(struct phylo_s))) == NULL) + return NULL; + + for (i = 0; i < N-1; i++) + { + tree[i].diff = 0.0; + tree[i].lblen = tree[i].rblen = 0.0; + tree[i].left = tree[i].right = tree[i].parent = -1; + tree[i].incnum = 0; + if ((tree[i].is_in = (char *) calloc (N, sizeof(char))) == NULL) + return NULL; + } + return tree; +} + + +/* Function: FreePhylo() + * + * Purpose: Free a clustree array that was built to cluster N sequences. + * + * Args: tree - phylogenetic tree to free + * N - size of clustree; number of sequences it clustered + * + * Return: (void) + */ +void +FreePhylo(struct phylo_s *tree, int N) +{ + int idx; + + for (idx = 0; idx < N-1; idx++) + free(tree[idx].is_in); + free(tree); +} + + +/* Function: MakeDiffMx() + * + * Purpose: Given a set of aligned sequences, construct + * an NxN fractional difference matrix. (i.e. 1.0 is + * completely different, 0.0 is exactly identical). + * + * Args: aseqs - flushed, aligned sequences + * num - number of aseqs + * ret_dmx - RETURN: difference matrix + * + * Return: 1 on success, 0 on failure. + * Caller must free diff matrix with FMX2Free(dmx) + */ +void +MakeDiffMx(char **aseqs, int num, float ***ret_dmx) +{ + float **dmx; /* RETURN: distance matrix */ + int i,j; /* counters over sequences */ + + /* Allocate 2D float matrix + */ + dmx = FMX2Alloc(num, num); + + /* Calculate distances; symmetric matrix + * record difference, not identity (1 - identity) + */ + for (i = 0; i < num; i++) + for (j = i; j < num; j++) + dmx[i][j] = dmx[j][i] = 1.0 - PairwiseIdentity(aseqs[i], aseqs[j]); + + *ret_dmx = dmx; + return; +} + +/* Function: MakeIdentityMx() + * + * Purpose: Given a set of aligned sequences, construct + * an NxN fractional identity matrix. (i.e. 1.0 is + * completely identical, 0.0 is completely different). + * Virtually identical to MakeDiffMx(). It's + * less confusing to have two distinct functions, I find. + * + * Args: aseqs - flushed, aligned sequences + * num - number of aseqs + * ret_imx - RETURN: identity matrix (caller must free) + * + * Return: 1 on success, 0 on failure. + * Caller must free imx using FMX2Free(imx) + */ +void +MakeIdentityMx(char **aseqs, int num, float ***ret_imx) +{ + float **imx; /* RETURN: identity matrix */ + int i,j; /* counters over sequences */ + + /* Allocate 2D float matrix + */ + imx = FMX2Alloc(num, num); + + /* Calculate distances, symmetric matrix + */ + for (i = 0; i < num; i++) + for (j = i; j < num; j++) + imx[i][j] = imx[j][i] = PairwiseIdentity(aseqs[i], aseqs[j]); + + *ret_imx = imx; + return; +} + + + +/* Function: PrintNewHampshireTree() + * + * Purpose: Print out a tree in the "New Hampshire" standard + * format. See PHYLIP's draw.doc for a definition of + * the New Hampshire format. + * + * Like a CFG, we generate the format string left to + * right by a preorder tree traversal. + * + * Args: fp - file to print to + * ainfo- alignment info, including sequence names + * tree - tree to print + * N - number of leaves + * + */ +void +PrintNewHampshireTree(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N) +{ + struct intstack_s *stack; + int code; + float *blen; + int docomma; + + blen = (float *) MallocOrDie (sizeof(float) * (2*N-1)); + stack = InitIntStack(); + PushIntStack(stack, N); /* push root on stack */ + docomma = FALSE; + + /* node index code: + * 0..N-1 = leaves; indexes of sequences. + * N..2N-2 = interior nodes; node-N = index of node in tree structure. + * code N is the root. + * 2N..3N-2 = special flags for closing interior nodes; node-2N = index in tree + */ + while (PopIntStack(stack, &code)) + { + if (code < N) /* we're a leaf. */ + { + /* 1) print name:branchlength */ + if (docomma) fputs(",", fp); + fprintf(fp, "%s:%.5f", ainfo->sqinfo[code].name, blen[code]); + docomma = TRUE; + } + + else if (code < 2*N) /* we're an interior node */ + { + /* 1) print a '(' */ + if (docomma) fputs(",\n", fp); + fputs("(", fp); + /* 2) push on stack: ), rchild, lchild */ + PushIntStack(stack, code+N); + PushIntStack(stack, tree[code-N].right); + PushIntStack(stack, tree[code-N].left); + /* 3) record branch lengths */ + blen[tree[code-N].right] = tree[code-N].rblen; + blen[tree[code-N].left] = tree[code-N].lblen; + docomma = FALSE; + } + + else /* we're closing an interior node */ + { + /* print a ):branchlength */ + if (code == 2*N) fprintf(fp, ");\n"); + else fprintf(fp, "):%.5f", blen[code-N]); + docomma = TRUE; + } + } + + FreeIntStack(stack); + free(blen); + return; +} + + +/* Function: PrintPhylo() + * + * Purpose: Debugging output of a phylogenetic tree structure. + */ +void +PrintPhylo(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N) +{ + int idx; + + for (idx = 0; idx < N-1; idx++) + { + fprintf(fp, "Interior node %d (code %d)\n", idx, idx+N); + fprintf(fp, "\tParent: %d (code %d)\n", tree[idx].parent-N, tree[idx].parent); + fprintf(fp, "\tLeft: %d (%s) %f\n", + tree[idx].left < N ? tree[idx].left-N : tree[idx].left, + tree[idx].left < N ? ainfo->sqinfo[tree[idx].left].name : "interior", + tree[idx].lblen); + fprintf(fp, "\tRight: %d (%s) %f\n", + tree[idx].right < N ? tree[idx].right-N : tree[idx].right, + tree[idx].right < N ? ainfo->sqinfo[tree[idx].right].name : "interior", + tree[idx].rblen); + fprintf(fp, "\tHeight: %f\n", tree[idx].diff); + fprintf(fp, "\tIncludes:%d seqs\n", tree[idx].incnum); + } +} + + + diff --git a/forester/archive/RIO/others/hmmer/squid/compalign_main.c b/forester/archive/RIO/others/hmmer/squid/compalign_main.c new file mode 100644 index 0000000..0ac499d --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/compalign_main.c @@ -0,0 +1,221 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* main for compalign + * + * Compalign -- a program to compare two sequence alignments + * SRE, Tue Nov 3 07:38:03 1992 + * RCS $Id: compalign_main.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $ + * + * incorporated into SQUID, Thu Jan 26 16:52:41 1995 + * + * Usage: compalign + * + * Calculate the fractional "identity" between the trusted alignment + * and the test alignment. The two files must contain exactly the same + * sequences, in exactly the same order. + * + * The identity of the multiple sequence alignments is defined as + * the averaged identity over all N(N-1)/2 pairwise alignments. + * + * The fractional identity of two sets of pairwise alignments + * is in turn defined as follows (for aligned known sequences k1 and k2, + * and aligned test sequences t1 and t2): + * + * matched columns / total columns, + * + * where total columns = the total number of columns in + * which there is a valid (nongap) symbol in k1 or k2; + * + * matched columns = the number of columns in which one of the + * following is true: + * + * k1 and k2 both have valid symbols at a given column; t1 and t2 + * have the same symbols aligned in a column of the t1/t2 + * alignment; + * + * k1 has a symbol aligned to a gap in k2; that symbol in t1 + * is also aligned to a gap; + * + * k2 has a symbol aligned to a gap in k1; that symbol in t2 + * is also aligned to a gap. + * + * Because scores for all possible pairs are calculated, the + * algorithm is of order (N^2)L for N sequences of length L; + * large sequence sets will take a while. + * + * Sean Eddy, Tue Nov 3 07:46:59 1992 + * + */ + +#include +#include +#include "squid.h" +#include "msa.h" + +static char banner[] = "compalign - compare two multiple alignments"; + +static char usage[] = "\ +Usage: compalign [-options] \n\ + Available options:\n\ + -c : only compare under marked #=CS consensus structure\n\ + -h : print short help and usage info\n\ +"; + +static char experts[] = "\ + --informat : specify that both alignments are in format (MSF, for instance)\n\ + --quiet : suppress verbose header (used in regression testing)\n\ +"; + +struct opt_s OPTIONS[] = { + { "-c", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "--informat", FALSE, sqdARG_STRING }, + { "--quiet", FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + + +int +main(int argc, char **argv) +{ + char *kfile; /* name of file of trusted (known) alignment */ + char *tfile; /* name of file of test alignment */ + MSAFILE *kfp; /* open ptr into trusted (known) alignfile */ + MSAFILE *tfp; /* open ptr into test alignment file */ + int format; /* expected format of alignment files */ + MSA *kmsa; /* a trusted (known) alignment */ + MSA *tmsa; /* a test alignment */ + char **kraw; /* dealigned trusted seqs */ + char **traw; /* dealigned test sequences */ + int idx; /* counter for sequences */ + int apos; /* position in alignment */ + float score; /* RESULT: score for the comparison */ + + int cs_only; /* TRUE to compare under #=CS annotation only */ + int *ref = NULL; /* init only to silence gcc warning */ + int be_quiet; /* TRUE to suppress verbose header */ + + char *optname; + char *optarg; + int optind; + + /*********************************************** + * Parse command line + ***********************************************/ + + format = MSAFILE_UNKNOWN; + cs_only = FALSE; + be_quiet = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-c") == 0) cs_only = TRUE; + else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE; + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == MSAFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + if (! IsAlignmentFormat(format)) + Die("%s is an unaligned format, can't read as an alignment", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind != 2) + Die("Incorrect number of command line arguments.\n%s\n", usage); + + kfile = argv[optind++]; + tfile = argv[optind]; + + if (! be_quiet) Banner(stdout, banner); + + /*********************************************** + * Read in the alignments + * Capable of handling full Stockholm: >1 alignment/file + ***********************************************/ + + if ((kfp = MSAFileOpen(kfile, format, NULL)) == NULL) + Die("Trusted alignment file %s could not be opened for reading", kfile); + if ((tfp = MSAFileOpen(tfile, format, NULL)) == NULL) + Die("Test alignment file %s could not be opened for reading", tfile); + + while ((kmsa = MSAFileRead(kfp)) != NULL) + { + if ((tmsa = MSAFileRead(tfp)) == NULL) + Die("Failed to get a test alignment to match with the trusted alignment"); + + /* test that they're the same! */ + if (kmsa->nseq != tmsa->nseq) + Die("files %s and %s do not contain same number of seqs!\n", kfile, tfile); + + for (idx = 0; idx < kmsa->nseq; idx++) + { + s2upper(kmsa->aseq[idx]); + s2upper(tmsa->aseq[idx]); + } + /* another sanity check */ + for (idx = 0; idx < kmsa->nseq; idx++) + if (strcmp(kmsa->sqname[idx], tmsa->sqname[idx]) != 0) + Die("seqs in %s and %s don't seem to be in the same order\n (%s != %s)", + kfile, tfile, kmsa->sqname[idx], tmsa->sqname[idx]); + + /* and *another* sanity check */ + DealignAseqs(kmsa->aseq, kmsa->nseq, &kraw); + DealignAseqs(tmsa->aseq, tmsa->nseq, &traw); + for (idx = 0; idx < kmsa->nseq; idx++) + if (strcmp(kraw[idx], traw[idx]) != 0) + Die("raw seqs in %s and %s are not the same (died at %s, number %d)\n", + kfile, tfile, kmsa->sqname[idx], idx); + Free2DArray((void **) kraw, kmsa->nseq); + Free2DArray((void **) traw, tmsa->nseq); + + if (cs_only) + { + if (kmsa->ss_cons == NULL) + Die("Trusted alignment %s has no consensus structure annotation\n -- can't use -c!\n", + kfile); + ref = (int *) MallocOrDie (sizeof(int) * kmsa->alen); + for (apos = 0; apos < kmsa->alen; apos++) + ref[apos] = (isgap(kmsa->ss_cons[apos])) ? FALSE : TRUE; + } + + /*********************************************** + * Compare the alignments, print results + ***********************************************/ + + if (cs_only) + score = CompareRefMultAlignments(ref, kmsa->aseq, tmsa->aseq, kmsa->nseq); + else + score = CompareMultAlignments(kmsa->aseq, tmsa->aseq, kmsa->nseq); + + printf("Trusted alignment: %s\n", kmsa->name != NULL ? kmsa->name : kfile); + printf("Test alignment: %s\n", tmsa->name != NULL ? tmsa->name : tfile); + printf("Total sequences: %d\n", kmsa->nseq); + printf("Alignment identity: %.4f\n", score); + puts("//"); + + if (cs_only) free(ref); + MSAFree(kmsa); + MSAFree(tmsa); + } + + MSAFileClose(kfp); + MSAFileClose(tfp); + return 0; +} + + diff --git a/forester/archive/RIO/others/hmmer/squid/compstruct_main.c b/forester/archive/RIO/others/hmmer/squid/compstruct_main.c new file mode 100644 index 0000000..9701a00 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/compstruct_main.c @@ -0,0 +1,321 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* compstruct_main.c + * SRE, Tue Aug 30 10:35:31 1994 + * + * Compare RNA secondary structures. + * RCS $Id: compstruct_main.c,v 1.1.1.1 2005/03/22 08:34:22 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" +#include "msa.h" + +static char banner[] = "compalign - compare test RNA secondary structure predictions to trusted set"; + +char usage[] = "\ +Usage: compstruct [-options] \n\ + Both files must contain secondary structure markup (e.g. Stockholm, SQUID,\n\ + SELEX formats), and sequences must occur in the same order in the two files.\n\ +\n\ + Available options are:\n\ + -h : print short help and usage info\n\ +"; + +static char experts[] = "\ + --informat : specify that both alignments are in format (SELEX, for instance)\n\ + --quiet : suppress verbose header (used in regression testing)\n\ +"; + +struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "--informat", FALSE, sqdARG_STRING }, + { "--quiet", FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + + +static int KHS2ct(char *ss, int **ret_ct); +/* static void WriteCT(FILE *fp, char *seq, int *ct, int len); */ + +int +main(int argc, char **argv) +{ + char *kfile, *tfile; /* known, test structure file */ + int format; /* expected format of kfile, tfile */ + SQFILE *kfp, *tfp; /* open kfile, tfile */ + char *kseq, *tseq; /* known, test sequence */ + SQINFO kinfo, tinfo; /* known, test info */ + int *kct, *tct; /* known, test CT rep of structure */ + int pos; + int nseq; + + int correct; /* count of correct base pair predictions */ + int missedpair; /* count of false negatives */ + int falsepair; /* count of false positives */ + int tot_trusted; /* total base pairs in trusted structure */ + int tot_predicted; /* total base pairs in predicted structure*/ + int tot_correct; /* cumulative total correct pairs */ + + int dscorrect; /* count of correct 2-state paired prediction */ + int sscorrect; /* count of correct 2-state unpaired prediction */ + int tot_dscorrect; + int tot_sscorrect; + int tot_positions; + + int quiet; /* TRUE to silence verbose banner */ + + char *optname; + char *optarg; + int optind; + + /*********************************************** + * Parse command line + ***********************************************/ + + format = MSAFILE_UNKNOWN; + quiet = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "--quiet") == 0) quiet = TRUE; + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == MSAFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + if (! IsAlignmentFormat(format)) + Die("%s is an unaligned format, can't read as an alignment", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind != 2) + Die("Incorrect number of command line arguments.\n%s\n", usage); + + kfile = argv[optind++]; + tfile = argv[optind]; + + if (! quiet) Banner(stdout, banner); + + /*********************************************** + * Open the files + ***********************************************/ + + if ((kfp = SeqfileOpen(kfile, format, NULL)) == NULL) + Die("Failed to open trusted structure file %s for reading", kfile); + if ((tfp = SeqfileOpen(tfile, format, NULL)) == NULL) + Die("Failed to open test structure file %s for reading", tfile); + + /*********************************************** + * Do structure comparisons, one seq at a time + ***********************************************/ + + tot_trusted = tot_predicted = tot_correct = 0; + tot_dscorrect = tot_sscorrect = tot_positions = 0; + nseq = 0; + while (ReadSeq(kfp, kfp->format, &kseq, &kinfo) && ReadSeq(tfp, tfp->format, &tseq, &tinfo)) + { + if (!quiet && strcmp(tinfo.name, kinfo.name) != 0) + Warn("Trusted sequence %s, test sequence %s -- names not identical\n", + kinfo.name, tinfo.name); + if (!quiet && strcmp(kseq, tseq) != 0) + Warn("Trusted sequence %s, test sequence %s -- sequences not identical\n", + kinfo.name, tinfo.name); + + printf("%s %s\n", kinfo.name, (kinfo.flags & SQINFO_DESC) ? kinfo.desc : ""); + + if (! (tinfo.flags & SQINFO_SS) && ! (kinfo.flags & SQINFO_SS)) + printf("[no test or trusted structure]\n\n"); + else if (! (tinfo.flags & SQINFO_SS)) + printf("[no test structure]\n\n"); + else if (! (kinfo.flags & SQINFO_SS)) + printf("[no trusted structure]\n\n"); + else + { + if (! KHS2ct(kinfo.ss, &kct)) + { printf("[bad trusted structure]\n"); goto CLEANUP;} + if (! KHS2ct(tinfo.ss, &tct)) + { printf("[bad test structure]\n"); free(kct); goto CLEANUP; } + +/* WriteCT(stdout, tseq, tct, tinfo.len); */ +/* WriteCT(stdout, tseq, kct, tinfo.len); */ + + correct = falsepair = missedpair = 0; + dscorrect = sscorrect = 0; + for (pos = 0; pos < kinfo.len; pos++) + { + /* check if actual base pair is predicted */ + if (kct[pos] >= 0 && kct[pos] == tct[pos]) + correct++; + else if (kct[pos] >= 0) + missedpair++; + + if (tct[pos] >= 0 && kct[pos] != tct[pos]) + falsepair++; + + /* 2 state prediction */ + if (kct[pos] >= 0 && tct[pos] >= 0) + dscorrect++; + else if (kct[pos] < 0 && tct[pos] < 0) + sscorrect++; + } + nseq++; + tot_trusted += correct + missedpair; + tot_predicted += correct + falsepair; + tot_correct += correct; + + tot_dscorrect += dscorrect; + tot_sscorrect += sscorrect; + tot_positions += kinfo.len; + + /* print out per sequence info */ + printf(" %d/%d trusted pairs predicted (%.2f%% sensitivity)\n", + correct, correct+missedpair, + 100. * (float) correct/ (float) (correct + missedpair)); + printf(" %d/%d predicted pairs correct (%.2f%% specificity)\n", + correct, correct + falsepair, + 100. * (float) correct/ (float) (correct + falsepair)); + + printf(" Two state: %d/%d positions correctly predicted (%.2f%% accuracy)\n", + dscorrect + sscorrect, + kinfo.len, + 100. * (float) (dscorrect + sscorrect) / (float) kinfo.len); + puts(""); + + + free(kct); + free(tct); + } + + CLEANUP: + FreeSequence(kseq, &kinfo); + FreeSequence(tseq, &tinfo); + } + + /* And the final summary: + */ + puts(""); + printf("Overall structure prediction accuracy (%d sequences, %d positions)\n", + nseq, tot_positions); + printf(" %d/%d trusted pairs predicted (%.2f%% sensitivity)\n", + tot_correct, tot_trusted, + 100. * (float) tot_correct/ (float) tot_trusted); + printf(" %d/%d predicted pairs correct (%.2f%% specificity)\n", + tot_correct, tot_predicted, + 100. * (float) tot_correct/ (float) tot_predicted); + printf(" Two state: %d/%d positions correctly predicted (%.2f%% accuracy)\n", + tot_dscorrect + tot_sscorrect, tot_positions, + 100. * (float) (tot_dscorrect + tot_sscorrect) / (float) tot_positions); + puts(""); + + SeqfileClose(tfp); + SeqfileClose(kfp); + return 0; +} + + +/* Function: KHS2ct() + * + * Purpose: Convert a secondary structure string to an array of integers + * representing what position each position is base-paired + * to (0..len-1), or -1 if none. This is off-by-one from a + * Zuker .ct file representation. + * + * The .ct representation can accomodate pseudoknots but the + * secondary structure string cannot easily; the string contains + * "Aa", "Bb", etc. pairs as a limited representation of + * pseudoknots. The string contains "><" for base pairs. + * Other symbols are ignored. + * + * Return: ret_ct is allocated here and must be free'd by caller. + * Returns 1 on success, 0 if ss is somehow inconsistent. + */ +static int +KHS2ct(char *ss, int **ret_ct) +{ + struct intstack_s *dolist[27]; + int *ct; + int i; + int pos, pair; + int status = 1; /* success or failure return status */ + int len; + + for (i = 0; i < 27; i++) + dolist[i] = InitIntStack(); + len = strlen(ss); + + if ((ct = (int *) malloc (len * sizeof(int))) == NULL) + Die("malloc failed"); + for (pos = 0; pos < len; pos++) + ct[pos] = -1; + + for (pos = 0; ss[pos] != '\0'; pos++) + { + if (ss[pos] == '>') /* left side of a pair: push onto stack 0 */ + PushIntStack(dolist[0], pos); + else if (ss[pos] == '<') /* right side of a pair; resolve pair */ + { + if (! PopIntStack(dolist[0], &pair)) + { status = 0; } + else + { + ct[pos] = pair; + ct[pair] = pos; + } + } + /* same stuff for pseudoknots */ + else if (isupper((int) ss[pos])) + PushIntStack(dolist[ss[pos] - 'A' + 1], pos); + else if (islower((int) ss[pos])) + { + if (! PopIntStack(dolist[ss[pos] - 'a' + 1], &pair)) + { status = 0; } + else + { + ct[pos] = pair; + ct[pair] = pos; + } + } + else if (!isgap(ss[pos])) status = 0; /* bad character */ + } + + for (i = 0; i < 27; i++) + if ( FreeIntStack(dolist[i]) > 0) + status = 0; + + *ret_ct = ct; + return status; +} + + +#ifdef SRE_REMOVED +/* Function: WriteCT() + * + * Purpose: Write a CT representation of a structure. + * Written in 1..len sense, with 0 for unpaired + * positions. + */ +static void +WriteCT(FILE *fp, char *seq, int *ct, int len) +{ + int pos; + for (pos = 0; pos < len; pos++) + fprintf(fp, "%d %c %d\n", pos+1, seq[pos], ct[pos]+1); +} +#endif diff --git a/forester/archive/RIO/others/hmmer/squid/configure b/forester/archive/RIO/others/hmmer/squid/configure new file mode 100755 index 0000000..3bfb5cb --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/configure @@ -0,0 +1,2241 @@ +#! /bin/sh + +# Guess values for system-dependent variables and create Makefiles. +# Generated automatically using autoconf version 2.13 +# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc. +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. + +# Defaults: +ac_help= +ac_default_prefix=/usr/local +# Any additions from configure.in: +ac_help="$ac_help + --with-pvm enable PVM, Parallel Virtual Machine" + +# Initialize some variables set by options. +# The variables have the same names as the options, with +# dashes changed to underlines. +build=NONE +cache_file=./config.cache +exec_prefix=NONE +host=NONE +no_create= +nonopt=NONE +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +target=NONE +verbose= +x_includes=NONE +x_libraries=NONE +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datadir='${prefix}/share' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +libdir='${exec_prefix}/lib' +includedir='${prefix}/include' +oldincludedir='/usr/include' +infodir='${prefix}/info' +mandir='${prefix}/man' + +# Initialize some other variables. +subdirs= +MFLAGS= MAKEFLAGS= +SHELL=${CONFIG_SHELL-/bin/sh} +# Maximum number of lines to put in a shell here document. +ac_max_here_lines=12 + +ac_prev= +for ac_option +do + + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval "$ac_prev=\$ac_option" + ac_prev= + continue + fi + + case "$ac_option" in + -*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;; + *) ac_optarg= ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case "$ac_option" in + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir="$ac_optarg" ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build="$ac_optarg" ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file="$ac_optarg" ;; + + -datadir | --datadir | --datadi | --datad | --data | --dat | --da) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \ + | --da=*) + datadir="$ac_optarg" ;; + + -disable-* | --disable-*) + ac_feature=`echo $ac_option|sed -e 's/-*disable-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + eval "enable_${ac_feature}=no" ;; + + -enable-* | --enable-*) + ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "enable_${ac_feature}='$ac_optarg'" ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix="$ac_optarg" ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he) + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat << EOF +Usage: configure [options] [host] +Options: [defaults in brackets after descriptions] +Configuration: + --cache-file=FILE cache test results in FILE + --help print this message + --no-create do not create output files + --quiet, --silent do not print \`checking...' messages + --version print the version of autoconf that created configure +Directory and file names: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [same as prefix] + --bindir=DIR user executables in DIR [EPREFIX/bin] + --sbindir=DIR system admin executables in DIR [EPREFIX/sbin] + --libexecdir=DIR program executables in DIR [EPREFIX/libexec] + --datadir=DIR read-only architecture-independent data in DIR + [PREFIX/share] + --sysconfdir=DIR read-only single-machine data in DIR [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data in DIR + [PREFIX/com] + --localstatedir=DIR modifiable single-machine data in DIR [PREFIX/var] + --libdir=DIR object code libraries in DIR [EPREFIX/lib] + --includedir=DIR C header files in DIR [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc in DIR [/usr/include] + --infodir=DIR info documentation in DIR [PREFIX/info] + --mandir=DIR man documentation in DIR [PREFIX/man] + --srcdir=DIR find the sources in DIR [configure dir or ..] + --program-prefix=PREFIX prepend PREFIX to installed program names + --program-suffix=SUFFIX append SUFFIX to installed program names + --program-transform-name=PROGRAM + run sed PROGRAM on installed program names +EOF + cat << EOF +Host type: + --build=BUILD configure for building on BUILD [BUILD=HOST] + --host=HOST configure for HOST [guessed] + --target=TARGET configure for TARGET [TARGET=HOST] +Features and packages: + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) + --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --x-includes=DIR X include files are in DIR + --x-libraries=DIR X library files are in DIR +EOF + if test -n "$ac_help"; then + echo "--enable and --with options recognized:$ac_help" + fi + exit 0 ;; + + -host | --host | --hos | --ho) + ac_prev=host ;; + -host=* | --host=* | --hos=* | --ho=*) + host="$ac_optarg" ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir="$ac_optarg" ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir="$ac_optarg" ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir="$ac_optarg" ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir="$ac_optarg" ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst \ + | --locals | --local | --loca | --loc | --lo) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* \ + | --locals=* | --local=* | --loca=* | --loc=* | --lo=*) + localstatedir="$ac_optarg" ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir="$ac_optarg" ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir="$ac_optarg" ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix="$ac_optarg" ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix="$ac_optarg" ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix="$ac_optarg" ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name="$ac_optarg" ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir="$ac_optarg" ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir="$ac_optarg" ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site="$ac_optarg" ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir="$ac_optarg" ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir="$ac_optarg" ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target="$ac_optarg" ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers) + echo "configure generated by autoconf version 2.13" + exit 0 ;; + + -with-* | --with-*) + ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "with_${ac_package}='$ac_optarg'" ;; + + -without-* | --without-*) + ac_package=`echo $ac_option|sed -e 's/-*without-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + eval "with_${ac_package}=no" ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes="$ac_optarg" ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries="$ac_optarg" ;; + + -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; } + ;; + + *) + if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then + echo "configure: warning: $ac_option: invalid host type" 1>&2 + fi + if test "x$nonopt" != xNONE; then + { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; } + fi + nonopt="$ac_option" + ;; + + esac +done + +if test -n "$ac_prev"; then + { echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; } +fi + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +# File descriptor usage: +# 0 standard input +# 1 file creation +# 2 errors and warnings +# 3 some systems may open it to /dev/tty +# 4 used on the Kubota Titan +# 6 checking for... messages and results +# 5 compiler messages saved in config.log +if test "$silent" = yes; then + exec 6>/dev/null +else + exec 6>&1 +fi +exec 5>./config.log + +echo "\ +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. +" 1>&5 + +# Strip out --no-create and --no-recursion so they do not pile up. +# Also quote any args containing shell metacharacters. +ac_configure_args= +for ac_arg +do + case "$ac_arg" in + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) ;; + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;; + *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*) + ac_configure_args="$ac_configure_args '$ac_arg'" ;; + *) ac_configure_args="$ac_configure_args $ac_arg" ;; + esac +done + +# NLS nuisances. +# Only set these to C if already set. These must not be set unconditionally +# because not all systems understand e.g. LANG=C (notably SCO). +# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'! +# Non-C LC_CTYPE values break the ctype check. +if test "${LANG+set}" = set; then LANG=C; export LANG; fi +if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi +if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi +if test "${LC_CTYPE+set}" = set; then LC_CTYPE=C; export LC_CTYPE; fi + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -rf conftest* confdefs.h +# AIX cpp loses on an empty file, so make sure it contains at least a newline. +echo > confdefs.h + +# A filename unique to this package, relative to the directory that +# configure is in, which we can look for to find out if srcdir is correct. +ac_unique_file=squidcore.c + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then its parent. + ac_prog=$0 + ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'` + test "x$ac_confdir" = "x$ac_prog" && ac_confdir=. + srcdir=$ac_confdir + if test ! -r $srcdir/$ac_unique_file; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r $srcdir/$ac_unique_file; then + if test "$ac_srcdir_defaulted" = yes; then + { echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; } + else + { echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; } + fi +fi +srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'` + +# Prefer explicitly selected file to automatically selected ones. +if test -z "$CONFIG_SITE"; then + if test "x$prefix" != xNONE; then + CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site" + else + CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" + fi +fi +for ac_site_file in $CONFIG_SITE; do + if test -r "$ac_site_file"; then + echo "loading site script $ac_site_file" + . "$ac_site_file" + fi +done + +if test -r "$cache_file"; then + echo "loading cache $cache_file" + . $cache_file +else + echo "creating cache $cache_file" + > $cache_file +fi + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +ac_exeext= +ac_objext=o +if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then + # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu. + if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then + ac_n= ac_c=' +' ac_t=' ' + else + ac_n=-n ac_c= ac_t= + fi +else + ac_n= ac_c='\c' ac_t= +fi + + + + +echo " Welcome to SQUID... configuring for your system." + + + + + + + +# Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:540: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="gcc" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:570: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_prog_rejected=no + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + break + fi + done + IFS="$ac_save_ifs" +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# -gt 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + set dummy "$ac_dir/$ac_word" "$@" + shift + ac_cv_prog_CC="$@" + fi +fi +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + if test -z "$CC"; then + case "`uname -s`" in + *win32* | *WIN32*) + # Extract the first word of "cl", so it can be a program name with args. +set dummy cl; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:621: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="cl" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + ;; + esac + fi + test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; } +fi + +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6 +echo "configure:653: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5 + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +cat > conftest.$ac_ext << EOF + +#line 664 "configure" +#include "confdefs.h" + +main(){return(0);} +EOF +if { (eval echo configure:669: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + ac_cv_prog_cc_works=yes + # If we can't run a trivial program, we are probably using a cross compiler. + if (./conftest; exit) 2>/dev/null; then + ac_cv_prog_cc_cross=no + else + ac_cv_prog_cc_cross=yes + fi +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + ac_cv_prog_cc_works=no +fi +rm -fr conftest* +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +echo "$ac_t""$ac_cv_prog_cc_works" 1>&6 +if test $ac_cv_prog_cc_works = no; then + { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; } +fi +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6 +echo "configure:695: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5 +echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6 +cross_compiling=$ac_cv_prog_cc_cross + +echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6 +echo "configure:700: checking whether we are using GNU C" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.c <&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then + ac_cv_prog_gcc=yes +else + ac_cv_prog_gcc=no +fi +fi + +echo "$ac_t""$ac_cv_prog_gcc" 1>&6 + +if test $ac_cv_prog_gcc = yes; then + GCC=yes +else + GCC= +fi + +ac_test_CFLAGS="${CFLAGS+set}" +ac_save_CFLAGS="$CFLAGS" +CFLAGS= +echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6 +echo "configure:728: checking whether ${CC-cc} accepts -g" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'void f(){}' > conftest.c +if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then + ac_cv_prog_cc_g=yes +else + ac_cv_prog_cc_g=no +fi +rm -f conftest* + +fi + +echo "$ac_t""$ac_cv_prog_cc_g" 1>&6 +if test "$ac_test_CFLAGS" = set; then + CFLAGS="$ac_save_CFLAGS" +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi + +echo $ac_n "checking whether ln -s works""... $ac_c" 1>&6 +echo "configure:760: checking whether ln -s works" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_LN_S'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + rm -f conftestdata +if ln -s X conftestdata 2>/dev/null +then + rm -f conftestdata + ac_cv_prog_LN_S="ln -s" +else + ac_cv_prog_LN_S=ln +fi +fi +LN_S="$ac_cv_prog_LN_S" +if test "$ac_cv_prog_LN_S" = "ln -s"; then + echo "$ac_t""yes" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "ranlib", so it can be a program name with args. +set dummy ranlib; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:783: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$RANLIB"; then + ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_RANLIB="ranlib" + break + fi + done + IFS="$ac_save_ifs" + test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":" +fi +fi +RANLIB="$ac_cv_prog_RANLIB" +if test -n "$RANLIB"; then + echo "$ac_t""$RANLIB" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + + + + echo $ac_n "checking whether your make is GNU make""... $ac_c" 1>&6 +echo "configure:814: checking whether your make is GNU make" >&5 + foundGNUmake='nope, assuming sysv make.' ; + EXEC_DEPENDENCY=\$\$\@_main.o ; + if ( make --version nothing 2> /dev/null | grep GNU > /dev/null ) ; then + foundGNUmake='yes, it is.' ; + EXEC_DEPENDENCY='%: %_main.o' ; + fi + echo "$ac_t""$foundGNUmake" 1>&6 + + + +# Check whether --with-pvm or --without-pvm was given. +if test "${with_pvm+set}" = set; then + withval="$with_pvm" + case $with_pvm in + yes) echo 'Configuring for PVM' + PVMLIBDIR="-L${PVM_ROOT}/lib/${PVM_ARCH}" + PVMINCDIR="-I${PVM_ROOT}/include" + PVMFLAG="-DSRE_ENABLE_PVM" + PVMLIBS="-lpvm3" + ;; + no) ;; + *) echo "Ignoring unknown argument to --with-pvm: $with_pvm" + ;; +esac +fi + + +echo $ac_n "checking whether byte ordering is bigendian""... $ac_c" 1>&6 +echo "configure:843: checking whether byte ordering is bigendian" >&5 +if eval "test \"`echo '$''{'ac_cv_c_bigendian'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_cv_c_bigendian=unknown +# See if sys/param.h defines the BYTE_ORDER macro. +cat > conftest.$ac_ext < +#include +int main() { + +#if !BYTE_ORDER || !BIG_ENDIAN || !LITTLE_ENDIAN + bogus endian macros +#endif +; return 0; } +EOF +if { (eval echo configure:861: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + # It does; now see whether it defined to BIG_ENDIAN or not. +cat > conftest.$ac_ext < +#include +int main() { + +#if BYTE_ORDER != BIG_ENDIAN + not big endian +#endif +; return 0; } +EOF +if { (eval echo configure:876: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + ac_cv_c_bigendian=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_c_bigendian=no +fi +rm -f conftest* +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 +fi +rm -f conftest* +if test $ac_cv_c_bigendian = unknown; then +if test "$cross_compiling" = yes; then + { echo "configure: error: can not run test program while cross compiling" 1>&2; exit 1; } +else + cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_c_bigendian=no +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_c_bigendian=yes +fi +rm -fr conftest* +fi + +fi +fi + +echo "$ac_t""$ac_cv_c_bigendian" 1>&6 +if test $ac_cv_c_bigendian = yes; then + cat >> confdefs.h <<\EOF +#define WORDS_BIGENDIAN 1 +EOF + +fi + +for ac_func in ntohs +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:935: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:963: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for ntohs in -lsocket""... $ac_c" 1>&6 +echo "configure:985: checking for ntohs in -lsocket" >&5 +ac_lib_var=`echo socket'_'ntohs | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + +for ac_func in ntohl +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1037: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1065: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for ntohl in -lsocket""... $ac_c" 1>&6 +echo "configure:1087: checking for ntohl in -lsocket" >&5 +ac_lib_var=`echo socket'_'ntohl | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + +for ac_func in htons +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1139: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1167: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for htons in -lsocket""... $ac_c" 1>&6 +echo "configure:1189: checking for htons in -lsocket" >&5 +ac_lib_var=`echo socket'_'htons | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + +for ac_func in htonl +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1241: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1269: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +echo $ac_n "checking for htonl in -lsocket""... $ac_c" 1>&6 +echo "configure:1291: checking for htonl in -lsocket" >&5 +ac_lib_var=`echo socket'_'htonl | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lsocket $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo socket | sed -e 's/^a-zA-Z0-9_/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + +fi +done + +echo $ac_n "checking size of unsigned short""... $ac_c" 1>&6 +echo "configure:1341: checking size of unsigned short" >&5 +if eval "test \"`echo '$''{'ac_cv_sizeof_unsigned_short'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + ac_cv_sizeof_unsigned_short=2 +else + cat > conftest.$ac_ext < +main() +{ + FILE *f=fopen("conftestval", "w"); + if (!f) exit(1); + fprintf(f, "%d\n", sizeof(unsigned short)); + exit(0); +} +EOF +if { (eval echo configure:1360: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_sizeof_unsigned_short=`cat conftestval` +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_sizeof_unsigned_short=0 +fi +rm -fr conftest* +fi + +fi +echo "$ac_t""$ac_cv_sizeof_unsigned_short" 1>&6 +cat >> confdefs.h <&6 +echo "configure:1380: checking size of unsigned int" >&5 +if eval "test \"`echo '$''{'ac_cv_sizeof_unsigned_int'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + ac_cv_sizeof_unsigned_int=4 +else + cat > conftest.$ac_ext < +main() +{ + FILE *f=fopen("conftestval", "w"); + if (!f) exit(1); + fprintf(f, "%d\n", sizeof(unsigned int)); + exit(0); +} +EOF +if { (eval echo configure:1399: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_sizeof_unsigned_int=`cat conftestval` +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_sizeof_unsigned_int=0 +fi +rm -fr conftest* +fi + +fi +echo "$ac_t""$ac_cv_sizeof_unsigned_int" 1>&6 +cat >> confdefs.h <&6 +echo "configure:1419: checking size of unsigned long" >&5 +if eval "test \"`echo '$''{'ac_cv_sizeof_unsigned_long'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + ac_cv_sizeof_unsigned_long=4 +else + cat > conftest.$ac_ext < +main() +{ + FILE *f=fopen("conftestval", "w"); + if (!f) exit(1); + fprintf(f, "%d\n", sizeof(unsigned long)); + exit(0); +} +EOF +if { (eval echo configure:1438: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_sizeof_unsigned_long=`cat conftestval` +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_sizeof_unsigned_long=0 +fi +rm -fr conftest* +fi + +fi +echo "$ac_t""$ac_cv_sizeof_unsigned_long" 1>&6 +cat >> confdefs.h <&6 +echo "configure:1458: checking size of unsigned long long" >&5 +if eval "test \"`echo '$''{'ac_cv_sizeof_unsigned_long_long'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + ac_cv_sizeof_unsigned_long_long=8 +else + cat > conftest.$ac_ext < +main() +{ + FILE *f=fopen("conftestval", "w"); + if (!f) exit(1); + fprintf(f, "%d\n", sizeof(unsigned long long)); + exit(0); +} +EOF +if { (eval echo configure:1477: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_sizeof_unsigned_long_long=`cat conftestval` +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_sizeof_unsigned_long_long=0 +fi +rm -fr conftest* +fi + +fi +echo "$ac_t""$ac_cv_sizeof_unsigned_long_long" 1>&6 +cat >> confdefs.h <&2 "No 16-bit int? Manually edit config file to typedef sqd_uint16." +fi +if test "$ac_cv_sizeof_unsigned_int" = "4"; then + SQD_UINT32="unsigned int " +elif test "$ac_cv_sizeof_unsigned_long" = "4"; then + SQD_UINT32="unsigned long " +else + SQD_UINT32="FIXME" + echo "configure: warning: " 1>&2 "No 32-bit int? Manually edit config file to typedef sqd_uint32." +fi +if test "$ac_cv_sizeof_unsigned_long" = "8"; then + SQD_UINT64="unsigned long " +elif test "$ac_cv_sizeof_unsigned_long_long" = "8"; then + SQD_UINT64="unsigned long long" +else + SQD_UINT64="FIXME" + echo "configure: warning: " 1>&2 "No 64-bit int? Manually edit config file to typedef sqd_uint64." +fi + + + + + + + + echo $ac_n "checking whether fpos_t is an arithmetic datatype""... $ac_c" 1>&6 +echo "configure:1526: checking whether fpos_t is an arithmetic datatype" >&5 + fpos_arithmetic="no." + cat > conftest.$ac_ext < +int main() { +int main(void) { fpos_t f1, f2; if (f1 == f2) f1 = 0;} +; return 0; } +EOF +if { (eval echo configure:1536: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + cat >> confdefs.h <<\EOF +#define ARITHMETIC_FPOS_T 1 +EOF + + fpos_arithmetic="yes." +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 +fi +rm -f conftest* + echo "$ac_t""$fpos_arithmetic" 1>&6 + + +for ac_func in ftello fseeko +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1554: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1582: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +fi +done + +for ac_func in ftello64 fseeko64 +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1609: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1637: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +fi +done + +for ac_func in ftell64 fseek64 +do +echo $ac_n "checking for $ac_func""... $ac_c" 1>&6 +echo "configure:1664: checking for $ac_func" >&5 +if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char $ac_func(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_$ac_func) || defined (__stub___$ac_func) +choke me +#else +$ac_func(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1692: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_$ac_func=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_$ac_func=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` + cat >> confdefs.h <&6 +fi +done + +echo $ac_n "checking for stat64""... $ac_c" 1>&6 +echo "configure:1717: checking for stat64" >&5 +if eval "test \"`echo '$''{'ac_cv_func_stat64'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +/* Override any gcc2 internal prototype to avoid an error. */ +/* We use char because int might match the return type of a gcc2 + builtin and then its argument prototype would still apply. */ +char stat64(); + +int main() { + +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined (__stub_stat64) || defined (__stub___stat64) +choke me +#else +stat64(); +#endif + +; return 0; } +EOF +if { (eval echo configure:1745: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_func_stat64=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_func_stat64=no" +fi +rm -f conftest* +fi + +if eval "test \"`echo '$ac_cv_func_'stat64`\" = yes"; then + echo "$ac_t""yes" 1>&6 + : +else + echo "$ac_t""no" 1>&6 +fi + +echo $ac_n "checking size of off_t""... $ac_c" 1>&6 +echo "configure:1765: checking size of off_t" >&5 +if eval "test \"`echo '$''{'ac_cv_sizeof_off_t'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + { echo "configure: error: can not run test program while cross compiling" 1>&2; exit 1; } +else + cat > conftest.$ac_ext < +main() +{ + FILE *f=fopen("conftestval", "w"); + if (!f) exit(1); + fprintf(f, "%d\n", sizeof(off_t)); + exit(0); +} +EOF +if { (eval echo configure:1784: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_sizeof_off_t=`cat conftestval` +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_sizeof_off_t=0 +fi +rm -fr conftest* +fi + +fi +echo "$ac_t""$ac_cv_sizeof_off_t" 1>&6 +cat >> confdefs.h <&6 +echo "configure:1804: checking size of off64_t" >&5 +if eval "test \"`echo '$''{'ac_cv_sizeof_off64_t'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + { echo "configure: error: can not run test program while cross compiling" 1>&2; exit 1; } +else + cat > conftest.$ac_ext < +main() +{ + FILE *f=fopen("conftestval", "w"); + if (!f) exit(1); + fprintf(f, "%d\n", sizeof(off64_t)); + exit(0); +} +EOF +if { (eval echo configure:1823: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_sizeof_off64_t=`cat conftestval` +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_sizeof_off64_t=0 +fi +rm -fr conftest* +fi + +fi +echo "$ac_t""$ac_cv_sizeof_off64_t" 1>&6 +cat >> confdefs.h <&6 +echo "configure:1843: checking size of fpos_t" >&5 +if eval "test \"`echo '$''{'ac_cv_sizeof_fpos_t'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test "$cross_compiling" = yes; then + { echo "configure: error: can not run test program while cross compiling" 1>&2; exit 1; } +else + cat > conftest.$ac_ext < +main() +{ + FILE *f=fopen("conftestval", "w"); + if (!f) exit(1); + fprintf(f, "%d\n", sizeof(fpos_t)); + exit(0); +} +EOF +if { (eval echo configure:1862: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + ac_cv_sizeof_fpos_t=`cat conftestval` +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_sizeof_fpos_t=0 +fi +rm -fr conftest* +fi + +fi +echo "$ac_t""$ac_cv_sizeof_fpos_t" 1>&6 +cat >> confdefs.h < confcache <<\EOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs. It is not useful on other systems. +# If it contains results you don't want to keep, you may remove or edit it. +# +# By default, configure uses ./config.cache as the cache file, +# creating it if it does not exist already. You can give configure +# the --cache-file=FILE option to use a different cache file; that is +# what configure does when it calls configure scripts in +# subdirectories, so they share the cache. +# Giving --cache-file=/dev/null disables caching, for debugging configure. +# config.status only pays attention to the cache file if you give it the +# --recheck option to rerun configure. +# +EOF +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, don't put newlines in cache variables' values. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +(set) 2>&1 | + case `(ac_space=' '; set | grep ac_space) 2>&1` in + *ac_space=\ *) + # `set' does not quote correctly, so add quotes (double-quote substitution + # turns \\\\ into \\, and sed turns \\ into \). + sed -n \ + -e "s/'/'\\\\''/g" \ + -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p" + ;; + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p' + ;; + esac >> confcache +if cmp -s $cache_file confcache; then + : +else + if test -w $cache_file; then + echo "updating cache $cache_file" + cat confcache > $cache_file + else + echo "not updating unwritable cache $cache_file" + fi +fi +rm -f confcache + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Any assignment to VPATH causes Sun make to only execute +# the first set of double-colon rules, so remove it if not needed. +# If there is a colon in the path, we need to keep it. +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[^:]*$/d' +fi + +trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15 + +DEFS=-DHAVE_CONFIG_H + +# Without the "./", some shells look in PATH for config.status. +: ${CONFIG_STATUS=./config.status} + +echo creating $CONFIG_STATUS +rm -f $CONFIG_STATUS +cat > $CONFIG_STATUS </dev/null | sed 1q`: +# +# $0 $ac_configure_args +# +# Compiler output produced by configure, useful for debugging +# configure, is in ./config.log if it exists. + +ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]" +for ac_option +do + case "\$ac_option" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion" + exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;; + -version | --version | --versio | --versi | --vers | --ver | --ve | --v) + echo "$CONFIG_STATUS generated by autoconf version 2.13" + exit 0 ;; + -help | --help | --hel | --he | --h) + echo "\$ac_cs_usage"; exit 0 ;; + *) echo "\$ac_cs_usage"; exit 1 ;; + esac +done + +ac_given_srcdir=$srcdir + +trap 'rm -fr `echo "Makefile squid.h squidconf.h" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15 +EOF +cat >> $CONFIG_STATUS < conftest.subs <<\\CEOF +$ac_vpsub +$extrasub +s%@SHELL@%$SHELL%g +s%@CFLAGS@%$CFLAGS%g +s%@CPPFLAGS@%$CPPFLAGS%g +s%@CXXFLAGS@%$CXXFLAGS%g +s%@FFLAGS@%$FFLAGS%g +s%@DEFS@%$DEFS%g +s%@LDFLAGS@%$LDFLAGS%g +s%@LIBS@%$LIBS%g +s%@exec_prefix@%$exec_prefix%g +s%@prefix@%$prefix%g +s%@program_transform_name@%$program_transform_name%g +s%@bindir@%$bindir%g +s%@sbindir@%$sbindir%g +s%@libexecdir@%$libexecdir%g +s%@datadir@%$datadir%g +s%@sysconfdir@%$sysconfdir%g +s%@sharedstatedir@%$sharedstatedir%g +s%@localstatedir@%$localstatedir%g +s%@libdir@%$libdir%g +s%@includedir@%$includedir%g +s%@oldincludedir@%$oldincludedir%g +s%@infodir@%$infodir%g +s%@mandir@%$mandir%g +s%@MDEFS@%$MDEFS%g +s%@PVMLIBDIR@%$PVMLIBDIR%g +s%@PVMINCDIR@%$PVMINCDIR%g +s%@PVMFLAG@%$PVMFLAG%g +s%@PVMLIBS@%$PVMLIBS%g +s%@CC@%$CC%g +s%@LN_S@%$LN_S%g +s%@RANLIB@%$RANLIB%g +s%@EXEC_DEPENDENCY@%$EXEC_DEPENDENCY%g +s%@SQD_UINT16@%$SQD_UINT16%g +s%@SQD_UINT32@%$SQD_UINT32%g +s%@SQD_UINT64@%$SQD_UINT64%g + +CEOF +EOF + +cat >> $CONFIG_STATUS <<\EOF + +# Split the substitutions into bite-sized pieces for seds with +# small command number limits, like on Digital OSF/1 and HP-UX. +ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script. +ac_file=1 # Number of current file. +ac_beg=1 # First line for current file. +ac_end=$ac_max_sed_cmds # Line after last line for current file. +ac_more_lines=: +ac_sed_cmds="" +while $ac_more_lines; do + if test $ac_beg -gt 1; then + sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file + else + sed "${ac_end}q" conftest.subs > conftest.s$ac_file + fi + if test ! -s conftest.s$ac_file; then + ac_more_lines=false + rm -f conftest.s$ac_file + else + if test -z "$ac_sed_cmds"; then + ac_sed_cmds="sed -f conftest.s$ac_file" + else + ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file" + fi + ac_file=`expr $ac_file + 1` + ac_beg=$ac_end + ac_end=`expr $ac_end + $ac_max_sed_cmds` + fi +done +if test -z "$ac_sed_cmds"; then + ac_sed_cmds=cat +fi +EOF + +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF +for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories. + + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`" + # A "../" for each directory in $ac_dir_suffix. + ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'` + else + ac_dir_suffix= ac_dots= + fi + + case "$ac_given_srcdir" in + .) srcdir=. + if test -z "$ac_dots"; then top_srcdir=. + else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;; + /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;; + *) # Relative path. + srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix" + top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + + echo creating "$ac_file" + rm -f "$ac_file" + configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure." + case "$ac_file" in + *Makefile*) ac_comsub="1i\\ +# $configure_input" ;; + *) ac_comsub= ;; + esac + + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + sed -e "$ac_comsub +s%@configure_input@%$configure_input%g +s%@srcdir@%$srcdir%g +s%@top_srcdir@%$top_srcdir%g +" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file +fi; done +rm -f conftest.s* + +# These sed commands are passed to sed as "A NAME B NAME C VALUE D", where +# NAME is the cpp macro being defined and VALUE is the value it is being given. +# +# ac_d sets the value in "#define NAME VALUE" lines. +ac_dA='s%^\([ ]*\)#\([ ]*define[ ][ ]*\)' +ac_dB='\([ ][ ]*\)[^ ]*%\1#\2' +ac_dC='\3' +ac_dD='%g' +# ac_u turns "#undef NAME" with trailing blanks into "#define NAME VALUE". +ac_uA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)' +ac_uB='\([ ]\)%\1#\2define\3' +ac_uC=' ' +ac_uD='\4%g' +# ac_e turns "#undef NAME" without trailing blanks into "#define NAME VALUE". +ac_eA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)' +ac_eB='$%\1#\2define\3' +ac_eC=' ' +ac_eD='%g' + +if test "${CONFIG_HEADERS+set}" != set; then +EOF +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF +fi +for ac_file in .. $CONFIG_HEADERS; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + echo creating $ac_file + + rm -f conftest.frag conftest.in conftest.out + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + cat $ac_file_inputs > conftest.in + +EOF + +# Transform confdefs.h into a sed script conftest.vals that substitutes +# the proper values into config.h.in to produce config.h. And first: +# Protect against being on the right side of a sed subst in config.status. +# Protect against being in an unquoted here document in config.status. +rm -f conftest.vals +cat > conftest.hdr <<\EOF +s/[\\&%]/\\&/g +s%[\\$`]%\\&%g +s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD}%gp +s%ac_d%ac_u%gp +s%ac_u%ac_e%gp +EOF +sed -n -f conftest.hdr confdefs.h > conftest.vals +rm -f conftest.hdr + +# This sed command replaces #undef with comments. This is necessary, for +# example, in the case of _POSIX_SOURCE, which is predefined and required +# on some systems where configure will not decide to define it. +cat >> conftest.vals <<\EOF +s%^[ ]*#[ ]*undef[ ][ ]*[a-zA-Z_][a-zA-Z_0-9]*%/* & */% +EOF + +# Break up conftest.vals because some shells have a limit on +# the size of here documents, and old seds have small limits too. + +rm -f conftest.tail +while : +do + ac_lines=`grep -c . conftest.vals` + # grep -c gives empty output for an empty file on some AIX systems. + if test -z "$ac_lines" || test "$ac_lines" -eq 0; then break; fi + # Write a limited-size here document to conftest.frag. + echo ' cat > conftest.frag <> $CONFIG_STATUS + sed ${ac_max_here_lines}q conftest.vals >> $CONFIG_STATUS + echo 'CEOF + sed -f conftest.frag conftest.in > conftest.out + rm -f conftest.in + mv conftest.out conftest.in +' >> $CONFIG_STATUS + sed 1,${ac_max_here_lines}d conftest.vals > conftest.tail + rm -f conftest.vals + mv conftest.tail conftest.vals +done +rm -f conftest.vals + +cat >> $CONFIG_STATUS <<\EOF + rm -f conftest.frag conftest.h + echo "/* $ac_file. Generated automatically by configure. */" > conftest.h + cat conftest.in >> conftest.h + rm -f conftest.in + if cmp -s $ac_file conftest.h 2>/dev/null; then + echo "$ac_file is unchanged" + rm -f conftest.h + else + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + fi + rm -f $ac_file + mv conftest.h $ac_file + fi +fi; done + +EOF +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF + +exit 0 +EOF +chmod +x $CONFIG_STATUS +rm -fr confdefs* $ac_clean_files +test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1 + + + diff --git a/forester/archive/RIO/others/hmmer/squid/dayhoff.c b/forester/archive/RIO/others/hmmer/squid/dayhoff.c new file mode 100644 index 0000000..906fb76 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/dayhoff.c @@ -0,0 +1,171 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* dayhoff.c + * + * Routines for dealing with PAM matrices. + * + * Includes: + * ParsePAMFile() -- read a PAM matrix from disk. + * + * + * SRE - Fri Apr 2 11:23:45 1993 + * RCS $Id: dayhoff.c,v 1.1.1.1 2005/03/22 08:34:17 cmzmasek Exp $ + */ + + +#include +#include +#include +#include +#include +#include "squid.h" + +/* Function: ParsePAMFile() + * + * Purpose: Given a pointer to an open file containing a PAM matrix, + * parse the file and allocate and fill a 2D array of + * floats containing the matrix. The PAM file is + * assumed to be in the format that NCBI distributes + * with BLAST. BLOSUM matrices also work fine, as + * produced by Henikoff's program "MATBLAS". + * + * Parses both old format and new format BLAST matrices. + * Old format just had rows of integers. + * New format includes a leading character on each row. + * + * The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*. + * Note that it's not a 20x20 matrix as you might expect; + * this is for speed of indexing as well as the ability + * to deal with ambiguous characters. + * + * Args: fp - open PAM file + * ret_pam - RETURN: pam matrix, integers + * ret_scale - RETURN: scale factor for converting + * to real Sij. For instance, PAM120 is + * given in units of ln(2)/2. This may + * be passed as NULL if the caller + * doesn't care. + * + * Returns: 1 on success; 0 on failure and sets squid_errno to + * indicate the cause. ret_pam is allocated here and + * must be freed by the caller (use FreePAM). + */ +int +ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale) +{ + int **pam; + char buffer[512]; /* input buffer from fp */ + int order[27]; /* order of fields, obtained from header */ + int nsymbols; /* total number of symbols in matrix */ + char *sptr; + int idx; + int row, col; + float scale; + int gotscale = FALSE; + + if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; } + + /* Look at the first non-blank, non-comment line in the file. + * It gives single-letter codes in the order the PAM matrix + * is arrayed in the file. + */ + do { + if (fgets(buffer, 512, fp) == NULL) + { squid_errno = SQERR_NODATA; return 0; } + + /* Get the scale factor from the header. + * For BLOSUM files, we assume the line looks like: + * BLOSUM Clustered Scoring Matrix in 1/2 Bit Units + * and we assume that the fraction is always 1/x; + * + * For PAM files, we assume the line looks like: + * PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574 + * and we assume that the number following the final '=' is our scale + */ + scale = 0.0; /* just to silence gcc uninit warnings */ + if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL && + (sptr = strchr(buffer, '/')) != NULL) + { + sptr++; + if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; } + scale = (float) (log(2.0) / atof(sptr)); + gotscale = TRUE; + } + else if (strstr(buffer, "substitution matrix,") != NULL) + { + while ((sptr = strrchr(buffer, '=')) != NULL) { + sptr += 2; + if (IsReal(sptr)) { + scale = atof(sptr); + gotscale = TRUE; + break; + } + } + } + } while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#'); + + idx = 0; + do { + order[idx] = (int) *sptr - (int) 'A'; + if (order[idx] < 0 || order[idx] > 25) order[idx] = 26; + idx++; + } while ((sptr = strtok(NULL, " \t\n")) != NULL); + nsymbols = idx; + + /* Allocate a pam matrix. For speed of indexing, we use + * a 27x27 matrix so we can do lookups using the ASCII codes + * of amino acid single-letter representations, plus one + * extra field to deal with the "*" (terminators). + */ + if ((pam = (int **) calloc (27, sizeof(int *))) == NULL) + Die("calloc failed"); + for (idx = 0; idx < 27; idx++) + if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL) + Die("calloc failed"); + + /* Parse the rest of the file. + */ + for (row = 0; row < nsymbols; row++) + { + if (fgets(buffer, 512, fp) == NULL) + { squid_errno = SQERR_NODATA; return 0; } + + if ((sptr = strtok(buffer, " \t\n")) == NULL) + { squid_errno = SQERR_NODATA; return 0; } + for (col = 0; col < nsymbols; col++) + { + if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; } + + /* Watch out for new BLAST format, with leading characters + */ + if (*sptr == '*' || isalpha((int) *sptr)) + col--; /* hack hack */ + else + pam [order[row]] [order[col]] = atoi(sptr); + + sptr = strtok(NULL, " \t\n"); + } + } + + /* Return + */ + if (ret_scale != NULL) + { + if (gotscale) *ret_scale = scale; + else + { + Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!"); + *ret_scale = log(2.0) / 2.0; + } + } + *ret_pam = pam; + return 1; +} diff --git a/forester/archive/RIO/others/hmmer/squid/eps.c b/forester/archive/RIO/others/hmmer/squid/eps.c new file mode 100644 index 0000000..849c8f1 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/eps.c @@ -0,0 +1,115 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* eps.c + * SRE, Thu Jun 21 18:02:31 2001 [St. Louis] + * + * Some crude support for Encapsulated PostScript (EPS) output, + * DSC compliant. + * + * CVS $Id: eps.c,v 1.1.1.1 2005/03/22 08:34:17 cmzmasek Exp $ + */ + +#include +#include + +#include "squid.h" +#include "msa.h" + +/* Function: EPSWriteSmallMSA() + * Date: SRE, Thu Jun 21 18:15:21 2001 [St. Louis] + * + * Purpose: Write an alignment in singleblock, Stockholm/SELEX like + * format to an open file. Very crude. + * Currently fails if the alignment is >50 columns long, because + * it doesn't think it will fit on a single page. + * + * Args: fp - open file for writing + * msa - alignment to write + * + * Returns: (void) + */ +void +EPSWriteSmallMSA(FILE *fp, MSA *msa) +{ + int namewidth; /* namewidth in PostScript units */ + int fontwidth; /* width of a character in this font */ + int hspace; /* horizontal space between aligned chars */ + int vspace; /* vertical space between sequences */ + char *font; /* font name, e.g. "Courier" */ + int fontsize; /* font size in pts */ + int i,j; /* counter over sequences, columns */ + int len; /* tmp var holding length of something */ + int width, height; /* width and height of bounding box */ + int xpos, ypos; /* x,y position */ + + /* Set some font characteristics; done here, so it'll + * be easy to change. Magic numbers for Courier 12 determined + * by trial and error. + */ + fontwidth = 8; + hspace = 9; + vspace = 15; + font = sre_strdup("Courier", -1); + fontsize = 12; + + /* Find the width of the longest sequence name in characters. + */ + namewidth = 0; + for (i = 0; i < msa->nseq; i++) + if ((len = (int) strlen(msa->sqname[i])) > namewidth) + namewidth = len; + namewidth += 1; /* add a space to separate name & aligned seq */ + namewidth *= fontwidth; + + /* Determine bounding box + */ + if (msa->alen > 50) Die("No EPS fmt if alignment is >50 columns"); + width = namewidth + hspace*msa->alen; + if (width > 612) Die("Alignment too wide to write in EPS"); + height = vspace*msa->nseq; + if (height > 792) Die("Too many seqs to write in EPS"); + + /* Magic EPS header, bare-bones DSC-compliant. + */ + fprintf(fp, "%%!PS-Adobe-3.0 EPSF-3.0\n"); + fprintf(fp, "%%%%BoundingBox: %d %d %d %d\n", 0, 0, width, height); + fprintf(fp, "%%%%Pages: 1\n"); + fprintf(fp, "%%%%EndComments\n"); + + /* More postscript magic before we start the alignment + */ + fprintf(fp, "/%s findfont\n", font); + fprintf(fp, "%d scalefont\n", fontsize); + fprintf(fp, "setfont\n"); + fprintf(fp, "newpath\n"); + + /* Write the alignment in PostScript in a single block + */ + for (i = 0; i < msa->nseq; i++) + { + ypos = (msa->nseq-i-1)*vspace; + /* name first */ + fprintf(fp, "%d %d moveto\n", 0, ypos); + fprintf(fp, "(%s) show\n", msa->sqname[i]); + /* now seq */ + xpos = namewidth; + for (j = 0; j < msa->alen; j++) + { + fprintf(fp, "%d %d moveto\n", xpos, ypos); + fprintf(fp, "(%c) show\n", msa->aseq[i][j]); + xpos+= hspace; + } + } + + free(font); +} + + diff --git a/forester/archive/RIO/others/hmmer/squid/file.c b/forester/archive/RIO/others/hmmer/squid/file.c new file mode 100644 index 0000000..ec3647a --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/file.c @@ -0,0 +1,231 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + + +/* file.c + * SRE, Wed Jun 19 11:19:22 1996 + * + * File operation utilities, dealing with pathnames, directories, + * and environment variables. + * + * The goal is to have these be platform-independent but they + * currently are UNIX-specific: i.e. this file is currently POSIX compliant + * but it is NOT ANSI C compliant. (The sole offender is getenv().) + * + * RCS $Id: file.c,v 1.1.1.1 2005/03/22 08:34:26 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "squid.h" +#include "sqfuncs.h" + +/* + * VMS: #define DIRSLASH ']' + * MacOS: #define DIRSLASH ':' + * DOS: #define DIRSLASH '\\' + * + * The code assumes that '.' is used for file name extensions, + * such as "foo.bar". + */ +#define DIRSLASH '/' /* UNIX directory paths have /foo/bar */ + + + +/* Function: FileDirname() + * + * Purpose: Returns the path from a filename: + * "/foo/bar/baz" -> "/foo/bar" + * "foo/bar" -> "foo" + * "foo" -> "." + * "/" -> "/" + * i.e. the string will be non-NULL; it will + * contain the string up to but not including the + * last '/' character; returns "." if + * there are no '/' characters, and returns "/" + * if the last slash is the first character. + * Modeled on Tcl's "file dirname" command. + * + * Args: file - name of file "/foo/bar/baz". + * + * Return: ptr to malloc'ed string "/foo/bar". + */ +char * +FileDirname(char *file) +{ + char *dirname; + char *lastslash; + int len; + + lastslash = strrchr(file, DIRSLASH); + len = (lastslash == NULL) ? 0 : (int) (lastslash - file); + dirname = (char *) MallocOrDie (sizeof(char) * (len+2)); + if (len > 0) strncpy(dirname, file, len); + else if (*file != DIRSLASH) { *dirname = '.'; len = 1; } + else { *dirname = DIRSLASH; len = 1; } + dirname[len] = '\0'; + return dirname; +} + + +/* Function: FileTail() + * + * Purpose: Return everything after the DIRSLASH: + * "/foo/bar/baz.1" -> "baz.1" + * "foo/bar" -> "bar" + * "foo" -> "foo" + * "/" -> "" + * If noextension is TRUE, removes a trailing ".foo" extension + * too. + * + * Args: file - name of file "/foo/bar/baz.1" + * noextension - TRUE to also remove extensions + * + * Return: ptr to malloc'ed string "baz.1" + */ +char * +FileTail(char *file, int noextension) +{ + char *tail; + char *lastslash; + char *lastdot; + /* remove directory prefix */ + lastslash = strrchr(file, DIRSLASH); + tail = (char *) MallocOrDie (sizeof(char) * (strlen(file)+1)); + if (lastslash == NULL) strcpy(tail, file); + else strcpy(tail, lastslash+1); + /* remove trailing suffix */ + if (noextension) { + if ((lastdot = strrchr(tail, '.')) != NULL) + *lastdot = '\0'; + } + + return tail; +} + + +/* Function: FileConcat() + * + * Purpose: Concatenate a directory path and a file name, + * returning a pointer to a malloc'ed string with the + * full filename. + */ +char * +FileConcat(char *dir, char *file) +{ + char *full; + + full = (char *) MallocOrDie (sizeof(char) * (strlen(dir)+strlen(file)+2)); + if (*file == DIRSLASH) strcpy(full, file); /* file = "/foo", ignore directory. */ + else sprintf(full, "%s%c%s", dir, DIRSLASH, file); + return full; +} + + +/* Function: FileAddSuffix() + * Date: SRE, Wed Aug 1 11:19:33 2001 [Pasadena] + * + * Purpose: Add a suffix to a filename, return a malloc'ed + * string containing the new filename.sfx name. + * Example: + * FileAddSuffix("genbank", "ssi") + * returns "genbank.ssi". + */ +char * +FileAddSuffix(char *filename, char *sfx) +{ + char *new; + new = MallocOrDie(strlen(filename) + strlen(sfx) + 2); + sprintf(new, "%s.%s", filename, sfx); + return new; +} + +/* Function: EnvFileOpen() + * Date: Sun Feb 12 10:55:29 1995 + * + * Purpose: Open a file, given a file name and an environment + * variable that contains a directory path. Files + * are opened read-only. Does not look at current directory + * unless "." is explicitly in the path specified by env. + * + * For instance: + * fp = EnvFileOpen("BLOSUM45", "BLASTMAT", NULL); + * or: + * fp = EnvFileOpen("swiss", "BLASTDB", NULL); + * + * Environment variables may contain a colon-delimited + * list of more than one path; e.g. + * setenv BLASTDB /nfs/databases/foo:/nfs/databases/bar + * + * Sometimes a group of files may be found in + * one directory; for instance, an index file with a + * database. The caller can EnvFileOpen() the main + * file, and ask to get the name of the + * directory back in ret_dir, so it can construct + * the other auxiliary file names and fopen() them. (If it called + * EnvFileOpen(), it might get confused by + * file name clashes and open files in different + * directories. + * + * Args: fname - name of file to open + * env - name of environment variable containing path + * ret_dir - if non-NULL, RETURN: name of dir that was used. + * + * Return: FILE * to open file, or NULL on failure -- same as fopen() + * Caller must free ret_dir if it passed a non-NULL address. + */ +FILE * +EnvFileOpen(char *fname, char *env, char **ret_dir) +{ + FILE *fp; + char *path; + char *s; /* ptr to indiv element in env list */ + char full[1024]; /* constructed file name */ + + if (env == NULL) return NULL; + if ((path = Strdup(getenv(env))) == NULL) return NULL; + + fp = NULL; + s = strtok(path, ":"); + while (s != NULL) + { + if (((int) strlen(fname) + (int) strlen(s) + 2) > 1024) + { free(path); return NULL; } + sprintf(full, "%s%c%s", s, DIRSLASH, fname); + if ((fp = fopen(full, "r")) != NULL) break; + s = strtok(NULL, ":"); + } + + /* Return the path we used, if caller wants it + */ + if (ret_dir != NULL) *ret_dir = Strdup(s); + free(path); + + return fp; +} + + +/* Function: FileExists() + * + * Purpose: Return TRUE if filename exists. + * Testing fopen() is the only possible platform-independent test + * I'm aware of. + */ +int +FileExists(char *filename) +{ + FILE *fp; + if ((fp = fopen(filename, "r"))) { fclose(fp); return TRUE; } + return FALSE; +} + + diff --git a/forester/archive/RIO/others/hmmer/squid/getopt.c b/forester/archive/RIO/others/hmmer/squid/getopt.c new file mode 100644 index 0000000..75158f7 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/getopt.c @@ -0,0 +1,251 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* RCS $Id: getopt.c,v 1.1.1.1 2005/03/22 08:34:26 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "squid.h" + +/* Function: Getopt() + * + * Purpose: Portable command line option parsing with abbreviated + * option switches. Replaces UNIX getopt(). Using UNIX getopt() + * hinders portability to non-UNIX platforms, and getopt() + * is also limited to single letter options. + * + * Getopt() implements a superset of UNIX getopt(). + * All of getopt()'s single-character switch behavior + * is emulated, and "--" by itself terminates the options. + * Additionally, Getopt() provides extended switches + * like "--youroptionhere", and Getopt() type checks + * arguments. + * + * Extended options must start with "--", as in "--option1". + * Normal options must start with "-", as in "-o". + * Normal options may be concatenated, as in "-a -b" == "-ab". + * + * See bottom of this .c file after #fdef GETOPT_TESTDRIVER + * for an example of calling Getopt(). + * + * Args: argc - from main(). number of elems in argv. + * argv - from main(). argv[0] is the name of the command. + * opt - array of opt_s structures, defining option switches + * nopts - number of switches in opt + * usage - a (possibly long) string to print if usage error. + * ret_optind - RETURN: the index in argv[] of the next + * valid command-line token. + * ret_optname- RETURN: ptr to the name of option switch + * seen, or NULL if no option was seen. + * ret_optarg - RETURN: ptr to the optional argument, if any; + * NULL if option takes no argument. + * + * Return: 1 if a valid option was parsed. + * 0 if no option was found, and command-line parsing is complete. + * Die()'s here if an error is detected. + */ +int +Getopt(int argc, char **argv, struct opt_s *opt, int nopts, char *usage, + int *ret_optind, char **ret_optname, char **ret_optarg) +{ + int i; + int arglen; + int nmatch; + static int optind = 1; /* init to 1 on first call */ + static char *optptr = NULL; /* ptr to next valid switch */ + int opti = 0; /* init only to silence gcc uninit warnings */ + + /* Check to see if we've run out of options. + * A '-' by itself is an argument (e.g. "read from stdin") + * not an option. + */ + if (optind >= argc || argv[optind][0] != '-' || strcmp(argv[optind], "-") == 0) + { + *ret_optind = optind; + *ret_optarg = NULL; + *ret_optname = NULL; + return 0; + } + + /* Check to see if we're being told that this is the end + * of the options with the special "--" flag. + */ + if (strcmp(argv[optind], "--") == 0) + { + optind++; + *ret_optind = optind; + *ret_optname = NULL; + *ret_optarg = NULL; + return 0; + } + + /* We have a real option. Find which one it is. + * We handle single letter switches "-o" separately + * from full switches "--option", based on the "-" vs. "--" + * prefix -- single letter switches can be concatenated + * as long as they don't have arguments. + */ + /* full option */ + if (optptr == NULL && strncmp(argv[optind], "--", 2) == 0) + { + /* Use optptr to parse argument in options of form "--foo=666" + */ + if ((optptr = strchr(argv[optind], '=')) != NULL) + { *optptr = '\0'; optptr++; } + + arglen = strlen(argv[optind]); + nmatch = 0; + for (i = 0; i < nopts; i++) + if (opt[i].single == FALSE && + strncmp(opt[i].name, argv[optind], arglen) == 0) + { + nmatch++; + opti = i; + if (arglen == strlen(opt[i].name)) break; /* exact match, stop now */ + } + if (nmatch > 1 && arglen != strlen(opt[i].name)) + Die("Option \"%s\" is ambiguous; please be more specific.\n%s", + argv[optind], usage); + if (nmatch == 0) + Die("No such option \"%s\".\n%s", argv[optind], usage); + + *ret_optname = opt[opti].name; + + /* Set the argument, if there is one + */ + if (opt[opti].argtype != sqdARG_NONE) + { + if (optptr != NULL) + { /* --foo=666 style */ + *ret_optarg = optptr; + optptr = NULL; + optind++; + } + else if (optind+1 >= argc) + Die("Option %s requires an argument\n%s", opt[opti].name, usage); + else /* "--foo 666" style */ + { + *ret_optarg = argv[optind+1]; + optind+=2; + } + } + else /* sqdARG_NONE */ + { + if (optptr != NULL) + Die("Option %s does not take an argument\n%s", opt[opti].name, usage); + *ret_optarg = NULL; + optind++; + } + } + else /* else, a single letter option "-o" */ + { + /* find the option */ + if (optptr == NULL) + optptr = argv[optind]+1; + for (opti = -1, i = 0; i < nopts; i++) + if (opt[i].single == TRUE && *optptr == opt[i].name[1]) + { opti = i; break; } + if (opti == -1) + Die("No such option \"%c\".\n%s", *optptr, usage); + *ret_optname = opt[opti].name; + + /* set the argument, if there is one */ + if (opt[opti].argtype != sqdARG_NONE) + { + if (*(optptr+1) != '\0') /* attached argument */ + { + *ret_optarg = optptr+1; + optind++; + } + else if (optind+1 < argc) /* unattached argument */ + { + *ret_optarg = argv[optind+1]; + optind+=2; + } + else Die("Option %s requires an argument\n%s", opt[opti].name, usage); + + optptr = NULL; /* can't concatenate after an argument */ + } + else /* sqdARG_NONE */ + { + *ret_optarg = NULL; + if (*(optptr+1) != '\0') /* concatenation */ + optptr++; + else + { + optind++; /* move to next field */ + optptr = NULL; + } + } + + } + + /* Type check the argument, if there is one + */ + if (opt[opti].argtype != sqdARG_NONE) + { + if (opt[opti].argtype == sqdARG_INT && ! IsInt(*ret_optarg)) + Die("Option %s requires an integer argument\n%s", + opt[opti].name, usage); + else if (opt[opti].argtype == sqdARG_FLOAT && ! IsReal(*ret_optarg)) + Die("Option %s requires a numerical argument\n%s", + opt[opti].name, usage); + else if (opt[opti].argtype == sqdARG_CHAR && strlen(*ret_optarg) != 1) + Die("Option %s requires a single-character argument\n%s", + opt[opti].name, usage); + /* sqdARG_STRING is always ok, no type check necessary */ + } + + *ret_optind = optind; + return 1; +} + + + +#ifdef GETOPT_TESTDRIVER +/* cc -DGETOPT_TESTDRIVER -L ~/lib/squid.linux/ getopt.c -lsquid + */ +struct opt_s OPTIONS[] = { + { "--test1", FALSE, sqdARG_INT }, + { "--test2", FALSE, sqdARG_FLOAT }, + { "--test3", FALSE, sqdARG_STRING }, + { "--test4", FALSE, sqdARG_CHAR }, + { "-a", TRUE, sqdARG_NONE }, + { "-b", TRUE, sqdARG_INT }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + int optind; + char *optarg; + char *optname; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, "Usage/help here", + &optind, &optname, &optarg)) + { + printf("Option: index: %d name: %s argument: %s\n", + optind, optname, optarg); + } + while (optind < argc) + { + printf("Argument: index: %d name: %s\n", optind, argv[optind]); + optind++; + } + + +} + + +#endif /*GETOPT_TESTDRIVER*/ diff --git a/forester/archive/RIO/others/hmmer/squid/gki.c b/forester/archive/RIO/others/hmmer/squid/gki.c new file mode 100644 index 0000000..3ce8390 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/gki.c @@ -0,0 +1,390 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* gki.c + * SRE, Sat May 1 14:49:08 1999 + * + * "generic key index" module: emulation of Perl hashes. + * Maps keys (ASCII char strings) to array index. Dynamically + * resizes the hash table. + * + * Limitations: + * - hash table can only grow; no provision for deleting keys + * or downsizing the hash table. + * - Maximum hash table size set at 100003. Performance + * will degrade for key sets much larger than this. + * - Assumes that integers are 32 bits (or greater). + * + * Defines a typedef'd structure: + * gki - a key index hash table. + * Provides functions: + * GKIInit() - start a hash table. + * GKIStoreKey() - store a new key, get a unique index. + * GKIKeyIndex() - retrieve an existing key's index. + * GKIFree() - free a hash table. + * GKIStatus() - Debugging: prints internal status of a hash struct + * + * + * Note that there are no dependencies on squid; the gki.c/gki.h + * pair are base ANSI C and can be reused anywhere. + ***************************************************************** + * + * API for storing/reading stuff: + * moral equivalent of Perl's $foo{$key} = whatever, $bar{$key} = whatever: + * #include "gki.h" + * + * gki *hash; + * int idx; + * char *key; + * + * hash = GKIInit(); + * (Storing:) + * (foreach key) { + * idx = GKIStoreKey(hash, key); + * (reallocate foo, bar as needed) + * foo[idx] = whatever; + * bar[idx] = whatever; + * } + * (Reading:) + * (foreach key) { + * idx = GKIKeyIndex(hash, key); + * if (idx == -1) {no_such_key; } + * (do something with) foo[idx]; + * (do something with) bar[idx]; + * } + * GKIFree(); + * + ***************************************************************** + * + * Timings on wrasse for 45402 keys in /usr/dict/words using + * Tests/test_gki: + * 250 msec store (6 usec/store) + * 140 msec retrieve (3 usec/retrieve) + * and using the 13408 names of Pfam's GP120.full alignment: + * 70 msec store (5 usec/store) + * 50 msec retrieve (4 usec/retrieve) + * + * RCS $Id: gki.c,v 1.1.1.1 2005/03/22 08:34:18 cmzmasek Exp $ + */ + + + +#include +#include +#include +#include +#include "squid.h" +#include "gki.h" + +/* + * Best hash table sizes are prime numbers (see Knuth vol 3, Sorting + * and Searching). + * gki_primes[] defines the ascending order of hash table sizes + * that we use in upsizing the hash table dynamically. + * useful site for testing primes: + * http://www.idbsu.edu/people/jbrennan/algebra/numbers/sieve.html + * Because of the way gki_hashvalue works, the largest number + * must be < INT_MAX / 128 / 128 : 131072 on a 32 bit machine. + */ +static int gki_primes[] = { 101, 1009, 10007, 100003 }; +#define GKI_NPRIMES 4 +#define GKI_ALPHABETSIZE 128 + +static GKI *gki_alloc(int primelevel); +static int gki_hashvalue(GKI *hash, char *key); +static int gki_upsize(GKI *old); + + +/* Function: GKIInit() + * Date: SRE, Sat May 1 11:12:24 1999 [May Day geek-out] + * + * Purpose: Initialize a hash table for key indexing. + * Simply a wrapper around a level 0 gki_alloc(). + * + * Args: (void) + * + * Returns: An allocated hash table structure. + * Caller frees with GKIFree(). + */ +GKI * +GKIInit(void) +{ + GKI *hash; + hash = gki_alloc(0); + return hash; +} + +/* Function: GKIFree() + * Date: SRE, Sat May 1 11:13:26 1999 [May Day geek-out] + * + * Purpose: Free a key index hash table. + * + * Args: hash - the gki structure + * + * Returns: (void). + * hash table is destroyed. + */ +void +GKIFree(GKI *hash) +{ + struct gki_elem *ptr; + int i; + + if (hash == NULL) return; /* tolerate a NULL */ + + for (i = 0; i < hash->nhash; i++) + while (hash->table[i] != NULL) + { + ptr = hash->table[i]->nxt; + /* NULL keys can occur after we've gki_upsize'd */ + if (hash->table[i]->key != NULL) free(hash->table[i]->key); + free(hash->table[i]); + hash->table[i] = ptr; + } + free(hash->table); + free(hash); +} + +/* Function: GKIStoreKey() + * Date: SRE, Sat May 1 11:16:48 1999 [May Day geek-out] + * + * Purpose: Store a key in the key index hash table. + * Associate it with a unique "key index", counting + * from 0. (It's this index that lets us map + * the hashed keys to indexed C arrays, (clumsily) + * emulating Perl's hashes.) + * + * Does *not* check to see if the key's already + * in the table, so it's possible to store multiple + * copies of a key with different indices; probably + * not what you want, so if you're not sure the + * key is unique, check the table first with + * GKIKeyIndex(). + * + * Args: hash - GKI structure to store the key in + * key - string to store + * + * Returns: the new key's index. Since it's always the + * last one in the current array, this index is + * just hash->nkeys-1. + * On a malloc failure, returns -1. + * hash table is modified. + */ +int +GKIStoreKey(GKI *hash, char *key) +{ + int val; + struct gki_elem *ptr; + + val = gki_hashvalue(hash, key); + + ptr = hash->table[val]; + hash->table[val] = MallocOrDie(sizeof(struct gki_elem)); + hash->table[val]->key = MallocOrDie(sizeof(char) * (strlen(key)+1)); + strcpy(hash->table[val]->key, key); + + hash->table[val]->idx = hash->nkeys; + hash->table[val]->nxt = ptr; + + hash->nkeys++; + /* time to upsize? */ + if (hash->nkeys > 3*hash->nhash && hash->primelevel < GKI_NPRIMES-1) + gki_upsize(hash); + + return hash->nkeys-1; +} + +/* Function: GKIKeyIndex() + * Date: SRE, Sat May 1 11:20:42 1999 [May Day geek-out] + * + * Purpose: Look up a key in the hash table. Return + * its index (0..nkeys-1), else -1 if the key + * isn't in the hash (yet). + * + * Args: hash - the GKI hash table to search in + * key - the key to look up + * + * Returns: -1 if key is not found; + * index of key if it is found (range 0..nkeys-1). + * hash table is unchanged. + */ +int +GKIKeyIndex(GKI *hash, char *key) +{ + struct gki_elem *ptr; + int val; + + val = gki_hashvalue(hash, key); + for (ptr = hash->table[val]; ptr != NULL; ptr = ptr->nxt) + if (strcmp(key, ptr->key) == 0) return ptr->idx; + return -1; +} + +/* Function: GKIStatus() + * Date: SRE, Sat May 1 11:11:13 1999 [St. Louis] + * + * Purpose: (DEBUGGING) How are we doing? Calculate some + * simple statistics for the hash table. + * + * Args: hash - the GKI hash table to look at + * + * Returns: (void) + * Prints diagnostics on stdout. + * hash table is unchanged. + */ +void +GKIStatus(GKI *hash) +{ + struct gki_elem *ptr; + int i; + int nkeys; + int nempty = 0; + int maxkeys = -1; + int minkeys = INT_MAX; + + for (i = 0; i < hash->nhash; i++) + { + nkeys = 0; + for (ptr = hash->table[i]; ptr != NULL; ptr = ptr->nxt) + nkeys++; + + if (nkeys == 0) nempty++; + if (nkeys > maxkeys) maxkeys = nkeys; + if (nkeys < minkeys) minkeys = nkeys; + } + + printf("Total keys: %d\n", hash->nkeys); + printf("Hash table size: %d\n", hash->nhash); + printf("Average occupancy: %.1f\n", (float) hash->nkeys / (float) hash->nhash); + printf("Unoccupied slots: %d\n", nempty); + printf("Most in one slot: %d\n", maxkeys); + printf("Least in one slot: %d\n", minkeys); + +} + + +/* Function: gki_alloc() + * Date: SRE, Sat May 1 11:55:47 1999 [May Day geek-out] + * + * Purpose: Allocate a hash table structure with the + * size given by primelevel. + * + * Args: primelevel - level 0..GKI_NPRIMES-1, specifying + * the size of the table; see gki_primes[] + * array. + * + * Returns: An allocated hash table structure. + * Caller frees with GKIFree(). + */ +static GKI * +gki_alloc(int primelevel) +{ + GKI *hash; + int i; + + if (primelevel < 0 || primelevel >= GKI_NPRIMES) + Die("bad primelevel in gki_alloc()"); + hash = MallocOrDie(sizeof(GKI)); + + hash->primelevel = primelevel; + hash->nhash = gki_primes[hash->primelevel]; + hash->table = MallocOrDie(sizeof(struct gki_elem) * hash->nhash); + for (i = 0; i < hash->nhash; i++) + hash->table[i] = NULL; + hash->nkeys = 0; + return hash; +} + + +/* Function: gki_hashvalue() + * Date: SRE, Sat May 1 11:14:10 1999 [May Day geek-out] + * + * Purpose: Calculate the hash value for a key. Usually + * we expect a one-word key, but the function will + * hash any ASCII string effectively. The hash function + * is a simple one (see p. 233 of Sedgewick, + * Algorithms in C). + * Slightly optimized: does two characters at a time + * before doing the modulo; this gives us a significant + * speedup. + * + * Args: hash - the gki structure (we need to know the hash table size) + * key - a string to calculate the hash value for + * + * Returns: a hash value, in the range 0..hash->nhash-1. + * hash table is unmodified. + */ +static int +gki_hashvalue(GKI *hash, char *key) +{ + int val = 0; + + for (; *key != '\0'; key++) + { + val = GKI_ALPHABETSIZE*val + *key; + if (*(++key) == '\0') { val = val % hash->nhash; break; } + val = (GKI_ALPHABETSIZE*val + *key) % hash->nhash; + } + return val; +} + +/* Function: gki_upsize() + * Date: SRE, Sat May 1 11:46:07 1999 [May Day geek-out] + * + * Purpose: Grow the hash table to the next available size. + * + * Args: old - the GKI hash table to reallocate. + * + * Returns: 1 on success (the hash table is changed); + * 0 on failure; the table is already at its maximum size, + * and the hash table is returned unchanged. + */ +static int +gki_upsize(GKI *old) +{ + GKI *new; + int i; + struct gki_elem *optr; + struct gki_elem *nptr; + int val; + + if (old->primelevel >= GKI_NPRIMES-1) return 0; + new = gki_alloc(old->primelevel+1); + + /* Read the old, store in the new, while *not changing* + * any key indices. Because of the way the lists are + * treated as LIFO stacks, all the lists are reversed + * in the new structure. + */ + for (i = 0; i < old->nhash; i++) + { + optr = old->table[i]; + while (optr != NULL) + { + val = gki_hashvalue(new, optr->key); + + nptr = new->table[val]; + new->table[val] = optr; + optr = optr->nxt; + new->table[val]->nxt = nptr; + } + } + free(old->table); + + /* Now swap within the interior of the structures, so the old + * structure is updated to the new structure. + * (nkeys is identical, so we don't need to swap that element.) + */ + old->primelevel = new->primelevel; + old->nhash = new->nhash; + old->table = new->table; + free(new); + return 1; +} diff --git a/forester/archive/RIO/others/hmmer/squid/gki.h b/forester/archive/RIO/others/hmmer/squid/gki.h new file mode 100644 index 0000000..1346045 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/gki.h @@ -0,0 +1,51 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef SQUID_GKI_INCLUDED +#define SQUID_GKI_INCLUDED + +/* gki.h + * SRE, Sat May 1 15:07:22 1999 + * + * Declarations of structures, functions for generic key index + * module: emulation of Perl hashes. See gki.c. + * + * RCS $Id: gki.h,v 1.1.1.1 2005/03/22 08:34:18 cmzmasek Exp $ + */ + +/* gki_elem: + * key, array index pairs are kept in linked list structures. + */ +struct gki_elem { + char *key; + int idx; + struct gki_elem *nxt; +}; + +/* gki: + * a dynamically resized hash structure; + * contains a hash table and associated data + */ +typedef struct { + struct gki_elem **table; + + int primelevel; + int nhash; + int nkeys; +} GKI; + +GKI *GKIInit(void); +void GKIFree(GKI *hash); +int GKIHashValue(GKI *hash, char *key); +int GKIStoreKey(GKI *hash, char *key); +int GKIKeyIndex(GKI *hash, char *key); +void GKIStatus(GKI *hash); + +#endif /* SQUID_GKI_INCLUDED */ diff --git a/forester/archive/RIO/others/hmmer/squid/gsi.c b/forester/archive/RIO/others/hmmer/squid/gsi.c new file mode 100644 index 0000000..f5cbee1 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/gsi.c @@ -0,0 +1,385 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* gsi.c + * Interfaces for GSI "generic sequence index" files. + * broken away from sqio.c and extended: SRE, Wed Aug 5 10:32:53 1998 + * + * + * GSI definition: + * 1 + + total records. + * Each record = 38 bytes. + * + * one header record : <"GSI" (32)> + * file records : + * key records : + * + * Matches up with my Perl scripts that create GSI files. + * + * RCS $Id: gsi.c,v 1.1.1.1 2005/03/22 08:34:18 cmzmasek Exp $ + */ + +#include +#include +#include +#ifndef SEEK_SET +#include /* needed for poor crippled SunOS */ +#endif + +#include "squid.h" +#include "gsi.h" + + +/***************************************************************** + * GSI index file access routines + *****************************************************************/ + +/* Function: GSIOpen() + * + * Purpose: Open a GSI file. Returns the number of records in + * the file and a file pointer. Returns NULL on failure. + * The file pointer should be fclose()'d normally. + */ +GSIFILE * +GSIOpen(char *gsifile) +{ + GSIFILE *gsi; + char magic[GSI_KEYSIZE]; + + gsi = (GSIFILE *) MallocOrDie (sizeof(GSIFILE)); + if ((gsi->gsifp = fopen(gsifile, "r")) == NULL) + { free(gsi); squid_errno = SQERR_NOFILE; return NULL; } + + if (! fread(magic, sizeof(char), GSI_KEYSIZE, gsi->gsifp)) + { free(gsi); squid_errno = SQERR_NODATA; return NULL; } + if (strcmp(magic, "GSI") != 0) + { free(gsi); squid_errno = SQERR_FORMAT; return NULL; } + + if (! fread(&(gsi->nfiles), sizeof(sqd_uint16), 1, gsi->gsifp)) + { free(gsi); squid_errno = SQERR_NODATA; return NULL; } + if (! fread(&(gsi->recnum), sizeof(sqd_uint32), 1, gsi->gsifp)) + { free(gsi); squid_errno = SQERR_NODATA; return NULL; } + + gsi->nfiles = sre_ntoh16(gsi->nfiles); /* convert from network short */ + gsi->recnum = sre_ntoh32(gsi->recnum); /* convert from network long */ + + return gsi; +} + +/* Function: GSIGetRecord() + * + * Purpose: Each non-header record of a GSI index files consists + * of 38 bytes: 32 bytes of character string, a 2 byte + * short, and a 4 byte long. This function returns the + * three values. + * + * Args: gsi - open GSI index file, correctly positioned at a record + * f1 - char[32], allocated by caller (or NULL if unwanted) + * f2 - pointer to short (or NULL if unwanted) + * f3 - pointer to long (or NULL if unwanted) + * + * Return: 0 on failure and sets squid_errno. + */ +int +GSIGetRecord(GSIFILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint32 *f3) +{ + if (f1 == NULL) fseek(gsi->gsifp, GSI_KEYSIZE, SEEK_CUR); + else if (! fread(f1, GSI_KEYSIZE, 1, gsi->gsifp)) + { squid_errno = SQERR_NODATA; return 0; } + + if (f2 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint16), SEEK_CUR); + else if (! fread(f2, sizeof(sqd_uint16), 1, gsi->gsifp)) + { squid_errno = SQERR_NODATA; return 0; } + + if (f3 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint32), SEEK_CUR); + else if (! fread(f3, sizeof(sqd_uint32), 1, gsi->gsifp)) + { squid_errno = SQERR_NODATA; return 0; } + + if (f2 != NULL) *f2 = sre_ntoh16(*f2); + if (f3 != NULL) *f3 = sre_ntoh32(*f3); + + return 1; +} + + +/* Function: GSIGetOffset() + * + * Purpose: From a key (sequence name), find a disk offset + * in an open general sequence index file by binary + * search. Presumably GSI indexing could be even faster + * if we used hashing. + * + * Args: gsi - GSI index file, opened by GSIOpen() + * key - name of key to retrieve indices for + * ret_seqfile - pre-alloced char[32] array for seqfile name + * ret_fmt - format of seqfile + * ret_offset - return: disk offset in seqfile. + */ +int +GSIGetOffset(GSIFILE *gsi, char *key, char *ret_seqfile, + int *ret_format, long *ret_offset) +{ + sqd_uint32 left, right, mid; + int cmp; + char name[GSI_KEYSIZE + 1]; + sqd_uint32 offset; + sqd_uint16 filenum; + sqd_uint32 fmt; + + name[GSI_KEYSIZE] = '\0'; + + left = gsi->nfiles + 1; + right = gsi->nfiles + gsi->recnum; + mid = (left + right) / 2; + fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET); + + while (GSIGetRecord(gsi, name, &filenum, &offset)) + { + cmp = strcmp(name, key); + if (cmp == 0) break; /* found it! */ + else if (left >= right) return 0; /* oops, missed it; fail. */ + else if (cmp < 0) left = mid + 1; /* it's right of mid */ + else if (cmp > 0) right = mid - 1; /* it's left of mid */ + mid = (left + right) / 2; + fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET); + } + + /* Using file number, look up the sequence file and format. + */ + fseek(gsi->gsifp, filenum * GSI_RECSIZE, SEEK_SET); + GSIGetRecord(gsi, ret_seqfile, NULL, &fmt); + *ret_format = (int) fmt; + *ret_offset = (long) offset; + + return 1; +} + +/* Function: GSIClose() + * + * Purpose: Close an open GSI sequence index file. + */ +void +GSIClose(GSIFILE *gsi) +{ + fclose(gsi->gsifp); + free(gsi); +} + + +/***************************************************************** + * GSI index construction routines + * SRE, Wed Nov 10 11:49:14 1999 [St. Louis] + * + * API: + * g = GSIAllocIndex(); + * + * [foreach filename, <32 char, no directory path] + * GSIAddFileToIndex(g, filename); + * filenum++; + * [foreach key, <32 char, w/ filenum 1..nfiles, w/ 32bit offset] + * GSIAddKeyToIndex(g, key, filenum, offset); + * + * GSISortIndex(g); + * GSIWriteIndex(fp, g); + * GSIFreeIndex(g); + *****************************************************************/ +struct gsiindex_s * +GSIAllocIndex(void) +{ + struct gsiindex_s *g; + + g = MallocOrDie(sizeof(struct gsiindex_s)); + g->filenames = MallocOrDie(sizeof(char *) * 10); + g->fmt = MallocOrDie(sizeof(int) * 10); + g->elems = MallocOrDie(sizeof(struct gsikey_s) * 100); + g->nfiles = 0; + g->nkeys = 0; + return g; +} +void +GSIFreeIndex(struct gsiindex_s *g) +{ + int i; + for (i = 0; i < g->nfiles; i++) free(g->filenames[i]); + free(g->filenames); + free(g->fmt); + free(g->elems); + free(g); +} +void +GSIAddFileToIndex(struct gsiindex_s *g, char *filename, int fmt) +{ + int len; + + len = strlen(filename); + if (len >= GSI_KEYSIZE) Die("File name too long to be indexed."); + g->filenames[g->nfiles] = sre_strdup(filename, len); + g->fmt[g->nfiles] = fmt; + g->nfiles++; + if (g->nfiles % 10 == 0) { + g->filenames = ReallocOrDie(g->filenames, sizeof(char *) * (g->nfiles + 10)); + g->fmt = ReallocOrDie(g->fmt, sizeof(int) * (g->nfiles + 10)); + } +} +void +GSIAddKeyToIndex(struct gsiindex_s *g, char *key, int filenum, long offset) +{ + if (strlen(key) >= GSI_KEYSIZE) Die("key too long in GSI index"); + if (filenum > SQD_UINT16_MAX) Die("too many files in GSI index"); + if (offset > SQD_UINT32_MAX) Die("offset too big in GSI index"); + + strncpy(g->elems[g->nkeys].key, key, GSI_KEYSIZE-1); + g->elems[g->nkeys].key[GSI_KEYSIZE-1] = '\0'; + g->elems[g->nkeys].filenum = (sqd_uint16) filenum; + g->elems[g->nkeys].offset = (sqd_uint32) offset; + g->nkeys++; + + if (g->nkeys % 100 == 0) + g->elems = ReallocOrDie(g->elems, sizeof(struct gsikey_s) * (g->nkeys + 100)); +} +static int +gsi_keysorter(const void *k1, const void *k2) +{ + struct gsikey_s *key1; + struct gsikey_s *key2; + key1 = (struct gsikey_s *) k1; + key2 = (struct gsikey_s *) k2; + return strcmp(key1->key, key2->key); +} +void +GSISortIndex(struct gsiindex_s *g) +{ + qsort((void *) g->elems, g->nkeys, sizeof(struct gsikey_s), gsi_keysorter); +} +void +GSIWriteIndex(FILE *fp, struct gsiindex_s *g) +{ + sqd_uint32 i; + + /* Range checking. + */ + if (g->nfiles > SQD_UINT16_MAX) Die("Too many files in GSI index."); + if (g->nkeys > SQD_UINT32_MAX) Die("Too many keys in GSI index."); + + GSIWriteHeader(fp, g->nfiles, g->nkeys); + for (i = 0; i < g->nfiles; i++) + GSIWriteFileRecord(fp, g->filenames[i], i+1, g->fmt[i]); + for (i = 0; i < g->nkeys; i++) + GSIWriteKeyRecord(fp, g->elems[i].key, g->elems[i].filenum, g->elems[i].offset); +} + + + + + +/* Function: GSIWriteHeader() + * Date: SRE, Wed Aug 5 10:36:02 1998 [St. Louis] + * + * Purpose: Write the first record to an open GSI file: + * "GSI" + * + * Args: fp - open file to write to. + * nfiles - number of files indexed + * nkeys - number of keys indexed + * + * Returns: void + */ +void +GSIWriteHeader(FILE *fp, int nfiles, long nkeys) +{ + char key[GSI_KEYSIZE]; + sqd_uint16 f1; + sqd_uint32 f2; + + /* beware potential range errors! + */ + if (nfiles > SQD_UINT16_MAX) Die("GSI: nfiles out of range"); + if (nkeys > SQD_UINT32_MAX) Die("GSI: nkeys out of range"); + + f1 = (sqd_uint16) nfiles; + f2 = (sqd_uint32) nkeys; + f1 = sre_hton16(f1); + f2 = sre_hton32(f2); + strcpy(key, "GSI"); + + if (fwrite(key, 1, GSI_KEYSIZE, fp) < GSI_KEYSIZE) PANIC; + if (fwrite(&f1, 2, 1, fp) < 1) PANIC; + if (fwrite(&f2, 4, 1, fp) < 1) PANIC; +} + + +/* Function: GSIWriteFileRecord() + * Date: SRE, Wed Aug 5 10:45:51 1998 [St. Louis] + * + * Purpose: Write a file record to an open GSI file. + * + * Args: fp - open GSI file + * fname - file name (max 31 characters) + * idx - file number + * fmt - file format (e.g. kPearson, etc.) + * + * Returns: 0 on failure. 1 on success. + */ +int +GSIWriteFileRecord(FILE *fp, char *fname, int idx, int fmt) +{ + sqd_uint16 f1; + sqd_uint32 f2; + + if (strlen(fname) >= GSI_KEYSIZE) return 0; + if (idx > SQD_UINT16_MAX) Die("GSI: file index out of range"); + if (fmt > SQD_UINT32_MAX) Die("GSI: format index out of range"); + + f1 = (sqd_uint16) idx; + f2 = (sqd_uint32) fmt; + f1 = sre_hton16(f1); + f2 = sre_hton32(f2); + + if (fwrite(fname, 1, GSI_KEYSIZE, fp) < GSI_KEYSIZE) PANIC; + if (fwrite(&f1, 2, 1, fp) < 1) PANIC; + if (fwrite(&f2, 4, 1, fp) < 1) PANIC; + return 1; +} + + +/* Function: GSIWriteKeyRecord() + * Date: SRE, Wed Aug 5 10:52:30 1998 [St. Louis] + * + * Purpose: Write a key record to a GSI file. + * + * Args: fp - open GSI file for writing + * key - key (max 31 char + \0) + * fileidx - which file number to find this key in + * offset - offset for this key + * + * Returns: 1 on success, else 0. + * will fail if key >= 32 chars, for instance. + */ +int +GSIWriteKeyRecord(FILE *fp, char *key, int fileidx, long offset) +{ + sqd_uint16 f1; + sqd_uint32 f2; + + if (strlen(key) >= GSI_KEYSIZE) return 0; + if (fileidx > SQD_UINT16_MAX) Die("GSI: file index out of range"); + if (offset > SQD_UINT32_MAX) Die("GSI: offset out of range"); + + f1 = (sqd_uint16) fileidx; + f2 = (sqd_uint32) offset; + f1 = sre_hton16(f1); + f2 = sre_hton32(f2); + + if (fwrite(key, 1, GSI_KEYSIZE, fp) < GSI_KEYSIZE) PANIC; + if (fwrite(&f1, 2, 1, fp) < 1) PANIC; + if (fwrite(&f2, 4, 1, fp) < 1) PANIC; + return 1; +} + diff --git a/forester/archive/RIO/others/hmmer/squid/gsi.h b/forester/archive/RIO/others/hmmer/squid/gsi.h new file mode 100644 index 0000000..1c385ff --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/gsi.h @@ -0,0 +1,85 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef GSIH_INCLUDED +#define GSIH_INCLUDED + +/* gsi.h + * Database indexing (GSI format support) + * RCS $Id: gsi.h,v 1.1.1.1 2005/03/22 08:34:18 cmzmasek Exp $ + * + * A GSI (generic sequence index) file is composed of + * recnum + nfiles + 1 records. Each record contains + * three fields; key, file number, and disk offset. + * Record 0 contains: + * [ "GSI" ] [ nfiles ] [ recnum ] + * Records 1..nfiles map file names to file numbers, and contain: + * [ filename ] [ file number, 1..nfiles ] [ 0 (unused) ] + * Records nfiles+1 to recnum+nfiles+1 provide disk offset + * and file number indices for every key: + * [ key ] [ file number ] [ offset] + * + * Because the file is binary, we take some (but not + * complete) care to improve portability amongst platforms. + * This means using network order integers (see ntohl()) + * and defining types for 16 and 32 bit integers. + * + * Because we use 32-bit offsets, ftell(), and fseek(), + * there is an implicit 2 Gb file size maximum. + * AFAIK neither ANSI C nor POSIX provide a portable solution + * to this problem. fsetpos(), fgetpos() use an + * opaque fpos_t datatype that we can't write portably + * to a disk file. Suggestions welcomed. + */ +#define GSI_KEYSIZE 32 /* keys are 32 bytes long */ +#define GSI_RECSIZE 38 /* 32 + 2 + 4 bytes */ +#define SQD_UINT16_MAX 65535 /* 2^16-1 */ +#define SQD_UINT32_MAX 4294967295U/* 2^32-1 */ + +struct gsi_s { + FILE *gsifp; /* open GSI index file */ + sqd_uint16 nfiles; /* number of files = 16 bit int */ + sqd_uint32 recnum; /* number of records = 32 bit int */ +}; +typedef struct gsi_s GSIFILE; + +struct gsikey_s { + char key[GSI_KEYSIZE]; + sqd_uint16 filenum; + sqd_uint32 offset; +}; +struct gsiindex_s { + char **filenames; + int *fmt; + sqd_uint16 nfiles; + + struct gsikey_s *elems; + int nkeys; +}; + + +/* from gsi.c + */ +extern GSIFILE *GSIOpen(char *gsifile); +extern int GSIGetRecord(GSIFILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint32 *f3); +extern int GSIGetOffset(GSIFILE *gsi, char *key, char *sqfile, + int *fmt, long *ret_offset); +extern void GSIClose(GSIFILE *gsi); +extern struct gsiindex_s *GSIAllocIndex(void); +extern void GSIFreeIndex(struct gsiindex_s *g); +extern void GSIAddFileToIndex(struct gsiindex_s *g, char *filename, int fmt); +extern void GSIAddKeyToIndex(struct gsiindex_s *g, char *key, int filenum, long offset); +extern void GSISortIndex(struct gsiindex_s *g); +extern void GSIWriteIndex(FILE *fp, struct gsiindex_s *g); +extern void GSIWriteHeader(FILE *fp, int nfiles, long nkeys); +extern int GSIWriteFileRecord(FILE *fp, char *fname, int idx, int fmt); +extern int GSIWriteKeyRecord(FILE *fp, char *key, int fileidx, long offset); + +#endif /*GSIH_INCLUDED*/ diff --git a/forester/archive/RIO/others/hmmer/squid/gsi64.c b/forester/archive/RIO/others/hmmer/squid/gsi64.c new file mode 100644 index 0000000..0aeb82c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/gsi64.c @@ -0,0 +1,395 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ +#ifdef USE_GSI64 + +/* gsi64.c + * Updated interfaces for GSI64 64-bit "generic sequence index" files. + * See gsi.c for old interfaces. + * This is a temporary hack! Needed for human genome project. + */ + +/* 1 + + total records. + * Each record = 42 bytes. + * + * one header record : <"GSI64" (32)> + * file records : + * key records : + * + * CVS $Id: gsi64.c,v 1.1.1.1 2005/03/22 08:34:29 cmzmasek Exp $ + */ + +#include +#include +#include +#ifndef SEEK_SET +#include /* needed for poor crippled SunOS */ +#endif + +#include "squid.h" +#include "gsi64.h" + +/***************************************************************** + * GSI64 index file access routines + *****************************************************************/ + +/* Function: GSI64Open() + * + * Purpose: Open a GSI64 file. Returns the number of records in + * the file and a file pointer. Returns NULL on failure. + * The file pointer should be fclose()'d normally. + */ +GSI64FILE * +GSI64Open(char *gsifile) +{ + GSI64FILE *gsi; + char magic[GSI64_KEYSIZE]; + + gsi = (GSI64FILE *) MallocOrDie (sizeof(GSI64FILE)); + if ((gsi->gsifp = fopen(gsifile, "r")) == NULL) + { free(gsi); squid_errno = SQERR_NOFILE; return NULL; } + + if (! fread(magic, sizeof(char), GSI64_KEYSIZE, gsi->gsifp)) + { free(gsi); squid_errno = SQERR_NODATA; return NULL; } + if (strcmp(magic, "GSI64") != 0) + { free(gsi); squid_errno = SQERR_FORMAT; return NULL; } + + if (! fread(&(gsi->nfiles), sizeof(sqd_uint16), 1, gsi->gsifp)) + { free(gsi); squid_errno = SQERR_NODATA; return NULL; } + if (! fread(&(gsi->recnum), sizeof(sqd_uint64), 1, gsi->gsifp)) + { free(gsi); squid_errno = SQERR_NODATA; return NULL; } + +#if 0 /* HACK! we don't byteswap */ + gsi->nfiles = sre_ntohs(gsi->nfiles); /* convert from network short */ + gsi->recnum = sre_ntohl(gsi->recnum); /* convert from network long */ +#endif + + return gsi; +} + +/* Function: GSI64GetRecord() + * + * Purpose: Each non-header record of a GSI64 index file consists + * of 42 bytes: 32 bytes of character string, a 2 byte + * short, and an 8 byte long long. This function returns the + * three values. + * + * Args: gsi - open GSI64 index file, correctly positioned at a record + * f1 - char[32], allocated by caller (or NULL if unwanted) + * f2 - pointer to short (or NULL if unwanted) + * f3 - pointer to long long (or NULL if unwanted) + * + * Return: 0 on failure and sets squid_errno. + */ +int +GSI64GetRecord(GSI64FILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint64 *f3) +{ + if (f1 == NULL) fseek64(gsi->gsifp, GSI64_KEYSIZE, SEEK_CUR); + else if (! fread(f1, GSI64_KEYSIZE, 1, gsi->gsifp)) + { squid_errno = SQERR_NODATA; return 0; } + + if (f2 == NULL) fseek64(gsi->gsifp, sizeof(sqd_uint16), SEEK_CUR); + else if (! fread(f2, sizeof(sqd_uint16), 1, gsi->gsifp)) + { squid_errno = SQERR_NODATA; return 0; } + + if (f3 == NULL) fseek64(gsi->gsifp, sizeof(sqd_uint64), SEEK_CUR); + else if (! fread(f3, sizeof(sqd_uint64), 1, gsi->gsifp)) + { squid_errno = SQERR_NODATA; return 0; } + +#if 0 /* no byteswap yet! HACK! */ + if (f2 != NULL) *f2 = sre_ntohs(*f2); + if (f3 != NULL) *f3 = sre_ntohl(*f3); +#endif + + return 1; +} + + +/* Function: GSI64GetOffset() + * + * Purpose: From a key (sequence name), find a disk offset + * in an open general sequence index file by binary + * search. Presumably GSI64 indexing could be even faster + * if we used hashing. + * + * Args: gsi - GSI64 index file, opened by GSI64Open() + * key - name of key to retrieve indices for + * ret_seqfile - pre-alloced char[32] array for seqfile name + * ret_fmt - format of seqfile + * ret_offset - return: disk offset in seqfile. + */ +int +GSI64GetOffset(GSI64FILE *gsi, char *key, char *ret_seqfile, + int *ret_format, long long *ret_offset) +{ + sqd_uint64 left, right, mid; + int cmp; + char name[GSI64_KEYSIZE + 1]; + sqd_uint64 offset; + sqd_uint16 filenum; + sqd_uint64 fmt; + + name[GSI64_KEYSIZE] = '\0'; + + left = gsi->nfiles + 1; + right = gsi->nfiles + gsi->recnum; + mid = (left + right) / 2; + fseek64(gsi->gsifp, mid * GSI64_RECSIZE, SEEK_SET); + + while (GSI64GetRecord(gsi, name, &filenum, &offset)) + { + cmp = strcmp(name, key); + if (cmp == 0) break; /* found it! */ + else if (left >= right) return 0; /* oops, missed it; fail. */ + else if (cmp < 0) left = mid + 1; /* it's right of mid */ + else if (cmp > 0) right = mid - 1; /* it's left of mid */ + mid = (left + right) / 2; + fseek64(gsi->gsifp, mid * GSI64_RECSIZE, SEEK_SET); + } + + /* Using file number, look up the sequence file and format. + */ + fseek64(gsi->gsifp, filenum * GSI64_RECSIZE, SEEK_SET); + GSI64GetRecord(gsi, ret_seqfile, NULL, &fmt); + *ret_format = (int) fmt; + *ret_offset = (long long) offset; + + return 1; +} + +/* Function: GSI64Close() + * + * Purpose: Close an open GSI64 sequence index file. + */ +void +GSI64Close(GSI64FILE *gsi) +{ + fclose(gsi->gsifp); + free(gsi); +} + + +/***************************************************************** + * GSI64 index construction routines + * SRE, Wed Nov 10 11:49:14 1999 [St. Louis] + * + * API: + * g = GSI64AllocIndex(); + * + * [foreach filename, <32 char, no directory path] + * GSI64AddFileToIndex(g, filename); + * filenum++; + * [foreach key, <32 char, w/ filenum 1..nfiles, w/ 64bit offset] + * GSI64AddKeyToIndex(g, key, filenum, offset); + * + * GSI64SortIndex(g); + * GSI64WriteIndex(fp, g); + * GSI64FreeIndex(g); + *****************************************************************/ +struct gsi64index_s * +GSI64AllocIndex(void) +{ + struct gsi64index_s *g; + + g = MallocOrDie(sizeof(struct gsi64index_s)); + g->filenames = MallocOrDie(sizeof(char *) * 10); + g->fmt = MallocOrDie(sizeof(int) * 10); + g->elems = MallocOrDie(sizeof(struct gsi64key_s) * 100); + g->nfiles = 0; + g->nkeys = 0; + return g; +} +void +GSI64FreeIndex(struct gsi64index_s *g) +{ + int i; + for (i = 0; i < g->nfiles; i++) free(g->filenames[i]); + free(g->filenames); + free(g->fmt); + free(g->elems); + free(g); +} +void +GSI64AddFileToIndex(struct gsi64index_s *g, char *filename, int fmt) +{ + int len; + + len = strlen(filename); + if (len >= GSI64_KEYSIZE) Die("File name too long to be indexed."); + g->filenames[g->nfiles] = sre_strdup(filename, len); + g->fmt[g->nfiles] = fmt; + g->nfiles++; + if (g->nfiles % 10 == 0) { + g->filenames = ReallocOrDie(g->filenames, sizeof(char *) * (g->nfiles + 10)); + g->fmt = ReallocOrDie(g->fmt, sizeof(int) * (g->nfiles + 10)); + } +} +void +GSI64AddKeyToIndex(struct gsi64index_s *g, char *key, int filenum, long long offset) +{ + if (strlen(key) >= GSI64_KEYSIZE) Die("key too long in GSI64 index"); + if (filenum > SQD_UINT16_MAX) Die("too many files in GSI64 index"); + if (offset > SQD_UINT64_MAX) Die("offset too big in GSI64 index"); + + strncpy(g->elems[g->nkeys].key, key, GSI64_KEYSIZE-1); + g->elems[g->nkeys].key[GSI64_KEYSIZE-1] = '\0'; + g->elems[g->nkeys].filenum = (sqd_uint16) filenum; + g->elems[g->nkeys].offset = (sqd_uint64) offset; + g->nkeys++; + + if (g->nkeys % 100 == 0) + g->elems = ReallocOrDie(g->elems, sizeof(struct gsi64key_s) * (g->nkeys + 100)); +} +static int +gsi_keysorter(const void *k1, const void *k2) +{ + struct gsi64key_s *key1; + struct gsi64key_s *key2; + key1 = (struct gsi64key_s *) k1; + key2 = (struct gsi64key_s *) k2; + return strcmp(key1->key, key2->key); +} +void +GSI64SortIndex(struct gsi64index_s *g) +{ + qsort((void *) g->elems, g->nkeys, sizeof(struct gsi64key_s), gsi_keysorter); +} +void +GSI64WriteIndex(FILE *fp, struct gsi64index_s *g) +{ + sqd_uint16 i; + sqd_uint64 j; + + /* Range checking. + */ + if (g->nfiles > SQD_UINT16_MAX) Die("Too many files in GSI64 index."); + if (g->nkeys > SQD_UINT64_MAX) Die("Too many keys in GSI64 index."); + + GSI64WriteHeader(fp, g->nfiles, g->nkeys); + for (i = 0; i < g->nfiles; i++) + GSI64WriteFileRecord(fp, g->filenames[i], i+1, g->fmt[i]); + for (j = 0; j < g->nkeys; j++) + GSI64WriteKeyRecord(fp, g->elems[j].key, g->elems[j].filenum, g->elems[j].offset); +} + + + + + +/* Function: GSI64WriteHeader() + * Date: SRE, Wed Aug 5 10:36:02 1998 [St. Louis] + * + * Purpose: Write the first record to an open GSI64 file: + * "GSI64" + * + * Args: fp - open file to write to. + * nfiles - number of files indexed + * nkeys - number of keys indexed + * + * Returns: void + */ +void +GSI64WriteHeader(FILE *fp, int nfiles, long long nkeys) +{ + char key[GSI64_KEYSIZE]; + sqd_uint16 f1; + sqd_uint64 f2; + + /* beware potential range errors! + */ + if (nfiles > SQD_UINT16_MAX) Die("GSI64: nfiles out of range"); + if (nkeys > SQD_UINT64_MAX) Die("GSI64: nkeys out of range"); + + f1 = (sqd_uint16) nfiles; + f2 = (sqd_uint64) nkeys; +#if 0 /* HACK no byteswap */ + f1 = sre_htons(f1); + f2 = sre_htonl(f2); +#endif + strcpy(key, "GSI64"); + + if (fwrite(key, 1, GSI64_KEYSIZE, fp) < GSI64_KEYSIZE) PANIC; + if (fwrite(&f1, 2, 1, fp) < 1) PANIC; + if (fwrite(&f2, 8, 1, fp) < 1) PANIC; +} + + +/* Function: GSI64WriteFileRecord() + * Date: SRE, Wed Aug 5 10:45:51 1998 [St. Louis] + * + * Purpose: Write a file record to an open GSI64 file. + * + * Args: fp - open GSI64 file + * fname - file name (max 31 characters) + * idx - file number + * fmt - file format (e.g. kPearson, etc.) + * + * Returns: 0 on failure. 1 on success. + */ +int +GSI64WriteFileRecord(FILE *fp, char *fname, int idx, int fmt) +{ + sqd_uint16 f1; + sqd_uint64 f2; + + if (strlen(fname) >= GSI64_KEYSIZE) return 0; + if (idx > SQD_UINT16_MAX) Die("GSI64: file index out of range"); + if (fmt > SQD_UINT64_MAX) Die("GSI64: format index out of range"); + + f1 = (sqd_uint16) idx; + f2 = (sqd_uint64) fmt; +#if 0 /* hack : no byteswap */ + f1 = sre_htons(f1); + f2 = sre_htonl(f2); +#endif + + if (fwrite(fname, 1, GSI64_KEYSIZE, fp) < GSI64_KEYSIZE) PANIC; + if (fwrite(&f1, 2, 1, fp) < 1) PANIC; + if (fwrite(&f2, 8, 1, fp) < 1) PANIC; + return 1; +} + + +/* Function: GSI64WriteKeyRecord() + * Date: SRE, Wed Aug 5 10:52:30 1998 [St. Louis] + * + * Purpose: Write a key record to a GSI64 file. + * + * Args: fp - open GSI64 file for writing + * key - key (max 31 char + \0) + * fileidx - which file number to find this key in + * offset - offset for this key + * + * Returns: 1 on success, else 0. + * will fail if key >= 32 chars, for instance. + */ +int +GSI64WriteKeyRecord(FILE *fp, char *key, int fileidx, long long offset) +{ + sqd_uint16 f1; + sqd_uint64 f2; + + if (strlen(key) >= GSI64_KEYSIZE) return 0; + if (fileidx > SQD_UINT16_MAX) Die("GSI64: file index out of range"); + if (offset > SQD_UINT64_MAX) Die("GSI64: offset out of range"); + + f1 = (sqd_uint16) fileidx; + f2 = (sqd_uint64) offset; +#if 0 /* HACK! */ + f1 = sre_htons(f1); + f2 = sre_htonl(f2); +#endif + + if (fwrite(key, 1, GSI64_KEYSIZE, fp) < GSI64_KEYSIZE) PANIC; + if (fwrite(&f1, 2, 1, fp) < 1) PANIC; + if (fwrite(&f2, 8, 1, fp) < 1) PANIC; + return 1; +} + +#endif /*USE_GSI64 */ diff --git a/forester/archive/RIO/others/hmmer/squid/gsi64.h b/forester/archive/RIO/others/hmmer/squid/gsi64.h new file mode 100644 index 0000000..99f7296 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/gsi64.h @@ -0,0 +1,101 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef GSI64H_INCLUDED +#define GSI64H_INCLUDED +#ifdef USE_GSI64 + +/* gsi64.h + * Database indexing (GSI64 format support) + * CVS $Id: gsi64.h,v 1.1.1.1 2005/03/22 08:34:29 cmzmasek Exp $ + * + * A GSI64 (generic sequence index, 64 bit hack) file is composed of + * recnum + nfiles + 1 records. Each record contains + * three fields; key, file number, and disk offset. + * Record 0 contains: + * [ "GSI64" ] [ nfiles ] [ recnum ] + * Records 1..nfiles map file names to file numbers, and contain: + * [ filename ] [ file number, 1..nfiles ] [ 0 (unused) ] + * Records nfiles+1 to recnum+nfiles+1 provide disk offset + * and file number indices for every key: + * [ key ] [ file number ] [ offset] + * + * Because the file is binary, we take some (but not + * complete) care to improve portability amongst platforms. + * This means using network order integers (see ntohl()) + * and defining types for 16 and 64 bit integers. + * + * A short test program that verifies the sizes of these + * data types would be a good idea... + * + * Because we use 64-bit offsets, ftell64(), and fseek64(), + * we rely on the OS actually providing these. This is + * a temporary hack for human genome analysis. + */ +typedef unsigned long long sqd_uint64; /* 64 bit integer. */ + +#define GSI64_KEYSIZE 32 /* keys are 32 bytes long */ +#define GSI64_RECSIZE 42 /* 32 + 2 + 8 bytes */ +#define SQD_UINT16_MAX 65535 /* 2^16-1 */ +#define SQD_UINT64_MAX 18446744073709551615LU /* 2^64-1 */ + +struct gsi64_s { + FILE *gsifp; /* open GSI index file */ + sqd_uint16 nfiles; /* number of files = 16 bit int */ + sqd_uint64 recnum; /* number of records = 64 bit int */ +}; +typedef struct gsi64_s GSI64FILE; + +struct gsi64key_s { + char key[GSI64_KEYSIZE]; + sqd_uint16 filenum; + sqd_uint64 offset; +}; +struct gsi64index_s { + char **filenames; + int *fmt; + sqd_uint16 nfiles; + + struct gsi64key_s *elems; + sqd_uint64 nkeys; +}; + + + +/* if ntohl() and friends are not available, you + * can slip replacements in by providing sre_ntohl() + * functions. (i.e., there is a possible portability problem here.) + */ +#if 0 +#define sre_ntohl(x) ntohl(x); +#define sre_ntohs(x) ntohs(x); +#define sre_htonl(x) htonl(x); +#define sre_htons(x) htons(x); +#endif + +/* from gsi64.c + */ +extern GSI64FILE *GSI64Open(char *gsifile); +extern int GSI64GetRecord(GSI64FILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint64 *f3); +extern int GSI64GetOffset(GSI64FILE *gsi, char *key, char *sqfile, + int *fmt, long long *ret_offset); +extern void GSI64Close(GSI64FILE *gsi); +extern struct gsi64index_s *GSI64AllocIndex(void); +extern void GSI64FreeIndex(struct gsi64index_s *g); +extern void GSI64AddFileToIndex(struct gsi64index_s *g, char *filename, int fmt); +extern void GSI64AddKeyToIndex(struct gsi64index_s *g, char *key, int filenum, long long offset); +extern void GSI64SortIndex(struct gsi64index_s *g); +extern void GSI64WriteIndex(FILE *fp, struct gsi64index_s *g); +extern void GSI64WriteHeader(FILE *fp, int nfiles, long long nkeys); +extern int GSI64WriteFileRecord(FILE *fp, char *fname, int idx, int fmt); +extern int GSI64WriteKeyRecord(FILE *fp, char *key, int fileidx, long long offset); + +#endif /* USE_GSI64 */ +#endif /*GSIH_INCLUDED*/ diff --git a/forester/archive/RIO/others/hmmer/squid/hsregex.c b/forester/archive/RIO/others/hmmer/squid/hsregex.c new file mode 100644 index 0000000..6113900 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/hsregex.c @@ -0,0 +1,1314 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/***************************************************************** + * This code is an altered version of Henry Spencer's + * regex library. Alterations are limited to minor streamlining, + * and some name changes to protect the SQUID namespace. + * Henry's copyright notice appears below. + * You can obtain the original from + * ftp://ftp.zoo.toronto.edu/pub/bookregex.tar.Z + * Thanks, Henry! + * + * SRE, Fri Aug 28 11:10:17 1998 + * RCS $Id: hsregex.c,v 1.1.1.1 2005/03/22 08:34:17 cmzmasek Exp $ + *****************************************************************/ + +#include +#include +#include +#include +#include "squid.h" + +/* global sqd_parse[] are managed by Strparse(). + * WARNING: TODO: this code is not threadsafe, and needs to be revised. + */ +char *sqd_parse[10]; + +/* Function: Strparse() + * + * Purpose: Match a regexp to a string. Returns 1 if pattern matches, + * else 0. + * + * Much like Perl, Strparse() makes copies of the matching + * substrings available via globals, sqd_parse[]. + * sqd_parse[0] contains a copy of the complete matched + * text. sqd_parse[1-9] contain copies of up to nine + * different substrings matched within parentheses. + * The memory for these strings is internally managed and + * volatile; the next call to Strparse() may destroy them. + * If the caller needs the matched substrings to persist + * beyond a new Strparse() call, it must make its own + * copies. + * + * A minor drawback of the memory management is that + * there will be a small amount of unfree'd memory being + * managed by Strparse() when a program exits; this may + * confuse memory debugging (Purify, dbmalloc). The + * general cleanup function SqdClean() is provided; + * you can call this before exiting. + * + * Uses an extended POSIX regular expression interface. + * A copylefted GNU implementation is included in the squid + * implementation (gnuregex.c) for use on non-POSIX compliant + * systems. POSIX 1003.2-compliant systems (all UNIX, + * some WinNT, I believe) can omit the GNU code if necessary. + * + * I built this for ease of use, not speed nor efficiency. + * + * Example: Strparse("foo-...-baz", "foo-bar-baz") returns 0 + * Strparse("foo-(...)-baz", "foo-bar-baz") + * returns 0; sqd_parse[0] is "foo-bar-baz"; + * sqd_parse[1] is "bar". + * + * Args: rexp - regular expression, extended POSIX form + * s - string to match against + * ntok - number of () substrings we will save (maximum NSUBEXP-1) + * + * Return: 1 on match, 0 if no match + */ +int +Strparse(char *rexp, char *s, int ntok) +{ + sqd_regexp *pat; + int code; + int len; + int i; + /* sanity check */ + if (ntok >= NSUBEXP ) Die("Strparse(): ntok must be <= %d", NSUBEXP-1); + + /* Free previous global substring buffers + */ + for (i = 0; i <= ntok; i++) + if (sqd_parse[i] != NULL) + { + free(sqd_parse[i]); + sqd_parse[i] = NULL; + } + + /* Compile and match the pattern, using our modified + * copy of Henry Spencer's regexp library + */ + if ((pat = sqd_regcomp(rexp)) == NULL) + Die("regexp compilation failed."); + code = sqd_regexec(pat, s); + + /* Fill the global substring buffers + */ + if (code == 1) + for (i = 0; i <= ntok; i++) + if (pat->startp[i] != NULL && pat->endp[i] != NULL) + { + len = pat->endp[i] - pat->startp[i]; + sqd_parse[i] = (char *) MallocOrDie(sizeof(char) * (len+1)); + strncpy(sqd_parse[i], pat->startp[i], len); + sqd_parse[i][len] = '\0'; + } + + free(pat); + return code; +} + +/* Function: SqdClean() + * Date: SRE, Wed Oct 29 12:52:08 1997 [TWA 721] + * + * Purpose: Clean up any squid library allocations before exiting + * a program, so we don't leave unfree'd memory around + * and confuse a malloc debugger like Purify or dbmalloc. + */ +void +SqdClean(void) +{ + int i; + + /* Free global substring buffers that Strparse() uses + */ + for (i = 0; i <= 9; i++) + if (sqd_parse[i] != NULL) { + free(sqd_parse[i]); + sqd_parse[i] = NULL; + } +} + + + +/* all code below is: + * Copyright (c) 1986, 1993, 1995 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. + * + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it in any way, + * subject to the following restrictions: + * + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. + * + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. + * + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented (by explicit claim or omission) as being + * the original software. + * + * 4. This notice must not be removed or altered. + */ + +/* + * sqd_regcomp and sqd_regexec -- sqd_regsub and sqd_regerror are elsewhere + */ + +/* + * The first byte of the regexp internal "program" is actually this magic + * number; the start node begins in the second byte. + */ +#define SQD_REGMAGIC 0234 + +/* + * The "internal use only" fields in regexp.h are present to pass info from + * compile to execute that permits the execute phase to run lots faster on + * simple cases. They are: + * + * regstart char that must begin a match; '\0' if none obvious + * reganch is the match anchored (at beginning-of-line only)? + * regmust string (pointer into program) that match must include, or NULL + * regmlen length of regmust string + * + * Regstart and reganch permit very fast decisions on suitable starting points + * for a match, cutting down the work a lot. Regmust permits fast rejection + * of lines that cannot possibly match. The regmust tests are costly enough + * that sqd_regcomp() supplies a regmust only if the r.e. contains something + * potentially expensive (at present, the only such thing detected is * or + + * at the start of the r.e., which can involve a lot of backup). Regmlen is + * supplied because the test in sqd_regexec() needs it and sqd_regcomp() is computing + * it anyway. + */ + +/* + * Structure for regexp "program". This is essentially a linear encoding + * of a nondeterministic finite-state machine (aka syntax charts or + * "railroad normal form" in parsing technology). Each node is an opcode + * plus a "next" pointer, possibly plus an operand. "Next" pointers of + * all nodes except BRANCH implement concatenation; a "next" pointer with + * a BRANCH on both ends of it is connecting two alternatives. (Here we + * have one of the subtle syntax dependencies: an individual BRANCH (as + * opposed to a collection of them) is never concatenated with anything + * because of operator precedence.) The operand of some types of node is + * a literal string; for others, it is a node leading into a sub-FSM. In + * particular, the operand of a BRANCH node is the first node of the branch. + * (NB this is *not* a tree structure: the tail of the branch connects + * to the thing following the set of BRANCHes.) The opcodes are: + */ + +/* definition number opnd? meaning */ +#define END 0 /* no End of program. */ +#define BOL 1 /* no Match beginning of line. */ +#define EOL 2 /* no Match end of line. */ +#define ANY 3 /* no Match any character. */ +#define ANYOF 4 /* str Match any of these. */ +#define ANYBUT 5 /* str Match any but one of these. */ +#define BRANCH 6 /* node Match this, or the next..\&. */ +#define BACK 7 /* no "next" ptr points backward. */ +#define EXACTLY 8 /* str Match this string. */ +#define NOTHING 9 /* no Match empty string. */ +#define STAR 10 /* node Match this 0 or more times. */ +#define PLUS 11 /* node Match this 1 or more times. */ +#define OPEN 20 /* no Sub-RE starts here. */ + /* OPEN+1 is number 1, etc. */ +#define CLOSE 30 /* no Analogous to OPEN. */ + +/* + * Opcode notes: + * + * BRANCH The set of branches constituting a single choice are hooked + * together with their "next" pointers, since precedence prevents + * anything being concatenated to any individual branch. The + * "next" pointer of the last BRANCH in a choice points to the + * thing following the whole choice. This is also where the + * final "next" pointer of each individual branch points; each + * branch starts with the operand node of a BRANCH node. + * + * BACK Normal "next" pointers all implicitly point forward; BACK + * exists to make loop structures possible. + * + * STAR,PLUS '?', and complex '*' and '+', are implemented as circular + * BRANCH structures using BACK. Simple cases (one character + * per match) are implemented with STAR and PLUS for speed + * and to minimize recursive plunges. + * + * OPEN,CLOSE ...are numbered at compile time. + */ + +/* + * A node is one char of opcode followed by two chars of "next" pointer. + * "Next" pointers are stored as two 8-bit pieces, high order first. The + * value is a positive offset from the opcode of the node containing it. + * An operand, if any, simply follows the node. (Note that much of the + * code generation knows about this implicit relationship.) + * + * Using two bytes for the "next" pointer is vast overkill for most things, + * but allows patterns to get big without disasters. + */ +#define OP(p) (*(p)) +#define NEXT(p) (((*((p)+1)&0177)<<8) + (*((p)+2)&0377)) +#define OPERAND(p) ((p) + 3) + +/* + * Utility definitions. + */ +#define FAIL(m) { sqd_regerror(m); return(NULL); } +#define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define META "^$.[()|?+*\\" + +/* + * Flags to be passed up and down. + */ +#define HASWIDTH 01 /* Known never to match null string. */ +#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 04 /* Starts with * or +. */ +#define WORST 0 /* Worst case. */ + +/* + * Work-variable struct for sqd_regcomp(). + */ +struct comp { + char *regparse; /* Input-scan pointer. */ + int regnpar; /* () count. */ + char *regcode; /* Code-emit pointer; ®dummy = don't. */ + char regdummy[3]; /* NOTHING, 0 next ptr */ + long regsize; /* Code size. */ +}; +#define EMITTING(cp) ((cp)->regcode != (cp)->regdummy) + +/* + * Forward declarations for sqd_regcomp()'s friends. + */ +static char *reg(struct comp *cp, int paren, int *flagp); +static char *regbranch(struct comp *cp, int *flagp); +static char *regpiece(struct comp *cp, int *flagp); +static char *regatom(struct comp *cp, int *flagp); +static char *regnode(struct comp *cp, int op); +static char *regnext(char *node); +static void regc(struct comp *cp, int c); +static void reginsert(struct comp *cp, int op, char *opnd); +static void regtail(struct comp *cp, char *p, char *val); +static void regoptail(struct comp *cp, char *p, char *val); + +/* + - sqd_regcomp - compile a regular expression into internal code + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. + */ +sqd_regexp * +sqd_regcomp(exp) +const char *exp; +{ + register sqd_regexp *r; + register char *scan; + int flags; + struct comp co; + + if (exp == NULL) + FAIL("NULL argument to sqd_regcomp"); + + /* First pass: determine size, legality. */ + co.regparse = (char *)exp; + co.regnpar = 1; + co.regsize = 0L; + co.regdummy[0] = NOTHING; + co.regdummy[1] = co.regdummy[2] = 0; + co.regcode = co.regdummy; + regc(&co, SQD_REGMAGIC); + if (reg(&co, 0, &flags) == NULL) + return(NULL); + + /* Small enough for pointer-storage convention? */ + if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ + FAIL("regexp too big"); + + /* Allocate space. */ + r = (sqd_regexp *)malloc(sizeof(sqd_regexp) + (size_t)co.regsize); + if (r == NULL) + FAIL("out of space"); + + /* Second pass: emit code. */ + co.regparse = (char *)exp; + co.regnpar = 1; + co.regcode = r->program; + regc(&co, SQD_REGMAGIC); + if (reg(&co, 0, &flags) == NULL) + return(NULL); + + /* Dig out information for optimizations. */ + r->regstart = '\0'; /* Worst-case defaults. */ + r->reganch = 0; + r->regmust = NULL; + r->regmlen = 0; + scan = r->program+1; /* First BRANCH. */ + if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ + scan = OPERAND(scan); + + /* Starting-point info. */ + if (OP(scan) == EXACTLY) + r->regstart = *OPERAND(scan); + else if (OP(scan) == BOL) + r->reganch = 1; + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + */ + if (flags&SPSTART) { + register char *longest = NULL; + register size_t len = 0; + + for (; scan != NULL; scan = regnext(scan)) + if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { + longest = OPERAND(scan); + len = strlen(OPERAND(scan)); + } + r->regmust = longest; + r->regmlen = (int)len; + } + } + + return(r); +} + +/* + - reg - regular expression, i.e. main body or parenthesized thing + * + * Caller must absorb opening parenthesis. + * + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. + */ +static char * +reg(cp, paren, flagp) +register struct comp *cp; +int paren; /* Parenthesized? */ +int *flagp; +{ + register char *ret = NULL; /* SRE: NULL init added to silence gcc */ + register char *br; + register char *ender; + register int parno = 0; /* SRE: init added to silence gcc */ + int flags; + + *flagp = HASWIDTH; /* Tentatively. */ + + if (paren) { + /* Make an OPEN node. */ + if (cp->regnpar >= NSUBEXP) + FAIL("too many ()"); + parno = cp->regnpar; + cp->regnpar++; + ret = regnode(cp, OPEN+parno); + } + + /* Pick up the branches, linking them together. */ + br = regbranch(cp, &flags); + if (br == NULL) + return(NULL); + if (paren) + regtail(cp, ret, br); /* OPEN -> first. */ + else + ret = br; + *flagp &= ~(~flags&HASWIDTH); /* Clear bit if bit 0. */ + *flagp |= flags&SPSTART; + while (*cp->regparse == '|') { + cp->regparse++; + br = regbranch(cp, &flags); + if (br == NULL) + return(NULL); + regtail(cp, ret, br); /* BRANCH -> BRANCH. */ + *flagp &= ~(~flags&HASWIDTH); + *flagp |= flags&SPSTART; + } + + /* Make a closing node, and hook it on the end. */ + ender = regnode(cp, (paren) ? CLOSE+parno : END); + regtail(cp, ret, ender); + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br != NULL; br = regnext(br)) + regoptail(cp, br, ender); + + /* Check for proper termination. */ + if (paren && *cp->regparse++ != ')') { + FAIL("unterminated ()"); + } else if (!paren && *cp->regparse != '\0') { + if (*cp->regparse == ')') { + FAIL("unmatched ()"); + } else + FAIL("internal error: junk on end"); + /* NOTREACHED */ + } + + return(ret); +} + +/* + - regbranch - one alternative of an | operator + * + * Implements the concatenation operator. + */ +static char * +regbranch(cp, flagp) +register struct comp *cp; +int *flagp; +{ + register char *ret; + register char *chain; + register char *latest; + int flags; + register int c; + + *flagp = WORST; /* Tentatively. */ + + ret = regnode(cp, BRANCH); + chain = NULL; + while ((c = *cp->regparse) != '\0' && c != '|' && c != ')') { + latest = regpiece(cp, &flags); + if (latest == NULL) + return(NULL); + *flagp |= flags&HASWIDTH; + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else + regtail(cp, chain, latest); + chain = latest; + } + if (chain == NULL) /* Loop ran zero times. */ + (void) regnode(cp, NOTHING); + + return(ret); +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + */ +static char * +regpiece(cp, flagp) +register struct comp *cp; +int *flagp; +{ + register char *ret; + register char op; + register char *next; + int flags; + + ret = regatom(cp, &flags); + if (ret == NULL) + return(NULL); + + op = *cp->regparse; + if (!ISREPN(op)) { + *flagp = flags; + return(ret); + } + + if (!(flags&HASWIDTH) && op != '?') + FAIL("*+ operand could be empty"); + switch (op) { + case '*': *flagp = WORST|SPSTART; break; + case '+': *flagp = WORST|SPSTART|HASWIDTH; break; + case '?': *flagp = WORST; break; + } + + if (op == '*' && (flags&SIMPLE)) + reginsert(cp, STAR, ret); + else if (op == '*') { + /* Emit x* as (x&|), where & means "self". */ + reginsert(cp, BRANCH, ret); /* Either x */ + regoptail(cp, ret, regnode(cp, BACK)); /* and loop */ + regoptail(cp, ret, ret); /* back */ + regtail(cp, ret, regnode(cp, BRANCH)); /* or */ + regtail(cp, ret, regnode(cp, NOTHING)); /* null. */ + } else if (op == '+' && (flags&SIMPLE)) + reginsert(cp, PLUS, ret); + else if (op == '+') { + /* Emit x+ as x(&|), where & means "self". */ + next = regnode(cp, BRANCH); /* Either */ + regtail(cp, ret, next); + regtail(cp, regnode(cp, BACK), ret); /* loop back */ + regtail(cp, next, regnode(cp, BRANCH)); /* or */ + regtail(cp, ret, regnode(cp, NOTHING)); /* null. */ + } else if (op == '?') { + /* Emit x? as (x|) */ + reginsert(cp, BRANCH, ret); /* Either x */ + regtail(cp, ret, regnode(cp, BRANCH)); /* or */ + next = regnode(cp, NOTHING); /* null. */ + regtail(cp, ret, next); + regoptail(cp, ret, next); + } + cp->regparse++; + if (ISREPN(*cp->regparse)) + FAIL("nested *?+"); + + return(ret); +} + +/* + - regatom - the lowest level + * + * Optimization: gobbles an entire sequence of ordinary characters so that + * it can turn them into a single node, which is smaller to store and + * faster to run. Backslashed characters are exceptions, each becoming a + * separate node; the code is simpler that way and it's not worth fixing. + */ +static char * +regatom(cp, flagp) +register struct comp *cp; +int *flagp; +{ + register char *ret; + int flags; + + *flagp = WORST; /* Tentatively. */ + + switch (*cp->regparse++) { + case '^': + ret = regnode(cp, BOL); + break; + case '$': + ret = regnode(cp, EOL); + break; + case '.': + ret = regnode(cp, ANY); + *flagp |= HASWIDTH|SIMPLE; + break; + case '[': { + register int range; + register int rangeend; + register int c; + + if (*cp->regparse == '^') { /* Complement of range. */ + ret = regnode(cp, ANYBUT); + cp->regparse++; + } else + ret = regnode(cp, ANYOF); + if ((c = *cp->regparse) == ']' || c == '-') { + regc(cp, c); + cp->regparse++; + } + while ((c = *cp->regparse++) != '\0' && c != ']') { + if (c != '-') + regc(cp, c); + else if ((c = *cp->regparse) == ']' || c == '\0') + regc(cp, '-'); + else { + range = (unsigned char)*(cp->regparse-2); + rangeend = (unsigned char)c; + if (range > rangeend) + FAIL("invalid [] range"); + for (range++; range <= rangeend; range++) + regc(cp, range); + cp->regparse++; + } + } + regc(cp, '\0'); + if (c != ']') + FAIL("unmatched []"); + *flagp |= HASWIDTH|SIMPLE; + break; + } + case '(': + ret = reg(cp, 1, &flags); + if (ret == NULL) + return(NULL); + *flagp |= flags&(HASWIDTH|SPSTART); + break; + case '\0': + case '|': + case ')': + /* supposed to be caught earlier */ + FAIL("internal error: \\0|) unexpected"); + break; + case '?': + case '+': + case '*': + FAIL("?+* follows nothing"); + break; + case '\\': + if (*cp->regparse == '\0') + FAIL("trailing \\"); + ret = regnode(cp, EXACTLY); + regc(cp, *cp->regparse++); + regc(cp, '\0'); + *flagp |= HASWIDTH|SIMPLE; + break; + default: { + register size_t len; + register char ender; + + cp->regparse--; + len = strcspn(cp->regparse, META); + if (len == 0) + FAIL("internal error: strcspn 0"); + ender = *(cp->regparse+len); + if (len > 1 && ISREPN(ender)) + len--; /* Back off clear of ?+* operand. */ + *flagp |= HASWIDTH; + if (len == 1) + *flagp |= SIMPLE; + ret = regnode(cp, EXACTLY); + for (; len > 0; len--) + regc(cp, *cp->regparse++); + regc(cp, '\0'); + break; + } + } + + return(ret); +} + +/* + - regnode - emit a node + */ +static char * /* Location. */ +regnode(cp, op) +register struct comp *cp; +char op; +{ + register char *const ret = cp->regcode; + register char *ptr; + + if (!EMITTING(cp)) { + cp->regsize += 3; + return(ret); + } + + ptr = ret; + *ptr++ = op; + *ptr++ = '\0'; /* Null next pointer. */ + *ptr++ = '\0'; + cp->regcode = ptr; + + return(ret); +} + +/* + - regc - emit (if appropriate) a byte of code + */ +static void +regc(cp, b) +register struct comp *cp; +char b; +{ + if (EMITTING(cp)) + *cp->regcode++ = b; + else + cp->regsize++; +} + +/* + - reginsert - insert an operator in front of already-emitted operand + * + * Means relocating the operand. + */ +static void +reginsert(cp, op, opnd) +register struct comp *cp; +char op; +char *opnd; +{ + register char *place; + + if (!EMITTING(cp)) { + cp->regsize += 3; + return; + } + + (void) memmove(opnd+3, opnd, (size_t)(cp->regcode - opnd)); + cp->regcode += 3; + + place = opnd; /* Op node, where operand used to be. */ + *place++ = op; + *place++ = '\0'; + *place++ = '\0'; +} + +/* + - regtail - set the next-pointer at the end of a node chain + */ +static void +regtail(cp, p, val) +register struct comp *cp; +char *p; +char *val; +{ + register char *scan; + register char *temp; + register int offset; + + if (!EMITTING(cp)) + return; + + /* Find last node. */ + for (scan = p; (temp = regnext(scan)) != NULL; scan = temp) + continue; + + offset = (OP(scan) == BACK) ? scan - val : val - scan; + *(scan+1) = (offset>>8)&0177; + *(scan+2) = offset&0377; +} + +/* + - regoptail - regtail on operand of first argument; nop if operandless + */ +static void +regoptail(cp, p, val) +register struct comp *cp; +char *p; +char *val; +{ + /* "Operandless" and "op != BRANCH" are synonymous in practice. */ + if (!EMITTING(cp) || OP(p) != BRANCH) + return; + regtail(cp, OPERAND(p), val); +} + +/* + * sqd_regexec and friends + */ + +/* + * Work-variable struct for sqd_regexec(). + */ +struct exec { + char *reginput; /* String-input pointer. */ + char *regbol; /* Beginning of input, for ^ check. */ + char **regstartp; /* Pointer to startp array. */ + char **regendp; /* Ditto for endp. */ +}; + +/* + * Forwards. + */ +static int regtry(struct exec *ep, sqd_regexp *rp, char *string); +static int regmatch(struct exec *ep, char *prog); +static size_t regrepeat(struct exec *ep, char *node); + +#ifdef DEBUG +int regnarrate = 0; +void regdump(); +static char *regprop(); +#endif + +/* + - sqd_regexec - match a regexp against a string + */ +int +sqd_regexec(prog, str) +register sqd_regexp *prog; +const char *str; +{ + register char *string = (char *)str; /* avert const poisoning */ + register char *s; + struct exec ex; + + /* Be paranoid. */ + if (prog == NULL || string == NULL) { + sqd_regerror("NULL argument to sqd_regexec"); + return(0); + } + + /* Check validity of program. */ + if ((unsigned char)*prog->program != SQD_REGMAGIC) { + sqd_regerror("corrupted regexp"); + return(0); + } + + /* If there is a "must appear" string, look for it. */ + if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL) + return(0); + + /* Mark beginning of line for ^ . */ + ex.regbol = string; + ex.regstartp = prog->startp; + ex.regendp = prog->endp; + + /* Simplest case: anchored match need be tried only once. */ + if (prog->reganch) + return(regtry(&ex, prog, string)); + + /* Messy cases: unanchored match. */ + if (prog->regstart != '\0') { + /* We know what char it must start with. */ + for (s = string; s != NULL; s = strchr(s+1, prog->regstart)) + if (regtry(&ex, prog, s)) + return(1); + return(0); + } else { + /* We don't -- general case. */ + for (s = string; !regtry(&ex, prog, s); s++) + if (*s == '\0') + return(0); + return(1); + } + /* NOTREACHED */ +} + +/* + - regtry - try match at specific point + */ +static int /* 0 failure, 1 success */ +regtry(ep, prog, string) +register struct exec *ep; +sqd_regexp *prog; +char *string; +{ + register int i; + register char **stp; + register char **enp; + + ep->reginput = string; + + stp = prog->startp; + enp = prog->endp; + for (i = NSUBEXP; i > 0; i--) { + *stp++ = NULL; + *enp++ = NULL; + } + if (regmatch(ep, prog->program + 1)) { + prog->startp[0] = string; + prog->endp[0] = ep->reginput; + return(1); + } else + return(0); +} + +/* + - regmatch - main matching routine + * + * Conceptually the strategy is simple: check to see whether the current + * node matches, call self recursively to see whether the rest matches, + * and then act accordingly. In practice we make some effort to avoid + * recursion, in particular by going through "ordinary" nodes (that don't + * need to know whether the rest of the match failed) by a loop instead of + * by recursion. + */ +static int /* 0 failure, 1 success */ +regmatch(ep, prog) +register struct exec *ep; +char *prog; +{ + register char *scan; /* Current node. */ + char *next; /* Next node. */ + +#ifdef DEBUG + if (prog != NULL && regnarrate) + fprintf(stderr, "%s(\n", regprop(prog)); +#endif + for (scan = prog; scan != NULL; scan = next) { +#ifdef DEBUG + if (regnarrate) + fprintf(stderr, "%s...\n", regprop(scan)); +#endif + next = regnext(scan); + + switch (OP(scan)) { + case BOL: + if (ep->reginput != ep->regbol) + return(0); + break; + case EOL: + if (*ep->reginput != '\0') + return(0); + break; + case ANY: + if (*ep->reginput == '\0') + return(0); + ep->reginput++; + break; + case EXACTLY: { + register size_t len; + register char *const opnd = OPERAND(scan); + + /* Inline the first character, for speed. */ + if (*opnd != *ep->reginput) + return(0); + len = strlen(opnd); + if (len > 1 && strncmp(opnd, ep->reginput, len) != 0) + return(0); + ep->reginput += len; + break; + } + case ANYOF: + if (*ep->reginput == '\0' || + strchr(OPERAND(scan), *ep->reginput) == NULL) + return(0); + ep->reginput++; + break; + case ANYBUT: + if (*ep->reginput == '\0' || + strchr(OPERAND(scan), *ep->reginput) != NULL) + return(0); + ep->reginput++; + break; + case NOTHING: + break; + case BACK: + break; + case OPEN+1: case OPEN+2: case OPEN+3: + case OPEN+4: case OPEN+5: case OPEN+6: + case OPEN+7: case OPEN+8: case OPEN+9: { + register const int no = OP(scan) - OPEN; + register char *const input = ep->reginput; + + if (regmatch(ep, next)) { + /* + * Don't set startp if some later + * invocation of the same parentheses + * already has. + */ + if (ep->regstartp[no] == NULL) + ep->regstartp[no] = input; + return(1); + } else + return(0); + break; + } + case CLOSE+1: case CLOSE+2: case CLOSE+3: + case CLOSE+4: case CLOSE+5: case CLOSE+6: + case CLOSE+7: case CLOSE+8: case CLOSE+9: { + register const int no = OP(scan) - CLOSE; + register char *const input = ep->reginput; + + if (regmatch(ep, next)) { + /* + * Don't set endp if some later + * invocation of the same parentheses + * already has. + */ + if (ep->regendp[no] == NULL) + ep->regendp[no] = input; + return(1); + } else + return(0); + break; + } + case BRANCH: { + register char *const save = ep->reginput; + + if (OP(next) != BRANCH) /* No choice. */ + next = OPERAND(scan); /* Avoid recursion. */ + else { + while (OP(scan) == BRANCH) { + if (regmatch(ep, OPERAND(scan))) + return(1); + ep->reginput = save; + scan = regnext(scan); + } + return(0); + /* NOTREACHED */ + } + break; + } + case STAR: case PLUS: { + register const char nextch = + (OP(next) == EXACTLY) ? *OPERAND(next) : '\0'; + register size_t no; + register char *const save = ep->reginput; + register const size_t min = (OP(scan) == STAR) ? 0 : 1; + + for (no = regrepeat(ep, OPERAND(scan)) + 1; no > min; no--) { + ep->reginput = save + no - 1; + /* If it could work, try it. */ + if (nextch == '\0' || *ep->reginput == nextch) + if (regmatch(ep, next)) + return(1); + } + return(0); + break; + } + case END: + return(1); /* Success! */ + break; + default: + sqd_regerror("regexp corruption"); + return(0); + break; + } + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + sqd_regerror("corrupted pointers"); + return(0); +} + +/* + - regrepeat - report how many times something simple would match + */ +static size_t +regrepeat(ep, node) +register struct exec *ep; +char *node; +{ + register size_t count; + register char *scan; + register char ch; + + switch (OP(node)) { + case ANY: + return(strlen(ep->reginput)); + break; + case EXACTLY: + ch = *OPERAND(node); + count = 0; + for (scan = ep->reginput; *scan == ch; scan++) + count++; + return(count); + break; + case ANYOF: + return(strspn(ep->reginput, OPERAND(node))); + break; + case ANYBUT: + return(strcspn(ep->reginput, OPERAND(node))); + break; + default: /* Oh dear. Called inappropriately. */ + sqd_regerror("internal error: bad call of regrepeat"); + return(0); /* Best compromise. */ + break; + } + /* NOTREACHED */ +} + +/* + - regnext - dig the "next" pointer out of a node + */ +static char * +regnext(p) +register char *p; +{ + register const int offset = NEXT(p); + + if (offset == 0) + return(NULL); + + return((OP(p) == BACK) ? p-offset : p+offset); +} + +#ifdef DEBUG + +static char *regprop(); + +/* + - regdump - dump a regexp onto stdout in vaguely comprehensible form + */ +void +regdump(r) +sqd_regexp *r; +{ + register char *s; + register char op = EXACTLY; /* Arbitrary non-END op. */ + register char *next; + + + s = r->program + 1; + while (op != END) { /* While that wasn't END last time... */ + op = OP(s); + printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ + next = regnext(s); + if (next == NULL) /* Next ptr. */ + printf("(0)"); + else + printf("(%d)", (s-r->program)+(next-s)); + s += 3; + if (op == ANYOF || op == ANYBUT || op == EXACTLY) { + /* Literal string, where present. */ + while (*s != '\0') { + putchar(*s); + s++; + } + s++; + } + putchar('\n'); + } + + /* Header fields of interest. */ + if (r->regstart != '\0') + printf("start `%c' ", r->regstart); + if (r->reganch) + printf("anchored "); + if (r->regmust != NULL) + printf("must have \"%s\"", r->regmust); + printf("\n"); +} + +/* + - regprop - printable representation of opcode + */ +static char * +regprop(op) +char *op; +{ + register char *p; + static char buf[50]; + + (void) strcpy(buf, ":"); + + switch (OP(op)) { + case BOL: + p = "BOL"; + break; + case EOL: + p = "EOL"; + break; + case ANY: + p = "ANY"; + break; + case ANYOF: + p = "ANYOF"; + break; + case ANYBUT: + p = "ANYBUT"; + break; + case BRANCH: + p = "BRANCH"; + break; + case EXACTLY: + p = "EXACTLY"; + break; + case NOTHING: + p = "NOTHING"; + break; + case BACK: + p = "BACK"; + break; + case END: + p = "END"; + break; + case OPEN+1: + case OPEN+2: + case OPEN+3: + case OPEN+4: + case OPEN+5: + case OPEN+6: + case OPEN+7: + case OPEN+8: + case OPEN+9: + sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); + p = NULL; + break; + case CLOSE+1: + case CLOSE+2: + case CLOSE+3: + case CLOSE+4: + case CLOSE+5: + case CLOSE+6: + case CLOSE+7: + case CLOSE+8: + case CLOSE+9: + sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); + p = NULL; + break; + case STAR: + p = "STAR"; + break; + case PLUS: + p = "PLUS"; + break; + default: + sqd_regerror("corrupted opcode"); + break; + } + if (p != NULL) + (void) strcat(buf, p); + return(buf); +} +#endif + + +/* + - sqd_regsub - perform substitutions after a regexp match + */ +void +sqd_regsub(rp, source, dest) +const sqd_regexp *rp; +const char *source; +char *dest; +{ + register sqd_regexp * const prog = (sqd_regexp *)rp; + register char *src = (char *)source; + register char *dst = dest; + register char c; + register int no; + register size_t len; + + if (prog == NULL || source == NULL || dest == NULL) { + sqd_regerror("NULL parameter to sqd_regsub"); + return; + } + if ((unsigned char)*(prog->program) != SQD_REGMAGIC) { + sqd_regerror("damaged regexp"); + return; + } + + while ((c = *src++) != '\0') { + if (c == '&') + no = 0; + else if (c == '\\' && isdigit((int) (*src))) + no = *src++ - '0'; + else + no = -1; + + if (no < 0) { /* Ordinary character. */ + if (c == '\\' && (*src == '\\' || *src == '&')) + c = *src++; + *dst++ = c; + } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && + prog->endp[no] > prog->startp[no]) { + len = prog->endp[no] - prog->startp[no]; + (void) strncpy(dst, prog->startp[no], len); + dst += len; + if (*(dst-1) == '\0') { /* strncpy hit NUL. */ + sqd_regerror("damaged match string"); + return; + } + } + } + *dst++ = '\0'; +} + + +void +sqd_regerror(s) +char *s; +{ + fprintf(stderr, "regexp(3): %s\n", s); + exit(EXIT_FAILURE); + /* NOTREACHED */ +} diff --git a/forester/archive/RIO/others/hmmer/squid/install-sh b/forester/archive/RIO/others/hmmer/squid/install-sh new file mode 100755 index 0000000..e9de238 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/install-sh @@ -0,0 +1,251 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + chmodcmd="" + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/forester/archive/RIO/others/hmmer/squid/iupac.c b/forester/archive/RIO/others/hmmer/squid/iupac.c new file mode 100644 index 0000000..9f2a577 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/iupac.c @@ -0,0 +1,220 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* iupac.c + * + * Globally defines the IUPAC symbols for nucleic acid sequence + * Slowly evolving into a repository of globals. Tue Apr 20 1993 + * + * RCS $Id: iupac.c,v 1.1.1.1 2005/03/22 08:34:32 cmzmasek Exp $ + */ +#include "squid.h" + +/* Default expected nucleotide occurrence frequencies, A/C/G/T. + * Used (for instance) as the default distribution for + * i.i.d. random nucleotide sequences. + */ +float dnafq[4] = { 0.25, 0.25, 0.25, 0.25 }; + +/* Dayhoff f(i) amino acid occurrence frequencies. + * From SwissProt 34: 21,210,388 residues + * In alphabetic order by single-letter code. + * Used (for instance) as the default distribution for + * i.i.d. random protein sequences. + */ +float aafq[20] = { + 0.075520, /* A */ + 0.016973, /* C */ + 0.053029, /* D */ + 0.063204, /* E */ + 0.040762, /* F */ + 0.068448, /* G */ + 0.022406, /* H */ + 0.057284, /* I */ + 0.059398, /* K */ + 0.093399, /* L */ + 0.023569, /* M */ + 0.045293, /* N */ + 0.049262, /* P */ + 0.040231, /* Q */ + 0.051573, /* R */ + 0.072214, /* S */ + 0.057454, /* T */ + 0.065252, /* V */ + 0.012513, /* W */ + 0.031985 /* Y */ +}; + +char aa_alphabet[] = AMINO_ALPHABET; + /* aa_index converts to pam's 27x27 scheme */ +int aa_index[20] = { 0, 2, 3, 4, 5, 6, 7, 8, 10, 11, + 12, 13, 15, 16, 17, 18, 19, 21, 22, 24 }; + + /* IUPAC code translations */ + /* note: sequence chars are UPPER CASE */ +struct iupactype iupac[] = { + { 'A', 'T', NTA, NTT, }, + { 'C', 'G', NTC, NTG, }, + { 'G', 'C', NTG, NTC, }, + { 'T', 'A', NTT, NTA, }, + { 'U', 'A', NTU, NTA, }, + { 'N', 'N', NTN, NTN, }, + { ' ', ' ', NTGAP, NTGAP, }, + { 'R', 'Y', NTR, NTY, }, + { 'Y', 'R', NTY, NTR, }, + { 'M', 'K', NTM, NTK, }, + { 'K', 'M', NTK, NTM, }, + { 'S', 'S', NTS, NTS, }, + { 'W', 'W', NTW, NTW, }, + { 'H', 'D', NTH, NTD, }, + { 'B', 'V', NTB, NTV, }, + { 'V', 'B', NTV, NTB, }, + { 'D', 'H', NTD, NTH, }, + }; + + +char *stdcode1[65] = { + "K", /* AAA */ + "N", /* AAC */ + "K", /* AAG */ + "N", /* AAU */ + "T", /* ACA */ + "T", /* ACC */ + "T", /* ACG */ + "T", /* ACU */ + "R", /* AGA */ + "S", /* AGC */ + "R", /* AGG */ + "S", /* AGU */ + "I", /* AUA */ + "I", /* AUC */ + "M", /* AUG */ + "I", /* AUU */ + "Q", /* CAA */ + "H", /* CAC */ + "Q", /* CAG */ + "H", /* CAU */ + "P", /* CCA */ + "P", /* CCC */ + "P", /* CCG */ + "P", /* CCU */ + "R", /* CGA */ + "R", /* CGC */ + "R", /* CGG */ + "R", /* CGU */ + "L", /* CUA */ + "L", /* CUC */ + "L", /* CUG */ + "L", /* CUU */ + "E", /* GAA */ + "D", /* GAC */ + "E", /* GAG */ + "D", /* GAU */ + "A", /* GCA */ + "A", /* GCC */ + "A", /* GCG */ + "A", /* GCU */ + "G", /* GGA */ + "G", /* GGC */ + "G", /* GGG */ + "G", /* GGU */ + "V", /* GUA */ + "V", /* GUC */ + "V", /* GUG */ + "V", /* GUU */ + "*", /* UAA */ + "Y", /* UAC */ + "*", /* UAG */ + "Y", /* UAU */ + "S", /* UCA */ + "S", /* UCC */ + "S", /* UCG */ + "S", /* UCU */ + "*", /* UGA */ + "C", /* UGC */ + "W", /* UGG */ + "C", /* UGU */ + "L", /* UUA */ + "F", /* UUC */ + "L", /* UUG */ + "F", /* UUU */ + "X", /* unknown */ +}; + + + + +char *stdcode3[65] = { + "Lys", /* AAA */ + "Asn", /* AAC */ + "Lys", /* AAG */ + "Asn", /* AAU */ + "Thr", /* ACA */ + "Thr", /* ACC */ + "Thr", /* ACG */ + "Thr", /* ACU */ + "Arg", /* AGA */ + "Ser", /* AGC */ + "Arg", /* AGG */ + "Ser", /* AGU */ + "Ile", /* AUA */ + "Ile", /* AUC */ + "Met", /* AUG */ + "Ile", /* AUU */ + "Gln", /* CAA */ + "His", /* CAC */ + "Gln", /* CAG */ + "His", /* CAU */ + "Pro", /* CCA */ + "Pro", /* CCC */ + "Pro", /* CCG */ + "Pro", /* CCU */ + "Arg", /* CGA */ + "Arg", /* CGC */ + "Arg", /* CGG */ + "Arg", /* CGU */ + "Leu", /* CUA */ + "Leu", /* CUC */ + "Leu", /* CUG */ + "Leu", /* CUU */ + "Glu", /* GAA */ + "Asp", /* GAC */ + "Glu", /* GAG */ + "Asp", /* GAU */ + "Ala", /* GCA */ + "Ala", /* GCC */ + "Ala", /* GCG */ + "Ala", /* GCU */ + "Gly", /* GGA */ + "Gly", /* GGC */ + "Gly", /* GGG */ + "Gly", /* GGU */ + "Val", /* GUA */ + "Val", /* GUC */ + "Val", /* GUG */ + "Val", /* GUU */ + "***", /* UAA */ + "Tyr", /* UAC */ + "***", /* UAG */ + "Tyr", /* UAU */ + "Ser", /* UCA */ + "Ser", /* UCC */ + "Ser", /* UCG */ + "Ser", /* UCU */ + "***", /* UGA */ + "Cys", /* UGC */ + "Trp", /* UGG */ + "Cys", /* UGU */ + "Leu", /* UUA */ + "Phe", /* UUC */ + "Leu", /* UUG */ + "Trp", /* UUU */ + "XXX", /* unknown */ +}; diff --git a/forester/archive/RIO/others/hmmer/squid/msa.c b/forester/archive/RIO/others/hmmer/squid/msa.c new file mode 100644 index 0000000..03bd57a --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/msa.c @@ -0,0 +1,1394 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* msa.c + * SRE, Mon May 17 10:48:47 1999 + * + * SQUID's interface for multiple sequence alignment + * manipulation: access to the MSA object. + * + * RCS $Id: msa.c,v 1.1.1.1 2005/03/22 08:34:19 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" +#include "msa.h" /* multiple sequence alignment object support */ +#include "gki.h" /* string indexing hashtable code */ +#include "ssi.h" /* SSI sequence file indexing code */ + +/* Function: MSAAlloc() + * Date: SRE, Tue May 18 10:45:47 1999 [St. Louis] + * + * Purpose: Allocate an MSA structure, return a pointer + * to it. + * + * Designed to be used in three ways: + * 1) We know exactly the dimensions of the alignment: + * both nseq and alen. + * msa = MSAAlloc(nseq, alen); + * + * 2) We know the number of sequences but not alen. + * (We add sequences later.) + * msa = MSAAlloc(nseq, 0); + * + * 3) We even don't know the number of sequences, so + * we'll have to dynamically expand allocations. + * We provide a blocksize for the allocation expansion, + * and expand when needed. + * msa = MSAAlloc(10, 0); + * if (msa->nseq == msa->nseqalloc) MSAExpand(msa); + * + * Args: nseq - number of sequences, or nseq allocation blocksize + * alen - length of alignment in columns, or 0 + * + * Returns: pointer to new MSA object, w/ all values initialized. + * Note that msa->nseq is initialized to 0, though space + * is allocated. + * + * Diagnostics: "always works". Die()'s on memory allocation failure. + * + */ +MSA * +MSAAlloc(int nseq, int alen) +{ + MSA *msa; + int i; + + msa = MallocOrDie(sizeof(MSA)); + msa->aseq = MallocOrDie(sizeof(char *) * nseq); + msa->sqname = MallocOrDie(sizeof(char *) * nseq); + msa->sqlen = MallocOrDie(sizeof(int) * nseq); + msa->wgt = MallocOrDie(sizeof(float) * nseq); + + for (i = 0; i < nseq; i++) + { + msa->sqname[i] = NULL; + msa->sqlen[i] = 0; + msa->wgt[i] = -1.0; + + if (alen != 0) msa->aseq[i] = MallocOrDie(sizeof(char) * (alen+1)); + else msa->aseq[i] = NULL; + } + + msa->alen = alen; + msa->nseq = 0; + msa->nseqalloc = nseq; + msa->nseqlump = nseq; + + msa->flags = 0; + msa->type = kOtherSeq; + msa->name = NULL; + msa->desc = NULL; + msa->acc = NULL; + msa->au = NULL; + msa->ss_cons = NULL; + msa->sa_cons = NULL; + msa->rf = NULL; + msa->sqacc = NULL; + msa->sqdesc = NULL; + msa->ss = NULL; + msa->sslen = NULL; + msa->sa = NULL; + msa->salen = NULL; + msa->index = GKIInit(); + msa->lastidx = 0; + + /* Initialize unparsed optional markup + */ + msa->comment = NULL; + msa->ncomment = 0; + msa->alloc_ncomment = 0; + + msa->gf_tag = NULL; + msa->gf = NULL; + msa->ngf = 0; + + msa->gs_tag = NULL; + msa->gs = NULL; + msa->gs_idx = NULL; + msa->ngs = 0; + + msa->gc_tag = NULL; + msa->gc = NULL; + msa->gc_idx = NULL; + msa->ngc = 0; + + msa->gr_tag = NULL; + msa->gr = NULL; + msa->gr_idx = NULL; + msa->ngr = 0; + + /* Done. Return the alloced, initialized structure + */ + return msa; +} + +/* Function: MSAExpand() + * Date: SRE, Tue May 18 11:06:53 1999 [St. Louis] + * + * Purpose: Increase the sequence allocation in an MSA + * by msa->nseqlump. (Typically used when we're reading + * in an alignment sequentially from a file, + * so we don't know nseq until we're done.) + * + * Args: msa - the MSA object + * + * Returns: (void) + * + */ +void +MSAExpand(MSA *msa) +{ + int i,j; + + msa->nseqalloc += msa->nseqlump; + + msa->aseq = ReallocOrDie(msa->aseq, sizeof(char *) * msa->nseqalloc); + msa->sqname = ReallocOrDie(msa->sqname, sizeof(char *) * msa->nseqalloc); + msa->sqlen = ReallocOrDie(msa->sqlen, sizeof(char *) * msa->nseqalloc); + msa->wgt = ReallocOrDie(msa->wgt, sizeof(float) * msa->nseqalloc); + + if (msa->ss != NULL) { + msa->ss = ReallocOrDie(msa->ss, sizeof(char *) * msa->nseqalloc); + msa->sslen = ReallocOrDie(msa->sslen, sizeof(int) * msa->nseqalloc); + } + if (msa->sa != NULL) { + msa->sa = ReallocOrDie(msa->sa, sizeof(char *) * msa->nseqalloc); + msa->salen = ReallocOrDie(msa->salen, sizeof(int) * msa->nseqalloc); + } + if (msa->sqacc != NULL) + msa->sqacc = ReallocOrDie(msa->sqacc, sizeof(char *) * msa->nseqalloc); + if (msa->sqdesc != NULL) + msa->sqdesc =ReallocOrDie(msa->sqdesc,sizeof(char *) * msa->nseqalloc); + + for (i = msa->nseqalloc-msa->nseqlump; i < msa->nseqalloc; i++) + { + msa->sqname[i] = NULL; + msa->wgt[i] = -1.0; + + if (msa->sqacc != NULL) msa->sqacc[i] = NULL; + if (msa->sqdesc != NULL) msa->sqdesc[i] = NULL; + + if (msa->alen != 0) + msa->aseq[i] = ReallocOrDie(msa->aseq[i], sizeof(char) * (msa->alen+1)); + else msa->aseq[i] = NULL; + msa->sqlen[i] = 0; + + if (msa->ss != NULL) { + if (msa->alen != 0) + msa->ss[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1)); + else msa->ss[i] = NULL; + msa->sslen[i] = 0; + } + if (msa->sa != NULL) { + if (msa->alen != 0) + msa->sa[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1)); + else + msa->sa[i] = NULL; + msa->salen[i] = 0; + } + } + + /* Reallocate and re-init for unparsed #=GS tags, if we have some. + * gs is [0..ngs-1][0..nseq-1][], so we're reallocing the middle + * set of pointers. + */ + if (msa->gs != NULL) + for (i = 0; i < msa->ngs; i++) + { + if (msa->gs[i] != NULL) + { + msa->gs[i] = ReallocOrDie(msa->gs[i], sizeof(char *) * msa->nseqalloc); + for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++) + msa->gs[i][j] = NULL; + } + } + + /* Reallocate and re-init for unparsed #=GR tags, if we have some. + * gr is [0..ngs-1][0..nseq-1][], so we're reallocing the middle + * set of pointers. + */ + if (msa->gr != NULL) + for (i = 0; i < msa->ngr; i++) + { + if (msa->gr[i] != NULL) + { + msa->gr[i] = ReallocOrDie(msa->gr[i], sizeof(char *) * msa->nseqalloc); + for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++) + msa->gr[i][j] = NULL; + } + } + + return; +} + +/* Function: MSAFree() + * Date: SRE, Tue May 18 11:20:16 1999 [St. Louis] + * + * Purpose: Free a multiple sequence alignment structure. + * + * Args: msa - the alignment + * + * Returns: (void) + */ +void +MSAFree(MSA *msa) +{ + Free2DArray((void **) msa->aseq, msa->nseq); + Free2DArray((void **) msa->sqname, msa->nseq); + Free2DArray((void **) msa->sqacc, msa->nseq); + Free2DArray((void **) msa->sqdesc, msa->nseq); + Free2DArray((void **) msa->ss, msa->nseq); + Free2DArray((void **) msa->sa, msa->nseq); + + if (msa->sqlen != NULL) free(msa->sqlen); + if (msa->wgt != NULL) free(msa->wgt); + + if (msa->name != NULL) free(msa->name); + if (msa->desc != NULL) free(msa->desc); + if (msa->acc != NULL) free(msa->acc); + if (msa->au != NULL) free(msa->au); + if (msa->ss_cons != NULL) free(msa->ss_cons); + if (msa->sa_cons != NULL) free(msa->sa_cons); + if (msa->rf != NULL) free(msa->rf); + if (msa->sslen != NULL) free(msa->sslen); + if (msa->salen != NULL) free(msa->salen); + + Free2DArray((void **) msa->comment, msa->ncomment); + Free2DArray((void **) msa->gf_tag, msa->ngf); + Free2DArray((void **) msa->gf, msa->ngf); + Free2DArray((void **) msa->gs_tag, msa->ngs); + Free3DArray((void ***)msa->gs, msa->ngs, msa->nseq); + Free2DArray((void **) msa->gc_tag, msa->ngc); + Free2DArray((void **) msa->gc, msa->ngc); + Free2DArray((void **) msa->gr_tag, msa->ngr); + Free3DArray((void ***)msa->gr, msa->ngr, msa->nseq); + + GKIFree(msa->index); + GKIFree(msa->gs_idx); + GKIFree(msa->gc_idx); + GKIFree(msa->gr_idx); + + free(msa); +} + + +/* Function: MSASetSeqAccession() + * Date: SRE, Mon Jun 21 04:13:33 1999 [Sanger Centre] + * + * Purpose: Set a sequence accession in an MSA structure. + * Handles some necessary allocation/initialization. + * + * Args: msa - multiple alignment to add accession to + * seqidx - index of sequence to attach accession to + * acc - accession + * + * Returns: void + */ +void +MSASetSeqAccession(MSA *msa, int seqidx, char *acc) +{ + int x; + + if (msa->sqacc == NULL) { + msa->sqacc = MallocOrDie(sizeof(char *) * msa->nseqalloc); + for (x = 0; x < msa->nseqalloc; x++) + msa->sqacc[x] = NULL; + } + msa->sqacc[seqidx] = sre_strdup(acc, -1); +} + +/* Function: MSASetSeqDescription() + * Date: SRE, Mon Jun 21 04:21:09 1999 [Sanger Centre] + * + * Purpose: Set a sequence description in an MSA structure. + * Handles some necessary allocation/initialization. + * + * Args: msa - multiple alignment to add accession to + * seqidx - index of sequence to attach accession to + * desc - description + * + * Returns: void + */ +void +MSASetSeqDescription(MSA *msa, int seqidx, char *desc) +{ + int x; + + if (msa->sqdesc == NULL) { + msa->sqdesc = MallocOrDie(sizeof(char *) * msa->nseqalloc); + for (x = 0; x < msa->nseqalloc; x++) + msa->sqdesc[x] = NULL; + } + msa->sqdesc[seqidx] = sre_strdup(desc, -1); +} + + +/* Function: MSAAddComment() + * Date: SRE, Tue Jun 1 17:37:21 1999 [St. Louis] + * + * Purpose: Add an (unparsed) comment line to the MSA structure, + * allocating as necessary. + * + * Args: msa - a multiple alignment + * s - comment line to add + * + * Returns: (void) + */ +void +MSAAddComment(MSA *msa, char *s) +{ + /* If this is our first recorded comment, we need to malloc(); + * and if we've filled available space, we need to realloc(). + * Note the arbitrary lumpsize of 10 lines per allocation... + */ + if (msa->comment == NULL) { + msa->comment = MallocOrDie (sizeof(char *) * 10); + msa->alloc_ncomment = 10; + } + if (msa->ncomment == msa->alloc_ncomment) { + msa->alloc_ncomment += 10; + msa->comment = ReallocOrDie(msa->comment, sizeof(char *) * msa->alloc_ncomment); + } + + msa->comment[msa->ncomment] = sre_strdup(s, -1); + msa->ncomment++; + return; +} + +/* Function: MSAAddGF() + * Date: SRE, Wed Jun 2 06:53:54 1999 [bus to Madison] + * + * Purpose: Add an unparsed #=GF markup line to the MSA + * structure, allocating as necessary. + * + * Args: msa - a multiple alignment + * tag - markup tag (e.g. "AU") + * value - free text markup (e.g. "Alex Bateman") + * + * Returns: (void) + */ +void +MSAAddGF(MSA *msa, char *tag, char *value) +{ + /* If this is our first recorded unparsed #=GF line, we need to malloc(); + * if we've filled availabl space If we already have a hash index, and the GF + * Note the arbitrary lumpsize of 10 lines per allocation... + */ + if (msa->gf_tag == NULL) { + msa->gf_tag = MallocOrDie (sizeof(char *) * 10); + msa->gf = MallocOrDie (sizeof(char *) * 10); + msa->alloc_ngf = 10; + } + if (msa->ngf == msa->alloc_ngf) { + msa->alloc_ngf += 10; + msa->gf_tag = ReallocOrDie(msa->gf_tag, sizeof(char *) * msa->alloc_ngf); + msa->gf = ReallocOrDie(msa->gf, sizeof(char *) * msa->alloc_ngf); + } + + msa->gf_tag[msa->ngf] = sre_strdup(tag, -1); + msa->gf[msa->ngf] = sre_strdup(value, -1); + msa->ngf++; + + return; +} + + +/* Function: MSAAddGS() + * Date: SRE, Wed Jun 2 06:57:03 1999 [St. Louis] + * + * Purpose: Add an unparsed #=GS markup line to the MSA + * structure, allocating as necessary. + * + * It's possible that we could get more than one + * of the same type of GS tag per sequence; for + * example, "DR PDB;" structure links in Pfam. + * Hack: handle these by appending to the string, + * in a \n separated fashion. + * + * Args: msa - multiple alignment structure + * tag - markup tag (e.g. "AC") + * sqidx - index of sequence to assoc markup with (0..nseq-1) + * value - markup (e.g. "P00666") + * + * Returns: 0 on success + */ +void +MSAAddGS(MSA *msa, char *tag, int sqidx, char *value) +{ + int tagidx; + int i; + + /* Is this an unparsed tag name that we recognize? + * If not, handle adding it to index, and reallocating + * as needed. + */ + if (msa->gs_tag == NULL) /* first tag? init w/ malloc */ + { + msa->gs_idx = GKIInit(); + tagidx = GKIStoreKey(msa->gs_idx, tag); + SQD_DASSERT1((tagidx == 0)); + msa->gs_tag = MallocOrDie(sizeof(char *)); + msa->gs = MallocOrDie(sizeof(char **)); + msa->gs[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); + for (i = 0; i < msa->nseqalloc; i++) + msa->gs[0][i] = NULL; + } + else + { + /* new tag? */ + tagidx = GKIKeyIndex(msa->gs_idx, tag); + if (tagidx < 0) { /* it's a new tag name; realloc */ + tagidx = GKIStoreKey(msa->gs_idx, tag); + /* since we alloc in blocks of 1, + we always realloc upon seeing + a new tag. */ + SQD_DASSERT1((tagidx == msa->ngs)); + msa->gs_tag = ReallocOrDie(msa->gs_tag, (msa->ngs+1) + sizeof(char *)); + msa->gs = ReallocOrDie(msa->gs, (msa->ngs+1) + sizeof(char **)); + msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc); + for (i = 0; i < msa->nseqalloc; i++) + msa->gs[msa->ngs][i] = NULL; + } + } + + if (tagidx == msa->ngs) { + msa->gs_tag[tagidx] = sre_strdup(tag, -1); + msa->ngs++; + } + + if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */ + msa->gs[tagidx][sqidx] = sre_strdup(value, -1); + else { + /* >1 annotation of this seq with this tag; append */ + int len; + if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0) + Die("failed to sre_strcat()"); + if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0) + Die("failed to sre_strcat()"); + } + return; +} + +/* Function: MSAAppendGC() + * Date: SRE, Thu Jun 3 06:25:14 1999 [Madison] + * + * Purpose: Add an unparsed #=GC markup line to the MSA + * structure, allocating as necessary. + * + * When called multiple times for the same tag, + * appends value strings together -- used when + * parsing multiblock alignment files, for + * example. + * + * Args: msa - multiple alignment structure + * tag - markup tag (e.g. "CS") + * value - markup, one char per aligned column + * + * Returns: (void) + */ +void +MSAAppendGC(MSA *msa, char *tag, char *value) +{ + int tagidx; + + /* Is this an unparsed tag name that we recognize? + * If not, handle adding it to index, and reallocating + * as needed. + */ + if (msa->gc_tag == NULL) /* first tag? init w/ malloc */ + { + msa->gc_tag = MallocOrDie(sizeof(char *)); + msa->gc = MallocOrDie(sizeof(char **)); + msa->gc_idx = GKIInit(); + tagidx = GKIStoreKey(msa->gc_idx, tag); + SQD_DASSERT1((tagidx == 0)); + msa->gc[0] = NULL; + } + else + { /* new tag? */ + tagidx = GKIKeyIndex(msa->gc_idx, tag); + if (tagidx < 0) { /* it's a new tag name; realloc */ + tagidx = GKIStoreKey(msa->gc_idx, tag); + /* since we alloc in blocks of 1, + we always realloc upon seeing + a new tag. */ + SQD_DASSERT1((tagidx == msa->ngc)); + msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) + sizeof(char *)); + msa->gc = ReallocOrDie(msa->gc, (msa->ngc+1) + sizeof(char **)); + msa->gc[tagidx] = NULL; + } + } + + if (tagidx == msa->ngc) { + msa->gc_tag[tagidx] = sre_strdup(tag, -1); + msa->ngc++; + } + sre_strcat(&(msa->gc[tagidx]), -1, value, -1); + return; +} + +/* Function: MSAGetGC() + * Date: SRE, Fri Aug 13 13:25:57 1999 [St. Louis] + * + * Purpose: Given a tagname for a miscellaneous #=GC column + * annotation, return a pointer to the annotation + * string. + * + * Args: msa - alignment and its annotation + * tag - name of the annotation + * + * Returns: ptr to the annotation string. Caller does *not* + * free; is managed by msa object still. + */ +char * +MSAGetGC(MSA *msa, char *tag) +{ + int tagidx; + + if (msa->gc_idx == NULL) return NULL; + if ((tagidx = GKIKeyIndex(msa->gc_idx, tag)) < 0) return NULL; + return msa->gc[tagidx]; +} + + +/* Function: MSAAppendGR() + * Date: SRE, Thu Jun 3 06:34:38 1999 [Madison] + * + * Purpose: Add an unparsed #=GR markup line to the + * MSA structure, allocating as necessary. + * + * When called multiple times for the same tag, + * appends value strings together -- used when + * parsing multiblock alignment files, for + * example. + * + * Args: msa - multiple alignment structure + * tag - markup tag (e.g. "SS") + * sqidx - index of seq to assoc markup with (0..nseq-1) + * value - markup, one char per aligned column + * + * Returns: (void) + */ +void +MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value) +{ + int tagidx; + int i; + + /* Is this an unparsed tag name that we recognize? + * If not, handle adding it to index, and reallocating + * as needed. + */ + if (msa->gr_tag == NULL) /* first tag? init w/ malloc */ + { + msa->gr_tag = MallocOrDie(sizeof(char *)); + msa->gr = MallocOrDie(sizeof(char **)); + msa->gr[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc); + msa->gr_idx = GKIInit(); + tagidx = GKIStoreKey(msa->gr_idx, tag); + SQD_DASSERT1((tagidx == 0)); + } + else + { + /* new tag? */ + tagidx = GKIKeyIndex(msa->gr_idx, tag); + if (tagidx < 0) { /* it's a new tag name; realloc */ + tagidx = GKIStoreKey(msa->gr_idx, tag); + /* since we alloc in blocks of 1, + we always realloc upon seeing + a new tag. */ + SQD_DASSERT1((tagidx == msa->ngr)); + msa->gr_tag = ReallocOrDie(msa->gr_tag, (msa->ngr+1) + sizeof(char *)); + msa->gr = ReallocOrDie(msa->gr, (msa->ngr+1) + sizeof(char **)); + msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc); + for (i = 0; i < msa->nseqalloc; i++) + msa->gr[msa->ngr][i] = NULL; + } + } + + if (tagidx == msa->ngr) { + msa->gr_tag[tagidx] = sre_strdup(tag, -1); + msa->ngr++; + } + sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1); + return; +} + + +/* Function: MSAVerifyParse() + * Date: SRE, Sat Jun 5 14:24:24 1999 [Madison, 1999 worm mtg] + * + * Purpose: Last function called after a multiple alignment is + * parsed. Checks that parse was successful; makes sure + * required information is present; makes sure required + * information is consistent. Some fields that are + * only use during parsing may be freed (sqlen, for + * example). + * + * Some fields in msa may be modified (msa->alen is set, + * for example). + * + * Args: msa - the multiple alignment + * sqname, aseq must be set + * nseq must be correct + * alen need not be set; will be set here. + * wgt will be set here if not already set + * + * Returns: (void) + * Will Die() here with diagnostics on error. + * + * Example: + */ +void +MSAVerifyParse(MSA *msa) +{ + int idx; + + if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s", + msa->name != NULL ? msa->name : ""); + + msa->alen = msa->sqlen[0]; + + /* We can rely on msa->sqname[] being valid for any index, + * because of the way the line parsers always store any name + * they add to the index. + */ + for (idx = 0; idx < msa->nseq; idx++) + { + /* aseq is required. */ + if (msa->aseq[idx] == NULL) + Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx], + msa->name != NULL ? msa->name : ""); + /* either all weights must be set, or none of them */ + if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0) + Die("Parse error: some weights are set, but %s doesn't have one in alignment %s", + msa->sqname[idx], + msa->name != NULL ? msa->name : ""); + /* all aseq must be same length. */ + if (msa->sqlen[idx] != msa->alen) + Die("Parse error: sequence %s: length %d, expected %d in alignment %s", + msa->sqname[idx], msa->sqlen[idx], msa->alen, + msa->name != NULL ? msa->name : ""); + /* if SS is present, must have length right */ + if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->alen) + Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s", + msa->sqname[idx], msa->sslen[idx], msa->alen, + msa->name != NULL ? msa->name : ""); + /* if SA is present, must have length right */ + if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->alen) + Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s", + msa->sqname[idx], msa->salen[idx], msa->alen, + msa->name != NULL ? msa->name : ""); + } + + /* if cons SS is present, must have length right */ + if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen) + Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s", + strlen(msa->ss_cons), msa->alen, + msa->name != NULL ? msa->name : ""); + + /* if cons SA is present, must have length right */ + if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen) + Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s", + strlen(msa->sa_cons), msa->alen, + msa->name != NULL ? msa->name : ""); + + /* if RF is present, must have length right */ + if (msa->rf != NULL && strlen(msa->rf) != msa->alen) + Die("Parse error: #=GC RF annotation: length %d, expected %d in alignment %s", + strlen(msa->rf), msa->alen, + msa->name != NULL ? msa->name : ""); + + /* Check that all or no weights are set */ + if (!(msa->flags & MSA_SET_WGT)) + FSet(msa->wgt, msa->nseq, 1.0); /* default weights */ + + /* Clean up a little from the parser */ + if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; } + if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; } + if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; } + + return; +} + + + + +/* Function: MSAFileOpen() + * Date: SRE, Tue May 18 13:22:01 1999 [St. Louis] + * + * Purpose: Open an alignment database file and prepare + * for reading one alignment, or sequentially + * in the (rare) case of multiple MSA databases + * (e.g. Stockholm format). + * + * Args: filename - name of file to open + * if "-", read stdin + * if it ends in ".gz", read from pipe to gunzip -dc + * format - format of file (e.g. MSAFILE_STOCKHOLM) + * env - environment variable for path (e.g. BLASTDB) + * + * Returns: opened MSAFILE * on success. + * NULL on failure: + * usually, because the file doesn't exist; + * for gzip'ed files, may also mean that gzip isn't in the path. + */ +MSAFILE * +MSAFileOpen(char *filename, int format, char *env) +{ + MSAFILE *afp; + + afp = MallocOrDie(sizeof(MSAFILE)); + if (strcmp(filename, "-") == 0) + { + afp->f = stdin; + afp->do_stdin = TRUE; + afp->do_gzip = FALSE; + afp->fname = sre_strdup("[STDIN]", -1); + afp->ssi = NULL; /* can't index stdin because we can't seek*/ + } +#ifndef SRE_STRICT_ANSI + /* popen(), pclose() aren't portable to non-POSIX systems; disable */ + else if (Strparse("^.*\\.gz$", filename, 0)) + { + char cmd[256]; + + /* Note that popen() will return "successfully" + * if file doesn't exist, because gzip works fine + * and prints an error! So we have to check for + * existence of file ourself. + */ + if (! FileExists(filename)) + Die("%s: file does not exist", filename); + if (strlen(filename) + strlen("gzip -dc ") >= 256) + Die("filename > 255 char in MSAFileOpen()"); + sprintf(cmd, "gzip -dc %s", filename); + if ((afp->f = popen(cmd, "r")) == NULL) + return NULL; + + afp->do_stdin = FALSE; + afp->do_gzip = TRUE; + afp->fname = sre_strdup(filename, -1); + /* we can't index a .gz file, because we can't seek in a pipe afaik */ + afp->ssi = NULL; + } +#endif /*SRE_STRICT_ANSI*/ + else + { + char *ssifile; + char *dir; + + /* When we open a file, it may be either in the current + * directory, or in the directory indicated by the env + * argument - and we have to construct the SSI filename accordingly. + */ + if ((afp->f = fopen(filename, "r")) != NULL) + { + ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5)); + sprintf(ssifile, "%s.ssi", filename); + } + else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL) + { + char *full; + full = FileConcat(dir, filename); + ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename) + 5)); + sprintf(ssifile, "%s.ssi", full); + free(dir); + } + else return NULL; + + afp->do_stdin = FALSE; + afp->do_gzip = FALSE; + afp->fname = sre_strdup(filename, -1); + afp->ssi = NULL; + + /* Open the SSI index file. If it doesn't exist, or + * it's corrupt, or some error happens, afp->ssi stays NULL. + */ + SSIOpen(ssifile, &(afp->ssi)); + free(ssifile); + } + + /* Invoke autodetection if we haven't already been told what + * to expect. + */ + if (format == MSAFILE_UNKNOWN) + { + if (afp->do_stdin == TRUE || afp->do_gzip) + Die("Can't autodetect alignment file format from a stdin or gzip pipe"); + format = MSAFileFormat(afp); + if (format == MSAFILE_UNKNOWN) + Die("Can't determine format of multiple alignment file %s", afp->fname); + } + + afp->format = format; + afp->linenumber = 0; + afp->buf = NULL; + afp->buflen = 0; + + return afp; +} + + +/* Function: MSAFilePositionByKey() + * MSAFilePositionByIndex() + * MSAFileRewind() + * + * Date: SRE, Tue Nov 9 19:02:54 1999 [St. Louis] + * + * Purpose: Family of functions for repositioning in + * open MSA files; analogous to a similarly + * named function series in HMMER's hmmio.c. + * + * Args: afp - open alignment file + * offset - disk offset in bytes + * key - key to look up in SSI indices + * idx - index of alignment. + * + * Returns: 0 on failure. + * 1 on success. + * If called on a non-fseek()'able file (e.g. a gzip'ed + * or pipe'd alignment), returns 0 as a failure flag. + */ +int +MSAFileRewind(MSAFILE *afp) +{ + if (afp->do_gzip || afp->do_stdin) return 0; + rewind(afp->f); + return 1; +} +int +MSAFilePositionByKey(MSAFILE *afp, char *key) +{ + int fh; /* filehandle is ignored */ + SSIOFFSET offset; /* offset of the key alignment */ + + if (afp->ssi == NULL) return 0; + if (SSIGetOffsetByName(afp->ssi, key, &fh, &offset) != 0) return 0; + if (SSISetFilePosition(afp->f, &offset) != 0) return 0; + return 1; +} +int +MSAFilePositionByIndex(MSAFILE *afp, int idx) +{ + int fh; /* filehandled is passed but ignored */ + SSIOFFSET offset; /* disk offset of desired alignment */ + + if (afp->ssi == NULL) return 0; + if (SSIGetOffsetByNumber(afp->ssi, idx, &fh, &offset) != 0) return 0; + if (SSISetFilePosition(afp->f, &offset) != 0) return 0; + return 1; +} + + +/* Function: MSAFileRead() + * Date: SRE, Fri May 28 16:01:43 1999 [St. Louis] + * + * Purpose: Read the next msa from an open alignment file. + * This is a wrapper around format-specific calls. + * + * Args: afp - open alignment file + * + * Returns: next alignment, or NULL if out of alignments + */ +MSA * +MSAFileRead(MSAFILE *afp) +{ + MSA *msa = NULL; + + switch (afp->format) { + case MSAFILE_STOCKHOLM: msa = ReadStockholm(afp); break; + case MSAFILE_MSF: msa = ReadMSF(afp); break; + case MSAFILE_A2M: msa = ReadA2M(afp); break; + case MSAFILE_CLUSTAL: msa = ReadClustal(afp); break; + case MSAFILE_SELEX: msa = ReadSELEX(afp); break; + case MSAFILE_PHYLIP: msa = ReadPhylip(afp); break; + default: + Die("MSAFILE corrupted: bad format index"); + } + return msa; +} + +/* Function: MSAFileClose() + * Date: SRE, Tue May 18 14:05:28 1999 [St. Louis] + * + * Purpose: Close an open MSAFILE. + * + * Args: afp - ptr to an open MSAFILE. + * + * Returns: void + */ +void +MSAFileClose(MSAFILE *afp) +{ +#ifndef SRE_STRICT_ANSI /* gzip functionality only on POSIX systems */ + if (afp->do_gzip) pclose(afp->f); +#endif + if (! afp->do_stdin) fclose(afp->f); + if (afp->buf != NULL) free(afp->buf); + if (afp->ssi != NULL) SSIClose(afp->ssi); + if (afp->fname != NULL) free(afp->fname); + free(afp); +} + +char * +MSAFileGetLine(MSAFILE *afp) +{ + char *s; + if ((s = sre_fgets(&(afp->buf), &(afp->buflen), afp->f)) == NULL) + return NULL; + afp->linenumber++; + return afp->buf; +} + +void +MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline) +{ + switch (outfmt) { + case MSAFILE_A2M: WriteA2M(stdout, msa); break; + case MSAFILE_CLUSTAL: WriteClustal(stdout, msa); break; + case MSAFILE_MSF: WriteMSF(stdout, msa); break; + case MSAFILE_PHYLIP: WritePhylip(stdout, msa); break; + case MSAFILE_SELEX: WriteSELEX(stdout, msa); break; + case MSAFILE_STOCKHOLM: + if (do_oneline) WriteStockholmOneBlock(stdout, msa); + else WriteStockholm(stdout, msa); + break; + default: + Die("can't write. no such alignment format %d\n", outfmt); + } +} + +/* Function: MSAGetSeqidx() + * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis] + * + * Purpose: From a sequence name, return seqidx appropriate + * for an MSA structure. + * + * 1) try to guess the index. (pass -1 if you can't guess) + * 2) Look up name in msa's hashtable. + * 3) If it's a new name, store in msa's hashtable; + * expand allocs as needed; + * save sqname. + * + * Args: msa - alignment object + * name - a sequence name + * guess - a guess at the right index, or -1 if no guess. + * + * Returns: seqidx + */ +int +MSAGetSeqidx(MSA *msa, char *name, int guess) +{ + int seqidx; + /* can we guess? */ + if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) + return guess; + /* else, a lookup in the index */ + if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0) + return seqidx; + /* else, it's a new name */ + seqidx = GKIStoreKey(msa->index, name); + if (seqidx >= msa->nseqalloc) MSAExpand(msa); + + msa->sqname[seqidx] = sre_strdup(name, -1); + msa->nseq++; + return seqidx; +} + + +/* Function: MSAFromAINFO() + * Date: SRE, Mon Jun 14 11:22:24 1999 [St. Louis] + * + * Purpose: Convert the old aseq/ainfo alignment structure + * to new MSA structure. Enables more rapid conversion + * of codebase to the new world order. + * + * Args: aseq - [0..nseq-1][0..alen-1] alignment + * ainfo - old-style optional info + * + * Returns: MSA * + */ +MSA * +MSAFromAINFO(char **aseq, AINFO *ainfo) +{ + MSA *msa; + int i, j; + + msa = MSAAlloc(ainfo->nseq, ainfo->alen); + for (i = 0; i < ainfo->nseq; i++) + { + strcpy(msa->aseq[i], aseq[i]); + msa->wgt[i] = ainfo->wgt[i]; + msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1); + msa->sqlen[i] = msa->alen; + GKIStoreKey(msa->index, msa->sqname[i]); + + if (ainfo->sqinfo[i].flags & SQINFO_ACC) + MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc); + + if (ainfo->sqinfo[i].flags & SQINFO_DESC) + MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc); + + if (ainfo->sqinfo[i].flags & SQINFO_SS) { + if (msa->ss == NULL) { + msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc); + msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc); + for (j = 0; j < msa->nseqalloc; j++) { + msa->ss[j] = NULL; + msa->sslen[j] = 0; + } + } + MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i])); + msa->sslen[i] = msa->alen; + } + + if (ainfo->sqinfo[i].flags & SQINFO_SA) { + if (msa->sa == NULL) { + msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc); + msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc); + for (j = 0; j < msa->nseqalloc; j++) { + msa->sa[j] = NULL; + msa->salen[j] = 0; + } + } + MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i])); + msa->salen[i] = msa->alen; + } + } + /* note that sre_strdup() returns NULL when passed NULL */ + msa->name = sre_strdup(ainfo->name, -1); + msa->desc = sre_strdup(ainfo->desc, -1); + msa->acc = sre_strdup(ainfo->acc, -1); + msa->au = sre_strdup(ainfo->au, -1); + msa->ss_cons = sre_strdup(ainfo->cs, -1); + msa->rf = sre_strdup(ainfo->rf, -1); + if (ainfo->flags & AINFO_TC) + { msa->tc1 = ainfo->tc1; msa->tc2 = ainfo->tc2; msa->flags |= MSA_SET_TC; } + if (ainfo->flags & AINFO_NC) + { msa->nc1 = ainfo->nc1; msa->nc2 = ainfo->nc2; msa->flags |= MSA_SET_NC; } + if (ainfo->flags & AINFO_GA) + { msa->ga1 = ainfo->ga1; msa->ga2 = ainfo->ga2; msa->flags |= MSA_SET_GA; } + + msa->nseq = ainfo->nseq; + msa->alen = ainfo->alen; + return msa; +} + + + + +/* Function: MSAFileFormat() + * Date: SRE, Fri Jun 18 14:26:49 1999 [Sanger Centre] + * + * Purpose: (Attempt to) determine the format of an alignment file. + * Since it rewinds the file pointer when it's done, + * cannot be used on a pipe or gzip'ed file. Works by + * calling SeqfileFormat() from sqio.c, then making sure + * that the format is indeed an alignment. If the format + * comes back as FASTA, it assumes that the format as A2M + * (e.g. aligned FASTA). + * + * Args: fname - file to evaluate + * + * Returns: format code; e.g. MSAFILE_STOCKHOLM + */ +int +MSAFileFormat(MSAFILE *afp) +{ + int fmt; + + fmt = SeqfileFormat(afp->f); + + if (fmt == SQFILE_FASTA) fmt = MSAFILE_A2M; + + if (fmt != MSAFILE_UNKNOWN && ! IsAlignmentFormat(fmt)) + Die("File %s does not appear to be an alignment file;\n\ +rather, it appears to be an unaligned file in %s format.\n\ +I'm expecting an alignment file in this context.\n", + afp->fname, + SeqfileFormat2String(fmt)); + return fmt; +} + + +/* Function: MSAMingap() + * Date: SRE, Mon Jun 28 18:57:54 1999 [on jury duty, St. Louis Civil Court] + * + * Purpose: Remove all-gap columns from a multiple sequence alignment + * and its associated per-residue data. + * + * Args: msa - the alignment + * + * Returns: (void) + */ +void +MSAMingap(MSA *msa) +{ + int *useme; /* array of TRUE/FALSE flags for which columns to keep */ + int apos; /* position in original alignment */ + int idx; /* sequence index */ + + useme = MallocOrDie(sizeof(int) * msa->alen); + for (apos = 0; apos < msa->alen; apos++) + { + for (idx = 0; idx < msa->nseq; idx++) + if (! isgap(msa->aseq[idx][apos])) + break; + if (idx == msa->nseq) useme[apos] = FALSE; else useme[apos] = TRUE; + } + MSAShorterAlignment(msa, useme); + free(useme); + return; +} + +/* Function: MSANogap() + * Date: SRE, Wed Nov 17 09:59:51 1999 [St. Louis] + * + * Purpose: Remove all columns from a multiple sequence alignment that + * contain any gaps -- used for filtering before phylogenetic + * analysis. + * + * Args: msa - the alignment + * + * Returns: (void). The alignment is modified, so if you want to keep + * the original for something, make a copy. + */ +void +MSANogap(MSA *msa) +{ + int *useme; /* array of TRUE/FALSE flags for which columns to keep */ + int apos; /* position in original alignment */ + int idx; /* sequence index */ + + useme = MallocOrDie(sizeof(int) * msa->alen); + for (apos = 0; apos < msa->alen; apos++) + { + for (idx = 0; idx < msa->nseq; idx++) + if (isgap(msa->aseq[idx][apos])) + break; + if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE; + } + MSAShorterAlignment(msa, useme); + free(useme); + return; +} + + +/* Function: MSAShorterAlignment() + * Date: SRE, Wed Nov 17 09:49:32 1999 [St. Louis] + * + * Purpose: Given an array "useme" (0..alen-1) of TRUE/FALSE flags, + * where TRUE means "keep this column in the new alignment": + * Remove all columns annotated as "FALSE" in the useme + * array. + * + * Args: msa - the alignment. The alignment is changed, so + * if you don't want the original screwed up, make + * a copy of it first. + * useme - TRUE/FALSE flags for columns to keep: 0..alen-1 + * + * Returns: (void) + */ +void +MSAShorterAlignment(MSA *msa, int *useme) +{ + int apos; /* position in original alignment */ + int mpos; /* position in new alignment */ + int idx; /* sequence index */ + int i; /* markup index */ + + /* Since we're minimizing, we can overwrite, using already allocated + * memory. + */ + for (apos = 0, mpos = 0; apos < msa->alen; apos++) + { + if (useme[apos] == FALSE) continue; + + /* shift alignment and associated per-column+per-residue markup */ + if (mpos != apos) + { + for (idx = 0; idx < msa->nseq; idx++) + { + msa->aseq[idx][mpos] = msa->aseq[idx][apos]; + if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = msa->ss[idx][apos]; + if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = msa->sa[idx][apos]; + + for (i = 0; i < msa->ngr; i++) + if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = msa->gr[i][idx][apos]; + } + + if (msa->ss_cons != NULL) msa->ss_cons[mpos] = msa->ss_cons[apos]; + if (msa->sa_cons != NULL) msa->sa_cons[mpos] = msa->sa_cons[apos]; + if (msa->rf != NULL) msa->rf[mpos] = msa->rf[apos]; + + for (i = 0; i < msa->ngc; i++) + msa->gc[i][mpos] = msa->gc[i][apos]; + } + mpos++; + } + + msa->alen = mpos; /* set new length */ + /* null terminate everything */ + for (idx = 0; idx < msa->nseq; idx++) + { + msa->aseq[idx][mpos] = '\0'; + if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = '\0'; + if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = '\0'; + + for (i = 0; i < msa->ngr; i++) + if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = '\0'; + } + + if (msa->ss_cons != NULL) msa->ss_cons[mpos] = '\0'; + if (msa->sa_cons != NULL) msa->sa_cons[mpos] = '\0'; + if (msa->rf != NULL) msa->rf[mpos] = '\0'; + + for (i = 0; i < msa->ngc; i++) + msa->gc[i][mpos] = '\0'; + + return; +} + + +/* Function: MSASmallerAlignment() + * Date: SRE, Wed Jun 30 09:56:08 1999 [St. Louis] + * + * Purpose: Given an array "useme" of TRUE/FALSE flags for + * each sequence in an alignment, construct + * and return a new alignment containing only + * those sequences that are flagged useme=TRUE. + * + * Used by routines such as MSAFilterAlignment() + * and MSASampleAlignment(). + * + * Limitations: + * Does not copy unparsed Stockholm markup. + * + * Does not make assumptions about meaning of wgt; + * if you want the new wgt vector renormalized, do + * it yourself with FNorm(new->wgt, new->nseq). + * + * Args: msa -- the original (larger) alignment + * useme -- [0..nseq-1] array of TRUE/FALSE flags; TRUE means include + * this seq in new alignment + * ret_new -- RETURN: new alignment + * + * Returns: void + * ret_new is allocated here; free with MSAFree() + */ +void +MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new) +{ + MSA *new; /* RETURN: new alignment */ + int nnew; /* number of seqs in new msa (e.g. # of TRUEs) */ + int oidx, nidx; /* old, new indices */ + + nnew = 0; + for (oidx = 0; oidx < msa->nseq; oidx++) + if (useme[oidx]) nnew++; + if (nnew == 0) { *ret_new = NULL; return; } + + new = MSAAlloc(nnew, 0); + nidx = 0; + for (oidx = 0; oidx < msa->nseq; oidx++) + if (useme[oidx]) + { + new->aseq[nidx] = sre_strdup(msa->aseq[oidx], msa->alen); + new->sqname[nidx] = sre_strdup(msa->sqname[oidx], msa->alen); + GKIStoreKey(new->index, msa->sqname[oidx]); + new->wgt[nidx] = msa->wgt[oidx]; + if (msa->sqacc != NULL) + MSASetSeqAccession(new, nidx, msa->sqacc[oidx]); + if (msa->sqdesc != NULL) + MSASetSeqDescription(new, nidx, msa->sqdesc[oidx]); + if (msa->ss != NULL && msa->ss[oidx] != NULL) + { + if (new->ss == NULL) new->ss = MallocOrDie(sizeof(char *) * new->nseq); + new->ss[nidx] = sre_strdup(msa->ss[oidx], -1); + } + if (msa->sa != NULL && msa->sa[oidx] != NULL) + { + if (new->sa == NULL) new->sa = MallocOrDie(sizeof(char *) * new->nseq); + new->sa[nidx] = sre_strdup(msa->sa[oidx], -1); + } + nidx++; + } + + new->nseq = nnew; + new->alen = msa->alen; + new->flags = msa->flags; + new->type = msa->type; + new->name = sre_strdup(msa->name, -1); + new->desc = sre_strdup(msa->desc, -1); + new->acc = sre_strdup(msa->acc, -1); + new->au = sre_strdup(msa->au, -1); + new->ss_cons = sre_strdup(msa->ss_cons, -1); + new->sa_cons = sre_strdup(msa->sa_cons, -1); + new->rf = sre_strdup(msa->rf, -1); + new->tc1 = msa->tc1; + new->tc2 = msa->tc2; + new->nc1 = msa->nc1; + new->nc2 = msa->nc2; + new->ga1 = msa->ga1; + new->ga2 = msa->ga2; + free(new->sqlen); + + MSAMingap(new); + *ret_new = new; + return; +} + + +/***************************************************************** + * Retrieval routines + * + * Access to MSA structure data is possible through these routines. + * I'm not doing this because of object oriented design, though + * it might work in my favor someday. + * I'm doing this because lots of MSA data is optional, and + * checking through the chain of possible NULLs is a pain. + *****************************************************************/ + +char * +MSAGetSeqAccession(MSA *msa, int idx) +{ + if (msa->sqacc != NULL && msa->sqacc[idx] != NULL) + return msa->sqacc[idx]; + else + return NULL; +} +char * +MSAGetSeqDescription(MSA *msa, int idx) +{ + if (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL) + return msa->sqdesc[idx]; + else + return NULL; +} +char * +MSAGetSeqSS(MSA *msa, int idx) +{ + if (msa->ss != NULL && msa->ss[idx] != NULL) + return msa->ss[idx]; + else + return NULL; +} +char * +MSAGetSeqSA(MSA *msa, int idx) +{ + if (msa->sa != NULL && msa->sa[idx] != NULL) + return msa->sa[idx]; + else + return NULL; +} diff --git a/forester/archive/RIO/others/hmmer/squid/msa.h b/forester/archive/RIO/others/hmmer/squid/msa.h new file mode 100644 index 0000000..ff52f60 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/msa.h @@ -0,0 +1,286 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef SQUID_MSA_INCLUDED +#define SQUID_MSA_INCLUDED + +/* msa.h + * SRE, Mon May 17 10:24:30 1999 + * + * Header file for SQUID's multiple sequence alignment + * manipulation code. + * + * RCS $Id: msa.h,v 1.1.1.1 2005/03/22 08:34:19 cmzmasek Exp $ + */ + +#include /* FILE support */ +#include "gki.h" /* hash table support */ +#include "ssi.h" /* sequence file index support */ +#include "squid.h" /* need SQINFO */ + +/**************************************************** + * Obsolete alignment information, AINFO + * Superceded by MSA structure further below; but we + * need AINFO for the near future for backwards + * compatibility. + ****************************************************/ +/* Structure: aliinfo_s + * + * Purpose: Optional information returned from an alignment file. + * + * flags: always used. Flags for which info is valid/alloced. + * + * alen: mandatory. Alignments are always flushed right + * with gaps so that all aseqs are the same length, alen. + * Available for all alignment formats. + * + * nseq: mandatory. Aligned seqs are indexed 0..nseq-1. + * + * wgt: 0..nseq-1 vector of sequence weights. Mandatory. + * If not explicitly set, weights are initialized to 1.0. + * + * cs: 0..alen-1, just like the alignment. Contains single-letter + * secondary structure codes for consensus structure; "<>^+" + * for RNA, "EHL." for protein. May be NULL if unavailable + * from seqfile. Only available for SELEX format files. + * + * rf: 0..alen-1, just like the alignment. rf is an arbitrary string + * of characters, used for annotating columns. Blanks are + * interpreted as non-canonical columns and anything else is + * considered canonical. Only available from SELEX files. + * + * sqinfo: mandatory. Array of 0..nseq-1 + * per-sequence information structures, carrying + * name, id, accession, coords. + * + */ +struct aliinfo_s { + int flags; /* flags for what info is valid */ + int alen; /* length of alignment (columns) */ + int nseq; /* number of seqs in alignment */ + float *wgt; /* sequence weights [0..nseq-1] */ + char *cs; /* consensus secondary structure string */ + char *rf; /* reference coordinate system */ + struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */ + + /* Pfam/HMMER pick-ups */ + char *name; /* name of alignment */ + char *desc; /* description of alignment */ + char *acc; /* accession of alignment */ + char *au; /* "author" information */ + float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */ + float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */ + float ga1, ga2; /* gathering cutoffs */ +}; +typedef struct aliinfo_s AINFO; +#define AINFO_TC (1 << 0) +#define AINFO_NC (1 << 1) +#define AINFO_GA (1 << 2) + +/***************************************************************** + * MSA + * SRE, Sun Jun 27 15:03:35 1999 [TW 723 over Greenland] + * + * Defines the new data structure and API for multiple + * sequence alignment i/o. + *****************************************************************/ + +/* Structure: MSA + * SRE, Tue May 18 11:33:08 1999 + * + * Our object for a multiple sequence alignment. + */ +typedef struct msa_struct { + /* Mandatory information associated with the alignment. + */ + char **aseq; /* the alignment itself, [0..nseq-1][0..alen-1] */ + char **sqname; /* names of sequences, [0..nseq-1][0..alen-1] */ + float *wgt; /* sequence weights [0..nseq-1] */ + int alen; /* length of alignment (columns) */ + int nseq; /* number of seqs in alignment */ + + /* Optional information that we understand, and might have. + */ + int flags; /* flags for what optional info is valid */ + int type; /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */ + char *name; /* name of alignment, or NULL */ + char *desc; /* description of alignment, or NULL */ + char *acc; /* accession of alignment, or NULL */ + char *au; /* "author" information, or NULL */ + char *ss_cons; /* consensus secondary structure string, or NULL */ + char *sa_cons; /* consensus surface accessibility string, or NULL */ + char *rf; /* reference coordinate system, or NULL */ + char **sqacc; /* accession numbers for individual sequences */ + char **sqdesc; /* description lines for individual sequences */ + char **ss; /* per-seq secondary structure annotation, or NULL */ + char **sa; /* per-seq surface accessibility annotation, or NULL */ + float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */ + float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */ + float ga1, ga2; /* gathering cutoffs (per-seq, per-domain) */ + + /* Optional information that we don't understand. + * That is, we know what type of information it is, but it's + * either (interpreted as) free-text comment, or it's Stockholm + * markup with unfamiliar tags. + */ + char **comment; /* free text comments, or NULL */ + int ncomment; /* number of comment lines */ + int alloc_ncomment; /* number of comment lines alloc'ed */ + + char **gf_tag; /* markup tags for unparsed #=GF lines */ + char **gf; /* annotations for unparsed #=GF lines */ + int ngf; /* number of unparsed #=GF lines */ + int alloc_ngf; /* number of gf lines alloc'ed */ + + char **gs_tag; /* markup tags for unparsed #=GS lines */ + char ***gs; /* [0..ngs-1][0..nseq-1][free text] markup */ + GKI *gs_idx; /* hash of #=GS tag types */ + int ngs; /* number of #=GS tag types */ + + char **gc_tag; /* markup tags for unparsed #=GC lines */ + char **gc; /* [0..ngc-1][0..alen-1] markup */ + GKI *gc_idx; /* hash of #=GC tag types */ + int ngc; /* number of #=GC tag types */ + + char **gr_tag; /* markup tags for unparsed #=GR lines */ + char ***gr; /* [0..ngr][0..nseq-1][0..alen-1] markup */ + GKI *gr_idx; /* hash of #=GR tag types */ + int ngr; /* number of #=GR tag types */ + + /* Stuff we need for our own maintenance of the data structure + */ + GKI *index; /* name ->seqidx hash table */ + int nseqalloc; /* number of seqs currently allocated for */ + int nseqlump; /* lump size for dynamic expansions of nseq */ + int *sqlen; /* individual sequence lengths during parsing */ + int *sslen; /* individual ss lengths during parsing */ + int *salen; /* individual sa lengths during parsing */ + int lastidx; /* last index we saw; use for guessing next */ +} MSA; +#define MSA_SET_TC (1 << 0) +#define MSA_SET_NC (1 << 1) +#define MSA_SET_GA (1 << 2) +#define MSA_SET_WGT (1 << 3) + +/* Structure: MSAFILE + * SRE, Tue May 18 11:36:54 1999 + * + * Defines an alignment file that's open for reading. + */ +typedef struct msafile_struct { + FILE *f; /* open file pointer */ + char *fname; /* name of file. used for diagnostic output */ + int linenumber; /* what line are we on in the file */ + + char *buf; /* buffer for line input w/ sre_fgets() */ + int buflen; /* current allocated length for buf */ + + SSIFILE *ssi; /* open SSI index file; or NULL, if none. */ + + int do_gzip; /* TRUE if f is a pipe from gzip -dc (need pclose(f)) */ + int do_stdin; /* TRUE if f is stdin (don't close f, not our problem) */ + int format; /* format of alignment file we're reading */ +} MSAFILE; + + +/* Alignment file formats. + * Must coexist with sqio.c/squid.h unaligned file format codes. + * Rules: + * - 0 is an unknown/unassigned format + * - <100 reserved for unaligned formats + * - >100 reserved for aligned formats + */ +#define MSAFILE_UNKNOWN 0 /* unknown format */ +#define MSAFILE_STOCKHOLM 101 /* Pfam/HMMER's Stockholm format */ +#define MSAFILE_SELEX 102 /* Obsolete(!): old HMMER/SELEX format */ +#define MSAFILE_MSF 103 /* GCG MSF format */ +#define MSAFILE_CLUSTAL 104 /* Clustal V/W format */ +#define MSAFILE_A2M 105 /* aligned FASTA (A2M is UCSC terminology) */ +#define MSAFILE_PHYLIP 106 /* Felsenstein's PHYLIP format */ +#define MSAFILE_EPS 107 /* Encapsulated PostScript (output only) */ + +#define IsAlignmentFormat(fmt) ((fmt) > 100) + + +/* from msa.c + */ +extern MSAFILE *MSAFileOpen(char *filename, int format, char *env); +extern MSA *MSAFileRead(MSAFILE *afp); +extern void MSAFileClose(MSAFILE *afp); +extern void MSAFree(MSA *msa); +extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline); + +extern int MSAFileRewind(MSAFILE *afp); +extern int MSAFilePositionByKey(MSAFILE *afp, char *key); +extern int MSAFilePositionByIndex(MSAFILE *afp, int idx); + +extern int MSAFileFormat(MSAFILE *afp); +extern MSA *MSAAlloc(int nseq, int alen); +extern void MSAExpand(MSA *msa); +extern char *MSAFileGetLine(MSAFILE *afp); +extern void MSASetSeqAccession(MSA *msa, int seqidx, char *acc); +extern void MSASetSeqDescription(MSA *msa, int seqidx, char *desc); +extern void MSAAddComment(MSA *msa, char *s); +extern void MSAAddGF(MSA *msa, char *tag, char *value); +extern void MSAAddGS(MSA *msa, char *tag, int seqidx, char *value); +extern void MSAAppendGC(MSA *msa, char *tag, char *value); +extern char *MSAGetGC(MSA *msa, char *tag); +extern void MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value); +extern void MSAVerifyParse(MSA *msa); +extern int MSAGetSeqidx(MSA *msa, char *name, int guess); + +extern MSA *MSAFromAINFO(char **aseq, AINFO *ainfo); + +extern void MSAMingap(MSA *msa); +extern void MSANogap(MSA *msa); +extern void MSAShorterAlignment(MSA *msa, int *useme); +extern void MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new); + +extern char *MSAGetSeqAccession(MSA *msa, int idx); +extern char *MSAGetSeqDescription(MSA *msa, int idx); +extern char *MSAGetSeqSS(MSA *msa, int idx); +extern char *MSAGetSeqSA(MSA *msa, int idx); + +/* from a2m.c + */ +extern MSA *ReadA2M(MSAFILE *afp); +extern void WriteA2M(FILE *fp, MSA *msa); + +/* from clustal.c + */ +extern MSA *ReadClustal(MSAFILE *afp); +extern void WriteClustal(FILE *fp, MSA *msa); + +/* from eps.c + */ +extern void EPSWriteSmallMSA(FILE *fp, MSA *msa); + +/* from msf.c + */ +extern MSA *ReadMSF(MSAFILE *afp); +extern void WriteMSF(FILE *fp, MSA *msa); + +/* from phylip.c + */ +extern MSA *ReadPhylip(MSAFILE *afp); +extern void WritePhylip(FILE *fp, MSA *msa); + +/* from selex.c + */ +extern MSA *ReadSELEX(MSAFILE *afp); +extern void WriteSELEX(FILE *fp, MSA *msa); + +/* from stockholm.c + */ +extern MSA *ReadStockholm(MSAFILE *afp); +extern void WriteStockholm(FILE *fp, MSA *msa); +extern void WriteStockholmOneBlock(FILE *fp, MSA *msa); + +#endif /*SQUID_MSA_INCLUDED*/ diff --git a/forester/archive/RIO/others/hmmer/squid/msf.c b/forester/archive/RIO/others/hmmer/squid/msf.c new file mode 100644 index 0000000..ffbfa14 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/msf.c @@ -0,0 +1,389 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* msf.c + * SRE, Sun Jul 11 16:17:32 1993 + * + * Import/export of GCG MSF multiple sequence alignment + * formatted files. Designed using format specifications + * kindly provided by Steve Smith of Genetics Computer Group. + * + * RCS $Id: msf.c,v 1.1.1.1 2005/03/22 08:34:20 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include +#include "squid.h" +#include "msa.h" + +#ifdef TESTDRIVE_MSF +/***************************************************************** + * msf.c test driver: + * cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm + * + */ +int +main(int argc, char **argv) +{ + MSAFILE *afp; + MSA *msa; + char *file; + + file = argv[1]; + + if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL) + Die("Couldn't open %s\n", file); + + while ((msa = ReadMSF(afp)) != NULL) + { + WriteMSF(stdout, msa); + MSAFree(msa); + } + + MSAFileClose(afp); + exit(0); +} +/******************************************************************/ +#endif /* testdrive_msf */ + + + +/* Function: ReadMSF() + * Date: SRE, Tue Jun 1 08:07:22 1999 [St. Louis] + * + * Purpose: Parse an alignment read from an open MSF format + * alignment file. (MSF is a single-alignment format.) + * Return the alignment, or NULL if we've already + * read the alignment. + * + * Args: afp - open alignment file + * + * Returns: MSA * - an alignment object + * caller responsible for an MSAFree() + * NULL if no more alignments + * + * Diagnostics: + * Will Die() here with a (potentially) useful message + * if a parsing error occurs. + */ +MSA * +ReadMSF(MSAFILE *afp) +{ + MSA *msa; + char *s; + int alleged_alen; + int alleged_type; + int alleged_checksum; + char *tok; + char *sp; + int slen; + int sqidx; + char *name; + char *seq; + + if (feof(afp->f)) return NULL; + if ((s = MSAFileGetLine(afp)) == NULL) return NULL; + + /* The first line is the header. + * This is a new-ish GCG feature. Don't count on it, so + * we can be a bit more tolerant towards non-GCG software + * generating "MSF" files. + */ + msa = MSAAlloc(10, 0); + if (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) { + msa->type = kAmino; + if ((s = MSAFileGetLine(afp)) == NULL) return NULL; + } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) { + msa->type = kRNA; + if ((s = MSAFileGetLine(afp)) == NULL) return NULL; + } + + /* Now we're in the free text comment section of the MSF file. + * It ends when we see the "MSF: Type: Check: .." line. + * This line must be present. + */ + do + { + if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) && + Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3)) + { + alleged_alen = atoi(sqd_parse[0]); + switch (*(sqd_parse[1])) { + case 'N' : alleged_type = kRNA; break; + case 'P' : alleged_type = kAmino; break; + case 'X' : alleged_type = kOtherSeq; break; + default : alleged_type = kOtherSeq; + } + alleged_checksum = atoi(sqd_parse[3]); + if (msa->type == kOtherSeq) msa->type = alleged_type; + break; /* we're done with comment section. */ + } + if (! IsBlankline(s)) + MSAAddComment(msa, s); + } while ((s = MSAFileGetLine(afp)) != NULL); + + /* Now we're in the name section. + * GCG has a relatively poorly documented feature: only sequences that + * appear in this list will be read from the alignment section. Commenting + * out sequences in the name list (by preceding them with "!") is + * allowed as a means of manually defining subsets of sequences in + * the alignment section. We can support this feature reasonably + * easily because of the hash table for names in the MSA: we + * only add names to the hash table when we see 'em in the name section. + */ + while ((s = MSAFileGetLine(afp)) != NULL) + { + while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */ + + if (*s == '\n') continue; /* skip blank lines */ + else if (*s == '!') MSAAddComment(msa, s); + else if ((sp = strstr(s, "Name:")) != NULL) + { + /* We take the name and the weigh, and that's it */ + sp += 5; + tok = sre_strtok(&sp, " \t", &slen); /* */ + sqidx = GKIStoreKey(msa->index, tok); + if (sqidx >= msa->nseqalloc) MSAExpand(msa); + msa->sqname[sqidx] = sre_strdup(tok, slen); + msa->nseq++; + + if ((sp = strstr(sp, "Weight:")) == NULL) + Die("No Weight: on line %d for %s in name section of MSF file %s\n", + afp->linenumber, msa->sqname[sqidx], afp->fname); + sp += 7; + tok = sre_strtok(&sp, " \t", &slen); + msa->wgt[sqidx] = atof(tok); + msa->flags |= MSA_SET_WGT; + } + else if (strncmp(s, "//", 2) == 0) + break; + else + { + Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n", + afp->linenumber, afp->fname, s); + squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */ + return NULL; + } + + } + + /* And now we're in the sequence section. + * As discussed above, if we haven't seen a sequence name, then we + * don't include the sequence in the alignment. + * Also, watch out for coordinate-only lines. + */ + while ((s = MSAFileGetLine(afp)) != NULL) + { + sp = s; + if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue; + if ((seq = sre_strtok(&sp, "\n", &slen)) == NULL) continue; + + /* The test for a coord line: digits starting both fields + */ + if (isdigit(*name) && isdigit(*seq)) + continue; + + /* It's not blank, and it's not a coord line: must be sequence + */ + sqidx = GKIKeyIndex(msa->index, name); + if (sqidx < 0) continue; /* not a sequence we recognize */ + + msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); + } + + /* We've left blanks in the aseqs; take them back out. + */ + for (sqidx = 0; sqidx < msa->nseq; sqidx++) + { + if (msa->aseq[sqidx] == NULL) + Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname); + + for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++) + { + if (*s == ' ' || *s == '\t') { + msa->sqlen[sqidx]--; + } else { + *sp = *s; + sp++; + } + } + *sp = '\0'; + } + + MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ + return msa; +} + + +/* Function: WriteMSF() + * Date: SRE, Mon May 31 11:25:18 1999 [St. Louis] + * + * Purpose: Write an alignment in MSF format to an open file. + * + * Args: fp - file that's open for writing. + * msa - alignment to write. + * + * Note that msa->type, usually optional, must be + * set for WriteMSF to work. If it isn't, a fatal + * error is generated. + * + * Returns: (void) + */ +void +WriteMSF(FILE *fp, MSA *msa) +{ + time_t now; /* current time as a time_t */ + char date[64]; /* today's date in GCG's format "October 3, 1996 15:57" */ + char **gcg_aseq; /* aligned sequences with gaps converted to GCG format */ + char **gcg_sqname; /* sequence names with GCG-valid character sets */ + int idx; /* counter for sequences */ + char *s; /* pointer into sqname or seq */ + int len; /* tmp variable for name lengths */ + int namelen; /* maximum name length used */ + int pos; /* position counter */ + char buffer[51]; /* buffer for writing seq */ + int i; /* another position counter */ + + /***************************************************************** + * Make copies of sequence names and sequences. + * GCG recommends that name characters should only contain + * alphanumeric characters, -, or _ + * Some GCG and GCG-compatible software is sensitive to this. + * We silently convert all other characters to '_'. + * + * For sequences, GCG allows only ~ and . for gaps. + * Otherwise, everthing is interpreted as a residue; + * so squid's IUPAC-restricted chars are fine. ~ means + * an external gap. . means an internal gap. + *****************************************************************/ + + /* make copies that we can edit */ + gcg_aseq = MallocOrDie(sizeof(char *) * msa->nseq); + gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq); + for (idx = 0; idx < msa->nseq; idx++) + { + gcg_aseq[idx] = sre_strdup(msa->aseq[idx], msa->alen); + gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1); + } + /* alter names as needed */ + for (idx = 0; idx < msa->nseq; idx++) + for (s = gcg_sqname[idx]; *s != '\0'; s++) + if (! isalnum((int) *s) && *s != '-' && *s != '_') + *s = '_'; + /* alter gap chars in seq */ + for (idx = 0; idx < msa->nseq; idx++) + { + for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++) + *s = '~'; + for (; *s != '\0'; s++) + if (isgap(*s)) *s = '.'; + for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--) + gcg_aseq[idx][pos] = '~'; + } + /* calculate max namelen used */ + namelen = 0; + for (idx = 0; idx < msa->nseq; idx++) + if ((len = strlen(msa->sqname[idx])) > namelen) + namelen = len; + + /***************************************************** + * Write the MSF header + *****************************************************/ + /* required file type line */ + if (msa->type == kOtherSeq) + msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq); + + if (msa->type == kRNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); + else if (msa->type == kDNA) fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n"); + else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n"); + else if (msa->type == kOtherSeq) + Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n"); + else + Die("Invalid sequence type %d in WriteMSF()\n", msa->type); + + /* free text comments */ + if (msa->ncomment > 0) + { + for (idx = 0; idx < msa->ncomment; idx++) + fprintf(fp, "%s\n", msa->comment[idx]); + fprintf(fp, "\n"); + } + /* required checksum line */ + now = time(NULL); + if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0) + Die("What time is it on earth? strftime() failed in WriteMSF().\n"); + fprintf(fp, " %s MSF: %d Type: %c %s Check: %d ..\n", + msa->name != NULL ? msa->name : "squid.msf", + msa->alen, + msa->type == kRNA ? 'N' : 'P', + date, + GCGMultchecksum(gcg_aseq, msa->nseq)); + fprintf(fp, "\n"); + + /***************************************************** + * Names/weights section + *****************************************************/ + + for (idx = 0; idx < msa->nseq; idx++) + { + fprintf(fp, " Name: %-*.*s Len: %5d Check: %4d Weight: %.2f\n", + namelen, namelen, + gcg_sqname[idx], + msa->alen, + GCGchecksum(gcg_aseq[idx], msa->alen), + msa->wgt[idx]); + } + fprintf(fp, "\n"); + fprintf(fp, "//\n"); + + /***************************************************** + * Write the sequences + *****************************************************/ + + for (pos = 0; pos < msa->alen; pos += 50) + { + fprintf(fp, "\n"); /* Blank line between sequence blocks */ + + /* Coordinate line */ + len = (pos + 50) > msa->alen ? msa->alen - pos : 50; + if (len > 10) + fprintf(fp, "%*s %-6d%*s%6d\n", namelen, "", + pos+1, + len + ((len-1)/10) - 12, "", + pos + len); + else + fprintf(fp, "%*s %-6d\n", namelen, "", pos+1); + + for (idx = 0; idx < msa->nseq; idx++) + { + fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]); + /* get next line's worth of 50 from seq */ + strncpy(buffer, gcg_aseq[idx] + pos, 50); + buffer[50] = '\0'; + /* draw the sequence line */ + for (i = 0; i < len; i++) + { + if (! (i % 10)) fputc(' ', fp); + fputc(buffer[i], fp); + } + fputc('\n', fp); + } + } + + Free2DArray((void **) gcg_aseq, msa->nseq); + Free2DArray((void **) gcg_sqname, msa->nseq); + return; +} + + + diff --git a/forester/archive/RIO/others/hmmer/squid/phylip.c b/forester/archive/RIO/others/hmmer/squid/phylip.c new file mode 100644 index 0000000..e2980f1 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/phylip.c @@ -0,0 +1,174 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* phylip.c + * SRE, Mon Jun 14 14:08:33 1999 [St. Louis] + * + * Import/export of PHYLIP interleaved multiple sequence alignment + * format files. + * + * RCS $Id: phylip.c,v 1.1.1.1 2005/03/22 08:34:25 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include "squid.h" +#include "msa.h" + +#ifdef TESTDRIVE_PHYLIP +/***************************************************************** + * phylip.c test driver: + * + */ +int +main(int argc, char **argv) +{ + MSAFILE *afp; + MSA *msa; + char *file; + + file = argv[1]; + + if ((afp = MSAFileOpen(file, MSAFILE_UNKNOWN, NULL)) == NULL) + Die("Couldn't open %s\n", file); + + printf("format %d\n", afp->format); + + while ((msa = ReadPhylip(afp)) != NULL) + { + WritePhylip(stdout, msa); + MSAFree(msa); + } + + MSAFileClose(afp); + exit(0); +} +/******************************************************************/ +#endif /* testdrive_phylip */ + + + +/* Function: ReadPhylip() + * Date: SRE, Fri Jun 18 12:59:37 1999 [Sanger Centre] + * + * Purpose: Parse an alignment from an open Phylip format + * alignment file. Phylip is a single-alignment format. + * Return the alignment, or NULL if we have no data. + * + * Args: afp - open alignment file + * + * Returns: MSA * - an alignment object + * Caller responsible for an MSAFree() + * NULL if no more alignments + */ +MSA * +ReadPhylip(MSAFILE *afp) +{ + MSA *msa; + char *s, *s1, *s2; + char name[11]; /* seq name max len = 10 char */ + int nseq, alen; + int idx; /* index of current sequence */ + int slen; + int nblock; + + if (feof(afp->f)) return NULL; + + /* Skip until we see a nonblank line; it's the header, + * containing nseq/alen + */ + nseq = 0; alen = 0; + while ((s = MSAFileGetLine(afp)) != NULL) + { + if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; + if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) + Die("Failed to parse nseq/alen from first line of PHYLIP file %s\n", afp->fname); + if (! IsInt(s1) || ! IsInt(s2)) + Die("nseq and/or alen not an integer in first line of PHYLIP file %s\n", afp->fname); + nseq = atoi(s1); + alen = atoi(s2); + break; + } + + msa = MSAAlloc(nseq, 0); + idx = 0; + nblock = 0; + while ((s = MSAFileGetLine(afp)) != NULL) + { + /* ignore blank lines. nonblank lines start w/ nonblank char */ + if (isspace(*s)) continue; + /* First block has seq names */ + if (nblock == 0) { + strncpy(name, s, 10); + name[10] = '\0'; + GKIStoreKey(msa->index, name); + msa->sqname[idx] = sre_strdup(name, -1); + s += 10; + } + /* be careful of trailing whitespace on lines */ + if ((s1 = sre_strtok(&s, WHITESPACE, &slen)) == NULL) + Die("Failed to parse sequence at line %d of PHYLIP file %s\n", + afp->linenumber, afp->fname); + msa->sqlen[idx] = sre_strcat(&(msa->aseq[idx]), msa->sqlen[idx], s1, slen); + + idx++; + if (idx == nseq) { idx = 0; nblock++; } + } + msa->nseq = nseq; + MSAVerifyParse(msa); /* verifies; sets alen, wgt; frees sqlen[] */ + return msa; +} + + + +/* Function: WritePhylip() + * Date: SRE, Fri Jun 18 12:07:41 1999 [Sanger Centre] + * + * Purpose: Write an alignment in Phylip format to an open file. + * + * Args: fp - file that's open for writing. + * msa - alignment to write. + * + * Returns: (void) + */ +void +WritePhylip(FILE *fp, MSA *msa) +{ + int idx; /* counter for sequences */ + int cpl = 50; /* 50 seq char per line */ + char buf[51]; /* buffer for writing seq */ + int pos; + + /* First line has nseq, alen + */ + fprintf(fp, " %d %d\n", msa->nseq, msa->alen); + + /* Alignment section. + * PHYLIP is a multiblock format, blocks (optionally) separated + * by blanks; names only attached to first block. Names are + * restricted to ten char; we achieve this by simple truncation (!). + * (Do we need to convert gap characters from our ./- convention?) + */ + for (pos = 0; pos < msa->alen; pos += cpl) + { + if (pos > 0) fprintf(fp, "\n"); + + for (idx = 0; idx < msa->nseq; idx++) + { + strncpy(buf, msa->aseq[idx] + pos, cpl); + buf[cpl] = '\0'; + if (pos > 0) fprintf(fp, "%s\n", buf); + else fprintf(fp, "%-10.10s%s\n", msa->sqname[idx], buf); + } + } + return; +} diff --git a/forester/archive/RIO/others/hmmer/squid/revcomp.c b/forester/archive/RIO/others/hmmer/squid/revcomp.c new file mode 100644 index 0000000..a245e2b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/revcomp.c @@ -0,0 +1,62 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* revcomp.c + * + * Reverse complement of a IUPAC character string + * RCS $Id: revcomp.c,v 1.1.1.1 2005/03/22 08:34:16 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" + + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + + +char * +revcomp(char *comp, char *seq) +{ + long bases; + char *bckp, *fwdp; + int idx; + long pos; + int c; + + if (comp == NULL) return NULL; + if (seq == NULL) return NULL; + bases = strlen(seq); + + fwdp = comp; + bckp = seq + bases -1; + for (pos = 0; pos < bases; pos++) + { + c = *bckp; + c = sre_toupper(c); + for (idx = 0; c != iupac[idx].sym && idx < IUPACSYMNUM; idx++); + if (idx == IUPACSYMNUM) + { + Warn("Can't reverse complement an %c, pal. Using N.", c); + *fwdp = 'N'; + } + else + *fwdp = iupac[idx].symcomp; + if (islower((int) *bckp)) *fwdp = (char) sre_tolower((int) *fwdp); + fwdp++; + bckp--; + } + *fwdp = '\0'; + return comp; +} + diff --git a/forester/archive/RIO/others/hmmer/squid/revcomp_main.c b/forester/archive/RIO/others/hmmer/squid/revcomp_main.c new file mode 100644 index 0000000..130bff1 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/revcomp_main.c @@ -0,0 +1,93 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* main for revcomp + * + * revcomp - generate reverse complement of sequences + * SRE, Thu Aug 5 17:36:57 1993 + * RCS $Id: revcomp_main.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $ + */ + +#include +#include +#include "squid.h" +#include "version.h" + +#define OPTIONS "h" + +char usage[] = "Usage: revcomp [-options] \n\ + Reverse complement a nucleic acid sequence.\n\ + Available options:\n\ + -h : help; print version and usage info\n"; + +int +main(int argc, char **argv) +{ + char *seqfile; /* name of sequence file */ + SQFILE *dbfp; /* open sequence file */ + int fmt; /* format of seqfile */ + char *seq; /* sequence */ + SQINFO sqinfo; /* additional sequence info */ + char *rev; /* reverse complement */ + int swap; + + int optchar; /* option character, command line */ + extern int optind; + + /*********************************************** + * Parse command line + ***********************************************/ + + fmt = SQFILE_UNKNOWN; + + while ((optchar = getopt(argc, argv, OPTIONS)) != -1) + switch (optchar) { + case 'h': + printf("revcomp %s, %s\n%s\n", RELEASE, RELEASEDATE, usage); + exit(EXIT_SUCCESS); + default: + Die("%s\n", usage); + } + + if (argc - optind != 1) Die("%s\n", usage); + seqfile = argv[optind]; + + if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) + Die("Failed to open sequence file %s for reading", seqfile); + + while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) + { + if ((rev = (char *) malloc ((sqinfo.len + 1) * sizeof(char))) == NULL) + Die("malloc failed"); + + revcomp(rev, seq); + if (sqinfo.flags & (SQINFO_START | SQINFO_STOP)) + { + swap = sqinfo.start; + sqinfo.start = sqinfo.stop; + sqinfo.stop = swap; + } + /* secondary structure of reverse strand is nonsense + */ + if (sqinfo.flags & SQINFO_SS) + { + sqinfo.flags = sqinfo.flags & ~SQINFO_SS; + free(sqinfo.ss); + } + + WriteSeq(stdout, SQFILE_FASTA, rev, &sqinfo); + + free(rev); + FreeSequence(seq, &sqinfo); + } + + SeqfileClose(dbfp); + return 0; +} diff --git a/forester/archive/RIO/others/hmmer/squid/rk.c b/forester/archive/RIO/others/hmmer/squid/rk.c new file mode 100644 index 0000000..9ae0c68 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/rk.c @@ -0,0 +1,134 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* rk.c (originally from rnabob's patsearch.c) + * + * Contains a compiler and a search engine for Rabin-Karp + * based primary sequence pattern searching on encoded + * sequences. + * + * See Sedgewick, _Algorithms_, for a general discussion of + * the Rabin-Karp algorithm. See the rkcomp or rkexec man + * pages for specific details. + * + * RCS $Id: rk.c,v 1.1.1.1 2005/03/22 08:34:16 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" /* seq encoding utilities and typedefs */ +#include "rk.h" + + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +Hashseq +rkcomp(char *probe) /* A,C,G,T/U, N probe string, 0-8 nt long */ +{ + Hashseq hashprobe = 0; + char coded[RK_HASHSIZE + 1]; + int len; + int i; + /* check bounds violation on probe */ + if ((len = strlen(probe)) > RK_HASHSIZE) return 0; + /* encode the probe */ + if (seqencode(coded, probe) == 0) return 0; + /* pack the probe into a Hashseq */ + for (i = 0; i < len; i++) + { + hashprobe <<= 4; + hashprobe |= (Hashseq) coded[i]; + } + /* left adjust as needed */ + for (; i < RK_HASHSIZE; i++) + { + hashprobe <<= 4; + hashprobe |= (Hashseq) NTN; + } + /* return the compiled probe */ + return hashprobe; +} + +int +rkseq(Hashseq hashprobe, /* up to 8 nt packed into the probe */ + char *sequence) /* encoded sequence */ +{ + long i; + long pos = 0; + Hashseq target = 0; + + /* initialize the target hashseq */ + for (i = 0; i < RK_HASHSIZE; i++) + { + if (*(sequence + i) == NTEND) + break; + target <<= 4; + target |= (Hashseq) (*(sequence + i)); + } + + while (*(sequence + pos + RK_HASHSIZE -1) != NTEND) + { +#ifdef DEBUG + printf("hashprobe: "); + writehash(hashprobe); + printf("\ttarget: "); + writehash(target); + printf("\nhashprobe & target: "); + writehash(hashprobe & target); + printf("\n"); +#endif + if ((hashprobe & target) == target) + return ((int) pos); + target <<= 4; + target |= (Hashseq) (*(sequence + pos + RK_HASHSIZE)); + pos++; + } + /* now we deal with an end effect */ + for (i = 0; i < RK_HASHSIZE; i++) + { + target |= (Hashseq) NTN; + if ((hashprobe & target) == target) + return ((int) pos); + target <<=4; + pos++; + } + + return(-1); +} + + +#ifdef DEBUG /* Debugging aids */ + +static void +writehash(Hashseq hashseq) +{ + int idx; + int sym; + + if (hashseq/16) + writehash(hashseq/16); + + sym = (int) (hashseq % 16); + if (sym == 0) + putchar('-'); + else + { + for (idx = 0; sym != iupac[idx].code && idx < IUPACSYMNUM; idx++); + if (idx > IUPACSYMNUM) + printf("(%d)", sym); + else + putchar(iupac[idx].sym); + } +} + +#endif diff --git a/forester/archive/RIO/others/hmmer/squid/rk.h b/forester/archive/RIO/others/hmmer/squid/rk.h new file mode 100644 index 0000000..e9ff0b2 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/rk.h @@ -0,0 +1,40 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef SQRKH_INCLUDED +#define SQRKH_INCLUDED + +/* rk.h + * + * Header file for Rabin-Karp pattern searching on encoded + * sequence strings. + * + * Sean Eddy, Thu Oct 1 11:45:42 1992 + * RCS $Id: rk.h,v 1.1.1.1 2005/03/22 08:34:16 cmzmasek Exp $ + */ + + + /* expect 32 bits for 8 nt */ +typedef unsigned long Hashseq; + /* but we count to be sure... + RK_HASHSIZE is the number of nt that fit + in one probe */ +#define RK_HASHSIZE (sizeof(Hashseq)*2) + /* empirically, how many nt minimum we require + in a pattern before we abandon rk and + go with something else */ +#define RK_REQUIRE 4 + +extern int rkseq(Hashseq hashprobe, char *sequence); +extern Hashseq rkcomp(char *probe); /* compile a Hashseq from a pattern */ + + + +#endif /* SQRKH_INCLUDED */ diff --git a/forester/archive/RIO/others/hmmer/squid/selex.c b/forester/archive/RIO/others/hmmer/squid/selex.c new file mode 100644 index 0000000..25f63d3 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/selex.c @@ -0,0 +1,814 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* selex.c + * + * SRE, Mon Jun 14 11:08:38 1999 + * SELEX obsolete as the preferred HMMER/SQUID format + * replaced by Stockholm format + * selex support retained for backwards compatibility + * kludged to use the MSA interface + * + * SRE, Mon Jan 30 14:41:49 1995: + * #=SA side chain % surface accessibility annotation supported + * + * SRE, Tue Nov 9 17:40:50 1993: + * major revision. #= special comments and aliinfo_s optional + * alignment info support added. Support for #=CS (consensus + * secondary structure), #=SS (individual secondary structure), + * #=RF (reference coordinate system), #=SQ (per-sequence header info), + * and #=AU ("author") added. + * + * Fri Dec 4 17:43:24 1992, SRE: + * Reading and writing aligned sequences to/from disk files. + * Implements a new, broader specification of SELEX format + * and supercedes alignio.c. + * + * SELEX format is documented in Docs/formats.tex. + **************************************************************************** + * RCS $Id: selex.c,v 1.1.1.1 2005/03/22 08:34:24 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include +#include "squid.h" +#include "msa.h" + +static int copy_alignment_line(char *aseq, int apos, int name_rcol, + char *buffer, int lcol, int rcol, char gapsym); +static void actually_write_selex(FILE *fp, MSA *msa, int cpl); + +static char commentsyms[] = "%#"; + +/* Function: ReadSELEX() + * Date: SRE, Sun Jun 6 18:24:09 1999 [St. Louis] + * + * Purpose: Parse an alignment read from an open SELEX format + * alignment file. (SELEX is a single alignment format). + * Return the alignment, or NULL if we've already read the + * alignment or there's no alignment data in the file. + * + * Limitations: SELEX is the only remaining multipass parser for + * alignment files. It cannot read from gzip or from stdin. + * It Die()'s here if you try. The reason for this + * that SELEX allows space characters as gaps, so we don't + * know the borders of an alignment block until we've seen + * the whole block. I could rewrite to allow single-pass + * parsing (by storing the whole block in memory) but + * since SELEX is now legacy, why bother. + * + * Note that the interface is totally kludged: fastest + * possible adaptation of old ReadSELEX() to the new + * MSA interface. + * + * Args: afp - open alignment file + * + * Returns: MSA * - an alignment object + * caller responsible for an MSAFree() + * NULL if no alignment data. + */ +MSA * +ReadSELEX(MSAFILE *afp) +{ + MSA *msa; /* RETURN: mult seq alignment */ + FILE *fp; /* ptr to opened seqfile */ + char **aseqs; /* aligned seqs */ + int num = 0; /* number of seqs read */ + char buffer[LINEBUFLEN]; /* input buffer for lines */ + char bufcpy[LINEBUFLEN]; /* strtok'able copy of buffer */ + struct block_struc { /** alignment data for a block: */ + int lcol; /* furthest left aligned sym */ + int rcol; /* furthest right aligned sym */ + } *blocks = NULL; + int blocknum; /* number of blocks in file */ + char *nptr; /* ptr to start of name on line */ + char *sptr; /* ptr into sequence on line */ + int currnum; /* num. seqs in given block */ + int currblock; /* index for blocks */ + int i; /* loop counter */ + int seqidx; /* counter for seqs */ + int alen; /* length of alignment */ + int warn_names; /* becomes TRUE if names don't match between blocks */ + int headnum; /* seqidx in per-sequence header info */ + int currlen; + int count; + int have_cs = 0; + int have_rf = 0; + AINFO base_ainfo, *ainfo; /* hack: used to be passed ptr to AINFO */ + + + /* Convert from MSA interface to what old ReadSELEX() did: + * - copy our open fp, rather than opening file + * - verify that we're not reading a gzip or stdin + */ + if (feof(afp->f)) return NULL; + if (afp->do_gzip || afp->do_stdin) + Die("Can't read a SELEX format alignment from a pipe, stdin, or gzip'ed file"); + fp = afp->f; + ainfo = &base_ainfo; + + /*************************************************** + * First pass across file. + * Count seqs, get names, determine column info + * Determine what sorts of info are active in this file. + ***************************************************/ + + InitAinfo(ainfo); + /* get first line of the block + * (non-comment, non-blank) */ + do + { + if (fgets(buffer, LINEBUFLEN, fp) == NULL) + { squid_errno = SQERR_NODATA; return 0; } + strcpy(bufcpy, buffer); + if (*buffer == '#') + { + if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; + else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; + } + } + while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || + (strchr(commentsyms, *nptr) != NULL)); + + blocknum = 0; + warn_names = FALSE; + while (!feof(fp)) + { + /* allocate for info about this block. */ + if (blocknum == 0) + blocks = (struct block_struc *) MallocOrDie (sizeof(struct block_struc)); + else + blocks = (struct block_struc *) ReallocOrDie (blocks, (blocknum+1) * sizeof(struct block_struc)); + blocks[blocknum].lcol = LINEBUFLEN+1; + blocks[blocknum].rcol = -1; + + currnum = 0; + while (nptr != NULL) /* becomes NULL when this block ends. */ + { + /* First block only: save names */ + if (blocknum == 0) + { + if (currnum == 0) + ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO)); + else + ainfo->sqinfo = (SQINFO *) ReallocOrDie (ainfo->sqinfo, (currnum + 1) * sizeof(SQINFO)); + + ainfo->sqinfo[currnum].flags = 0; + SetSeqinfoString(&(ainfo->sqinfo[currnum]), nptr, SQINFO_NAME); + } + else /* in each additional block: check names */ + { + if (strcmp(ainfo->sqinfo[currnum].name, nptr) != 0) + warn_names = TRUE; + } + currnum++; + + /* check rcol, lcol */ + if ((sptr = strtok(NULL, WHITESPACE)) != NULL) + { + /* is this the furthest left we've + seen word 2 in this block? */ + if (sptr - bufcpy < blocks[blocknum].lcol) + blocks[blocknum].lcol = sptr - bufcpy; + /* look for right side in buffer */ + for (sptr = buffer + strlen(buffer) - 1; + strchr(WHITESPACE, *sptr) != NULL; + sptr --) + /* do nothing */ ; + if (sptr - buffer > blocks[blocknum].rcol) + blocks[blocknum].rcol = sptr - buffer; + } + + /* get the next line; blank line means end of block */ + do + { + if (fgets(buffer, LINEBUFLEN, fp) == NULL) + { nptr = NULL; break; } + strcpy(bufcpy, buffer); + + if (strncmp(buffer, "#=SS", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SS; + else if (strncmp(buffer, "#=SA", 4) == 0) ainfo->sqinfo[currnum-1].flags |= SQINFO_SA; + else if (strncmp(buffer, "#=CS", 4) == 0) have_cs = 1; + else if (strncmp(buffer, "#=RF", 4) == 0) have_rf = 1; + + if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) + break; + } while (strchr(commentsyms, *nptr) != NULL); + } + + + /* check that number of sequences matches expected */ + if (blocknum == 0) + num = currnum; + else if (currnum != num) + Die("Parse error in ReadSELEX()"); + blocknum++; + + /* get first line of next block + * (non-comment, non-blank) */ + do + { + if (fgets(buffer, LINEBUFLEN, fp) == NULL) { nptr = NULL; break; } + strcpy(bufcpy, buffer); + } + while ((nptr = strtok(bufcpy, WHITESPACE)) == NULL || + (strchr(commentsyms, *nptr) != NULL)); + } + + + /*************************************************** + * Get ready for second pass: + * figure out the length of the alignment + * malloc space + * rewind the file + ***************************************************/ + + alen = 0; + for (currblock = 0; currblock < blocknum; currblock++) + alen += blocks[currblock].rcol - blocks[currblock].lcol + 1; + + rewind(fp); + + /* allocations. we can't use AllocateAlignment because of + * the way we already used ainfo->sqinfo. + */ + aseqs = (char **) MallocOrDie (num * sizeof(char *)); + if (have_cs) + ainfo->cs = (char *) MallocOrDie ((alen+1) * sizeof(char)); + if (have_rf) + ainfo->rf = (char *) MallocOrDie ((alen+1) * sizeof(char)); + + + + for (i = 0; i < num; i++) + { + aseqs[i] = (char *) MallocOrDie ((alen+1) * sizeof(char)); + if (ainfo->sqinfo[i].flags & SQINFO_SS) + ainfo->sqinfo[i].ss = (char *) MallocOrDie ((alen+1) * sizeof(char)); + if (ainfo->sqinfo[i].flags & SQINFO_SA) + ainfo->sqinfo[i].sa = (char *) MallocOrDie ((alen+1) * sizeof(char)); + } + + ainfo->alen = alen; + ainfo->nseq = num; + ainfo->wgt = (float *) MallocOrDie (sizeof(float) * num); + FSet(ainfo->wgt, num, 1.0); + + /*************************************************** + * Second pass across file. Parse header; assemble sequences + ***************************************************/ + /* We've now made a complete first pass over the file. We know how + * many blocks it contains, we know the number of seqs in the first + * block, and we know every block has the same number of blocks; + * so we can be a bit more cavalier about error-checking as we + * make the second pass. + */ + + /* Look for header + */ + headnum = 0; + for (;;) + { + if (fgets(buffer, LINEBUFLEN, fp) == NULL) + Die("Parse error in ReadSELEX()"); + strcpy(bufcpy, buffer); + if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* skip blank lines */ + + if (strcmp(nptr, "#=AU") == 0 && (sptr = strtok(NULL, "\n")) != NULL) + ainfo->au = Strdup(sptr); + else if (strcmp(nptr, "#=ID") == 0 && (sptr = strtok(NULL, "\n")) != NULL) + ainfo->name = Strdup(sptr); + else if (strcmp(nptr, "#=AC") == 0 && (sptr = strtok(NULL, "\n")) != NULL) + ainfo->acc = Strdup(sptr); + else if (strcmp(nptr, "#=DE") == 0 && (sptr = strtok(NULL, "\n")) != NULL) + ainfo->desc = Strdup(sptr); + else if (strcmp(nptr, "#=GA") == 0) + { + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=GA line in ReadSELEX()"); + ainfo->ga1 = atof(sptr); + + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=GA line in ReadSELEX()"); + ainfo->ga2 = atof(sptr); + + ainfo->flags |= AINFO_GA; + } + else if (strcmp(nptr, "#=TC") == 0) + { + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=TC line in ReadSELEX()"); + ainfo->tc1 = atof(sptr); + + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=TC line in ReadSELEX()"); + ainfo->tc2 = atof(sptr); + + ainfo->flags |= AINFO_TC; + } + else if (strcmp(nptr, "#=NC") == 0) + { + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=NC line in ReadSELEX()"); + ainfo->nc1 = atof(sptr); + + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=NC line in ReadSELEX()"); + ainfo->nc2 = atof(sptr); + + ainfo->flags |= AINFO_NC; + } + else if (strcmp(nptr, "#=SQ") == 0) /* per-sequence header info */ + { + /* first field is the name */ + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=SQ line in ReadSELEX()"); + if (strcmp(sptr, ainfo->sqinfo[headnum].name) != 0) warn_names = TRUE; + + /* second field is the weight */ + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=SQ line in ReadSELEX()"); + if (!IsReal(sptr)) + Die("Parse error in #=SQ line in ReadSELEX(): weight is not a number"); + ainfo->wgt[headnum] = atof(sptr); + + /* third field is database source id */ + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); + SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ID); + + /* fourth field is database accession number */ + if ((sptr = strtok(NULL, WHITESPACE)) == NULL) + Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); + SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_ACC); + + /* fifth field is start..stop::olen */ + if ((sptr = strtok(NULL, ".:")) == NULL) + Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); + SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_START); + + if ((sptr = strtok(NULL, ".:")) == NULL) + Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); + SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_STOP); + + if ((sptr = strtok(NULL, ":\t ")) == NULL) + Die("Parse error in #=SQ line in ReadSELEX(): incomplete line"); + SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_OLEN); + + /* rest of line is optional description */ + if ((sptr = strtok(NULL, "\n")) != NULL) + SetSeqinfoString(&(ainfo->sqinfo[headnum]), sptr, SQINFO_DESC); + + headnum++; + } + else if (strcmp(nptr, "#=CS") == 0) break; + else if (strcmp(nptr, "#=RF") == 0) break; + else if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment, non-header */ + } + + + currlen = 0; + for (currblock = 0 ; currblock < blocknum; currblock++) + { + /* parse the block */ + seqidx = 0; + while (nptr != NULL) + { + /* Consensus structure */ + if (strcmp(nptr, "#=CS") == 0) + { + if (! copy_alignment_line(ainfo->cs, currlen, strlen(nptr)-1, + buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) + Die("Parse error in #=CS line in ReadSELEX()"); + } + + /* Reference coordinates */ + else if (strcmp(nptr, "#=RF") == 0) + { + if (! copy_alignment_line(ainfo->rf, currlen, strlen(nptr)-1, + buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) + Die("Parse error in #=RF line in ReadSELEX()"); + } + /* Individual secondary structure */ + else if (strcmp(nptr, "#=SS") == 0) + { + if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].ss, currlen, strlen(nptr)-1, + buffer, blocks[currblock].lcol, + blocks[currblock].rcol, (char) '.')) + Die("Parse error in #=SS line in ReadSELEX()"); + } + + /* Side chain % surface accessibility code */ + else if (strcmp(nptr, "#=SA") == 0) + { + if (! copy_alignment_line(ainfo->sqinfo[seqidx-1].sa, currlen, strlen(nptr)-1, + buffer, blocks[currblock].lcol, + blocks[currblock].rcol, (char) '.')) + Die("Parse error in #=SA line in ReadSELEX()"); + } + /* Aligned sequence; avoid unparsed machine comments */ + else if (strncmp(nptr, "#=", 2) != 0) + { + if (! copy_alignment_line(aseqs[seqidx], currlen, strlen(nptr)-1, + buffer, blocks[currblock].lcol, blocks[currblock].rcol, (char) '.')) + Die("Parse error in alignment line in ReadSELEX()"); + seqidx++; + } + + /* get next line */ + for (;;) + { + nptr = NULL; + if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* EOF */ + strcpy(bufcpy, buffer); + if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) break; /* blank */ + if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ + if (strchr(commentsyms, *nptr) == NULL) break; /* data */ + } + } /* end of a block */ + + currlen += blocks[currblock].rcol - blocks[currblock].lcol + 1; + + /* get line 1 of next block */ + for (;;) + { + if (fgets(buffer, LINEBUFLEN, fp) == NULL) break; /* no data */ + strcpy(bufcpy, buffer); + if ((nptr = strtok(bufcpy, WHITESPACE)) == NULL) continue; /* blank */ + if (strncmp(buffer, "#=", 2) == 0) break; /* machine comment */ + if (strchr(commentsyms, *nptr) == NULL) break; /* non-comment */ + } + } /* end of the file */ + + /* Lengths in sqinfo are for raw sequence (ungapped), + * and SS, SA are 0..rlen-1 not 0..alen-1. + * Only the seqs with structures come out of here with lengths set. + */ + for (seqidx = 0; seqidx < num; seqidx++) + { + int apos, rpos; + /* secondary structures */ + if (ainfo->sqinfo[seqidx].flags & SQINFO_SS) + { + for (apos = rpos = 0; apos < alen; apos++) + if (! isgap(aseqs[seqidx][apos])) + { + ainfo->sqinfo[seqidx].ss[rpos] = ainfo->sqinfo[seqidx].ss[apos]; + rpos++; + } + ainfo->sqinfo[seqidx].ss[rpos] = '\0'; + } + /* Surface accessibility */ + if (ainfo->sqinfo[seqidx].flags & SQINFO_SA) + { + for (apos = rpos = 0; apos < alen; apos++) + if (! isgap(aseqs[seqidx][apos])) + { + ainfo->sqinfo[seqidx].sa[rpos] = ainfo->sqinfo[seqidx].sa[apos]; + rpos++; + } + ainfo->sqinfo[seqidx].sa[rpos] = '\0'; + } + } + + /* NULL-terminate all the strings */ + if (ainfo->rf != NULL) ainfo->rf[alen] = '\0'; + if (ainfo->cs != NULL) ainfo->cs[alen] = '\0'; + for (seqidx = 0; seqidx < num; seqidx++) + aseqs[seqidx][alen] = '\0'; + + /* find raw sequence lengths for sqinfo */ + for (seqidx = 0; seqidx < num; seqidx++) + { + count = 0; + for (sptr = aseqs[seqidx]; *sptr != '\0'; sptr++) + if (!isgap(*sptr)) count++; + ainfo->sqinfo[seqidx].len = count; + ainfo->sqinfo[seqidx].flags |= SQINFO_LEN; + } + + + /*************************************************** + * Garbage collection and return + ***************************************************/ + free(blocks); + if (warn_names) + Warn("sequences may be in different orders in blocks of %s?", afp->fname); + + /* Convert back to MSA structure. (Wasteful kludge.) + */ + msa = MSAFromAINFO(aseqs, ainfo); + MSAVerifyParse(msa); + FreeAlignment(aseqs, ainfo); + return msa; +} + + +/* Function: WriteSELEX() + * Date: SRE, Mon Jun 14 13:13:14 1999 [St. Louis] + * + * Purpose: Write a SELEX file in multiblock format. + * + * Args: fp - file that's open for writing + * msa - multiple sequence alignment object + * + * Returns: (void) + */ +void +WriteSELEX(FILE *fp, MSA *msa) +{ + actually_write_selex(fp, msa, 50); /* 50 char per block */ +} + +/* Function: WriteSELEXOneBlock() + * Date: SRE, Mon Jun 14 13:14:56 1999 [St. Louis] + * + * Purpose: Write a SELEX alignment file in Pfam's single-block + * format style. A wrapper for actually_write_selex(). + * + * Args: fp - file that's open for writing + * msa- alignment to write + * + * Returns: (void) + */ +void +WriteSELEXOneBlock(FILE *fp, MSA *msa) +{ + actually_write_selex(fp, msa, msa->alen); /* one big block */ +} + + +/* Function: actually_write_selex() + * Date: SRE, Mon Jun 14 12:54:46 1999 [St. Louis] + * + * Purpose: Write an alignment in SELEX format to an open + * file. This is the function that actually does + * the work. The API's WriteSELEX() and + * WriteSELEXOneBlock() are wrappers. + * + * Args: fp - file that's open for writing + * msa - alignment to write + * cpl - characters to write per line in alignment block + * + * Returns: (void) + */ +static void +actually_write_selex(FILE *fp, MSA *msa, int cpl) +{ + int i; + int len = 0; + int namewidth; + char *buf; + int currpos; + + buf = malloc(sizeof(char) * (cpl+101)); /* 100 chars allowed for name, etc. */ + + /* Figure out how much space we need for name + markup + * to keep the alignment in register, for easier human viewing -- + * even though Stockholm format doesn't care about visual + * alignment. + */ + namewidth = 0; + for (i = 0; i < msa->nseq; i++) + if ((len = strlen(msa->sqname[i])) > namewidth) + namewidth = len; + if (namewidth < 6) namewidth = 6; /* minimum space for markup tags */ + + /* Free text comments + */ + for (i = 0; i < msa->ncomment; i++) + fprintf(fp, "# %s\n", msa->comment[i]); + if (msa->ncomment > 0) fprintf(fp, "\n"); + + /* Per-file annotation + */ + if (msa->name != NULL) fprintf(fp, "#=ID %s\n", msa->name); + if (msa->acc != NULL) fprintf(fp, "#=AC %s\n", msa->acc); + if (msa->desc != NULL) fprintf(fp, "#=DE %s\n", msa->desc); + if (msa->au != NULL) fprintf(fp, "#=AU %s\n", msa->au); + if (msa->flags & MSA_SET_GA) fprintf(fp, "#=GA %.1f %.1f\n", msa->ga1, msa->ga2); + if (msa->flags & MSA_SET_NC) fprintf(fp, "#=NC %.1f %.1f\n", msa->nc1, msa->nc2); + if (msa->flags & MSA_SET_TC) fprintf(fp, "#=TC %.1f %.1f\n", msa->tc1, msa->tc2); + + /* Per-sequence annotation + */ + for (i = 0; i < msa->nseq; i++) + fprintf(fp, "#=SQ %-*.*s %6.4f %s %s %d..%d::%d %s\n", + namewidth, namewidth, msa->sqname[i], + msa->wgt[i], + "-", /* MSA has no ID field */ + (msa->sqacc != NULL && msa->sqacc[i] != NULL) ? msa->sqacc[i] : "-", + 0, 0, 0, /* MSA has no start, stop, olen field */ + (msa->sqdesc != NULL && msa->sqdesc[i] != NULL) ? msa->sqdesc[i] : "-"); + fprintf(fp, "\n"); + + /* Alignment section: + */ + for (currpos = 0; currpos < msa->alen; currpos += cpl) + { + if (currpos > 0) fprintf(fp, "\n"); + + if (msa->ss_cons != NULL) { + strncpy(buf, msa->ss_cons + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=CS", buf); + } + if (msa->rf != NULL) { + strncpy(buf, msa->rf + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=RF", buf); + } + for (i = 0; i < msa->nseq; i++) + { + strncpy(buf, msa->aseq[i] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, msa->sqname[i], buf); + + if (msa->ss != NULL && msa->ss[i] != NULL) { + strncpy(buf, msa->ss[i] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SS", buf); + } + if (msa->sa != NULL && msa->sa[i] != NULL) { + strncpy(buf, msa->sa[i] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "%-*.*s %s\n", namewidth, namewidth, "#=SA", buf); + } + } + } + free(buf); +} + + +/* Function: copy_alignment_line() + * + * Purpose: Given a line from an alignment file, and bounds lcol,rcol + * on what part of it may be sequence, save the alignment into + * aseq starting at position apos. + * + * name_rcol is set to the rightmost column this aseqs's name + * occupies; if name_rcol >= lcol, we have a special case in + * which the name intrudes into the sequence zone. + */ +static int +copy_alignment_line(char *aseq, int apos, int name_rcol, + char *buffer, int lcol, int rcol, char gapsym) +{ + char *s1, *s2; + int i; + + s1 = aseq + apos; + s2 = buffer; /* be careful that buffer doesn't end before lcol! */ + for (i = 0; i < lcol; i++) + if (*s2) s2++; + + for (i = lcol; i <= rcol; i++) + { + if (*s2 == '\t') { + Warn("TAB characters will corrupt a SELEX alignment! Please remove them first."); + return 0; + } + if (name_rcol >= i) /* name intrusion special case: pad left w/ gaps */ + *s1 = gapsym; + /* short buffer special case: pad right w/ gaps */ + else if (*s2 == '\0' || *s2 == '\n') + *s1 = gapsym; + + else if (*s2 == ' ') /* new: disallow spaces as gap symbols */ + *s1 = gapsym; + + else /* normal case: copy buffer into aseq */ + *s1 = *s2; + + s1++; + if (*s2) s2++; + } + return 1; +} + + + + + +/* Function: DealignAseqs() + * + * Given an array of (num) aligned sequences aseqs, + * strip the gaps. Store the raw sequences in a new allocated array. + * + * Caller is responsible for free'ing the memory allocated to + * rseqs. + * + * Returns 1 on success. Returns 0 and sets squid_errno on + * failure. + */ +int +DealignAseqs(char **aseqs, int num, char ***ret_rseqs) +{ + char **rseqs; /* de-aligned sequence array */ + int idx; /* counter for sequences */ + int depos; /* position counter for dealigned seq*/ + int apos; /* position counter for aligned seq */ + int seqlen; /* length of aligned seq */ + + /* alloc space */ + rseqs = (char **) MallocOrDie (num * sizeof(char *)); + /* main loop */ + for (idx = 0; idx < num; idx++) + { + seqlen = strlen(aseqs[idx]); + /* alloc space */ + rseqs[idx] = (char *) MallocOrDie ((seqlen + 1) * sizeof(char)); + + /* strip gaps */ + depos = 0; + for (apos = 0; aseqs[idx][apos] != '\0'; apos++) + if (!isgap(aseqs[idx][apos])) + { + rseqs[idx][depos] = aseqs[idx][apos]; + depos++; + } + rseqs[idx][depos] = '\0'; + } + *ret_rseqs = rseqs; + return 1; +} + + +/* Function: IsSELEXFormat() + * + * Return TRUE if filename may be in SELEX format. + * + * Accuracy is sacrificed for speed; a TRUE return does + * *not* guarantee that the file will pass the stricter + * error-checking of ReadSELEX(). All it checks is that + * the first 500 non-comment lines of a file are + * blank, or if there's a second "word" on the line + * it looks like sequence (i.e., it's not kOtherSeq). + * + * Returns TRUE or FALSE. + */ +int +IsSELEXFormat(char *filename) +{ + FILE *fp; /* ptr to open sequence file */ + char buffer[LINEBUFLEN]; + char *sptr; /* ptr to first word */ + int linenum; + + + if ((fp = fopen(filename, "r")) == NULL) + { squid_errno = SQERR_NOFILE; return 0; } + + linenum = 0; + while (linenum < 500 && + fgets(buffer, LINEBUFLEN, fp) != NULL) + { + linenum++; + /* dead giveaways for extended SELEX */ + if (strncmp(buffer, "#=AU", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=ID", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=AC", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=DE", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=GA", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=TC", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=NC", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=SQ", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=SS", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=CS", 4) == 0) goto DONE; + else if (strncmp(buffer, "#=RF", 4) == 0) goto DONE; + + /* a comment? */ + if (strchr(commentsyms, *buffer) != NULL) continue; + + /* a blank line? */ + if ((sptr = strtok(buffer, WHITESPACE)) == NULL) continue; + + /* a one-word line (name only) + is possible, though rare */ + if ((sptr = strtok(NULL, "\n")) == NULL) continue; + + if (Seqtype(sptr) == kOtherSeq) {fclose(fp); return 0;} + } + + DONE: + fclose(fp); + return 1; +} + + + + + + + + diff --git a/forester/archive/RIO/others/hmmer/squid/seqencode.c b/forester/archive/RIO/others/hmmer/squid/seqencode.c new file mode 100644 index 0000000..6cdc265 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/seqencode.c @@ -0,0 +1,177 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* seqencode.c + * + * Routines for creating and manipulating encoded sequence strings. + * RCS $Id: seqencode.c,v 1.1.1.1 2005/03/22 08:34:29 cmzmasek Exp $ + */ +#include +#include +#include +#include "squid.h" + + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + /* seqcmp() + returns 0 if s1 == s2 + mismatch number otherwise */ +int +seqcmp(char *s1, char *s2, int allow) +{ + int mmat = 0; + + while ((*s1 != NTEND) && (*s2 != NTEND) && (mmat <= allow)) + { + if (!(ntmatch(*s1, *s2))) + mmat++;; + s1++; + s2++; + } + while ((*s1++ != NTEND) && (mmat <= allow)) + mmat++; + return(mmat); +} + /* seqncmp() + same as seqcmp but it looks at, + at most, n positions */ +int +seqncmp(char *s1, char *s2, int n, int allow) +{ + int mmat = 0; + + while ((*s2 != NTEND) && + (n-- != 0)) + { + if ((!(ntmatch(*s1, *s2))) && + (++mmat > allow)) + return(mmat); + s1++; + s2++; + } + while ((n-- != 0) && (*s1++ != NTEND) && (mmat <= allow)) + mmat++; + return (mmat); +} + + /* seqencode() + given a character text string str (A,C,G,T), + convert to an encoded seq string; + return 1 for success, 0 if fail */ +int +seqencode(char *codeseq, /* pre-allocated space for answer */ + char *str) /* character string to convert */ +{ + char *ptr; + int idx; + + ptr = codeseq; + while (*str != '\0') + { + if (islower((int) (*str))) *str = (char) toupper((int) (*str)); + for (idx = 0; *str != iupac[idx].sym && idx <= IUPACSYMNUM; idx++) + ; + if (idx > IUPACSYMNUM) + { + *ptr = (char) NTEND; + return 0; + } + else + *ptr = iupac[idx].code; + ptr++; + str++; + } + *ptr = NTEND; + return 1; +} + + +int +coded_revcomp(char *comp, char *seq) +{ + long bases; + char *bckp, *fwdp; + int idx; + long pos; + + bases = strlen(seq); + + fwdp = comp; + bckp = seq + bases -1; + for (pos = 0; pos < bases; pos++) + { + for (idx = 0; *bckp != iupac[idx].code && idx < IUPACSYMNUM; idx++); + if (idx > IUPACSYMNUM) + { + *fwdp = NTEND; + return 0; + } + else + *fwdp = iupac[idx].comp; + fwdp++; + bckp--; + } + *fwdp = NTEND; + return(1); +} + +int +seqdecode(char *str, char *codeseq) +{ + int idx; + int pos; + + pos = 0; + while (*codeseq != NTEND) + { + for (idx = 0; *codeseq != iupac[idx].code && idx < IUPACSYMNUM; idx++) + ; + if (idx > IUPACSYMNUM) + { + str[pos] = 'X'; + return 0; + } + else + str[pos] = iupac[idx].sym; + codeseq++; + pos++; + } + str[pos] = '\0'; + return 1; +} + +int +seqndecode( + char *str, /* pre-allocated string to write into */ + char *codeseq, /* sequence to decode */ + int n) /* how many bases to decode */ +{ + int idx; + int pos = 0; + + while (--n >= 0) + { + for (idx = 0; *codeseq != iupac[idx].code && idx < IUPACSYMNUM; idx++); + if (idx > IUPACSYMNUM) + { + str[pos] = 'X'; + return 0; + } + else + str[pos] = iupac[idx].sym; + codeseq++; + pos++; + } + str[pos] = '\0'; + return 1; +} + diff --git a/forester/archive/RIO/others/hmmer/squid/seqsplit_main.c b/forester/archive/RIO/others/hmmer/squid/seqsplit_main.c new file mode 100644 index 0000000..a38f6fd --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/seqsplit_main.c @@ -0,0 +1,163 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + + +/* seqsplit_main.c + * SRE, Mon Sep 25 11:43:58 2000 + * + * Split sequences into smaller chunks of defined size and overlap; + * output a FASTA file. + * + * Limitations: + * still working in 32 bits -- no sequence can be more than 2 GB + * in size. + * CVS $Id: seqsplit_main.c,v 1.1.1.1 2005/03/22 08:34:26 cmzmasek Exp $ + */ + +#include +#include +#include "squid.h" +#include "msa.h" + +static char banner[] = "seqsplit - split seqs into chunks of defined size and overlap"; + +static char usage[] = "\ +Usage: seqsplit [-options] \n\ + Available options:\n\ + -h : help; display usage and version\n\ + -o : output the new FASTA file to \n\ +"; + +static char experts[] = "\ + --informat : specify sequence file format \n\ + --length : set max length of each unique seq frag to \n\ + --overlap : set overlap length to (total frag size = length+overlap)\n\ +"; + +struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-o", TRUE, sqdARG_STRING }, + { "--informat", FALSE, sqdARG_STRING }, + { "--length", FALSE, sqdARG_INT }, + { "--overlap", FALSE, sqdARG_INT }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + + +int +main(int argc, char **argv) +{ + char *seqfile; /* name of sequence file */ + char *outfile; /* name of output file */ + SQFILE *dbfp; /* open sequence file */ + FILE *ofp; /* open output file */ + int fmt; /* format of seqfile */ + char *seq; /* sequence */ + SQINFO sqinfo; /* extra info about sequence */ + char *seqfrag; /* space for a seq fragment */ + int fraglength; /* length of unique seq per frag */ + int overlap; /* length of overlap. frags are fraglength+overlap*/ + char seqname[256]; /* renamed fragment, w/ coord info */ + int num; /* number of this fragment */ + int pos; /* position in a sequence */ + int len; /* length of a fragment */ + char *desc; + + int nseqs; /* total number of sequences */ + int nsplit; /* number of seqs that get split */ + int nnewfrags; /* total number of new fragments */ + + char *optname; + char *optarg; + int optind; + + /*********************************************** + * Parse command line + ***********************************************/ + + fmt = SQFILE_UNKNOWN; /* default: autodetect */ + fraglength = 100000; + overlap = 1000; + outfile = NULL; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-o") == 0) outfile = optarg; + else if (strcmp(optname, "--length") == 0) fraglength = atoi(optarg); + else if (strcmp(optname, "--overlap") == 0) overlap = atoi(optarg); + else if (strcmp(optname, "--informat") == 0) { + fmt = String2SeqfileFormat(optarg); + if (fmt == SQFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind != 1) Die("%s\n", usage); + seqfile = argv[argc-1]; + + seqfrag = MallocOrDie(sizeof(char) * (fraglength+overlap)); + seqfrag[fraglength+overlap] = '\0'; + + /*********************************************** + * Read the file. + ***********************************************/ + + if (outfile == NULL) ofp = stdout; + else { + if ((ofp = fopen(outfile, "w")) == NULL) + Die("Failed to open output sequence file %s for writing", outfile); + } + + if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) + Die("Failed to open sequence file %s for reading", seqfile); + + nseqs = nsplit = nnewfrags = 0; + while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) + { + nseqs++; + if (sqinfo.flags & SQINFO_DESC) desc = sqinfo.desc; + else desc = NULL; + + if (sqinfo.len <= fraglength+overlap) { + WriteSimpleFASTA(ofp, seq, sqinfo.name, desc); + continue; + } + + num = 1; + nsplit++; + for (pos = 0; pos < sqinfo.len; pos += fraglength) + { + if (sqinfo.len - pos <= overlap) continue; + strncpy(seqfrag, seq+pos, fraglength+overlap); + len = strlen(seqfrag); + sprintf(seqname, "%s/frag%d/%d-%d", + sqinfo.name, num, pos+1, pos+len); + WriteSimpleFASTA(ofp, seqfrag, seqname, desc); + nnewfrags++; + num ++; + } + FreeSequence(seq, &sqinfo); + } + SeqfileClose(dbfp); + if (outfile != NULL) fclose(ofp); + + printf("Total # of seqs: %d\n", nseqs); + printf("Affected by splitting: %d\n", nsplit); + printf("New # of seqs: %d\n", nseqs-nsplit + nnewfrags); + + return 0; +} diff --git a/forester/archive/RIO/others/hmmer/squid/seqstat_main.c b/forester/archive/RIO/others/hmmer/squid/seqstat_main.c new file mode 100644 index 0000000..01fe620 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/seqstat_main.c @@ -0,0 +1,229 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* seqstat_main.c + * Wed Aug 10 15:47:14 1994 + * + * Look at a sequence file, determine some simple statistics. + * CVS $Id: seqstat_main.c,v 1.1.1.1 2005/03/22 08:34:29 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include "squid.h" +#include "msa.h" + +static char banner[] = "seqstat - show some simple statistics on a sequence file"; + +static char usage[] = "\ +Usage: seqstat [-options] \n\ + Available options:\n\ + -a : report per-sequence info, not just a summary\n\ + -h : help; display usage and version\n\ +"; + +static char experts[] = "\ + --gccomp : with -a, include GC composition in report (DNA/RNA only)\n\ + --informat : specify sequence file format \n\ + --quiet : suppress verbose header (used in regression testing)\n\ +"; + +struct opt_s OPTIONS[] = { + { "-a", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "--gccomp", FALSE, sqdARG_NONE }, + { "--informat", FALSE, sqdARG_STRING }, + { "--quiet", FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +static float gc_composition(char *seq); + +int +main(int argc, char **argv) +{ + char *seqfile; /* name of sequence file */ + SQFILE *dbfp; /* open sequence file */ + int fmt; /* format of seqfile */ + char *seq; /* sequence */ + SQINFO sqinfo; /* extra info about sequence */ + int nseqs; + long long small; /* smallest length */ + long long large; /* largest length */ + long long total; /* total length */ + int type; /* kAmino, kDNA, kRNA, or kOtherSeq */ + + int allreport; /* TRUE to do a short table for each sequence */ + int be_quiet; /* TRUE to suppress header */ + int do_gccomp; /* TRUE to include GC composition in per-seq report */ + float gc; /* fractional gc composition, 0..1 */ + + char *optname; + char *optarg; + int optind; + + /*********************************************** + * Parse command line + ***********************************************/ + + fmt = SQFILE_UNKNOWN; /* default: autodetect format */ + allreport = FALSE; /* default: file summary only */ + be_quiet = FALSE; /* show header info by default */ + type = kOtherSeq; /* just to silence gcc uninit warning */ + do_gccomp = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-a") == 0) allreport = TRUE; + else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE; + else if (strcmp(optname, "--gccomp") == 0) do_gccomp = TRUE; + + else if (strcmp(optname, "--informat") == 0) { + fmt = String2SeqfileFormat(optarg); + if (fmt == SQFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind != 1) Die("%s\n", usage); + seqfile = argv[argc-1]; + + if (! be_quiet) Banner(stdout, banner); + + /*********************************************** + * Read the file. + ***********************************************/ + + if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) + Die("Failed to open sequence file %s for reading", seqfile); + + if (allreport) { + printf(" %-15s %-5s %s%s\n", " NAME", "LEN", + do_gccomp? " f_GC " : "", + "DESCRIPTION"); + printf(" --------------- ----- %s-----------\n", + do_gccomp ? "----- " : ""); + } + + nseqs = 0; + small = -1; + large = -1; + total = 0L; + while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) + { + if (nseqs == 0) type = Seqtype(seq); + if (do_gccomp) gc = gc_composition(seq); + + if (allreport) { + if (do_gccomp) { + printf("* %-15s %5d %.3f %-50.50s\n", sqinfo.name, sqinfo.len, + gc, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); + } else { + printf("* %-15s %5d %-50.50s\n", sqinfo.name, sqinfo.len, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); + } + } + + if (small == -1 || sqinfo.len < small) small = (long long) sqinfo.len; + if (large == -1 || sqinfo.len > large) large = (long long) sqinfo.len; + total += (long long) sqinfo.len; + nseqs++; + FreeSequence(seq, &sqinfo); + } + if (allreport) puts(""); + + printf("Format: %s\n", SeqfileFormat2String(dbfp->format)); + printf("Type (of 1st seq): "); + switch (type) + { + case kDNA: puts("DNA"); break; + case kRNA: puts("RNA"); break; + case kAmino: puts("Protein"); break; + case kOtherSeq: puts("Unknown"); break; + default: Die("oops."); + } + printf("Number of sequences: %d\n", nseqs); + printf("Total # residues: %lld\n", total); + printf("Smallest: %lld\n", small); + printf("Largest: %lld\n", large); + printf("Average length: %.1f\n", (float) total / (float) nseqs); + + SeqfileClose(dbfp); + + return 0; +} + + +/* Function: gc_composition() + * Date: SRE, Mon Apr 23 10:01:48 2001 [St. Louis] + * + * Purpose: Calculate the fractional GC composition of + * an input RNA or DNA sequence. Deals appropriately + * with IUPAC degeneracy. Case-insensitive. + * Ignores gap symbols. Other unexpected characters + * make it die with an error (protein, for instance). + * + * Args: seq - the DNA or RNA sequence + * + * Returns: fractional GC composition, 0-1 + */ +static float +gc_composition(char *seq) +{ + int c; + float total; + float gc; + + gc = total = 0.; + for (; *seq != '\0'; seq++) + { + if (isgap(c)) continue; + + c = toupper((int) *seq); + total += 1.0; + + switch (c) { + case 'C': + case 'G': + case 'S': gc += 1.0; break; + + case 'A': + case 'T': + case 'U': + case 'W': gc += 0.0; break; + + case 'N': + case 'R': + case 'Y': + case 'M': + case 'K': gc += 0.5; break; + + case 'H': + case 'D': gc += 0.3333; break; + + case 'B': + case 'V': gc += 0.6667; break; + + default: + Die("unrecognized nucleic acid character %c in sequence", c); + } + } + return (gc/total); +} diff --git a/forester/archive/RIO/others/hmmer/squid/sfetch_main.c b/forester/archive/RIO/others/hmmer/squid/sfetch_main.c new file mode 100644 index 0000000..8cb6aa4 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sfetch_main.c @@ -0,0 +1,444 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* sfetch_main.c, Fri Dec 25 14:22:17 1992, SRE + * + * sfetch -- a program to extract subsequences from a sequence database + * Renamed from "getseq" SRE, Tue Jan 19 10:47:42 1999 (GCG clash) + * + * CVS $Id: sfetch_main.c,v 1.1.1.1 2005/03/22 08:34:20 cmzmasek Exp $ + */ + +#include +#include +#include "squid.h" +#include "msa.h" +#include "ssi.h" + +static char banner[] = "sfetch - retrieve a specified sequence from a file"; + +static char usage[] = "\ +Usage: sfetch [-options] \n\ + or: sfetch [-options] .\n\ + (The second version fetches the first seq in the file.)\n\ + Get a sequence from a database.\n\ + Available options:\n\ + -a : name is an accession number, not a key\n\ + -d : get sequence from \n\ + -D : instead, get sequence from main database\n\ + -h : help; print version and usage info\n\ + -r : rename the fragment \n\ + -f : from which residue (1..N)\n\ + -t : to which residue (1..N)\n\ + -o : direct output to \n\ + -F : use output format of ; see below for\n\ + list. Default is original format of database.\n\ +\n\ + Available output formats include:\n\ + fasta\n\ + genbank\n\ + embl\n\ + gcg\n\ + pir\n\ + raw\n\n\ + Available databases are: (if $env variables are set correctly)\n\ + -Dsw $SWDIR SwissProt\n\ + -Dpir $PIRDIR PIR\n\ + -Dem $EMBLDIR EMBL\n\ + -Dgb $GBDIR GenBank\n\ + -Dwp $WORMDIR WormPep\n\ + -Dowl $OWLDIR OWL\n"; + +static char experts[] = "\ + --informat : specify input sequence file format \n\ +"; + +struct opt_s OPTIONS[] = { + { "-a", TRUE, sqdARG_NONE }, + { "-d", TRUE, sqdARG_STRING }, + { "-f", TRUE, sqdARG_INT }, + { "-h", TRUE, sqdARG_NONE }, + { "-o", TRUE, sqdARG_STRING }, + { "-r", TRUE, sqdARG_STRING }, + { "-t", TRUE, sqdARG_INT }, + { "-D", TRUE, sqdARG_STRING }, + { "-F", TRUE, sqdARG_STRING }, + { "--informat", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +/* dbenv maps command line database selection to an environment + * variable, from which the database directory is obtained. + */ +struct dbenv_s { + char *dbname; /* name of database, as used on command line */ + char *ssiname; /* name of GSI index file to look for */ + char *envname; /* environment var to get directory path from*/ + char *entryend; /* string signifying end of entry */ + int addend; /* TRUE if entryend line is part of entry */ +} dbenv[] = +{ + { "sw", "swiss.ssi", "SWDIR", "//", TRUE}, + { "pir", "pir.ssi", "PIRDIR", "///", TRUE}, + { "em", "embl.ssi", "EMBLDIR", "//", TRUE}, + { "gb", "genbank.ssi","GBDIR", "//", TRUE}, + { "wp", "wormpep.ssi","WORMDIR", ">", FALSE}, + { "owl", "owl.ssi", "OWLDIR", ">", FALSE}, /* use FASTA OWL version */ +}; +#define NUMDBS (sizeof(dbenv) / sizeof(struct dbenv_s)) + +int +main(int argc, char **argv) +{ + char *dbname; /* master database to search */ + char *seqfile; /* name of sequence file to read */ + char *ssifile; /* name of SSI index file (if one exists) */ + SQFILE *seqfp; /* pointer to open sequence file */ + char *getname; /* name of sequence to get from */ + int from; /* starting residue, 1..N */ + int to; /* ending residue, 1..N */ + char *outfile; /* name of file to put output to */ + FILE *outfp; /* file pointer to put output to */ + int format; /* format of seqfile */ + int outfmt; /* output format */ + char *seq; /* current working sequence */ + SQINFO sqinfo; + char *frag; /* extracted subsequence */ + int source_start; /* start of seq on original source 1..N */ + int source_stop; /* end of seq on original source 1..N */ + int source_orient; /* sign of parent: -1 revcomp, +1 normal*/ + char *ss; /* secondary structure representation */ + + SSIFILE *ssi; /* open SSI index file */ + SSIOFFSET ssi_offset; /* disk offset for locating sequence */ + int used_ssi; /* TRUE if SSI file was used (don't scan) */ + int status; /* status returned by an SSI call */ + + char *rename; /* new name to give fragment */ + int reverse_complement; /* do we have to reverse complement? */ + int getall; + int getfirst; /* TRUE to extract from the first seq, w/o looking at name */ + char *outformat; /* output format string */ + int by_accession; /* TRUE if name is accession number not key */ + + int dbidx; + + char *optname; + char *optarg; + int optind; + + /*********************************************** + * Parse the command line + ***********************************************/ + + /* initializations and defaults */ + format = SQFILE_UNKNOWN; /* autodetect default, overridden by --informat or SSI files */ + reverse_complement = 0; + getall = TRUE; + getfirst= FALSE; + dbname = NULL; + dbidx = -1; + seqfile = NULL; + from = -1; + to = -1; /* flag that says do the whole thing */ + outfile = NULL; + getname = NULL; + rename = NULL; + outformat = NULL; + by_accession = FALSE; + used_ssi = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-a") == 0) { by_accession = TRUE; } + else if (strcmp(optname, "-d") == 0) { seqfile = optarg; } + else if (strcmp(optname, "-f") == 0) { + from = atoi(optarg); getall = FALSE; + } + else if (strcmp(optname, "-t") == 0) { + to = atoi(optarg); getall = FALSE; + } + else if (strcmp(optname, "-r") == 0) { rename = optarg; } + else if (strcmp(optname, "-o") == 0) { outfile = optarg; } + else if (strcmp(optname, "-D") == 0) { dbname = optarg; } + else if (strcmp(optname, "-F") == 0) { outformat = optarg; } + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == SQFILE_UNKNOWN) + Die("unrecognized input sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind != 1) + Die("Incorrect number of command line arguments.\n%s\n", usage); + + getname = argv[optind]; + if (strcmp(getname, ".") == 0) getfirst = TRUE; + + if (getfirst && seqfile == NULL) + Die("You need to specify -d to retrieve a first sequence.\n%s", + usage); + + /*********************************************** + * Get name of file to look through, and disk offset, + * using SSI file if one exists. Three possibilities: + * 1) Look in main DB, which has SSI index in the directory + * 2) Look in a file, which has associated SSI index + * 3) Look in an unindexed file + ***********************************************/ + + if (dbname != NULL && seqfile != NULL) + Die("Can't fetch from *both* a database %s and a file %s\n%s", + dbname, seqfile, usage); + if (dbname == NULL && seqfile == NULL) + { /* try to guess SwissProt, stupidly, but usually works */ + if (strchr(getname, '_') != NULL) dbname = Strdup("sw"); + else Die("You have to specify either a database or a seqfile\n%s", usage); + } + + if (dbname != NULL) /* Main database. GSI index mandatory. */ + { + char *dbdir; + char *dbfile; + int fh; + /* find which db this is */ + for (dbidx = 0; dbidx < NUMDBS; dbidx++) + if (strcmp(dbenv[dbidx].dbname, dbname) == 0) + break; + if (dbidx == NUMDBS) + Die("No such main database %s\n%s", dbname, usage); + + /* get directory name */ + if ((dbdir = getenv(dbenv[dbidx].envname)) == NULL) + Die("Environment variable %s is not set.\n%s", + dbenv[dbidx].envname, usage); + /* open ssi file */ + ssifile = (char *) MallocOrDie + ((strlen(dbdir) + strlen(dbenv[dbidx].ssiname) + 2) * sizeof(char)); + sprintf(ssifile, "%s/%s", dbdir, dbenv[dbidx].ssiname); + if ((status = SSIOpen(ssifile, &ssi)) != 0) + Die("Failed to open SSI index file %s in directory %s\n%s", + dbenv[dbidx].ssiname, dbdir, usage); + /* get seqfile name, file format, and offset */ + if ((status = SSIGetOffsetByName(ssi, getname, &fh, &ssi_offset)) != 0) + Die("Failed to find key %s in SSI file %s", getname, ssifile); + if ((status = SSIFileInfo(ssi, fh, &dbfile, &format)) != 0) + Die("SSI error: %s", SSIErrorString(status)); + free(ssifile); + /* set up proper seqfile, with path */ + seqfile = (char *) MallocOrDie + ((strlen(dbdir) + strlen(dbfile) + 2) * sizeof(char)); + sprintf(seqfile, "%s/%s", dbdir, dbfile); + used_ssi = TRUE; + SSIClose(ssi); + } + else if (! getfirst) /* Sequence file. SSI index optional. */ + { + char *dbfile; + int fh; + + ssifile = (char *) MallocOrDie ((strlen(seqfile) + 5) * sizeof(char)); + sprintf(ssifile, "%s.ssi", seqfile); + if ((status = SSIOpen(ssifile, &ssi)) == 0) + { + SQD_DPRINTF1(("Opened SSI index %s...\n", ssifile)); + if ((status = SSIGetOffsetByName(ssi, getname, &fh, &ssi_offset)) != 0) + Die("Failed to find key %s in SSI file %s", getname, ssifile); + if ((status = SSIFileInfo(ssi, fh, &dbfile, &format)) != 0) + Die("SSI error: %s", SSIErrorString(status)); + SSIClose(ssi); + used_ssi = TRUE; + } + free(ssifile); + } + + /*********************************************** + * Open database file + ***********************************************/ + + if ((seqfp = SeqfileOpen(seqfile, format, NULL)) == NULL) + Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); + if (used_ssi) + SeqfilePosition(seqfp, &ssi_offset); + + /*********************************************** + * Open output file + ***********************************************/ + + /* Determine output format. Default: use same as input. Override: -F option. + */ + outfmt = seqfp->format; + if (outformat != NULL) + { + outfmt = String2SeqfileFormat(outformat); + if (outfmt == SQFILE_UNKNOWN) + Die("Unknown output format %s\n%s", outformat, usage); + if (IsAlignmentFormat(outfmt)) + Die("Can't output a single sequence in an alignment format (%s)\n", outformat); + } + /* open output file for writing; + use stdout by default */ + if (outfile == NULL) outfp = stdout; + else if ((outfp = fopen(outfile, "w")) == NULL) + Die("cannot open %s for output\n", outfile); + + + /*********************************************** + * Main loop + ***********************************************/ + + /* If this is a simple fetch of the complete sequence + * in native format, and we've been positioned in the file + * by an SSI index file, we can just read right from the file, + * partially bypassing the ReadSeq() API, and probably + * putting our fingers a little too deep into the seqfp object. + */ + if (getall && used_ssi && outfmt == format && dbname != NULL) + { + char *buf = NULL; + int buflen = 0; + int endlen; + + if (dbidx == -1) Die("That's weird. No database index available."); + endlen = strlen(dbenv[dbidx].entryend); + fputs(seqfp->buf, outfp); /* always do first line */ + /* fputs("\n", outfp); */ /* buf has its /n */ + while (sre_fgets(&buf, &buflen, seqfp->f) != NULL) + { + if (strncmp(buf, dbenv[dbidx].entryend, endlen) == 0) + { + if (dbenv[dbidx].addend) fputs(buf, outfp); + break; + } + fputs(buf, outfp); + } + if (buf != NULL) free(buf); + } + else /* else, the hard way with ReadSeq */ + { + seq = NULL; + frag = NULL; + + while (ReadSeq(seqfp, format, &seq, &sqinfo)) + { + if (used_ssi) /* GSI file puts us right on our seq. */ + break; + else if (getfirst) /* Use the first seq in the file. */ + break; + else if (by_accession && + (sqinfo.flags & SQINFO_ACC) && + strcmp(sqinfo.acc, getname) == 0) + break; + else if (strcmp(sqinfo.name, getname) == 0) + break; + + FreeSequence(seq, &sqinfo); + seq = NULL; + } + + if (seq == NULL) + Die("failed to extract the subsequence %s\n%s", getname, usage); + + if (getall) + { + from = 1; + to = sqinfo.len; + } + else if (from == -1) from = 1; + else if (to == -1) to = sqinfo.len; + + if (to > sqinfo.len || from > sqinfo.len) + Warn("Extracting beyond the length of the sequence"); + if (to < 1 || from < 1) + Warn("Extracting beyond the beginning of the sequence"); + + /* check for reverse complement */ + if (to != -1 && from > to) + { + int swapfoo; /* temp variable for swapping coords */ + + reverse_complement = TRUE; + swapfoo = from; from = to; to = swapfoo; + } + if (to > sqinfo.len) to = sqinfo.len; + if (from < 1) from = 1; + + if ((frag = (char *) calloc (to-from+2, sizeof(char))) == NULL) + Die("memory error\n"); + + if (strncpy(frag, seq+from-1, to-from+1) == NULL) + Die("strncpy() failed\n"); + + if (sqinfo.flags & SQINFO_SS) + { + if ((ss = (char *) calloc (to-from+2, sizeof(char))) == NULL) + Die("memory error\n"); + if (strncpy(ss, sqinfo.ss+from-1, to-from+1) == NULL) + Die("strncpy() failed\n"); + free(sqinfo.ss); + sqinfo.ss = ss; + } + + if (reverse_complement) + { + char *revfrag; /* temp variable for reverse complement */ + int swapfoo; /* temp variable for swapping coords back */ + + if ((revfrag = calloc ( to-from+2, sizeof(char))) == NULL) + Die("memory failure\n"); + revcomp(revfrag, frag); + free(frag); + frag = revfrag; + swapfoo = from; from = to; to = swapfoo; + + /* reverse complement nullifies secondary structure */ + if (sqinfo.flags & SQINFO_SS) + { free(sqinfo.ss); sqinfo.flags &= ~SQINFO_SS; } + } + + if (! (sqinfo.flags & SQINFO_ID)) + SetSeqinfoString(&sqinfo, sqinfo.name, SQINFO_ID); + + if (! (sqinfo.flags & SQINFO_OLEN)) + { sqinfo.olen = sqinfo.len; sqinfo.flags |= SQINFO_OLEN; } + + sqinfo.len = (to > from) ? to-from+1 : from-to+1; + sqinfo.flags |= SQINFO_LEN; + + if (rename != NULL) + SetSeqinfoString(&sqinfo, rename, SQINFO_NAME); + + source_start = (sqinfo.flags & SQINFO_START) ? sqinfo.start : 1; + source_stop = (sqinfo.flags & SQINFO_STOP) ? sqinfo.stop : sqinfo.len; + source_orient= (source_stop > source_start) ? 1 : -1; + + sqinfo.start = source_start + (from- 1) * source_orient; + sqinfo.stop = source_start + (to - 1) * source_orient; + sqinfo.flags |= SQINFO_START | SQINFO_STOP; + + WriteSeq(outfp, outfmt, frag, &sqinfo); + free(frag); + FreeSequence(seq, &sqinfo); + } + + if (outfile != NULL) + printf("Fragment written to file %s\n", outfile); + + SeqfileClose(seqfp); + fclose(outfp); + return(0); +} diff --git a/forester/archive/RIO/others/hmmer/squid/shuffle.c b/forester/archive/RIO/others/hmmer/squid/shuffle.c new file mode 100644 index 0000000..d923a2a --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/shuffle.c @@ -0,0 +1,550 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* shuffle.c + * + * Routines for randomizing sequences. + * + * All routines are alphabet-independent (DNA, protein, RNA, whatever); + * they assume that input strings are purely alphabetical [a-zA-Z], and + * will return strings in all upper case [A-Z]. + * + * All return 1 on success, and 0 on failure; 0 status invariably + * means the input string was not alphabetical. + * + * StrShuffle() - shuffled string, preserve mono-symbol composition. + * StrDPShuffle() - shuffled string, preserve mono- and di-symbol composition. + * + * StrMarkov0() - random string, same zeroth order Markov properties. + * StrMarkov1() - random string, same first order Markov properties. + * + * StrReverse() - simple reversal of string + * StrRegionalShuffle() - mono-symbol shuffled string in regional windows + * + * There are also similar routines for shuffling alignments: + * + * AlignmentShuffle() - alignment version of StrShuffle(). + * AlignmentBootstrap() - sample with replacement; a bootstrap dataset. + * + * CVS $Id: shuffle.c,v 1.1.1.1 2005/03/22 08:34:24 cmzmasek Exp $ + */ + +#include +#include + +#include "squid.h" + +/* Function: StrShuffle() + * + * Purpose: Returns a shuffled version of s2, in s1. + * (s1 and s2 can be identical, to shuffle in place.) + * + * Args: s1 - allocated space for shuffled string. + * s2 - string to shuffle. + * + * Return: 1 on success. + */ +int +StrShuffle(char *s1, char *s2) +{ + int len; + int pos; + char c; + + if (s1 != s2) strcpy(s1, s2); + for (len = strlen(s1); len > 1; len--) + { + pos = CHOOSE(len); + c = s1[pos]; + s1[pos] = s1[len-1]; + s1[len-1] = c; + } + return 1; +} + +/* Function: StrDPShuffle() + * Date: SRE, Fri Oct 29 09:15:17 1999 [St. Louis] + * + * Purpose: Returns a shuffled version of s2, in s1. + * (s1 and s2 may be identical; i.e. a string + * may be shuffled in place.) The shuffle is a + * "doublet-preserving" (DP) shuffle. Both + * mono- and di-symbol composition are preserved. + * + * Done by searching for a random Eulerian + * walk on a directed multigraph. + * Reference: S.F. Altschul and B.W. Erickson, Mol. Biol. + * Evol. 2:526-538, 1985. Quoted bits in my comments + * are from Altschul's outline of the algorithm. + * + * Args: s1 - RETURN: the string after it's been shuffled + * (space for s1 allocated by caller) + * s2 - the string to be shuffled + * + * Returns: 0 if string can't be shuffled (it's not all [a-zA-z] + * alphabetic. + * 1 on success. + */ +int +StrDPShuffle(char *s1, char *s2) +{ + int len; + int pos; /* a position in s1 or s2 */ + int x,y; /* indices of two characters */ + char **E; /* edge lists: E[0] is the edge list from vertex A */ + int *nE; /* lengths of edge lists */ + int *iE; /* positions in edge lists */ + int n; /* tmp: remaining length of an edge list to be shuffled */ + char sf; /* last character in s2 */ + char Z[26]; /* connectivity in last edge graph Z */ + int keep_connecting; /* flag used in Z connectivity algorithm */ + int is_eulerian; /* flag used for when we've got a good Z */ + + /* First, verify that the string is entirely alphabetic. + */ + len = strlen(s2); + for (pos = 0; pos < len; pos++) + if (! isalpha(s2[pos])) return 0; + + /* "(1) Construct the doublet graph G and edge ordering E + * corresponding to S." + * + * Note that these also imply the graph G; and note, + * for any list x with nE[x] = 0, vertex x is not part + * of G. + */ + E = MallocOrDie(sizeof(char *) * 26); + nE = MallocOrDie(sizeof(int) * 26); + for (x = 0; x < 26; x++) + { + E[x] = MallocOrDie(sizeof(char) * (len-1)); + nE[x] = 0; + } + + x = toupper(s2[0]) - 'A'; + for (pos = 1; pos < len; pos++) + { + y = toupper(s2[pos]) - 'A'; + E[x][nE[x]] = y; + nE[x]++; + x = y; + } + + /* Now we have to find a random Eulerian edge ordering. + */ + sf = toupper(s2[len-1]) - 'A'; + is_eulerian = 0; + while (! is_eulerian) + { + /* "(2) For each vertex s in G except s_f, randomly select + * one edge from the s edge list of E(S) to be the + * last edge of the s list in a new edge ordering." + * + * select random edges and move them to the end of each + * edge list. + */ + for (x = 0; x < 26; x++) + { + if (nE[x] == 0 || x == sf) continue; + + pos = CHOOSE(nE[x]); + y = E[x][pos]; + E[x][pos] = E[x][nE[x]-1]; + E[x][nE[x]-1] = y; + } + + /* "(3) From this last set of edges, construct the last-edge + * graph Z and determine whether or not all of its + * vertices are connected to s_f." + * + * a probably stupid algorithm for looking at the + * connectivity in Z: iteratively sweep through the + * edges in Z, and build up an array (confusing called Z[x]) + * whose elements are 1 if x is connected to sf, else 0. + */ + for (x = 0; x < 26; x++) Z[x] = 0; + Z[(int) sf] = keep_connecting = 1; + + while (keep_connecting) { + keep_connecting = 0; + for (x = 0; x < 26; x++) + { + y = E[x][nE[x]-1]; /* xy is an edge in Z */ + if (Z[x] == 0 && Z[y] == 1) /* x is connected to sf in Z */ + { + Z[x] = 1; + keep_connecting = 1; + } + } + } + + /* if any vertex in Z is tagged with a 0, it's + * not connected to sf, and we won't have a Eulerian + * walk. + */ + is_eulerian = 1; + for (x = 0; x < 26; x++) + { + if (nE[x] == 0 || x == sf) continue; + if (Z[x] == 0) { + is_eulerian = 0; + break; + } + } + + /* "(4) If any vertex is not connected in Z to s_f, the + * new edge ordering will not be Eulerian, so return to + * (2). If all vertices are connected in Z to s_f, + * the new edge ordering will be Eulerian, so + * continue to (5)." + * + * e.g. note infinite loop while is_eulerian is FALSE. + */ + } + + /* "(5) For each vertex s in G, randomly permute the remaining + * edges of the s edge list of E(S) to generate the s + * edge list of the new edge ordering E(S')." + * + * Essentially a StrShuffle() on the remaining nE[x]-1 elements + * of each edge list; unfortunately our edge lists are arrays, + * not strings, so we can't just call out to StrShuffle(). + */ + for (x = 0; x < 26; x++) + for (n = nE[x] - 1; n > 1; n--) + { + pos = CHOOSE(n); + y = E[x][pos]; + E[x][pos] = E[x][n-1]; + E[x][n-1] = y; + } + + /* "(6) Construct sequence S', a random DP permutation of + * S, from E(S') as follows. Start at the s_1 edge list. + * At each s_i edge list, add s_i to S', delete the + * first edge s_i,s_j of the edge list, and move to + * the s_j edge list. Continue this process until + * all edge lists are exhausted." + */ + iE = MallocOrDie(sizeof(int) * 26); + for (x = 0; x < 26; x++) iE[x] = 0; + + pos = 0; + x = toupper(s2[0]) - 'A'; + while (1) + { + s1[pos++] = 'A' + x; /* add s_i to S' */ + + y = E[x][iE[x]]; + iE[x]++; /* "delete" s_i,s_j from edge list */ + + x = y; /* move to s_j edge list. */ + + if (iE[x] == nE[x]) + break; /* the edge list is exhausted. */ + } + s1[pos++] = 'A' + sf; + s1[pos] = '\0'; + + /* Reality checks. + */ + if (x != sf) Die("hey, you didn't end on s_f."); + if (pos != len) Die("hey, pos (%d) != len (%d).", pos, len); + + /* Free and return. + */ + Free2DArray((void **) E, 26); + free(nE); + free(iE); + return 1; +} + + +/* Function: StrMarkov0() + * Date: SRE, Fri Oct 29 11:08:31 1999 [St. Louis] + * + * Purpose: Returns a random string s1 with the same + * length and zero-th order Markov properties + * as s2. + * + * s1 and s2 may be identical, to randomize s2 + * in place. + * + * Args: s1 - allocated space for random string + * s2 - string to base s1's properties on. + * + * Returns: 1 on success; 0 if s2 doesn't look alphabetical. + */ +int +StrMarkov0(char *s1, char *s2) +{ + int len; + int pos; + float p[26]; /* symbol probabilities */ + + /* First, verify that the string is entirely alphabetic. + */ + len = strlen(s2); + for (pos = 0; pos < len; pos++) + if (! isalpha(s2[pos])) return 0; + + /* Collect zeroth order counts and convert to frequencies. + */ + FSet(p, 26, 0.); + for (pos = 0; pos < len; pos++) + p[(int)(toupper(s2[pos]) - 'A')] += 1.0; + FNorm(p, 26); + + /* Generate a random string using those p's. + */ + for (pos = 0; pos < len; pos++) + s1[pos] = FChoose(p, 26) + 'A'; + s1[pos] = '\0'; + + return 1; +} + + +/* Function: StrMarkov1() + * Date: SRE, Fri Oct 29 11:22:20 1999 [St. Louis] + * + * Purpose: Returns a random string s1 with the same + * length and first order Markov properties + * as s2. + * + * s1 and s2 may be identical, to randomize s2 + * in place. + * + * Args: s1 - allocated space for random string + * s2 - string to base s1's properties on. + * + * Returns: 1 on success; 0 if s2 doesn't look alphabetical. + */ +int +StrMarkov1(char *s1, char *s2) +{ + int len; + int pos; + int x,y; + int i; /* initial symbol */ + float p[26][26]; /* symbol probabilities */ + + /* First, verify that the string is entirely alphabetic. + */ + len = strlen(s2); + for (pos = 0; pos < len; pos++) + if (! isalpha(s2[pos])) return 0; + + /* Collect first order counts and convert to frequencies. + */ + for (x = 0; x < 26; x++) FSet(p[x], 26, 0.); + + i = x = toupper(s2[0]) - 'A'; + for (pos = 1; pos < len; pos++) + { + y = toupper(s2[pos]) - 'A'; + p[x][y] += 1.0; + x = y; + } + for (x = 0; x < 26; x++) + FNorm(p[x], 26); + + /* Generate a random string using those p's. + */ + x = i; + s1[0] = x + 'A'; + for (pos = 1; pos < len; pos++) + { + y = FChoose(p[x], 26); + s1[pos] = y + 'A'; + x = y; + } + s1[pos] = '\0'; + + return 1; +} + + + +/* Function: StrReverse() + * Date: SRE, Thu Nov 20 10:54:52 1997 [St. Louis] + * + * Purpose: Returns a reversed version of s2, in s1. + * (s1 and s2 can be identical, to reverse in place) + * + * Args: s1 - allocated space for reversed string. + * s2 - string to reverse. + * + * Return: 1. + */ +int +StrReverse(char *s1, char *s2) +{ + int len; + int pos; + char c; + + if (s1 != s2) strcpy(s1, s2); + len = strlen(s1); + for (pos = 0; pos < len/2; pos++) + { /* swap ends */ + c = s1[len-pos-1]; + s1[len-pos-1] = s1[pos]; + s1[pos] = c; + } + return 1; +} + +/* Function: StrRegionalShuffle() + * Date: SRE, Thu Nov 20 11:02:34 1997 [St. Louis] + * + * Purpose: Returns a regionally shuffled version of s2, in s1. + * (s1 and s2 can be identical to regionally + * shuffle in place.) See [Pearson88]. + * + * Args: s1 - allocated space for regionally shuffled string. + * s2 - string to regionally shuffle + * w - window size (typically 10 or 20) + * + * Return: 1. + */ +int +StrRegionalShuffle(char *s1, char *s2, int w) +{ + int len; + char c; + int pos; + int i, j; + + if (s1 != s2) strcpy(s1, s2); + len = strlen(s1); + + for (i = 0; i < len; i += w) + for (j = MIN(len-1, i+w-1); j > i; j--) + { + pos = i + CHOOSE(j-i); + c = s1[pos]; + s1[pos] = s1[j]; + s1[j] = c; + } + return 1; +} + + +/* Function: AlignmentShuffle() + * Date: SRE, Sun Apr 22 18:37:15 2001 [St. Louis] + * + * Purpose: Returns a shuffled version of ali2, in ali1. + * (ali1 and ali2 can be identical, to shuffle + * in place.) The alignment columns are shuffled, + * preserving % identity within the columns. + * + * Args: ali1 - allocated space for shuffled alignment + * [0..nseq-1][0..alen-1] + * ali2 - alignment to be shuffled + * nseq - number of sequences in the alignment + * alen - length of alignment, in columns. + * + * Returns: int + */ +int +AlignmentShuffle(char **ali1, char **ali2, int nseq, int alen) +{ + int i; + int pos; + char c; + + if (ali1 != ali2) + { + for (i = 0; i < nseq; i++) strcpy(ali1[i], ali2[i]); + } + + for (i = 0; i < nseq; i++) + ali1[i][alen] = '\0'; + + for (; alen > 1; alen--) + { + pos = CHOOSE(alen); + for (i = 0; i < nseq; i++) + { + c = ali1[i][pos]; + ali1[i][pos] = ali1[i][alen-1]; + ali1[i][alen-1] = c; + } + } + + return 1; +} + +/* Function: AlignmentBootstrap() + * Date: SRE, Sun Apr 22 18:49:14 2001 [St. Louis] + * + * Purpose: Returns a bootstrapped alignment sample in ali1, + * constructed from ali2 by sampling columns with + * replacement. + * + * Unlike the other shuffling routines, ali1 and + * ali2 cannot be the same. ali2 is left unchanged. + * ali1 must be a properly allocated space for an + * alignment the same size as ali2. + * + * Args: ali1 - allocated space for bootstrapped alignment + * [0..nseq-1][0..alen-1] + * ali2 - alignment to be bootstrapped + * nseq - number of sequences in the alignment + * alen - length of alignment, in columns. + * + * Returns: 1 on success. + */ +int +AlignmentBootstrap(char **ali1, char **ali2, int nseq, int alen) +{ + int pos; + int col; + int i; + + for (pos = 0; pos < alen; pos++) + { + col = CHOOSE(alen); + for (i = 0; i < nseq; i++) + ali1[i][pos] = ali2[i][col]; + } + for (i = 0; i < nseq; i++) + ali1[i][alen] = '\0'; + + return 1; +} + + + + +#ifdef TESTDRIVER +/* + * cc -g -o testdriver -DTESTDRIVER -L. shuffle.c -lsquid -lm + */ +int +main(int argc, char **argv) +{ + char s1[100]; + char s2[100]; + + sre_srandom(42); + strcpy(s2, "GGGGGGGGGGCCCCCCCCCC"); + /* strcpy(s2, "AGACATAAAGTTCCGTACTGCCGGGAT"); + */ + StrDPShuffle(s1, s2); + printf("DPshuffle: %s\n", s1); + StrMarkov0(s1,s2); + printf("Markov 0 : %s\n", s1); + StrMarkov1(s1,s2); + printf("Markov 1 : %s\n", s1); + return 0; +} +#endif diff --git a/forester/archive/RIO/others/hmmer/squid/shuffle_main.c b/forester/archive/RIO/others/hmmer/squid/shuffle_main.c new file mode 100644 index 0000000..34be923 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/shuffle_main.c @@ -0,0 +1,281 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* main for shuffle + * + * shuffle - generate shuffled sequences + * Mon Feb 26 16:56:08 1996 + * + * CVS $Id: shuffle_main.c,v 1.1.1.1 2005/03/22 08:34:16 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" + +char banner[] = "shuffle - generated shuffled (or otherwise randomized) sequence"; + +char usage[] = "\ +Usage: shuffle [-options] \n\ + Available options:\n\ + -h : help; print version and usage info\n\ + -n : make samples per input seq (default 1)\n\ + -t : truncate/delete inputs to fixed length \n\ +\n\ + Default: shuffle each input randomly, preserving mono-symbol composition.\n\ + Other choices (exclusive; can't use more than one) :\n\ + -d : shuffle but preserve both mono- and di-symbol composition\n\ + -0 : generate with same 0th order Markov properties as each input\n\ + -1 : generate with same 1st order Markov properties as each input\n\ + -l : make iid sequences of same number and length as inputs\n\ + -r : reverse inputs\n\ + -w : regionally shuffle inputs in window size \n\ + -i : make [-n] iid seqs of length [-t] of type [--dna|--amino];\n\ + when -i is set, no argument is used\n\ +"; + +char experts[] = "\ + --alignment : is an alignment; shuffle the columns\n\ + --amino : synthesize protein sequences [default] (see -i, -l)\n\ + --dna : synthesize DNA sequences (see -i, -l))\n\ + --informat : specify sequence file format \n\ + --nodesc : remove sequence description lines\n\ + --seed : set random number seed to \n\ +"; + +struct opt_s OPTIONS[] = { + { "-0", TRUE, sqdARG_NONE }, /* 0th order Markov */ + { "-1", TRUE, sqdARG_NONE }, /* 1st order Markov */ + { "-d", TRUE, sqdARG_NONE }, /* digram shuffle */ + { "-h", TRUE, sqdARG_NONE }, /* help */ + { "-i", TRUE, sqdARG_NONE }, /* make iid seq of set length */ + { "-l", TRUE, sqdARG_NONE }, /* make iid seq of same length */ + { "-n", TRUE, sqdARG_INT }, /* number of shuffles per input seq */ + { "-r", TRUE, sqdARG_NONE }, /* reverse seq rather than shuffle */ + { "-t", TRUE, sqdARG_INT }, /* truncation of inputs to fixed len */ + { "-w", TRUE, sqdARG_INT }, /* do regional shuffling */ + { "--alignment",FALSE, sqdARG_NONE }, /* input is alignment; shuff cols */ + { "--amino", FALSE, sqdARG_NONE }, /* make iid protein seqs [default]*/ + { "--dna", FALSE, sqdARG_NONE }, /* make iid DNA seqs */ + { "--informat", FALSE, sqdARG_STRING }, /* remove desc lines */ + { "--nodesc", FALSE, sqdARG_NONE }, /* remove desc lines */ + { "--seed", FALSE, sqdARG_INT }, /* set the random number seed */ +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +static void shuffle_alignment_file(char *afile, int fmt); + +int +main(int argc, char **argv) +{ + char *seqfile; /* name of sequence file */ + SQFILE *dbfp; /* open sequence file */ + int fmt; /* format of seqfile */ + char *seq; /* sequence */ + char sqname[32]; /* name of an iid sequence */ + SQINFO sqinfo; /* additional sequence info */ + char *shuff; /* shuffled sequence */ + int num; /* number to generate */ + int seed; /* random number generator seed */ + int i; + int w; /* window size for regional shuffle (or 0) */ + int truncation; /* fixed length for truncation option (or 0) */ + int no_desc; /* TRUE to remove description lines */ + enum { /* shuffling strategy */ + DO_SHUFFLE, DO_DPSHUFFLE, DO_MARKOV0, DO_MARKOV1, DO_REVERSE, DO_REGIONAL, + DO_IID_SAMELEN, DO_IID_FIXEDLEN} strategy; + int do_dna; /* TRUE to make DNA iid seqs, not protein */ + int do_alignment; /* TRUE to shuffle alignment columns */ + + char *optname; /* option name */ + char *optarg; /* option argument (or NULL) */ + int optind; /* index of next argv[] */ + + + /*********************************************** + * Parse command line + ***********************************************/ + + fmt = SQFILE_UNKNOWN; /* autodetect file format by default */ + num = 0; + seed = (int) time ((time_t *) NULL); + w = 0; + truncation = 0; + strategy = DO_SHUFFLE; + no_desc = FALSE; + do_dna = FALSE; + do_alignment = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-0") == 0) strategy = DO_MARKOV0; + else if (strcmp(optname, "-1") == 0) strategy = DO_MARKOV1; + else if (strcmp(optname, "-d") == 0) strategy = DO_DPSHUFFLE; + else if (strcmp(optname, "-n") == 0) num = atoi(optarg); + else if (strcmp(optname, "-w") == 0) {strategy = DO_REGIONAL; w = atoi(optarg); } + else if (strcmp(optname, "-i") == 0) strategy = DO_IID_FIXEDLEN; + else if (strcmp(optname, "-l") == 0) strategy = DO_IID_SAMELEN; + else if (strcmp(optname, "-r") == 0) strategy = DO_REVERSE; + else if (strcmp(optname, "-t") == 0) truncation = atoi(optarg); + + else if (strcmp(optname, "--alignment")== 0) do_alignment = TRUE; + else if (strcmp(optname, "--amino") == 0) do_dna = FALSE; + else if (strcmp(optname, "--dna") == 0) do_dna = TRUE; + else if (strcmp(optname, "--nodesc") == 0) no_desc = TRUE; + else if (strcmp(optname, "--seed") == 0) seed = atoi(optarg); + else if (strcmp(optname, "--informat") == 0) { + fmt = String2SeqfileFormat(optarg); + if (fmt == SQFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + /***************************************************************** + * Special case, 1: IID sequence generation. + * -i option is special, because it synthesizes, rather than + * shuffles. Doesn't take a seqfile argument; + * requires -n, -t; and doesn't use the same code logic as the + * other shuffling strategies. Note that we misuse/overload the + * -t "truncation length" option to set our fixed length for + * generating iid sequence. + *****************************************************************/ + + if (strategy == DO_IID_FIXEDLEN) { + if (num == 0 || truncation == 0) + Die("-i (i.i.d. sequence generation) requires -n,-t to be set\n%s\n", + usage); + if (argc-optind != 0) + Die("-i (i.i.d. sequence generation) takes no seqfile argument\n%s\n", + usage); + sre_srandom(seed); + for (i = 0; i < num; i++) + { + if (do_dna) + shuff = RandomSequence(DNA_ALPHABET, dnafq, 4, truncation); + else + shuff = RandomSequence(AMINO_ALPHABET, aafq, 20, truncation); + + /* pedantic note: sqname has room for 31 char + \0, so + * there's room for 24 digits - a 32-bit integer can only run up + * to 10 digits, and a 64-bit integer to 20, so we don't worry + * about the following sprintf() overrunning its bounds. + */ + sprintf(sqname, "randseq%d", i); + WriteSimpleFASTA(stdout, shuff, sqname, NULL); + free(shuff); + } + return 0; + } + + /***************************************************************** + * Check command line + *****************************************************************/ + + if (argc - optind != 1) + Die("Incorrect number of command line arguments\n%s\n", usage); + seqfile = argv[optind]; + if (num == 0) num = 1; /* set default shuffle number per sequence */ + sre_srandom(seed); + + /***************************************************************** + * Special case, 2: Alignment shuffling + *****************************************************************/ + if (do_alignment) + { + shuffle_alignment_file(seqfile, fmt); + return 0; + } + + /***************************************************************** + * Main logic of the shuffling program: + * expect one seqfile argument + *****************************************************************/ + + if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) + Die("Failed to open sequence file %s for reading", seqfile); + + while (ReadSeq(dbfp, dbfp->format, &seq, &sqinfo)) + { + shuff = (char *) MallocOrDie ((sqinfo.len + 1) * sizeof(char)); + + if (no_desc) strcpy(sqinfo.desc, ""); + + /* If we're truncating seq, do it now. + */ + if (truncation > 0) + { + int start; + if (sqinfo.len < truncation) { + free(shuff); + FreeSequence(seq, &sqinfo); + continue; + } + + start = CHOOSE(sqinfo.len - truncation + 1); + strncpy(shuff, seq+start, truncation); + shuff[truncation] = '\0'; + strcpy(seq, shuff); + sqinfo.len = truncation; + } + + for (i = 0; i < num; i++) + { + switch (strategy) { + case DO_SHUFFLE: StrShuffle(shuff, seq); break; + case DO_DPSHUFFLE: StrDPShuffle(shuff, seq); break; + case DO_MARKOV0: StrMarkov0(shuff, seq); break; + case DO_MARKOV1: StrMarkov1(shuff, seq); break; + case DO_REVERSE: StrReverse(shuff, seq); break; + case DO_REGIONAL: StrRegionalShuffle(shuff, seq, w); break; + case DO_IID_SAMELEN: + free(shuff); + shuff = RandomSequence(AMINO_ALPHABET, aafq, 20, sqinfo.len); + break; + default: Die("choked on a bad enum; tragic."); + } + + WriteSeq(stdout, SQFILE_FASTA, shuff, &sqinfo); + } + + if (shuff != NULL) free(shuff); + FreeSequence(seq, &sqinfo); + } + + SeqfileClose(dbfp); + return 0; +} + + +static void +shuffle_alignment_file(char *afile, int fmt) +{ + MSAFILE *afp; + MSA *msa; + + if ((afp = MSAFileOpen(afile, fmt, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", afile); + while ((msa = MSAFileRead(afp)) != NULL) + { + /* shuffle in place */ + AlignmentShuffle(msa->aseq, msa->aseq, msa->nseq, msa->alen); + /* write in same format we read in */ + MSAFileWrite(stdout, msa, afp->format, FALSE); + MSAFree(msa); + } + MSAFileClose(afp); +} diff --git a/forester/archive/RIO/others/hmmer/squid/sindex_main.c b/forester/archive/RIO/others/hmmer/squid/sindex_main.c new file mode 100644 index 0000000..ad2faca --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sindex_main.c @@ -0,0 +1,185 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* sindex_main.c, SRE, Fri Feb 16 08:38:39 2001 [St. Louis] + * + * sindex -- create SSI index of sequence file(s) for sfetch + * + * CVS $Id: sindex_main.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $ + */ + +#include +#include "squid.h" +#include "msa.h" +#include "ssi.h" + +static char banner[] = "sindex - create SSI index of sequence file(s) for sfetch"; + +static char usage[] = "\ +Usage: sindex [-options] ...\n\ + Available options:\n\ + -h : help; print version and usage info.\n\ + -o : output the SSI index to file named \n\ +"; + +static char experts[] = "\ + --informat : specify input sequence file format \n\ +"; + +struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-o", TRUE, sqdARG_STRING }, + { "--64", FALSE< sqdARG_NONE }, + { "--informat", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *file; /* name of a sequence file */ + SQFILE *sfp; /* open sequence file */ + int format; /* forced sequence file format, if any */ + int mode; /* SSI_OFFSET_I32 or SSI_OFFSET_I64 */ + int idx; /* counter over files */ + int status; /* return status from an SSI call */ + SSIINDEX *ssi; /* the index we're creating */ + char *ssifile; /* file name for the SSI index */ + int fh; /* handle on current file */ + char *seq; /* a sequence read from the file */ + SQINFO sqinfo; /* info on the sequence */ + + char *optname; + char *optarg; + int optind; + + /*********************************************** + * Parse the command line + ***********************************************/ + + /* initializations and defaults */ + format = SQFILE_UNKNOWN; /* autodetecting format is the default */ + mode = SSI_OFFSET_I32; /* default = 32 bit mode */ + ssifile = NULL; /* default: set SSI file name as .ssi */ + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "--64") == 0) mode = SSI_OFFSET_I64; + else if (strcmp(optname, "-o") == 0) ssifile = sre_strdup(optarg, -1); + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == SQFILE_UNKNOWN) + Die("unrecognized input sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind < 1) + Die("Incorrect number of command line arguments.\n%s\n", usage); + + + /***************************************************************** + * Get set up... + *****************************************************************/ + + /* Determine whether we'll index in 32-bit or 64-bit mode. + * 32-bit is default, but 64-bit trumps; if any file needs 64-bit, + * we index them all that way. + */ + for (idx = optind; idx < argc; idx++) + { + file = argv[idx]; + if ((status = SSIRecommendMode(file)) == -1) + Die("Couldn't stat %s - file doesn't exist, or is too big", file); + if (status == SSI_OFFSET_I64) mode = SSI_OFFSET_I64; + } + + if (ssifile == NULL) { + ssifile = sre_strdup(file, -1); + sre_strcat(&ssifile, -1, ".ssi", -1); + } + + if ((ssi = SSICreateIndex(mode)) == NULL) + Die("Couldn't allocate/initialize the new SSI index\n"); + + /***************************************************************** + * Go through the files one at a time and compile index. + *****************************************************************/ + + for (idx = optind; idx < argc; idx++) + { + file = argv[idx]; + printf("Working on file %s... \t", file); + fflush(stdout); + + if ((sfp = SeqfileOpenForIndexing(file, format, NULL, mode)) == NULL) + Die("Failed to open sequence file %s for reading", file); + + if ((status = SSIAddFileToIndex(ssi, file, sfp->format, &fh)) != 0) + Die("SSI error: %s\n", SSIErrorString(status)); + + while (ReadSeq(sfp, sfp->format, &seq, &sqinfo)) { + if ((status = SSIAddPrimaryKeyToIndex(ssi, sqinfo.name, fh, + &(sfp->r_off), &(sfp->d_off), + sqinfo.len)) != 0) + Die("SSI error: %s\n", SSIErrorString(status)); + +#if DEBUGLEVEL >= 2 + if (mode == SSI_OFFSET_I32) + SQD_DPRINTF2(("Added primary key %s: r_off=%d, d_off=%d len=%d\n", + sqinfo.name, sfp->r_off.off.i32, + sfp->d_off.off.i32, sqinfo.len)); + else + SQD_DPRINTF2(("Added primary key %s: r_off=%lld, d_off=%lld len=%d\n", + sqinfo.name, sfp->r_off.off.i64, sfp->d_off.off.i64, + sqinfo.len)); +#endif + + if (sqinfo.flags & SQINFO_ACC) { + if ((status = SSIAddSecondaryKeyToIndex(ssi, sqinfo.acc, sqinfo.name)) != 0) + Die("SSI error: %s\n", SSIErrorString(status)); + } + + FreeSequence(seq, &sqinfo); + } + if (sfp->bpl > 0 && sfp->rpl > 0) { + if ((status = SSISetFileForSubseq(ssi, fh, sfp->bpl, sfp->rpl)) != 0) + Die("SSI error: %s\n", SSIErrorString(status)); + printf("FAST_SUBSEQ set...\t"); + } + + SeqfileClose(sfp); + printf("[done]\n"); + } + + printf("Sorting and writing index to SSI file %s...\t", ssifile); + fflush(stdout); + if ((status = SSIWriteIndex(ssifile, ssi)) != 0) + Die("SSIWriteIndex() failed: %s", SSIErrorString(status)); + printf("[done]\n"); + + printf("%s:\n", ssifile); + printf("Mode: %s\n", + mode == SSI_OFFSET_I32 ? "32-bit" : "64-bit"); + printf("Files: %d\n", ssi->nfiles); + printf("Primary keys: %d\n", ssi->nprimary); + printf("Secondary keys: %d\n", ssi->nsecondary); + + SSIFreeIndex(ssi); + + free(ssifile); + return 0; +} diff --git a/forester/archive/RIO/others/hmmer/squid/sqerror.c b/forester/archive/RIO/others/hmmer/squid/sqerror.c new file mode 100644 index 0000000..ee6a32e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sqerror.c @@ -0,0 +1,95 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* sqerror.c + * + * error handling for the squid library + * RCS $Id: sqerror.c,v 1.1.1.1 2005/03/22 08:34:26 cmzmasek Exp $ + */ + + /* a global errno equivalent */ +int squid_errno; + +#include +#include +#include + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +/* Function: Die() + * + * Purpose: Print an error message and die. The arguments + * are formatted exactly like arguments to printf(). + * + * Return: None. Exits the program. + */ +/* VARARGS0 */ +void +Die(char *format, ...) +{ + va_list argp; + /* format the error mesg */ + fprintf(stderr, "\nFATAL: "); + va_start(argp, format); + vfprintf(stderr, format, argp); + va_end(argp); + fprintf(stderr, "\n"); + fflush(stderr); + /* exit */ + exit(1); +} + + + +/* Function: Warn() + * + * Purpose: Print an error message and return. The arguments + * are formatted exactly like arguments to printf(). + * + * Return: (void) + */ +/* VARARGS0 */ +void +Warn(char *format, ...) +{ + va_list argp; + /* format the error mesg */ + fprintf(stderr, "WARNING: "); + va_start(argp, format); + vfprintf(stderr, format, argp); + va_end(argp); + fprintf(stderr, "\n"); + fflush(stderr); +} + +/* Function: Panic() + * + * Purpose: Die from a lethal error that's not my problem, + * but instead a failure of a StdC/POSIX call that + * shouldn't fail. Call perror() to get the + * errno flag, then die. + * + * Usually called by the PANIC macro which adds + * the __FILE__ and __LINE__ information; see + * structs.h. + * + * Inspired by code in Donald Lewine's book, _POSIX + * Programmer's Guide_. + */ +void +Panic(char *file, int line) +{ + (void) fprintf(stderr, "\nPANIC [%s line %d] ", file, line); + (void) perror("Unusual error"); + exit(EXIT_FAILURE); +} + diff --git a/forester/archive/RIO/others/hmmer/squid/sqfuncs.h b/forester/archive/RIO/others/hmmer/squid/sqfuncs.h new file mode 100644 index 0000000..4b5ef3c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sqfuncs.h @@ -0,0 +1,293 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef SQFUNCSH_INCLUDED +#define SQFUNCSH_INCLUDED +/* sqfuncs.h + * + * Prototypes for squid library functions; + * also makes a good reference list for what the package contains. + * + * Warning: squid is a slowly evolving beast. Some functions are + * obsolete. Some functions are probably just wrong, dating to + * a primordial era before I knew anything about what I was doing. + * Some functions are both obsolete and wrong but still necessary + * to get legacy code to compile. + * + * RCS $Id: sqfuncs.h,v 1.1.1.1 2005/03/22 08:34:30 cmzmasek Exp $ + */ + +/* + * from aligneval.c + */ +extern float ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2); +extern float CompareRefPairAlignments(int *ref, char *known1, char *known2, char *calc1, char *calc2); +extern float CompareMultAlignments(char **kseqs, char **tseqs, int N); +extern float CompareRefMultAlignments(int *ref, char **kseqs, char **tseqs, int N); +extern float PairwiseIdentity(char *s1, char *s2); +extern float AlignmentIdentityBySampling(char **aseq, int L, int N, int nsample); +extern char *MajorityRuleConsensus(char **aseq, int nseq, int alen); + +/* + * from alignio.c + */ +extern void AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo); +extern void InitAinfo(AINFO *ainfo); +extern void FreeAlignment(char **aseqs, AINFO *ainfo); +extern void SAMizeAlignment(char **aseq, int nseq, int alen); +extern void SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap); +extern int MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s); +extern int MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s); +extern int DealignedLength(char *aseq); +extern int WritePairwiseAlignment(FILE *ofp, char *aseq1, char *name1, int spos1, + char *aseq2, char *name2, int spos2, + int **pam, int indent); +extern int MingapAlignment(char **aseqs, AINFO *ainfo); +extern int RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex, + char ***ret_aseqs, AINFO *ainfo); +extern void AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym); + +/* from cluster.c + */ +extern int Cluster(float **mx, int N, enum clust_strategy mode, struct phylo_s **ret_tree); +extern struct phylo_s *AllocPhylo(int N); +extern void FreePhylo(struct phylo_s *tree, int N); +extern void MakeDiffMx(char **aseqs, int num, float ***ret_dmx); +extern void MakeIdentityMx(char **aseqs, int num, float ***ret_imx); +extern void PrintNewHampshireTree(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N); +extern void PrintPhylo(FILE *fp, AINFO *ainfo, struct phylo_s *tree, int N); + +/* + * from dayhoff.c + */ +extern int ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale); +extern void ScalePAM(int **pam, int scale); + + +/* from file.c + */ +extern char *FileDirname(char *filename); +extern char *FileTail(char *file, int noextension); +extern char *FileConcat(char *dir, char *file); +extern char *FileAddSuffix(char *filename, char *sfx); +extern FILE *EnvFileOpen(char *fname, char *env, char **ret_dir); +extern int FileExists(char *filename); + + +/* from getopt.c + */ +extern int Getopt(int argc, char **argv, + struct opt_s *opt, int nopts, char *usage, + int *ret_optind, char **ret_optname, char **ret_optarg); + + +/* from hsregex.c + * Henry Spencer's regex() code + */ +extern int Strparse(char *rexp, char *s, int ntok); +extern void SqdClean(void); +extern sqd_regexp *sqd_regcomp(const char *re); +extern int sqd_regexec(sqd_regexp *rp, const char *s); +extern void sqd_regsub(const sqd_regexp *rp, const char *src, char *dst); +extern void sqd_regerror(char *message); + +/* from interleaved.c + */ +extern int IsInterleavedFormat(int format); +extern int ReadInterleaved(char *seqfile, + int (*skip_header)(FILE *), + int (*parse_header)(FILE *, AINFO *), + int (*is_dataline)(char *, char *), + char ***ret_aseqs, AINFO *ainfo); +extern int ReadAlignment(char *seqfile, int format, char ***ret_aseqs, AINFO *ainfo); + + +/* from revcomp.c + */ +extern char *revcomp(char *comp, char *seq); + +/* + * from selex.c + */ +extern int DealignAseqs(char **aseqs, int num, char ***ret_rseqs); +extern int IsSELEXFormat(char *filename); +extern int TruncateNames(char **names, int N); /* OBSOLETE? */ + +/* + * from seqencode.c + */ +extern int seqcmp(char *s1, char *s2, int allow); +extern int seqncmp(char *s1, char *s2, int n, int allow); +extern int seqencode(char *codeseq,char *str); +extern int coded_revcomp(char *comp, char *seq); +extern int seqdecode(char *str, char *codeseq); +extern int seqndecode(char *str, char *codeseq, int n); + +/* + * from shuffle.c + */ +extern int StrShuffle(char *s1, char *s2); +extern int StrDPShuffle(char *s1, char *s2); +extern int StrMarkov0(char *s1, char *s2); +extern int StrMarkov1(char *s1, char *s2); +extern int StrReverse(char *s1, char *s2); +extern int StrRegionalShuffle(char *s1, char *s2, int w); +extern int AlignmentShuffle(char **ali1, char **ali2, int nseq, int alen); +extern int AlignmentBootstrap(char **ali1, char **ali2, int nseq, int alen); + +/* + * from sqerror.c + */ +extern void Die(char *format, ...); +extern void Warn(char *format, ...); +extern void Panic(char *file, int line); + + +/* + * from sqio.c + */ +extern void FreeSequence(char *seq, SQINFO *sqinfo); +extern int SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag); +extern void SeqinfoCopy(SQINFO *sq1, SQINFO *sq2); +extern void ToDNA(char *seq); +extern void ToRNA(char *seq); +extern void ToIUPAC(char *seq); +extern int ReadMultipleRseqs(char *seqfile, int fformat, char ***ret_rseqs, + SQINFO **ret_sqinfo, int *ret_num); +extern SQFILE *SeqfileOpen(char *filename, int format, char *env); +extern SQFILE *SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode); +extern int SeqfileFormat(FILE *fp); +extern void SeqfilePosition(SQFILE *sfp, SSIOFFSET *offset); +extern void SeqfileRewind(SQFILE *sfp); +extern void SeqfileClose(SQFILE *sfp); + +extern int ReadSeq(SQFILE *fp, int format, char **ret_seq, SQINFO *sqinfo); +extern int GCGBinaryToSequence(char *seq, int len); +extern int GCGchecksum(char *seq, int seqlen); +extern int GCGMultchecksum(char **seqs, int nseq); +extern void WriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc); +extern int WriteSeq(FILE *outf, int outfmt, char *seq, SQINFO *sqinfo); +extern int Seqtype(char *seq); +extern int GuessAlignmentSeqtype(char **aseq, int nseq); +extern int String2SeqfileFormat(char *s); +extern char *SeqfileFormat2String(int code); +extern SQINFO *MSAToSqinfo(MSA *msa); + +/* from squidcore.c + */ +extern void Banner(FILE *fp, char *banner); + + +/* from sre_ctype.c + */ +extern int sre_tolower(int c); +extern int sre_toupper(int c); + +/* from sre_math.c + */ +extern float ExponentialRandom(void); +extern float Gaussrandom(float mean, float stddev); +extern int Linefit(float *x, float *y, int N, + float *ret_a, float *ret_b, float *ret_r); +extern void WeightedLinefit(float *x, float *y, float *var, int N, + float *ret_m, float *ret_b); +extern double Gammln(double xx); +extern int DNorm(double *vec, int n); +extern int FNorm(float *vec, int n); +extern void DScale(double *vec, int n, double scale); +extern void FScale(float *vec, int n, float scale); +extern void DSet(double *vec, int n, double value); +extern void FSet(float *vec, int n, float value); +extern double DSum(double *vec, int n); +extern float FSum(float *vec, int n); +extern void DAdd(double *vec1, double *vec2, int n); +extern void FAdd(float *vec1, float *vec2, int n); +extern void DCopy(double *vec1, double *vec2, int n); +extern void FCopy(float *vec1, float *vec2, int n); +extern int DMax(double *vec, int n); +extern int FMax(float *vec, int n); +extern double DDot(double *vec1, double *vec2, int n); +extern float FDot(float *vec1, float *vec2, int n); +extern float **FMX2Alloc(int rows, int cols); +extern void FMX2Free(float **mx); +extern double **DMX2Alloc(int rows, int cols); +extern void DMX2Free(double **mx); +extern void FMX2Multiply(float **A, float **B, float **C, int m, int p, int n); +extern float sre_random(void); +extern void sre_srandom(int seed); +extern int DChoose(double *p, int n); +extern int FChoose(float *p, int n); +extern double DLogSum(double *logp, int n); +extern float FLogSum(float *logp, int n); +extern double IncompleteGamma(double a, double x); + +/* from sre_string.c + */ +#ifdef NOSTR +extern char *strstr(char *s, char *subs); +#endif +extern char *Strdup(char *s); +extern void StringChop(char *s); +extern int Strinsert(char *s1, char c, int pos); +extern int Strdelete(char *s1, int pos); +extern void s2lower(char *s); +extern void s2upper(char *s); +extern void *sre_malloc(char *file, int line, size_t size); +extern void *sre_realloc(char *file, int line, void *p, size_t size); +extern void Free2DArray(void **p, int dim1); +extern void Free3DArray(void ***p, int dim1, int dim2); +extern char *RandomSequence(char *alphabet, float *p, int n, int len); +extern char *sre_fgets(char **buf, int *n, FILE *fp); +extern int sre_strcat(char **dest, int ldest, char *src, int lsrc); +extern char *sre_strtok(char **s, char *delim, int *len); +extern char *sre_strdup(char *s, int n); +extern char *sre_strncat(char *s1, char *s2, int n); +extern int IsBlankline(char *s); + +/* from stack.c + */ +extern struct intstack_s *InitIntStack(void); +extern void PushIntStack(struct intstack_s *stack, int data); +extern int PopIntStack(struct intstack_s *stack, int *ret_data); +extern void ReverseIntStack(struct intstack_s *stack); +extern int FreeIntStack( struct intstack_s *stack ); + +/* + * from translate.c + */ +extern char *Translate(char *seq, char **code); + +/* + * from types.c + */ +extern int IsInt(char *s); +extern int IsReal(char *s); +extern void Byteswap(char *swap, int nbytes); +#ifndef USE_HOST_BYTESWAP_FUNCTIONS +extern sqd_uint16 sre_ntoh16(sqd_uint16 netshort); +extern sqd_uint32 sre_ntoh32(sqd_uint32 netlong); +extern sqd_uint16 sre_hton16(sqd_uint16 hostshort); +extern sqd_uint32 sre_hton32(sqd_uint32 hostlong); +#endif /*!USE_HOST_BYTESWAP_FUNCTIONS*/ +extern sqd_uint64 sre_ntoh64(sqd_uint64 net_int64); +extern sqd_uint64 sre_hton64(sqd_uint64 host_int64); + +/* + * from weight.c + */ +extern void GSCWeights(char **aseq, int nseq, int alen, float *wgt); +extern void VoronoiWeights(char **aseq, int nseq, int alen, float *wgt); +extern void BlosumWeights(char **aseq, int nseq, int alen, float blosumlevel, float *wgt); +extern void PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt); +extern void FilterAlignment(MSA *msa, float cutoff, MSA **ret_new); +extern void SampleAlignment(MSA *msa, int sample, MSA **ret_new); +extern void SingleLinkCluster(char **aseq, int nseq, int alen, float maxid, + int **ret_c, int *ret_nc); +#endif /* SQFUNCSH_INCLUDED */ diff --git a/forester/archive/RIO/others/hmmer/squid/sqio.c b/forester/archive/RIO/others/hmmer/squid/sqio.c new file mode 100644 index 0000000..4192f59 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sqio.c @@ -0,0 +1,1901 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* File: sqio.c + * From: ureadseq.c in Don Gilbert's sequence i/o package + * + * Reads and writes nucleic/protein sequence in various + * formats. Data files may have multiple sequences. + * + * Heavily modified from READSEQ package + * Copyright (C) 1990 by D.G. Gilbert + * Biology Dept., Indiana University, Bloomington, IN 47405 + * email: gilbertd@bio.indiana.edu + * Thanks Don! + * + * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992 + * Packaged for squid, Thu Oct 1 10:07:11 1992 + * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993 + * + * CVS $Id: sqio.c,v 1.1.1.1 2005/03/22 08:34:29 cmzmasek Exp $ + * + ***************************************************************** + * Basic API for single sequence reading: + * + * SQFILE *sqfp; + * char *seqfile; + * int format; - see squid.h for formats; example: SQFILE_FASTA + * char *seq; + * SQINFO *sqinfo; + * + * if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL) + * Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); + * while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) { + * do_stuff; + * FreeSequence(seq, &sqinfo); + * } + * SeqfileClose(sqfp); + * + ***************************************************************** + */ + +#include +#include +#include +#include + +#ifndef SEEK_SET +#include +#endif + +#include "squid.h" +#include "msa.h" +#include "ssi.h" + +static void SeqfileGetLine(SQFILE *V); + +#define kStartLength 500 + +static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; +static char *primenuc = "ACGTUN"; +static char *protonly = "EFIPQZ"; + +static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode); + +/* Function: SeqfileOpen() + * + * Purpose : Open a sequence database file and prepare for reading + * sequentially. + * + * Args: filename - name of file to open + * format - format of file + * env - environment variable for path (e.g. BLASTDB) + * ssimode - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64 + * + * Returns opened SQFILE ptr, or NULL on failure. + */ +SQFILE * +SeqfileOpen(char *filename, int format, char *env) +{ + return seqfile_open(filename, format, env, -1); +} +SQFILE * +SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode) +{ + return seqfile_open(filename, format, env, ssimode); +} +static SQFILE * +seqfile_open(char *filename, int format, char *env, int ssimode) +{ + SQFILE *dbfp; + + dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE)); + + dbfp->ssimode = ssimode; + dbfp->rpl = -1; /* flag meaning "unset" */ + dbfp->lastrpl = 0; + dbfp->maxrpl = 0; + dbfp->bpl = -1; /* flag meaning "unset" */ + dbfp->lastbpl = 0; + dbfp->maxbpl = 0; + + /* Open our file handle. + * Three possibilities: + * 1. normal file open + * 2. filename = "-"; read from stdin + * 3. filename = "*.gz"; read thru pipe from gzip + * If we're reading from stdin or a pipe, we can't reliably + * back up, so we can't do two-pass parsers like the interleaved alignment + * formats. + */ + if (strcmp(filename, "-") == 0) + { + dbfp->f = stdin; + dbfp->do_stdin = TRUE; + dbfp->do_gzip = FALSE; + dbfp->fname = sre_strdup("[STDIN]", -1); + } +#ifndef SRE_STRICT_ANSI + /* popen(), pclose() aren't portable to non-POSIX systems; disable */ + else if (Strparse("^.*\\.gz$", filename, 0)) + { + char cmd[256]; + + /* Note that popen() will return "successfully" + * if file doesn't exist, because gzip works fine + * and prints an error! So we have to check for + * existence of file ourself. + */ + if (! FileExists(filename)) + Die("%s: file does not exist", filename); + + if (strlen(filename) + strlen("gzip -dc ") >= 256) + Die("filename > 255 char in SeqfileOpen()"); + sprintf(cmd, "gzip -dc %s", filename); + if ((dbfp->f = popen(cmd, "r")) == NULL) + return NULL; + + dbfp->do_stdin = FALSE; + dbfp->do_gzip = TRUE; + dbfp->fname = sre_strdup(filename, -1); + } +#endif /*SRE_STRICT_ANSI*/ + else + { + if ((dbfp->f = fopen(filename, "r")) == NULL && + (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL) + return NULL; + + dbfp->do_stdin = FALSE; + dbfp->do_gzip = FALSE; + dbfp->fname = sre_strdup(filename, -1); + } + + + /* Invoke autodetection if we haven't already been told what + * to expect. + */ + if (format == SQFILE_UNKNOWN) + { + if (dbfp->do_stdin == TRUE || dbfp->do_gzip) + Die("Can't autodetect sequence file format from a stdin or gzip pipe"); + format = SeqfileFormat(dbfp->f); + if (format == SQFILE_UNKNOWN) + Die("Can't determine format of sequence file %s", dbfp->fname); + } + + /* The hack for sequential access of an interleaved alignment file: + * read the alignment in, we'll copy sequences out one at a time. + */ + dbfp->msa = NULL; + dbfp->afp = NULL; + dbfp->format = format; + dbfp->linenumber = 0; + dbfp->buf = NULL; + dbfp->buflen = 0; + if (IsAlignmentFormat(format)) + { + /* We'll be reading from the MSA interface. Copy our data + * to the MSA afp's structure. + */ + dbfp->afp = MallocOrDie(sizeof(MSAFILE)); + dbfp->afp->f = dbfp->f; /* just a ptr, don't close */ + dbfp->afp->do_stdin = dbfp->do_stdin; + dbfp->afp->do_gzip = dbfp->do_gzip; + dbfp->afp->fname = dbfp->fname; /* just a ptr, don't free */ + dbfp->afp->format = dbfp->format; /* e.g. format */ + dbfp->afp->linenumber = dbfp->linenumber; /* e.g. 0 */ + dbfp->afp->buf = NULL; + dbfp->afp->buflen = 0; + + if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL) + Die("Failed to read any alignment data from file %s", dbfp->fname); + /* hack: overload/reuse msa->lastidx; indicates + next seq to return upon a ReadSeq() call */ + dbfp->msa->lastidx = 0; + + return dbfp; + } + + /* Load the first line. + */ + SeqfileGetLine(dbfp); + return dbfp; +} + +/* Function: SeqfilePosition() + * + * Purpose: Move to a particular offset in a seqfile. + * Will not work on alignment files. + */ +void +SeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset) +{ + if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format)) + Die("SeqfilePosition() failed: in a nonrewindable data file or stream"); + + if (SSISetFilePosition(sqfp->f, offset) != 0) + Die("SSISetFilePosition failed, but that shouldn't happen."); + SeqfileGetLine(sqfp); +} + + +/* Function: SeqfileRewind() + * + * Purpose: Set a sequence file back to the first sequence. + * + * Won't work on alignment files. Although it would + * seem that it could (just set msa->lastidx back to 0), + * that'll fail on "multiple multiple" alignment file formats + * (e.g. Stockholm). + */ +void +SeqfileRewind(SQFILE *sqfp) +{ + if (sqfp->do_stdin || sqfp->do_gzip) + Die("SeqfileRewind() failed: in a nonrewindable data file or stream"); + + rewind(sqfp->f); + SeqfileGetLine(sqfp); +} + +/* Function: SeqfileLineParameters() + * Date: SRE, Thu Feb 15 17:00:41 2001 [St. Louis] + * + * Purpose: After all the sequences have been read from the file, + * but before closing it, retrieve overall bytes-per-line and + * residues-per-line info. If non-zero, these mean that + * the file contains homogeneous sequence line lengths (except + * the last line in each record). + * + * If either of bpl or rpl is determined to be inhomogeneous, + * both are returned as 0. + * + * Args: *sqfp - an open but fully read sequence file + * ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous + * ret_rpl - RETURN: residues per line, or 0 if inhomogenous. + * + * Returns: void + */ +void +SeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl) +{ + if (V->rpl > 0 && V->maxrpl == V->rpl && + V->bpl > 0 && V->maxbpl == V->bpl) { + *ret_bpl = V->bpl; + *ret_rpl = V->rpl; + } else { + *ret_bpl = 0; + *ret_rpl = 0; + } +} + + +void +SeqfileClose(SQFILE *sqfp) +{ + /* note: don't test for sqfp->msa being NULL. Now that + * we're holding afp open and allowing access to multi-MSA + * databases (e.g. Stockholm format, Pfam), msa ends + * up being NULL when we run out of alignments. + */ + if (sqfp->afp != NULL) { + if (sqfp->msa != NULL) MSAFree(sqfp->msa); + if (sqfp->afp->buf != NULL) free(sqfp->afp->buf); + free(sqfp->afp); + } +#ifndef SRE_STRICT_ANSI /* gunzip functionality only on POSIX systems */ + if (sqfp->do_gzip) pclose(sqfp->f); +#endif + else if (! sqfp->do_stdin) fclose(sqfp->f); + if (sqfp->buf != NULL) free(sqfp->buf); + if (sqfp->fname != NULL) free(sqfp->fname); + free(sqfp); +} + + +/* Function: SeqfileGetLine() + * Date: SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre] + * + * Purpose: read a line from a sequence file into V->buf + * If the fgets() is NULL, sets V->buf[0] to '\0'. + * + * Args: V + * + * Returns: void + */ +static void +SeqfileGetLine(SQFILE *V) +{ + if (V->ssimode >= 0) + if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset))) + Die("SSIGetFilePosition() failed"); + if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL) + *(V->buf) = '\0'; + V->linenumber++; +} + + +void +FreeSequence(char *seq, SQINFO *sqinfo) +{ + if (seq != NULL) free(seq); + if (sqinfo->flags & SQINFO_SS) free(sqinfo->ss); + if (sqinfo->flags & SQINFO_SA) free(sqinfo->sa); +} + +int +SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag) +{ + int len; + int pos; + + /* silently ignore NULL. */ + if (sptr == NULL) return 1; + + while (*sptr == ' ') sptr++; /* ignore leading whitespace */ + for (pos = strlen(sptr)-1; pos >= 0; pos--) + if (! isspace((int) sptr[pos])) break; + sptr[pos+1] = '\0'; /* ignore trailing whitespace */ + + switch (flag) { + case SQINFO_NAME: + if (*sptr != '-') + { + strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1); + sqinfo->name[SQINFO_NAMELEN-1] = '\0'; + sqinfo->flags |= SQINFO_NAME; + } + break; + + case SQINFO_ID: + if (*sptr != '-') + { + strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1); + sqinfo->id[SQINFO_NAMELEN-1] = '\0'; + sqinfo->flags |= SQINFO_ID; + } + break; + + case SQINFO_ACC: + if (*sptr != '-') + { + strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1); + sqinfo->acc[SQINFO_NAMELEN-1] = '\0'; + sqinfo->flags |= SQINFO_ACC; + } + break; + + case SQINFO_DESC: + if (*sptr != '-') + { + if (sqinfo->flags & SQINFO_DESC) /* append? */ + { + len = strlen(sqinfo->desc); + if (len < SQINFO_DESCLEN-2) /* is there room? */ + { + strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++; + strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len); + } + } + else /* else copy */ + strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1); + sqinfo->desc[SQINFO_DESCLEN-1] = '\0'; + sqinfo->flags |= SQINFO_DESC; + } + break; + + case SQINFO_START: + if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } + sqinfo->start = atoi(sptr); + if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START; + break; + + case SQINFO_STOP: + if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } + sqinfo->stop = atoi(sptr); + if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP; + break; + + case SQINFO_OLEN: + if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; } + sqinfo->olen = atoi(sptr); + if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN; + break; + + default: + Die("Invalid flag %d to SetSeqinfoString()", flag); + } + return 1; +} + +void +SeqinfoCopy(SQINFO *sq1, SQINFO *sq2) +{ + sq1->flags = sq2->flags; + if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name); + if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id); + if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc); + if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc); + if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len; + if (sq2->flags & SQINFO_START) sq1->start = sq2->start; + if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop; + if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen; + if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type; + if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss); + if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa); +} + +/* Function: ToDNA() + * + * Purpose: Convert a sequence to DNA. + * U --> T + */ +void +ToDNA(char *seq) +{ + for (; *seq != '\0'; seq++) + { + if (*seq == 'U') *seq = 'T'; + else if (*seq == 'u') *seq = 't'; + } +} + +/* Function: ToRNA() + * + * Purpose: Convert a sequence to RNA. + * T --> U + */ +void +ToRNA(char *seq) +{ + for (; *seq != '\0'; seq++) + { + if (*seq == 'T') *seq = 'U'; + else if (*seq == 't') *seq = 'u'; + } +} + + +/* Function: ToIUPAC() + * + * Purpose: Convert X's, o's, other junk in a nucleic acid sequence to N's, + * to comply with IUPAC code. Does allow gap characters + * though, so we can call ToIUPAC() on aligned seqs. + * + * WU-BLAST's pressdb will + * choke on X's, for instance, necessitating conversion + * of certain genome centers' data. + */ +void +ToIUPAC(char *seq) +{ + for (; *seq != '\0'; seq++) + if (strchr(NUCLEOTIDES, *seq) == NULL && ! isgap(*seq)) *seq = 'N'; +} + + +/* Function: addseq() + * + * Purpose: Add a line of sequence to the growing string in V. + * Skip all nonalphabetic characters in the input string: + * in particular, spaces and digits (coordinates). This + * allows us to generically read sequence data from most + * any format. + */ +static void +addseq(char *s, struct ReadSeqVars *V) +{ + char *s0; + char *sq; + int rpl; /* valid residues per line */ + int bpl; /* characters per line */ + + if (V->ssimode == -1) + { /* Normal mode: keeping the seq */ + /* Make sure we have enough room. We know that s is <= buflen, + * so just make sure we've got room for a whole new buflen worth + * of sequence. + */ + if (V->seqlen + V->buflen > V->maxseq) { + V->maxseq += MAX(V->buflen, kStartLength); + V->seq = ReallocOrDie (V->seq, V->maxseq+1); + } + + s0 = s; + sq = V->seq + V->seqlen; + while (*s != 0) { + if (isalpha((int) *s)) { + *sq = *s; + sq++; + } + s++; + } + V->seqlen = sq - V->seq; + } + else /* else: indexing mode, discard the seq */ + { + s0 = s; + rpl = 0; + while (*s != 0) { + if (isalpha((int) *s)) rpl++; + s++; + } + V->seqlen += rpl; + bpl = s - s0; + + /* Keep track of the global rpl, bpl for the file. + * This is overly complicated because we have to + * allow the last line of each record (e.g. the last addseq() call + * on each sequence) to have a different length - and sometimes + * we'll have one-line sequence records, too. Thus we only + * do something with the global V->rpl when we have *passed over* + * a line - we keep the last line's rpl in last_rpl. And because + * a file might consist entirely of single-line records, we keep + * a third guy, maxrpl, that tells us the maximum rpl of any line + * in the file. If we reach the end of file and rpl is still unset, + * we'll set it to maxrpl. If we reach eof and rpl is set, but is + * less than maxrpl, that's a weird case where a last line in some + * record is longer than every other line. + */ + if (V->rpl != 0) { /* 0 means we already know rpl is invalid */ + if (V->lastrpl > 0) { /* we're on something that's not the first line */ + if (V->rpl > 0 && V->lastrpl != V->rpl) V->rpl = 0; + else if (V->rpl == -1) V->rpl = V->lastrpl; + } + V->lastrpl = rpl; + if (rpl > V->maxrpl) V->maxrpl = rpl; /* make sure we check max length of final lines */ + } + if (V->bpl != 0) { /* 0 means we already know bpl is invalid */ + if (V->lastbpl > 0) { /* we're on something that's not the first line */ + if (V->bpl > 0 && V->lastbpl != V->bpl) V->bpl = 0; + else if (V->bpl == -1) V->bpl = V->lastbpl; + } + V->lastbpl = bpl; + if (bpl > V->maxbpl) V->maxbpl = bpl; /* make sure we check max length of final lines */ + } + } /* end of indexing mode of addseq(). */ + +} + +static void +readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V) +{ + int addend = 0; + int done = 0; + + V->seqlen = 0; + V->lastrpl = V->lastbpl = 0; + if (addfirst) { + if (V->ssimode >= 0) V->d_off = V->ssioffset; + addseq(V->buf, V); + } else if (V->ssimode >= 0) + if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->d_off))) + Die("SSIGetFilePosition() failed"); + + do { + SeqfileGetLine(V); + /* feof() alone is a bug; files not necessarily \n terminated */ + if (*(V->buf) == '\0' && feof(V->f)) + done = TRUE; + done |= (*endTest)(V->buf, &addend); + if (addend || !done) + addseq(V->buf, V); + } while (!done); +} + + +static int +endPIR(char *s, int *addend) +{ + *addend = 0; + if ((strncmp(s, "///", 3) == 0) || + (strncmp(s, "ENTRY", 5) == 0)) + return 1; + else + return 0; +} + +static void +readPIR(struct ReadSeqVars *V) +{ + char *sptr; + /* load first line of entry */ + while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) { + SeqfileGetLine(V); + } + if (feof(V->f)) return; + if (V->ssimode >= 0) V->r_off = V->ssioffset; + + if ((sptr = strtok(V->buf + 15, "\n\t ")) != NULL) + { + SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); + SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); + } + do { + SeqfileGetLine(V); + if (!feof(V->f) && strncmp(V->buf, "TITLE", 5) == 0) + SetSeqinfoString(V->sqinfo, V->buf+15, SQINFO_DESC); + else if (!feof(V->f) && strncmp(V->buf, "ACCESSION", 9) == 0) + { + if ((sptr = strtok(V->buf+15, " \t\n")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); + } + } while (! feof(V->f) && (strncmp(V->buf,"SEQUENCE", 8) != 0)); + SeqfileGetLine(V); /* skip next line, coords */ + + readLoop(0, endPIR, V); + + /* reading a real PIR-CODATA database file, we keep the source coords + */ + V->sqinfo->start = 1; + V->sqinfo->stop = V->seqlen; + V->sqinfo->olen = V->seqlen; + V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; + + /* get next line + */ + while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) { + SeqfileGetLine(V); + } +} + + + +static int +endIG(char *s, int *addend) +{ + *addend = 1; /* 1 or 2 occur in line w/ bases */ + return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL)); +} + +static void +readIG(struct ReadSeqVars *V) +{ + char *nm; + /* position past ';' comments */ + do { + SeqfileGetLine(V); + } while (! (feof(V->f) || ((*V->buf != 0) && (*V->buf != ';')) )); + + if (!feof(V->f)) + { + if ((nm = strtok(V->buf, "\n\t ")) != NULL) + SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); + + readLoop(0, endIG, V); + } + + while (!(feof(V->f) || ((*V->buf != '\0') && (*V->buf == ';')))) + SeqfileGetLine(V); +} + +static int +endStrider(char *s, int *addend) +{ + *addend = 0; + return (strstr( s, "//") != NULL); +} + +static void +readStrider(struct ReadSeqVars *V) +{ + char *nm; + + while ((!feof(V->f)) && (*V->buf == ';')) + { + if (strncmp(V->buf,"; DNA sequence", 14) == 0) + { + if ((nm = strtok(V->buf+16, ",\n\t ")) != NULL) + SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME); + } + SeqfileGetLine(V); + } + + if (! feof(V->f)) + readLoop(1, endStrider, V); + + /* load next line + */ + while ((!feof(V->f)) && (*V->buf != ';')) + SeqfileGetLine(V); +} + + +static int +endGB(char *s, int *addend) +{ + *addend = 0; + return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s)); +} + +static void +readGenBank(struct ReadSeqVars *V) +{ + char *sptr; + int in_definition; + + while (strncmp(V->buf, "LOCUS", 5) != 0) { + SeqfileGetLine(V); + } + if (V->ssimode >= 0) V->r_off = V->ssioffset; + + if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL) + { + SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); + SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); + } + + in_definition = FALSE; + while (! feof(V->f)) + { + SeqfileGetLine(V); + if (! feof(V->f) && strstr(V->buf, "DEFINITION") == V->buf) + { + if ((sptr = strtok(V->buf+12, "\n")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); + in_definition = TRUE; + } + else if (! feof(V->f) && strstr(V->buf, "ACCESSION") == V->buf) + { + if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); + in_definition = FALSE; + } + else if (strncmp(V->buf,"ORIGIN", 6) != 0) + { + if (in_definition) + SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC); + } + else + break; + } + + readLoop(0, endGB, V); + + /* reading a real GenBank database file, we keep the source coords + */ + V->sqinfo->start = 1; + V->sqinfo->stop = V->seqlen; + V->sqinfo->olen = V->seqlen; + V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; + + + while (!(feof(V->f) || ((*V->buf!=0) && (strstr(V->buf,"LOCUS") == V->buf)))) + SeqfileGetLine(V); + /* SRE: V->s now holds "//", so sequential + reads are wedged: fixed Tue Jul 13 1993 */ + while (!feof(V->f) && strstr(V->buf, "LOCUS ") != V->buf) + SeqfileGetLine(V); +} + +static int +endGCGdata(char *s, int *addend) +{ + *addend = 0; + return (*s == '>'); +} + +static void +readGCGdata(struct ReadSeqVars *V) +{ + int binary = FALSE; /* whether data are binary or not */ + int blen = 0; /* length of binary sequence */ + + /* first line contains ">>>>" followed by name */ + if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->buf, 2)) + { + binary = TRUE; + SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); + blen = atoi(sqd_parse[2]); + } + else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->buf, 1)) + SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME); + else + Die("bogus GCGdata format? %s", V->buf); + + /* second line contains free text description */ + SeqfileGetLine(V); + SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC); + + if (binary) { + /* allocate for blen characters +3... (allow for 3 bytes of slop) */ + if (blen >= V->maxseq) { + V->maxseq = blen; + if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL) + Die("malloc failed"); + } + /* read (blen+3)/4 bytes from file */ + if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4)) + Die("fread failed"); + V->seqlen = blen; + /* convert binary code to seq */ + GCGBinaryToSequence(V->seq, blen); + } + else readLoop(0, endGCGdata, V); + + while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) + SeqfileGetLine(V); +} + +static int +endPearson(char *s, int *addend) +{ + *addend = 0; + return(*s == '>'); +} + +static void +readPearson(struct ReadSeqVars *V) +{ + char *sptr; + + if (V->ssimode >= 0) V->r_off = V->ssioffset; + + if (*V->buf != '>') + Die("\ +File %s does not appear to be in FASTA format at line %d.\n\ +You may want to invoke the Babelfish to autodetect your file's format.\n\ +Usually this is done with a -B option.\n", + V->fname, V->linenumber); + + if ((sptr = strtok(V->buf+1, "\n\t ")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); + if ((sptr = strtok(NULL, "\n")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); + + readLoop(0, endPearson, V); + + while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) { + SeqfileGetLine(V); + } +} + + +static int +endEMBL(char *s, int *addend) +{ + *addend = 0; + /* Some people (Berlin 5S rRNA database, f'r instance) use + * an extended EMBL format that attaches extra data after + * the sequence -- watch out for that. We use the fact that + * real EMBL sequence lines begin with five spaces. + * + * We can use this as the sole end test because readEMBL() will + * advance to the next ID line before starting to read again. + */ + return (strncmp(s," ",5) != 0); +/* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */ +} + +static void +readEMBL(struct ReadSeqVars *V) +{ + char *sptr; + + /* make sure we have first line */ + while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) { + SeqfileGetLine(V); + } + if (V->ssimode >= 0) V->r_off = V->ssioffset; + + if ((sptr = strtok(V->buf+5, "\n\t ")) != NULL) + { + SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); + SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID); + } + + do { + SeqfileGetLine(V); + if (!feof(V->f) && strstr(V->buf, "AC ") == V->buf) + { + if ((sptr = strtok(V->buf+5, "; \t\n")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC); + } + else if (!feof(V->f) && strstr(V->buf, "DE ") == V->buf) + { + if ((sptr = strtok(V->buf+5, "\n")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); + } + } while (! feof(V->f) && strncmp(V->buf,"SQ",2) != 0); + + readLoop(0, endEMBL, V); + + /* Hack for Staden experiment files: convert - to N + */ + if (V->ssimode == -1) /* if we're in ssi mode, we're not keeping the seq */ + for (sptr = V->seq; *sptr != '\0'; sptr++) + if (*sptr == '-') *sptr = 'N'; + + /* reading a real EMBL database file, we keep the source coords + */ + V->sqinfo->start = 1; + V->sqinfo->stop = V->seqlen; + V->sqinfo->olen = V->seqlen; + V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN; + + /* load next record's ID line */ + while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) { + SeqfileGetLine(V); + } + +} + + +static int +endZuker(char *s, int *addend) +{ + *addend = 0; + return( *s == '(' ); +} + +static void +readZuker(struct ReadSeqVars *V) +{ + char *sptr; + + SeqfileGetLine(V); /*s == "seqLen seqid string..."*/ + + if ((sptr = strtok(V->buf+6, " \t\n")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); + + if ((sptr = strtok(NULL, "\n")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC); + + readLoop(0, endZuker, V); + + while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '(')))) + SeqfileGetLine(V); +} + +static void +readUWGCG(struct ReadSeqVars *V) +{ + char *si; + char *sptr; + int done; + + V->seqlen = 0; + + /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ + /*drop above or ".." from id*/ + if ((si = strstr(V->buf," Length: ")) != NULL) *si = 0; + else if ((si = strstr(V->buf,"..")) != NULL) *si = 0; + + if ((sptr = strtok(V->buf, "\n\t ")) != NULL) + SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME); + + do { + done = feof(V->f); + SeqfileGetLine(V); + if (! done) addseq(V->buf, V); + } while (!done); +} + + +/* Function: ReadSeq() + * + * Purpose: Read next sequence from an open database file. + * Return the sequence and associated info. + * + * Args: fp - open sequence database file pointer + * format - format of the file (previously determined + * by call to SeqfileFormat()). + * Currently unused, since we carry it in V. + * ret_seq - RETURN: sequence + * sqinfo - RETURN: filled in w/ other information + * + * Limitations: uses squid_errno, so it's not threadsafe. + * + * Return: 1 on success, 0 on failure. + * ret_seq and some field of sqinfo are allocated here, + * The preferred call mechanism to properly free the memory is: + * + * SQINFO sqinfo; + * char *seq; + * + * ReadSeq(fp, format, &seq, &sqinfo); + * ... do something... + * FreeSequence(seq, &sqinfo); + */ +int +ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo) +{ + int gotuw; + + squid_errno = SQERR_OK; + + /* Here's the hack for sequential access of sequences from + * the multiple sequence alignment formats + */ + if (IsAlignmentFormat(V->format)) + { + if (V->msa->lastidx >= V->msa->nseq) + { /* out of data. try to read another alignment */ + MSAFree(V->msa); + if ((V->msa = MSAFileRead(V->afp)) == NULL) + return 0; + V->msa->lastidx = 0; + } + /* copy and dealign the appropriate aligned seq */ + MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, + V->msa->aseq[V->msa->lastidx], &(V->seq)); + V->seqlen = strlen(V->seq); + + /* Extract sqinfo stuff for this sequence from the msa. + * Tedious; code that should be cleaned. + */ + sqinfo->flags = 0; + if (V->msa->sqname[V->msa->lastidx] != NULL) + SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME); + if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL) + SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC); + if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL) + SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC); + if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) { + MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, + V->msa->ss[V->msa->lastidx], &(sqinfo->ss)); + sqinfo->flags |= SQINFO_SS; + } + if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) { + MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen, + V->msa->sa[V->msa->lastidx], &(sqinfo->sa)); + sqinfo->flags |= SQINFO_SA; + } + V->msa->lastidx++; + } + else { + if (feof(V->f)) return 0; + + if (V->ssimode == -1) { /* normal mode */ + V->seq = (char*) calloc (kStartLength+1, sizeof(char)); + V->maxseq = kStartLength; + } else { /* index mode: discarding seq */ + V->seq = NULL; + V->maxseq = 0; + } + V->seqlen = 0; + V->sqinfo = sqinfo; + V->sqinfo->flags = 0; + + switch (V->format) { + case SQFILE_IG : readIG(V); break; + case SQFILE_STRIDER : readStrider(V); break; + case SQFILE_GENBANK : readGenBank(V); break; + case SQFILE_FASTA : readPearson(V); break; + case SQFILE_EMBL : readEMBL(V); break; + case SQFILE_ZUKER : readZuker(V); break; + case SQFILE_PIR : readPIR(V); break; + case SQFILE_GCGDATA : readGCGdata(V); break; + + case SQFILE_GCG : + do { /* skip leading comments on GCG file */ + gotuw = (strstr(V->buf,"..") != NULL); + if (gotuw) readUWGCG(V); + SeqfileGetLine(V); + } while (! feof(V->f)); + break; + + case SQFILE_IDRAW: /* SRE: no attempt to read idraw postscript */ + default: + squid_errno = SQERR_FORMAT; + free(V->seq); + return 0; + } + if (V->seq != NULL) /* (it can be NULL in indexing mode) */ + V->seq[V->seqlen] = 0; /* stick a string terminator on it */ + } + + /* Cleanup + */ + sqinfo->len = V->seqlen; + sqinfo->flags |= SQINFO_LEN; + *ret_seq = V->seq; + if (squid_errno == SQERR_OK) return 1; else return 0; +} + +/* Function: SeqfileFormat() + * Date: SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre] + * + * Purpose: Determine format of an open file. + * Returns format code. + * Rewinds the file. + * + * Autodetects the following unaligned formats: + * SQFILE_FASTA + * SQFILE_GENBANK + * SQFILE_EMBL + * SQFILE_GCG + * SQFILE_GCGDATA + * SQFILE_PIR + * Also autodetects the following alignment formats: + * MSAFILE_STOCKHOLM + * MSAFILE_MSF + * MSAFILE_CLUSTAL + * MSAFILE_SELEX + * MSAFILE_PHYLIP + * + * Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA. + * MSAFileFormat() does the opposite. + * + * Args: sfp - open SQFILE + * + * Return: format code, or SQFILE_UNKNOWN if unrecognized + */ +int +SeqfileFormat(FILE *fp) +{ + char *buf; + int len; + int fmt = SQFILE_UNKNOWN; + int ndataline; + char *bufcpy, *s, *s1, *s2; + int has_junk; + + buf = NULL; + len = 0; + ndataline = 0; + has_junk = FALSE; + while (sre_fgets(&buf, &len, fp) != NULL) + { + if (IsBlankline(buf)) continue; + + /* Well-behaved formats identify themselves in first nonblank line. + */ + if (ndataline == 0) + { + if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: ")) + { fmt = SQFILE_GCGDATA; goto DONE; } + + if (buf[0] == '>') + { fmt = SQFILE_FASTA; goto DONE; } + + if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 || + strncmp(buf, "!!NA_SEQUENCE", 13) == 0) + { fmt = SQFILE_GCG; goto DONE; } + + if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0) + { fmt = MSAFILE_STOCKHOLM; goto DONE; } + + if (strncmp(buf, "CLUSTAL", 7) == 0 && + strstr(buf, "multiple sequence alignment") != NULL) + { fmt = MSAFILE_CLUSTAL; goto DONE; } + + if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 || + strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) + { fmt = MSAFILE_MSF; goto DONE; } + + /* PHYLIP id: also just a good bet */ + bufcpy = sre_strdup(buf, -1); + s = bufcpy; + if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL && + (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL && + IsInt(s1) && + IsInt(s2)) + { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; } + free(bufcpy); + } + + /* We trust that other formats identify themselves soon. + */ + /* dead giveaways for extended SELEX */ + if (strncmp(buf, "#=AU", 4) == 0 || + strncmp(buf, "#=ID", 4) == 0 || + strncmp(buf, "#=AC", 4) == 0 || + strncmp(buf, "#=DE", 4) == 0 || + strncmp(buf, "#=GA", 4) == 0 || + strncmp(buf, "#=TC", 4) == 0 || + strncmp(buf, "#=NC", 4) == 0 || + strncmp(buf, "#=SQ", 4) == 0 || + strncmp(buf, "#=SS", 4) == 0 || + strncmp(buf, "#=CS", 4) == 0 || + strncmp(buf, "#=RF", 4) == 0) + { fmt = MSAFILE_SELEX; goto DONE; } + + if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0) + { fmt = SQFILE_PIR; goto DONE; } + + /* a ha, diagnostic of an (old) MSF file */ + if ((strstr(buf, "..") != NULL) && + (strstr(buf, "MSF:") != NULL) && + (strstr(buf, "Check:")!= NULL)) + { fmt = MSAFILE_MSF; goto DONE; } + + /* unaligned GCG (must follow MSF test!) */ + if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL) + { fmt = SQFILE_GCG; goto DONE; } + + if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0) + { fmt = SQFILE_GENBANK; goto DONE; } + + if (strncmp(buf,"ID ",5) == 0 || strncmp(buf,"SQ ",5) == 0) + { fmt = SQFILE_EMBL; goto DONE; } + + /* But past here, we're being desperate. A simple SELEX file is + * very difficult to detect; we can only try to disprove it. + */ + s = buf; + if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */ + if (strchr("#%", *s1) != NULL) continue; /* skip comment lines */ + + /* Disproof 1. Noncomment, nonblank lines in a SELEX file + * must have at least two space-delimited fields (name/seq) + */ + if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) + has_junk = TRUE; + + /* Disproof 2. + * The sequence field should look like a sequence. + */ + if (s2 != NULL && Seqtype(s2) == kOtherSeq) + has_junk = TRUE; + + ndataline++; + if (ndataline == 300) break; /* only look at first 300 lines */ + } + + if (ndataline == 0) + Die("Sequence file contains no data"); + + /* If we've made it this far, we've run out of data, but there + * was at least one line of it; check if we've + * disproven SELEX. If not, cross our fingers, pray, and guess SELEX. + */ + if (has_junk == TRUE) fmt = SQFILE_UNKNOWN; + else fmt = MSAFILE_SELEX; + + DONE: + if (buf != NULL) free(buf); + rewind(fp); + return fmt; +} + +/* Function: GCGBinaryToSequence() + * + * Purpose: Convert a GCG 2BIT binary string to DNA sequence. + * 0 = C 1 = T 2 = A 3 = G + * 4 nts/byte + * + * Args: seq - binary sequence. Converted in place to DNA. + * len - length of DNA. binary is (len+3)/4 bytes + */ +int +GCGBinaryToSequence(char *seq, int len) +{ + int bpos; /* position in binary */ + int spos; /* position in sequence */ + char twobit; + int i; + + for (bpos = (len-1)/4; bpos >= 0; bpos--) + { + twobit = seq[bpos]; + spos = bpos*4; + + for (i = 3; i >= 0; i--) + { + switch (twobit & 0x3) { + case 0: seq[spos+i] = 'C'; break; + case 1: seq[spos+i] = 'T'; break; + case 2: seq[spos+i] = 'A'; break; + case 3: seq[spos+i] = 'G'; break; + } + twobit = twobit >> 2; + } + } + seq[len] = '\0'; + return 1; +} + + +/* Function: GCGchecksum() + * Date: SRE, Mon May 31 11:13:21 1999 [St. Louis] + * + * Purpose: Calculate a GCG checksum for a sequence. + * Code provided by Steve Smith of Genetics + * Computer Group. + * + * Args: seq - sequence to calculate checksum for. + * may contain gap symbols. + * len - length of sequence (usually known, + * so save a strlen() call) + * + * Returns: GCG checksum. + */ +int +GCGchecksum(char *seq, int len) +{ + int i; /* position in sequence */ + int chk = 0; /* calculated checksum */ + + for (i = 0; i < len; i++) + chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000; + return chk; +} + + +/* Function: GCGMultchecksum() + * + * Purpose: GCG checksum for a multiple alignment: sum of + * individual sequence checksums (including their + * gap characters) modulo 10000. + * + * Implemented using spec provided by Steve Smith of + * Genetics Computer Group. + * + * Args: seqs - sequences to be checksummed; aligned or not + * nseq - number of sequences + * + * Return: the checksum, a number between 0 and 9999 + */ +int +GCGMultchecksum(char **seqs, int nseq) +{ + int chk = 0; + int idx; + + for (idx = 0; idx < nseq; idx++) + chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000; + return chk; +} + + + + +/* Function: Seqtype() + * + * Purpose: Returns a (very good) guess about type of sequence: + * kDNA, kRNA, kAmino, or kOtherSeq. + * + * Modified from, and replaces, Gilbert getseqtype(). + */ +int +Seqtype(char *seq) +{ + int saw; /* how many non-gap characters I saw */ + char c; + int po = 0; /* count of protein-only */ + int nt = 0; /* count of t's */ + int nu = 0; /* count of u's */ + int na = 0; /* count of nucleotides */ + int aa = 0; /* count of amino acids */ + int no = 0; /* count of others */ + + /* Look at the first 300 non-gap characters + */ + for (saw = 0; *seq != '\0' && saw < 300; seq++) + { + c = sre_toupper((int) *seq); + if (! isgap(c)) + { + if (strchr(protonly, c)) po++; + else if (strchr(primenuc,c)) { + na++; + if (c == 'T') nt++; + else if (c == 'U') nu++; + } + else if (strchr(aminos,c)) aa++; + else if (isalpha((int) c)) no++; + saw++; + } + } + + if (no > 0) return kOtherSeq; + else if (po > 0) return kAmino; + else if (na > aa) { + if (nu > nt) return kRNA; + else return kDNA; + } + else return kAmino; /* ooooh. risky. */ +} + + +/* Function: GuessAlignmentSeqtype() + * Date: SRE, Wed Jul 7 09:42:34 1999 [St. Louis] + * + * Purpose: Try to guess whether an alignment is protein + * or nucleic acid; return a code for the + * type (kRNA, kDNA, or kAmino). + * + * Args: aseq - array of aligned sequences. (Could also + * be an rseq unaligned sequence array) + * nseq - number of aseqs + * + * Returns: kRNA, kDNA, kAmino; + * kOtherSeq if inconsistency is detected. + */ +int +GuessAlignmentSeqtype(char **aseq, int nseq) +{ + int idx; + int nrna = 0; + int ndna = 0; + int namino = 0; + int nother = 0; + + for (idx = 0; idx < nseq; idx++) + switch (Seqtype(aseq[idx])) { + case kRNA: nrna++; break; + case kDNA: ndna++; break; + case kAmino: namino++; break; + default: nother++; + } + + /* Unambiguous decisions: + */ + if (nother) return kOtherSeq; + if (namino == nseq) return kAmino; + if (ndna == nseq) return kDNA; + if (nrna == nseq) return kRNA; + + /* Ambiguous decisions: + */ + if (namino == 0) return kRNA; /* it's nucleic acid, but seems mixed RNA/DNA */ + return kAmino; /* some amino acid seen; others probably short seqs, some + of which may be entirely ACGT (ala,cys,gly,thr). We + could be a little more sophisticated: U would be a giveaway + that we're not in protein seqs */ +} + +/* Function: WriteSimpleFASTA() + * Date: SRE, Tue Nov 16 18:06:00 1999 [St. Louis] + * + * Purpose: Just write a FASTA format sequence to a file; + * minimal interface, mostly for quick and dirty programs. + * + * Args: fp - open file handle (stdout, possibly) + * seq - sequence to output + * name - name for the sequence + * desc - optional description line, or NULL. + * + * Returns: void + */ +void +WriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc) +{ + char buf[61]; + int len; + int pos; + + len = strlen(seq); + buf[60] = '\0'; + fprintf(fp, ">%s %s\n", name, desc != NULL ? desc : ""); + for (pos = 0; pos < len; pos += 60) + { + strncpy(buf, seq+pos, 60); + fprintf(fp, "%s\n", buf); + } +} + +int +WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo) +{ + int numline = 0; + int lines = 0, spacer = 0, width = 50, tab = 0; + int i, j, l, l1, ibase; + char endstr[10]; + char s[100]; /* buffer for sequence */ + char ss[100]; /* buffer for structure */ + int checksum = 0; + int seqlen; + int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */ + int dostruc; /* TRUE to print structure lines*/ + + which_case = 0; + dostruc = FALSE; + seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq); + + if (IsAlignmentFormat(outform)) + Die("Tried to write an aligned format with WriteSeq() -- bad, bad."); + + + strcpy( endstr,""); + l1 = 0; + checksum = GCGchecksum(seq, seqlen); + + switch (outform) { + case SQFILE_UNKNOWN: /* no header, just sequence */ + strcpy(endstr,"\n"); /* end w/ extra blank line */ + break; + + case SQFILE_GENBANK: + fprintf(outf,"LOCUS %s %d bp\n", + (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name, + seqlen); + fprintf(outf,"DEFINITION %s\n", + (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); + fprintf(outf,"ACCESSION %s\n", + (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); + fprintf(outf,"ORIGIN \n"); + spacer = 11; + numline = 1; + strcpy(endstr, "\n//"); + break; + + case SQFILE_GCGDATA: + fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen); + fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); + break; + + case SQFILE_PIR: + fprintf(outf, "ENTRY %s\n", + (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); + fprintf(outf, "TITLE %s\n", + (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); + fprintf(outf, "ACCESSION %s\n", + (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); + fprintf(outf, "SUMMARY #Length %d #Checksum %d\n", + sqinfo->len, checksum); + fprintf(outf, "SEQUENCE\n"); + fprintf(outf, " 5 10 15 20 25 30\n"); + spacer = 2; /* spaces after every residue */ + numline = 1; /* number lines w/ coords */ + width = 30; /* 30 aa per line */ + strcpy(endstr, "\n///"); + break; + + case SQFILE_SQUID: + fprintf(outf, "NAM %s\n", sqinfo->name); + if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN)) + fprintf(outf, "SRC %s %s %d..%d::%d\n", + (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-", + (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-", + (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0, + (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0, + (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0); + if (sqinfo->flags & SQINFO_DESC) + fprintf(outf, "DES %s\n", sqinfo->desc); + if (sqinfo->flags & SQINFO_SS) + { + fprintf(outf, "SEQ +SS\n"); + dostruc = TRUE; /* print structure lines too */ + } + else + fprintf(outf, "SEQ\n"); + numline = 1; /* number seq lines w/ coords */ + strcpy(endstr, "\n++"); + break; + + case SQFILE_EMBL: + fprintf(outf,"ID %s\n", + (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name); + fprintf(outf,"AC %s\n", + (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-"); + fprintf(outf,"DE %s\n", + (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-"); + fprintf(outf,"SQ %d BP\n", seqlen); + strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ + tab = 5; /** added 31jan91 */ + spacer = 11; /** added 31jan91 */ + break; + + case SQFILE_GCG: + fprintf(outf,"%s\n", sqinfo->name); + if (sqinfo->flags & SQINFO_ACC) + fprintf(outf,"ACCESSION %s\n", sqinfo->acc); + if (sqinfo->flags & SQINFO_DESC) + fprintf(outf,"DEFINITION %s\n", sqinfo->desc); + fprintf(outf," %s Length: %d (today) Check: %d ..\n", + sqinfo->name, seqlen, checksum); + spacer = 11; + numline = 1; + strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ + break; + + case SQFILE_STRIDER: /* ?? map ?*/ + fprintf(outf,"; ### from DNA Strider ;-)\n"); + fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n", + sqinfo->name, seqlen, checksum); + strcpy(endstr, "\n//"); + break; + + /* SRE: Don had Zuker default to Pearson, which is not + intuitive or helpful, since Zuker's MFOLD can't read + Pearson format. More useful to use kIG */ + case SQFILE_ZUKER: + which_case = 1; /* MFOLD requires upper case. */ + /*FALLTHRU*/ + case SQFILE_IG: + fprintf(outf,";%s %s\n", + sqinfo->name, + (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); + fprintf(outf,"%s\n", sqinfo->name); + strcpy(endstr,"1"); /* == linear dna */ + break; + + case SQFILE_RAW: /* Raw: no header at all. */ + break; + + default : + case SQFILE_FASTA: + fprintf(outf,">%s %s\n", sqinfo->name, + (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ""); + break; + } + + if (which_case == 1) s2upper(seq); + if (which_case == 2) s2lower(seq); + + + width = MIN(width,100); + for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) { + if (l1 < 0) l1 = 0; + else if (l1 == 0) { + if (numline) fprintf(outf,"%8d ",ibase); + for (j=0; jflags & SQINFO_SS) ? sqinfo->ss[i] : '.'; + l++; i++; + l1++; /* don't count spaces for width*/ + if (l1 == width || i == seqlen) { + s[l] = ss[l] = '\0'; + l = 0; l1 = 0; + if (dostruc) + { + fprintf(outf, "%s\n", s); + if (numline) fprintf(outf," "); + for (j=0; jformat, &rseqs[num], &(sqinfo[num]))) + { + num++; + if (num == numalloced) /* more seqs coming, alloc more room */ + { + numalloced += 16; + rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *)); + sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO)); + } + } + SeqfileClose(dbfp); + + *ret_rseqs = rseqs; + *ret_sqinfo = sqinfo; + *ret_num = num; + return 1; +} + + +/* Function: String2SeqfileFormat() + * Date: SRE, Sun Jun 27 15:25:54 1999 [TW 723 over Canadian Shield] + * + * Purpose: Convert a string (e.g. from command line option arg) + * to a format code. Case insensitive. Return + * MSAFILE_UNKNOWN/SQFILE_UNKNOWN if string is bad. + * Uses codes defined in squid.h (unaligned formats) and + * msa.h (aligned formats). + * + * Args: s - string to convert; e.g. "stockholm" + * + * Returns: format code; e.g. MSAFILE_STOCKHOLM + */ +int +String2SeqfileFormat(char *s) +{ + char *s2; + int code = SQFILE_UNKNOWN; + + if (s == NULL) return SQFILE_UNKNOWN; + s2 = sre_strdup(s, -1); + s2upper(s2); + + if (strcmp(s2, "FASTA") == 0) code = SQFILE_FASTA; + else if (strcmp(s2, "GENBANK") == 0) code = SQFILE_GENBANK; + else if (strcmp(s2, "EMBL") == 0) code = SQFILE_EMBL; + else if (strcmp(s2, "GCG") == 0) code = SQFILE_GCG; + else if (strcmp(s2, "GCGDATA") == 0) code = SQFILE_GCGDATA; + else if (strcmp(s2, "RAW") == 0) code = SQFILE_RAW; + else if (strcmp(s2, "IG") == 0) code = SQFILE_IG; + else if (strcmp(s2, "STRIDER") == 0) code = SQFILE_STRIDER; + else if (strcmp(s2, "IDRAW") == 0) code = SQFILE_IDRAW; + else if (strcmp(s2, "ZUKER") == 0) code = SQFILE_ZUKER; + else if (strcmp(s2, "PIR") == 0) code = SQFILE_PIR; + else if (strcmp(s2, "SQUID") == 0) code = SQFILE_SQUID; + else if (strcmp(s2, "STOCKHOLM") == 0) code = MSAFILE_STOCKHOLM; + else if (strcmp(s2, "SELEX") == 0) code = MSAFILE_SELEX; + else if (strcmp(s2, "MSF") == 0) code = MSAFILE_MSF; + else if (strcmp(s2, "CLUSTAL") == 0) code = MSAFILE_CLUSTAL; + else if (strcmp(s2, "A2M") == 0) code = MSAFILE_A2M; + else if (strcmp(s2, "PHYLIP") == 0) code = MSAFILE_PHYLIP; + else if (strcmp(s2, "EPS") == 0) code = MSAFILE_EPS; + + free(s2); + return code; +} +char * +SeqfileFormat2String(int code) +{ + switch (code) { + case SQFILE_UNKNOWN: return "unknown"; + case SQFILE_FASTA: return "FASTA"; + case SQFILE_GENBANK: return "Genbank"; + case SQFILE_EMBL: return "EMBL"; + case SQFILE_GCG: return "GCG"; + case SQFILE_GCGDATA: return "GCG data library"; + case SQFILE_RAW: return "raw"; + case SQFILE_IG: return "Intelligenetics"; + case SQFILE_STRIDER: return "MacStrider"; + case SQFILE_IDRAW: return "Idraw Postscript"; + case SQFILE_ZUKER: return "Zuker"; + case SQFILE_PIR: return "PIR"; + case SQFILE_SQUID: return "SQUID"; + case MSAFILE_STOCKHOLM: return "Stockholm"; + case MSAFILE_SELEX: return "SELEX"; + case MSAFILE_MSF: return "MSF"; + case MSAFILE_CLUSTAL: return "Clustal"; + case MSAFILE_A2M: return "a2m"; + case MSAFILE_PHYLIP: return "Phylip"; + case MSAFILE_EPS: return "EPS"; + default: + Die("Bad code passed to MSAFormat2String()"); + } + /*NOTREACHED*/ + return NULL; +} + + +/* Function: MSAToSqinfo() + * Date: SRE, Tue Jul 20 14:36:56 1999 [St. Louis] + * + * Purpose: Take an MSA and generate a SQINFO array suitable + * for use in annotating the unaligned sequences. + * Return the array. + * + * Permanent temporary code. sqinfo was poorly designed. + * it must eventually be replaced, but the odds + * of this happening soon are nil, so I have to deal. + * + * Args: msa - the alignment + * + * Returns: ptr to allocated sqinfo array. + * Freeing is ghastly: free in each individual sqinfo[i] + * with FreeSequence(NULL, &(sqinfo[i])), then + * free(sqinfo). + */ +SQINFO * +MSAToSqinfo(MSA *msa) +{ + int idx; + SQINFO *sqinfo; + + sqinfo = MallocOrDie(sizeof(SQINFO) * msa->nseq); + + for (idx = 0; idx < msa->nseq; idx++) + { + sqinfo[idx].flags = 0; + SetSeqinfoString(&(sqinfo[idx]), + msa->sqname[idx], SQINFO_NAME); + SetSeqinfoString(&(sqinfo[idx]), + MSAGetSeqAccession(msa, idx), SQINFO_ACC); + SetSeqinfoString(&(sqinfo[idx]), + MSAGetSeqDescription(msa, idx), SQINFO_DESC); + + if (msa->ss != NULL && msa->ss[idx] != NULL) { + MakeDealignedString(msa->aseq[idx], msa->alen, + msa->ss[idx], &(sqinfo[idx].ss)); + sqinfo[idx].flags |= SQINFO_SS; + } + + if (msa->sa != NULL && msa->sa[idx] != NULL) { + MakeDealignedString(msa->aseq[idx], msa->alen, + msa->sa[idx], &(sqinfo[idx].sa)); + sqinfo[idx].flags |= SQINFO_SA; + } + + sqinfo[idx].len = DealignedLength(msa->aseq[idx]); + sqinfo[idx].flags |= SQINFO_LEN; + } + return sqinfo; +} + + + +/* cc -o sqio_test -DA_QUIET_DAY -L. sqio.c -lsquid */ +#ifdef A_QUIET_DAY +#include "ssi.h" +int +main(int argc, char **argv) +{ + FILE *fp; + char *filename; + char *buf; + int len; + int mode = 3; + SSIOFFSET off; + + filename = argv[1]; + + if (mode == 1) { + buf = malloc(sizeof(char) * 256); + if ((fp = fopen(filename, "r")) == NULL) + Die("open of %s failed", filename); + while (fgets(buf, 255, fp) != NULL) + ; + fclose(fp); + free(buf); + } else if (mode == 2) { + if ((fp = fopen(filename, "r")) == NULL) + Die("open of %s failed", filename); + buf = NULL; len = 0; + while (sre_fgets(&buf, &len, fp) != NULL) + SSIGetFilePosition(fp, SSI_OFFSET_I32, &off); + fclose(fp); + free(buf); + } else if (mode == 3) { + SQFILE *dbfp; + SQINFO info; + + if ((dbfp = SeqfileOpen(filename, SQFILE_FASTA, NULL)) == NULL) + Die("open of %s failed", filename); + while (ReadSeq(dbfp, dbfp->format, &buf, &info)) { + SSIGetFilePosition(dbfp->f, SSI_OFFSET_I32, &off); + FreeSequence(buf, &info); + } + SeqfileClose(dbfp); + } + +} + + +#endif diff --git a/forester/archive/RIO/others/hmmer/squid/squid.h.in b/forester/archive/RIO/others/hmmer/squid/squid.h.in new file mode 100644 index 0000000..2cf9a73 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/squid.h.in @@ -0,0 +1,473 @@ +/* @configure_input@ */ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef SQUIDH_INCLUDED +#define SQUIDH_INCLUDED + +/* squid.h + * Header file for my library of sequence functions. + * + * CVS $Id: squid.h.in,v 1.1.1.1 2005/03/22 08:34:25 cmzmasek Exp $ + */ + +#include +#include +#include +#include /* for sysconf() #define's */ + + +#if DEBUGLEVEL > 0 +#include /* for SQD_DASSERT1(), etc. */ +#endif + +#include "squidconf.h" /* #define's generated by ./configure script */ + +/***************************************************************** + * Integers of guaranteed size. (used for instance in gsi.c, gsi2.c) + * These are set by the ./configure script; if they show up as FIXME, + * they must be manually edited to appropriate type definitions. You + * do need 64-bit integers in the current code; email me if this + * prevents you from compiling SQUID and tell me your system (I don't + * know of any systems that don't have 64-bit integers these days). + *****************************************************************/ +typedef @SQD_UINT16@ sqd_uint16; +typedef @SQD_UINT32@ sqd_uint32; +typedef @SQD_UINT64@ sqd_uint64; + +#ifdef USE_HOST_BYTESWAP_FUNCTIONS +#include /* only for ntohl() and friends. */ +#include /* only for ntohl() and friends. */ +#define sre_ntoh16(x) ntohs(x); +#define sre_ntoh32(x) ntohl(x); +#define sre_hton16(x) htons(x); +#define sre_hton32(x) htonl(x); +#endif /* USE_HOST_BYTESWAP_FUNCTIONS */ + +/* Library version info is made available as a global to + * any interested program. These are defined in iupac.c + * with the other globals. + */ +extern char squid_version[]; /* version number */ +extern char squid_date[]; /* date of release */ +extern int squid_errno; /* error codes */ + + + +/**************************************************** + * Error codes returned by squid library functions (squid_errno) + ****************************************************/ + +#define SQERR_OK 0 /* no error */ +#define SQERR_UNKNOWN 1 /* generic error, unidentified */ +#define SQERR_NODATA 2 /* unexpectedly NULL stream */ +#define SQERR_MEM 3 /* malloc or realloc failed */ +#define SQERR_NOFILE 4 /* file not found */ +#define SQERR_FORMAT 5 /* file format not recognized */ +#define SQERR_PARAMETER 6 /* bad parameter passed to func */ +#define SQERR_DIVZERO 7 /* error in sre_math.c */ +#define SQERR_INCOMPAT 8 /* incompatible parameters */ +#define SQERR_EOD 9 /* end-of-data (often normal) */ + +/**************************************************** + * Single sequence information + ****************************************************/ +#define SQINFO_NAMELEN 64 +#define SQINFO_DESCLEN 128 + +struct seqinfo_s { + int flags; /* what extra data are available */ + char name[SQINFO_NAMELEN];/* up to 63 characters of name */ + char id[SQINFO_NAMELEN]; /* up to 63 char of database identifier */ + char acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */ + char desc[SQINFO_DESCLEN];/* up to 127 char of description */ + int len; /* length of this seq */ + int start; /* (1..len) start position on source seq */ + int stop; /* (1..len) end position on source seq */ + int olen; /* original length of source seq */ + int type; /* kRNA, kDNA, kAmino, or kOther */ + char *ss; /* 0..len-1 secondary structure string */ + char *sa; /* 0..len-1 % side chain surface access. */ +}; +typedef struct seqinfo_s SQINFO; + +#define SQINFO_NAME (1 << 0) +#define SQINFO_ID (1 << 1) +#define SQINFO_ACC (1 << 2) +#define SQINFO_DESC (1 << 3) +#define SQINFO_START (1 << 4) +#define SQINFO_STOP (1 << 5) +#define SQINFO_LEN (1 << 6) +#define SQINFO_TYPE (1 << 7) +#define SQINFO_OLEN (1 << 8) +#define SQINFO_SS (1 << 9) +#define SQINFO_SA (1 << 10) + + +/**************************************************** + * Sequence alphabet: see also iupac.c + ****************************************************/ + /* IUPAC symbols defined globally in iupac.c */ +struct iupactype { + char sym; /* character representation */ + char symcomp; /* complement (regular char */ + char code; /* my binary rep */ + char comp; /* binary encoded complement */ +}; +extern struct iupactype iupac[]; +#define IUPACSYMNUM 17 + +extern char *stdcode1[]; /* 1-letter amino acid translation code */ +extern char *stdcode3[]; /* 3-letter amino acid translation code */ +extern float dnafq[]; /* nucleotide occurrence frequencies */ +extern float aafq[]; /* amino acid occurrence frequencies */ +extern char aa_alphabet[]; /* amino acid alphabet */ +extern int aa_index[]; /* convert 0..19 indices to 0..26 */ + + /* valid symbols in IUPAC code */ +#define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd" +#define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY" +#define DNA_ALPHABET "ACGT" +#define RNA_ALPHABET "ACGU" +#define WHITESPACE " \t\n" + +#define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-' || (c) == '~') + + +/**************************************************** + * Sequence i/o: originally from Don Gilbert's readseq + ****************************************************/ +#include "msa.h" /* for multiple sequence alignment support */ + + /* buffer size for reading in lines from sequence files*/ +#define LINEBUFLEN 4096 + +/* sequence types parsed by Seqtype() */ +/* note that these must match hmmAMINO and hmmNUCLEIC in HMMER */ +#define kOtherSeq 0 /* hmmNOTSETYET */ +#define kDNA 1 +#define kRNA 2 /* hmmNUCLEIC */ +#define kAmino 3 /* hmmAMINO */ + +/* Unaligned sequence file formats recognized + * Coexists with definitions of multiple alignment formats in msa.h: + * >100 reserved for alignment formats + * <100 reserved for unaligned formats + * 0 reserved for unknown + * + * Some "legacy" formats are supported only when explicitly + * requested; not autodetected by SeqfileFormat(). + * + * DON'T REASSIGN THESE CODES. They're written into + * GSI index files. You can use new ones, but reassigning + * the sense of old ones will break GSI indices. + * Alignment format codes were reassigned with the creation + * of msa.c, but before Stockholm format, there were no + * indexed alignment databases. + */ +#define SQFILE_UNKNOWN 0 /* unknown format */ +#define SQFILE_IG 1 /* Intelligenetics (!) */ +#define SQFILE_GENBANK 2 /* GenBank flatfile */ + /* 3 was A2M. Now an alignment format */ +#define SQFILE_EMBL 4 /* EMBL or Swissprot flatfile */ +#define SQFILE_GCG 5 /* GCG single sequence files */ +#define SQFILE_STRIDER 6 /* MacStrider (!!) */ +#define SQFILE_FASTA 7 /* FASTA format: default */ +#define SQFILE_ZUKER 8 /* Zuker MFOLD format (legacy) */ +#define SQFILE_IDRAW 9 /* Idraw-style PostScript (legacy) */ + /* 10 was SELEX. Now alignment format */ + /* 11 was MSF. Now alignment format */ +#define SQFILE_PIR 12 /* PIR format */ +#define SQFILE_RAW 13 /* raw sequence */ +#define SQFILE_SQUID 14 /* my obsolete squid format */ + /* 15 was kXPearson, extended FASTA; withdrawn */ +#define SQFILE_GCGDATA 16 /* GCG data library file */ + /* 17 was Clustal. Now alignment format*/ + +#define IsUnalignedFormat(fmt) ((fmt) && (fmt) < 100) + +#include "ssi.h" + +struct ReadSeqVars { + FILE *f; /* open file pointer */ + char *fname; /* name of file; used for diagnostics */ + int linenumber; /* what line are we on in the file */ + + char *buf; /* dynamically allocated sre_fgets() buffer */ + int buflen; /* allocation length for buf */ + + int ssimode; /* SSI_OFFSET_I32 or SSI_OFFSET_I64 */ + SSIOFFSET ssioffset; /* disk offset to last line read into buf */ + SSIOFFSET r_off; /* offset to start of record */ + SSIOFFSET d_off; /* offset to start of sequence data */ + + int rpl; /* residues per data line for this file; -1 if unset, 0 if invalid */ + int lastrpl; /* rpl on last line seen */ + int maxrpl; /* max rpl on any line of the file */ + int bpl; /* bytes per data line; -1 if unset, 0 if invalid */ + int lastbpl; /* bpl on last line seen */ + int maxbpl; /* max bpl on any line of the file */ + + char *seq; /* growing sequence during parse */ + SQINFO *sqinfo; /* name, id, etc, gathered during parse */ + char *sp; + int seqlen; /* current sequence length */ + int maxseq; /* current allocation length for seq */ + + int format; /* format of seqfile we're reading. */ + int do_gzip; /* TRUE if f is a pipe from gzip -dc */ + int do_stdin; /* TRUE if f is stdin */ + + /* An (important) hack for sequential access of multiple alignment files: + * we read the whole alignment in, + * and then copy it one sequence at a time into seq and sqinfo. + * It is active if msa is non NULL. + * msa->lastidx is reused/overloaded: used to keep track of what + * seq we'll return next. + * afp->format is the real format, while SQFILE->format is kMSA. + * Because we keep it in the SQFILE structure, + * ReadSeq() and friends are always reentrant for multiple seqfiles. + */ + MSA *msa; + MSAFILE *afp; +}; +typedef struct ReadSeqVars SQFILE; + + +/**************************************************** + * Cluster analysis and phylogenetic tree support + ****************************************************/ + +/* struct phylo_s - a phylogenetic tree + * + * For N sequences, there will generally be an array of 0..N-2 + * phylo_s structures representing the nodes of a tree. + * [0] is the root. The indexes of left and + * right children are somewhat confusing so be careful. The + * indexes can have values of 0..2N-2. If they are 0..N-1, they + * represent pointers to individual sequences. If they are + * >= N, they represent pointers to a phylo_s structure + * at (index - N). + */ +struct phylo_s { + int parent; /* index of parent, N..2N-2, or -1 for root */ + int left; /* index of one of the branches, 0..2N-2 */ + int right; /* index of other branch, 0..2N-2 */ + float diff; /* difference score between seqs */ + float lblen; /* left branch length */ + float rblen; /* right branch length */ + char *is_in; /* 0..N-1 flag array, 1 if seq included */ + int incnum; /* number of seqs included at this node */ +}; + + +/* Strategies for cluster analysis; cluster by mean distance, + * minimum distance, or maximum distance. + */ +enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN }; + +/**************************************************** + * Generic data structure support + ****************************************************/ + +/* a struct intstack_s implements a pushdown stack for storing + * single integers. + */ +struct intstack_s { + int data; + struct intstack_s *nxt; +}; + +/**************************************************** + * Binary nucleotide alphabet support + ****************************************************/ + +/* Binary encoding of the IUPAC code for nucleotides + * + * four-bit "word", permitting rapid degenerate matching + * A C G T/U + * 0 0 1 0 + */ +#define NTA 8 +#define NTC 4 +#define NTG 2 +#define NTT 1 +#define NTU 1 +#define NTN 15 /* A|C|G|T */ +#define NTR 10 /* A|G */ +#define NTY 5 /* C|T */ +#define NTM 12 /* A|C */ +#define NTK 3 /* G|T */ +#define NTS 6 /* C|G */ +#define NTW 9 /* A|T */ +#define NTH 13 /* A|C|T */ +#define NTB 7 /* C|G|T */ +#define NTV 14 /* A|C|G */ +#define NTD 11 /* A|G|T */ +#define NTGAP 16 /* GAP */ +#define NTEND 0 /* null string terminator */ + +/* ntmatch(): bitwise comparison of two nuc's + * note that it's sensitive to the order; + * probe may be degenerate but target should not be + */ +#define ntmatch(probe, target) ((probe & target) == target) + +/**************************************************** + * Support for a portable, flexible Getopt() + ****************************************************/ + +/* Structure: opt_s + * + * Structure for declaring options to a main(). + */ +struct opt_s { + char *name; /* name of option, e.g. "--option1" or "-o" */ + int single; /* TRUE if a single letter option */ + int argtype; /* for typechecking, e.g. sqdARG_INT */ +}; + /* acceptable argtype's... */ +#define sqdARG_NONE 0 /* no argument */ +#define sqdARG_INT 1 /* something that atoi() can grok */ +#define sqdARG_FLOAT 2 /* something that atof() can grok */ +#define sqdARG_CHAR 3 /* require single character or digit */ +#define sqdARG_STRING 4 /* anything goes */ + +/**************************************************** + * Support for convenient Perl-y regexp matching + * See hsregexp.c for copyright notice: this code is derived + * from Henry Spencer's freely distributed regexp library. + ****************************************************/ + +#define NSUBEXP 10 +typedef struct sqd_regexp { + char *startp[NSUBEXP]; + char *endp[NSUBEXP]; + char regstart; /* Internal use only. */ + char reganch; /* Internal use only. */ + char *regmust; /* Internal use only. */ + int regmlen; /* Internal use only. */ + char program[1]; /* Unwarranted chumminess with compiler. */ +} sqd_regexp; + +/* Strparse() defines and manages these. + * sqd_parse[0] contains the substring that matched the pattern. + * sqd_parse[1-9] contain substrings matched with ()'s. + */ +extern char *sqd_parse[10]; + +/**************************************************** + * Portable detection of multiprocessor # of CPUs. + * #include + * long foo = SQD_NPROC; + * returns the number of available processors. + * if foo == -1, we failed. + ****************************************************/ + +/* Our problem here is that POSIX apparently doesn't specify + * a standard for how to get sysconf() to report the number of + * processors on-line. _SC_NPROCESSORS_ONLN is specified + * by SVR4.0MP. Thanks to W. Gish for help here. + */ +#undef SQD_NPROC +#ifdef _SC_NPROCESSORS_ONLN /* Sun Solaris, Digital UNIX */ +#define SQD_NPROC sysconf(_SC_NPROCESSORS_ONLN) +#else +#ifdef _SC_NPROC_ONLN /* Silicon Graphics IRIX */ +#define SQD_NPROC sysconf(_SC_NPROC_ONLN) +#else /* FreeBSD, Linux don't support getting ncpu via sysconf() */ +#define SQD_NPROC -1 +#endif +#endif + +/**************************************************** + * Three levels of debugging printf's and assert's + * level 1: little impact on verbosity or performance + * level 2: moderate impact + * level 3: high impact + * Example: + * SQD_DPRINTF3(("Matrix row %d col %d = %f\n", i, j, val)); + * Note the double parentheses; these are important. + ****************************************************/ + +#ifndef DEBUGLEVEL +#define DEBUGLEVEL 0 +#endif + +#if (DEBUGLEVEL >= 1) +#define SQD_DPRINTF1(x) printf x +#define SQD_DASSERT1(x) assert x +#else +#define SQD_DPRINTF1(x) +#define SQD_DASSERT1(x) +#endif +#if (DEBUGLEVEL >= 2) +#define SQD_DPRINTF2(x) printf x +#define SQD_DASSERT2(x) assert x +#else +#define SQD_DPRINTF2(x) +#define SQD_DASSERT2(x) +#endif +#if (DEBUGLEVEL >= 3) +#define SQD_DPRINTF3(x) printf x +#define SQD_DASSERT3(x) assert x +#else +#define SQD_DPRINTF3(x) +#define SQD_DASSERT3(x) +#endif + +/* PANIC is called for failures of Std C/POSIX functions, + * instead of my own functions. Panic() calls perror() and exits + * abnormally. + */ +#define PANIC Panic(__FILE__, __LINE__) + +/* Malloc/realloc calls are wrapped + */ +#define MallocOrDie(x) sre_malloc(__FILE__, __LINE__, (x)) +#define ReallocOrDie(x,y) sre_realloc(__FILE__, __LINE__, (x), (y)) + +/**************************************************** + * Miscellaneous macros and defines + ****************************************************/ + +#define CHOOSE(a) ((int) (sre_random() * (a))) + /* must declare swapfoo to use SWAP() */ +#define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;} +#define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7) + +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif +#ifndef MAX +#define MAX(a,b) (((a)>(b))?(a):(b)) +#endif + +/* For convenience and (one hopes) clarity in boolean tests: + */ +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* Somewhere, there is a universe in which Unix vendors comply + * with the ANSI C standard. Unfortunately, it is not ours: + */ +#ifndef EXIT_SUCCESS +#define EXIT_SUCCESS 0 +#endif +#ifndef EXIT_FAILURE +#define EXIT_FAILURE 1 +#endif + +#include "sqfuncs.h" /* squid function declarations */ +#endif /* SQUIDH_INCLUDED */ diff --git a/forester/archive/RIO/others/hmmer/squid/squidconf.h.in b/forester/archive/RIO/others/hmmer/squid/squidconf.h.in new file mode 100644 index 0000000..354c912 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/squidconf.h.in @@ -0,0 +1,76 @@ +/* @configure_input@ */ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef SQUIDCONFH_INCLUDED +#define SQUIDCONFH_INCLUDED + +/* squidconf.h + * Captures #define's generated by the ./configure script; + * this configuration information is #included at the start of + * squid.h + */ + +/***************************************************************** + * Sizes of integer types. + * various things are set by ./configure; the code + * uses WORDS_BIGENDIAN and USE_HOST_BYTESWAP_FUNCTIONS. + *****************************************************************/ +#undef WORDS_BIGENDIAN +#define SIZEOF_UNSIGNED_SHORT 0 +#define SIZEOF_UNSIGNED_INT 0 +#define SIZEOF_UNSIGNED_LONG 0 +#define SIZEOF_UNSIGNED_LONG_LONG 0 +#undef HAVE_NTOHS /* if defined, system provides ntohs() */ +#undef HAVE_NTOHL /* if defined, system provides ntohl() */ +#undef HAVE_HTONS /* if defined, system provides htons() */ +#undef HAVE_HTONL /* if defined, system provides htonl() */ +#if defined HAVE_NTOHL && defined HAVE_NTOHS && defined HAVE_HTONS && defined HAVE_HTONL +#define USE_HOST_BYTESWAP_FUNCTIONS 1 +#endif + +/***************************************************************** + * Can we support arithmetic 64-bit file offsets? + * four possible models checked for: + * 1. ftello(), fseeko() with 64-bit off_t + * 2. ftello64(), fseeko64() with 64-bit off64_t + * 3. ftell64(), fseek64() with 64-bit integer + * 4. fgetpos(), fsetpos() with an fpos_t that happens to be a + * 64-bit integer, even though ANSI says we're not supposed to know + * anything about fpos_t's internals. + * Based on what ./configure tells us about these, we set + * HAS_64BIT_FILE_OFFSETS or not. + *****************************************************************/ +#undef HAVE_FTELLO +#undef HAVE_FSEEKO +#undef HAVE_FTELLO64 +#undef HAVE_FSEEKO64 +#undef HAVE_FTELL64 +#undef HAVE_FSEEK64 +#undef ARITHMETIC_FPOS_T +#undef HAVE_STAT64 +#define SIZEOF_FPOS_T -1 +#define SIZEOF_OFF_T -1 +#define SIZEOF_OFF64_T -1 + +#if defined HAVE_FTELLO && defined HAVE_FSEEKO && SIZEOF_OFF_T == 8 +#define HAS_64BIT_FILE_OFFSETS 1 +#elif defined HAVE_FTELLO64 && defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8 +#define HAS_64BIT_FILE_OFFSETS 1 +#elif defined HAVE_FTELL64 && defined HAVE_FSEEK64 +#define HAS_64BIT_FILE_OFFSETS 1 +#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 +#define HAS_64BIT_FILE_OFFSETS 1 +#else +#undef HAS_64BIT_FILE_OFFSETS +#endif + + +#endif /* SQUIDCONFH_INCLUDED */ diff --git a/forester/archive/RIO/others/hmmer/squid/squidcore.c b/forester/archive/RIO/others/hmmer/squid/squidcore.c new file mode 100644 index 0000000..9970f0d --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/squidcore.c @@ -0,0 +1,53 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* squidcore.c + * SRE, Sun Jun 20 17:19:04 1999 [Graeme's kitchen] + * + * Core functions for SQUID library. + * RCS $Id: squidcore.c,v 1.1.1.1 2005/03/22 08:34:32 cmzmasek Exp $ + */ + +#include +#include "version.h" + +/* Function: Banner() + * Date: SRE, Sun Jun 20 17:19:41 1999 [Graeme's kitchen] + * + * Purpose: Print a package version and copyright banner. + * Used by all the main()'s. + * + * Expects to be able to pick up defined macros: + * macro example + * ------ -------------- + * PACKAGE "HMMER" + * RELEASE "2.0.42" + * RELEASEDATE "April 1 1999" + * COPYRIGHT "Copyright (C) 1992-1999 Washington University School of Medicine" + * LICENSE "HMMER is freely distributed under the GNU General Public License (GPL)." + * + * This gives us a general mechanism to update release information + * without changing multiple points in the code; we can also override + * SQUID release data with another package's release data (e.g. + * HMMER) just by redefining macros. + * + * Args: fp - where to print it + * banner - one-line program description, e.g.: + * "foobar - make bars from foo with elan" + * Returns: (void) + */ +void +Banner(FILE *fp, char *banner) +{ + fprintf(fp, "%s\n%s %s (%s)\n%s\n%s\n", banner, PACKAGE, RELEASE, RELEASEDATE, COPYRIGHT, LICENSE); + fprintf(fp, "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n"); +} + + diff --git a/forester/archive/RIO/others/hmmer/squid/sre_ctype.c b/forester/archive/RIO/others/hmmer/squid/sre_ctype.c new file mode 100644 index 0000000..6be7b82 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sre_ctype.c @@ -0,0 +1,39 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* sre_ctype.c + * + * For portability. Some systems have functions tolower, toupper + * as macros (for instance, MIPS M-2000 RISC/os!) + * + * RCS $Id: sre_ctype.c,v 1.1.1.1 2005/03/22 08:34:16 cmzmasek Exp $ + */ + +#include +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +int +sre_tolower(int c) +{ + if (isupper(c)) return tolower(c); + else return c; +} + +int +sre_toupper(int c) +{ + if (islower(c)) return toupper(c); + else return c; +} + diff --git a/forester/archive/RIO/others/hmmer/squid/sre_math.c b/forester/archive/RIO/others/hmmer/squid/sre_math.c new file mode 100644 index 0000000..f5ecda2 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sre_math.c @@ -0,0 +1,787 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* sre_math.c + * + * Portability for and extensions to C math library. + * RCS $Id: sre_math.c,v 1.1.1.1 2005/03/22 08:34:32 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +static int sre_reseed = 0; /* TRUE to reinit sre_random() */ +static int sre_randseed = 666; /* default seed for sre_random() */ + +/* Function: ExponentialRandom() + * Date: SRE, Mon Sep 6 21:24:29 1999 [St. Louis] + * + * Purpose: Pick an exponentially distributed random variable + * 0 > x >= infinity + * + * Args: (void) + * + * Returns: x + */ +float +ExponentialRandom(void) +{ + float x; + + do x = sre_random(); while (x == 0.0); + return -log(x); +} + +/* Function: Gaussrandom() + * + * Pick a Gaussian-distributed random variable + * with some mean and standard deviation, and + * return it. + * + * Based on RANLIB.c public domain implementation. + * Thanks to the authors, Barry W. Brown and James Lovato, + * University of Texas, M.D. Anderson Cancer Center, Houston TX. + * Their implementation is from Ahrens and Dieter, "Extensions + * of Forsythe's method for random sampling from the normal + * distribution", Math. Comput. 27:927-937 (1973). + * + * Impenetrability of the code is to be blamed on its FORTRAN/f2c lineage. + * + */ +float +Gaussrandom(float mean, float stddev) +{ + static float a[32] = { + 0.0,3.917609E-2,7.841241E-2,0.11777,0.1573107,0.1970991,0.2372021,0.2776904, 0.3186394,0.36013,0.4022501,0.4450965,0.4887764,0.5334097,0.5791322, + 0.626099,0.6744898,0.7245144,0.7764218,0.8305109,0.8871466,0.9467818, + 1.00999,1.077516,1.150349,1.229859,1.318011,1.417797,1.534121,1.67594, + 1.862732,2.153875 + }; + static float d[31] = { + 0.0,0.0,0.0,0.0,0.0,0.2636843,0.2425085,0.2255674,0.2116342,0.1999243, + 0.1899108,0.1812252,0.1736014,0.1668419,0.1607967,0.1553497,0.1504094, + 0.1459026,0.14177,0.1379632,0.1344418,0.1311722,0.128126,0.1252791, + 0.1226109,0.1201036,0.1177417,0.1155119,0.1134023,0.1114027,0.1095039 + }; + static float t[31] = { + 7.673828E-4,2.30687E-3,3.860618E-3,5.438454E-3,7.0507E-3,8.708396E-3, + 1.042357E-2,1.220953E-2,1.408125E-2,1.605579E-2,1.81529E-2,2.039573E-2, + 2.281177E-2,2.543407E-2,2.830296E-2,3.146822E-2,3.499233E-2,3.895483E-2, + 4.345878E-2,4.864035E-2,5.468334E-2,6.184222E-2,7.047983E-2,8.113195E-2, + 9.462444E-2,0.1123001,0.136498,0.1716886,0.2276241,0.330498,0.5847031 + }; + static float h[31] = { + 3.920617E-2,3.932705E-2,3.951E-2,3.975703E-2,4.007093E-2,4.045533E-2, + 4.091481E-2,4.145507E-2,4.208311E-2,4.280748E-2,4.363863E-2,4.458932E-2, + 4.567523E-2,4.691571E-2,4.833487E-2,4.996298E-2,5.183859E-2,5.401138E-2, + 5.654656E-2,5.95313E-2,6.308489E-2,6.737503E-2,7.264544E-2,7.926471E-2, + 8.781922E-2,9.930398E-2,0.11556,0.1404344,0.1836142,0.2790016,0.7010474 + }; + static long i; + static float snorm,u,s,ustar,aa,w,y,tt; + + u = sre_random(); + s = 0.0; + if(u > 0.5) s = 1.0; + u += (u-s); + u = 32.0*u; + i = (long) (u); + if(i == 32) i = 31; + if(i == 0) goto S100; + /* + * START CENTER + */ + ustar = u-(float)i; + aa = *(a+i-1); +S40: + if(ustar <= *(t+i-1)) goto S60; + w = (ustar-*(t+i-1))**(h+i-1); +S50: + /* + * EXIT (BOTH CASES) + */ + y = aa+w; + snorm = y; + if(s == 1.0) snorm = -y; + return (stddev*snorm + mean); +S60: + /* + * CENTER CONTINUED + */ + u = sre_random(); + w = u*(*(a+i)-aa); + tt = (0.5*w+aa)*w; + goto S80; +S70: + tt = u; + ustar = sre_random(); +S80: + if(ustar > tt) goto S50; + u = sre_random(); + if(ustar >= u) goto S70; + ustar = sre_random(); + goto S40; +S100: + /* + * START TAIL + */ + i = 6; + aa = *(a+31); + goto S120; +S110: + aa += *(d+i-1); + i += 1; +S120: + u += u; + if(u < 1.0) goto S110; + u -= 1.0; +S140: + w = u**(d+i-1); + tt = (0.5*w+aa)*w; + goto S160; +S150: + tt = u; +S160: + ustar = sre_random(); + if(ustar > tt) goto S50; + u = sre_random(); + if(ustar >= u) goto S150; + u = sre_random(); + goto S140; +} + + +/* Function: Linefit() + * + * Purpose: Given points x[0..N-1] and y[0..N-1], fit to + * a straight line y = a + bx. + * a, b, and the linear correlation coefficient r + * are filled in for return. + * + * Args: x - x values of data + * y - y values of data + * N - number of data points + * ret_a - RETURN: intercept + * ret_b - RETURN: slope + * ret_r - RETURN: correlation coefficient + * + * Return: 1 on success, 0 on failure. + */ +int +Linefit(float *x, float *y, int N, float *ret_a, float *ret_b, float *ret_r) +{ + float xavg, yavg; + float sxx, syy, sxy; + int i; + + /* Calculate averages, xavg and yavg + */ + xavg = yavg = 0.0; + for (i = 0; i < N; i++) + { + xavg += x[i]; + yavg += y[i]; + } + xavg /= (float) N; + yavg /= (float) N; + + sxx = syy = sxy = 0.0; + for (i = 0; i < N; i++) + { + sxx += (x[i] - xavg) * (x[i] - xavg); + syy += (y[i] - yavg) * (y[i] - xavg); + sxy += (x[i] - xavg) * (y[i] - yavg); + } + *ret_b = sxy / sxx; + *ret_a = yavg - xavg*(*ret_b); + *ret_r = sxy / (sqrt(sxx) * sqrt(syy)); + return 1; +} + + +/* Function: WeightedLinefit() + * + * Purpose: Given points x[0..N-1] and y[0..N-1] with + * variances (measurement errors) var[0..N-1], + * fit to a straight line y = mx + b. + * + * Method: Algorithm from Numerical Recipes in C, [Press88]. + * + * Return: (void) + * ret_m contains slope; ret_b contains intercept + */ +void +WeightedLinefit(float *x, float *y, float *var, int N, float *ret_m, float *ret_b) +{ + int i; + double s; + double sx, sy; + double sxx, sxy; + double delta; + double m, b; + + s = sx = sy = sxx = sxy = 0.; + for (i = 0; i < N; i++) + { + s += 1./var[i]; + sx += x[i] / var[i]; + sy += y[i] / var[i]; + sxx += x[i] * x[i] / var[i]; + sxy += x[i] * y[i] / var[i]; + } + + delta = s * sxx - (sx * sx); + b = (sxx * sy - sx * sxy) / delta; + m = (s * sxy - sx * sy) / delta; + + *ret_m = m; + *ret_b = b; +} + + +/* Function: Gammln() + * + * Returns the natural log of the gamma function of x. + * x is > 0.0. + * + * Adapted from a public domain implementation in the + * NCBI core math library. Thanks to John Spouge and + * the NCBI. (According to the NCBI, that's Dr. John + * "Gammas Galore" Spouge to you, pal.) + */ +double +Gammln(double x) +{ + int i; + double xx, tx; + double tmp, value; + static double cof[11] = { + 4.694580336184385e+04, + -1.560605207784446e+05, + 2.065049568014106e+05, + -1.388934775095388e+05, + 5.031796415085709e+04, + -9.601592329182778e+03, + 8.785855930895250e+02, + -3.155153906098611e+01, + 2.908143421162229e-01, + -2.319827630494973e-04, + 1.251639670050933e-10 + }; + + /* Protect against x=0. We see this in Dirichlet code, + * for terms alpha = 0. This is a severe hack but it is effective + * and (we think?) safe. (due to GJM) + */ + if (x <= 0.0) return 999999.; + + xx = x - 1.0; + tx = tmp = xx + 11.0; + value = 1.0; + for (i = 10; i >= 0; i--) /* sum least significant terms first */ + { + value += cof[i] / tmp; + tmp -= 1.0; + } + value = log(value); + tx += 0.5; + value += 0.918938533 + (xx+0.5)*log(tx) - tx; + return value; +} + + +/* Vector operations for doubles and floats. + * DNorm(), FNorm() -- normalize a probability vector of length n. + * return 0 if all values were zero. + * DScale(), FScale() -- multiply all items in vector by scale + * DSet(), FSet() -- set all items in vector to value. + * DAdd(), FAdd() -- add vec2 to vec1. + * DDot(), FDot() -- calculate dot product of two vectors. + * DCopy(), FCopy() -- set vec1 to be same as vec2. + * DMax(), FMax() -- return index of maximum element in vec + */ +int +DNorm(double *vec, int n) +{ + int x; + double sum; + + sum = 0.0; + for (x = 0; x < n; x++) sum += vec[x]; + if (sum != 0.0) + for (x = 0; x < n; x++) vec[x] /= sum; + else + { squid_errno = SQERR_DIVZERO; return 0; } + return 1; +} +int +FNorm(float *vec, int n) +{ + int x; + float sum; + + sum = 0.0; + for (x = 0; x < n; x++) sum += vec[x]; + if (sum != 0.0) + for (x = 0; x < n; x++) vec[x] /= sum; + else + { squid_errno = SQERR_DIVZERO; return 0; } + return 1; +} + +void +DScale(double *vec, int n, double scale) +{ + int x; + for (x = 0; x < n; x++) + vec[x] *= scale; +} +void +FScale(float *vec, int n, float scale) +{ + int x; + for (x = 0; x < n; x++) + vec[x] *= scale; +} + +void +DSet(double *vec, int n, double value) +{ + int x; + for (x = 0; x < n; x++) + vec[x] = value; +} +void +FSet(float *vec, int n, float value) +{ + int x; + for (x = 0; x < n; x++) + vec[x] = value; +} + +double +DSum(double *vec, int n) +{ + double sum = 0.; + int x; + for (x = 0; x < n; x++) + sum += vec[x]; + return sum; +} +float +FSum(float *vec, int n) +{ + float sum = 0.; + int x; + for (x = 0; x < n; x++) + sum += vec[x]; + return sum; +} + +void +DAdd(double *vec1, double *vec2, int n) +{ + int x; + for (x = 0; x < n; x++) + vec1[x] += vec2[x]; +} +void +FAdd(float *vec1, float *vec2, int n) +{ + int x; + for (x = 0; x < n; x++) + vec1[x] += vec2[x]; +} + +void +DCopy(double *vec1, double *vec2, int n) +{ + int x; + for (x = 0; x < n; x++) + vec1[x] = vec2[x]; +} +void +FCopy(float *vec1, float *vec2, int n) +{ + int x; + for (x = 0; x < n; x++) + vec1[x] = vec2[x]; +} + +double +DDot(double *vec1, double *vec2, int n) +{ + double result = 0.; + int x; + + for (x = 0; x < n; x++) + result += vec1[x] * vec2[x]; + return result; +} +float +FDot(float *vec1, float *vec2, int n) +{ + float result = 0.; + int x; + + for (x = 0; x < n; x++) + result += vec1[x] * vec2[x]; + return result; +} + +/* Functions: DMax(), FMax() + * Date: SRE, Fri Aug 29 11:14:08 1997 (Denver CO) + * + * Purpose: return index of maximum element in vec. + */ +int +DMax(double *vec, int n) +{ + int i; + int best = 0; + + for (i = 1; i < n; i++) + if (vec[i] > vec[best]) best = i; + return best; +} +int +FMax(float *vec, int n) +{ + int i; + int best = 0; + + for (i = 1; i < n; i++) + if (vec[i] > vec[best]) best = i; + return best; +} + + +/* 2D matrix operations + */ +float ** +FMX2Alloc(int rows, int cols) +{ + float **mx; + int r; + + mx = (float **) MallocOrDie(sizeof(float *) * rows); + mx[0] = (float *) MallocOrDie(sizeof(float) * rows * cols); + for (r = 1; r < rows; r++) + mx[r] = mx[0] + r*cols; + return mx; +} +void +FMX2Free(float **mx) +{ + free(mx[0]); + free(mx); +} +double ** +DMX2Alloc(int rows, int cols) +{ + double **mx; + int r; + + mx = (double **) MallocOrDie(sizeof(double *) * rows); + mx[0] = (double *) MallocOrDie(sizeof(double) * rows * cols); + for (r = 1; r < rows; r++) + mx[r] = mx[0] + r*cols; + return mx; +} +void +DMX2Free(double **mx) +{ + free(mx[0]); + free(mx); +} +/* Function: FMX2Multiply() + * + * Purpose: Matrix multiplication. + * Multiply an m x p matrix A by a p x n matrix B, + * giving an m x n matrix C. + * Matrix C must be a preallocated matrix of the right + * size. + */ +void +FMX2Multiply(float **A, float **B, float **C, int m, int p, int n) +{ + int i, j, k; + + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + { + C[i][j] = 0.; + for (k = 0; k < p; k++) + C[i][j] += A[i][p] * B[p][j]; + } +} + +/* Function: sre_random() + * + * Purpose: Return a uniform deviate from 0.0 to 1.0. + * sre_randseed is a static variable, set + * by sre_srandom(). sre_reseed is a static flag + * raised by sre_srandom(), saying that we need + * to re-initialize. + * [0.0 <= x < 1.0] + * + * Uses a simple linear congruential generator with + * period 2^28. Based on discussion in Robert Sedgewick's + * _Algorithms in C_, Addison-Wesley, 1990. + * + * Requires that long int's have at least 32 bits. + * + * Reliable and portable, but slow. Benchmarks on wol, + * using IRIX cc and IRIX C library rand() and random(): + * sre_random(): 0.8 usec/call + * random(): 0.3 usec/call + * rand(): 0.3 usec/call + */ +#define RANGE 268435456 /* 2^28 */ +#define DIV 16384 /* sqrt(RANGE) */ +#define MULT 72530821 /* my/Cathy's birthdays, x21, x even (Knuth)*/ +float +sre_random(void) +{ + static long rnd; + static int firsttime = 1; + long high1, low1; + long high2, low2; + + if (sre_reseed || firsttime) + { + sre_reseed = firsttime = 0; + if (sre_randseed <= 0) sre_randseed = 666; /* seeds of zero break me */ + high1 = sre_randseed / DIV; low1 = sre_randseed % DIV; + high2 = MULT / DIV; low2 = MULT % DIV; + rnd = (((high2*low1 + high1*low2) % DIV)*DIV + low1*low2) % RANGE; + } + high1 = rnd / DIV; low1 = rnd % DIV; + high2 = MULT / DIV; low2 = MULT % DIV; + rnd = (((high2*low1 + high1*low2) % DIV)*DIV + low1*low2) % RANGE; + + return ((float) rnd / (float) RANGE); +} +#undef RANGE +#undef DIV +#undef MULT + + +/* Function: sre_srandom() + * + * Purpose: Initialize with a random seed. Seed can be + * any integer. + */ +void +sre_srandom(int seed) +{ + if (seed < 0) seed = -1 * seed; + sre_reseed = 1; + sre_randseed = seed; +} + + +/* Functions: DChoose(), FChoose() + * + * Purpose: Make a random choice from a normalized distribution. + * DChoose() is for double-precision vectors; + * FChoose() is for single-precision float vectors. + * Returns the number of the choice. + */ +int +DChoose(double *p, int N) +{ + double roll; /* random fraction */ + double sum; /* integrated prob */ + int i; /* counter over the probs */ + + roll = sre_random(); + sum = 0.0; + for (i = 0; i < N; i++) + { + sum += p[i]; + if (roll < sum) return i; + } + SQD_DASSERT2((fabs(1.0 - sum) < 1e-14)); /* a verification at level 2 */ + return (int) (sre_random() * N); /* bulletproof */ +} +int +FChoose(float *p, int N) +{ + float roll; /* random fraction */ + float sum; /* integrated prob */ + int i; /* counter over the probs */ + + roll = sre_random(); + sum = 0.0; + for (i = 0; i < N; i++) + { + sum += p[i]; + if (roll < sum) return i; + } + SQD_DASSERT2((fabs(1.0f - sum) < 1e-6f)); /* a verification at level 2 */ + return (int) (sre_random() * N); /* bulletproof */ +} + +/* Functions: DLogSum(), FLogSum() + * + * Calculate the sum of a log vector + * *in normal space*, and return the log of the sum. + */ +double +DLogSum(double *logp, int n) +{ + int x; + double max, sum; + + max = logp[0]; + for (x = 1; x < n; x++) + if (logp[x] > max) max = logp[x]; + sum = 0.0; + for (x = 0; x < n; x++) + if (logp[x] > max - 50.) + sum += exp(logp[x] - max); + sum = log(sum) + max; + return sum; +} +float +FLogSum(float *logp, int n) +{ + int x; + float max, sum; + + max = logp[0]; + for (x = 1; x < n; x++) + if (logp[x] > max) max = logp[x]; + sum = 0.0; + for (x = 0; x < n; x++) + if (logp[x] > max - 50.) + sum += exp(logp[x] - max); + sum = log(sum) + max; + return sum; +} + + +/* Function: IncompleteGamma() + * + * Purpose: Returns 1 - P(a,x) where: + * P(a,x) = \frac{1}{\Gamma(a)} \int_{0}^{x} t^{a-1} e^{-t} dt + * = \frac{\gamma(a,x)}{\Gamma(a)} + * = 1 - \frac{\Gamma(a,x)}{\Gamma(a)} + * + * Used in a chi-squared test: for a X^2 statistic x + * with v degrees of freedom, call: + * p = IncompleteGamma(v/2., x/2.) + * to get the probability p that a chi-squared value + * greater than x could be obtained by chance even for + * a correct model. (i.e. p should be large, say + * 0.95 or more). + * + * Method: Based on ideas from Numerical Recipes in C, Press et al., + * Cambridge University Press, 1988. + * + * Args: a - for instance, degrees of freedom / 2 [a > 0] + * x - for instance, chi-squared statistic / 2 [x >= 0] + * + * Return: 1 - P(a,x). + */ +double +IncompleteGamma(double a, double x) +{ + int iter; /* iteration counter */ + + if (a <= 0.) Die("IncompleteGamma(): a must be > 0"); + if (x < 0.) Die("IncompleteGamma(): x must be >= 0"); + + /* For x > a + 1 the following gives rapid convergence; + * calculate 1 - P(a,x) = \frac{\Gamma(a,x)}{\Gamma(a)}: + * use a continued fraction development for \Gamma(a,x). + */ + if (x > a+1) + { + double oldp; /* previous value of p */ + double nu0, nu1; /* numerators for continued fraction calc */ + double de0, de1; /* denominators for continued fraction calc */ + + nu0 = 0.; /* A_0 = 0 */ + de0 = 1.; /* B_0 = 1 */ + nu1 = 1.; /* A_1 = 1 */ + de1 = x; /* B_1 = x */ + + oldp = nu1; + for (iter = 1; iter < 100; iter++) + { + /* Continued fraction development: + * set A_j = b_j A_j-1 + a_j A_j-2 + * B_j = b_j B_j-1 + a_j B_j-2 + * We start with A_2, B_2. + */ + /* j = even: a_j = iter-a, b_j = 1 */ + /* A,B_j-2 are in nu0, de0; A,B_j-1 are in nu1,de1 */ + nu0 = nu1 + ((double)iter - a) * nu0; + de0 = de1 + ((double)iter - a) * de0; + + /* j = odd: a_j = iter, b_j = x */ + /* A,B_j-2 are in nu1, de1; A,B_j-1 in nu0,de0 */ + nu1 = x * nu0 + (double) iter * nu1; + de1 = x * de0 + (double) iter * de1; + + /* rescale */ + if (de1) + { + nu0 /= de1; + de0 /= de1; + nu1 /= de1; + de1 = 1.; + } + /* check for convergence */ + if (fabs((nu1-oldp)/nu1) < 1.e-7) + return nu1 * exp(a * log(x) - x - Gammln(a)); + + oldp = nu1; + } + Die("IncompleteGamma(): failed to converge using continued fraction approx"); + } + else /* x <= a+1 */ + { + double p; /* current sum */ + double val; /* current value used in sum */ + + /* For x <= a+1 we use a convergent series instead: + * P(a,x) = \frac{\gamma(a,x)}{\Gamma(a)}, + * where + * \gamma(a,x) = e^{-x}x^a \sum_{n=0}{\infty} \frac{\Gamma{a}}{\Gamma{a+1+n}} x^n + * which looks appalling but the sum is in fact rearrangeable to + * a simple series without the \Gamma functions: + * = \frac{1}{a} + \frac{x}{a(a+1)} + \frac{x^2}{a(a+1)(a+2)} ... + * and it's obvious that this should converge nicely for x <= a+1. + */ + + p = val = 1. / a; + for (iter = 1; iter < 10000; iter++) + { + val *= x / (a+(double)iter); + p += val; + + if (fabs(val/p) < 1.e-7) + return 1. - p * exp(a * log(x) - x - Gammln(a)); + } + Die("IncompleteGamma(): failed to converge using series approx"); + } + /*NOTREACHED*/ + return 0.; +} + diff --git a/forester/archive/RIO/others/hmmer/squid/sre_string.c b/forester/archive/RIO/others/hmmer/squid/sre_string.c new file mode 100644 index 0000000..15255ba --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sre_string.c @@ -0,0 +1,524 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* sre_string.c + * + * my library of extra string functions. Some for portability + * across UNIXes + * + * RCS $Id: sre_string.c,v 1.1.1.1 2005/03/22 08:34:25 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include +#include "squid.h" + +/* Function: Strdup() + * + * Purpose: Implementation of the common (but non-ANSI) function + * strdup(). Robust against being passed a NULL pointer. + * + */ +char * +Strdup(char *s) +{ + char *new; + if (s == NULL) return NULL; + if ((new = (char *) malloc (strlen(s) +1)) == NULL) return NULL; + strcpy(new, s); + return new; +} + +/* Function: StringChop() + * Date: SRE, Wed Oct 29 12:10:02 1997 [TWA 721] + * + * Purpose: Chop trailing whitespace off of a string. + */ +void +StringChop(char *s) +{ + int i; + + i = strlen(s) - 1; /* set i at last char in string */ + while (i >= 0 && isspace((int) s[i])) i--; /* i now at last non-whitespace char, or -1 */ + s[i+1] = '\0'; +} + +int +Strinsert(char *s1, /* string to insert a char into */ + char c, /* char to insert */ + int pos) /* position in s1 to insert c at */ +{ + char oldc; + char *s; + + for (s = s1 + pos; c; s++) + { + /* swap current char for inserted one */ + oldc = *s; /* pick up current */ + *s = c; /* put down inserted one */ + c = oldc; /* old becomes next to insert */ + } + *s = '\0'; + + return 1; +} + + +int +Strdelete(char *s1, /* string to delete a char from */ + int pos) /* position of char to delete 0..n-1 */ +{ + char *s; + + for (s = s1 + pos; *s; s++) + *s = *(s + 1); + + return 1; +} + +void +s2lower(char *s) +{ + for (; *s != '\0'; s++) + *s = sre_tolower((int) *s); +} + +void +s2upper(char *s) +{ + for (; *s != '\0'; s++) + *s = sre_toupper((int) *s); +} + + +void * +sre_malloc(char *file, int line, size_t size) +{ + void *ptr; + + SQD_DPRINTF3(("MALLOC: %d bytes (file %s line %d)\n", size, file, line)); + if ((ptr = malloc (size)) == NULL) + Die("malloc of %ld bytes failed: file %s line %d", size, file, line); + return ptr; +} + +void * +sre_realloc(char *file, int line, void *p, size_t size) +{ + void *ptr; + + if ((ptr = realloc(p, size)) == NULL) + Die("realloc of %ld bytes failed: file %s line %d", size, file, line); + return ptr; +} + + + +/* Function: Free2DArray(), Free3DArray() + * Date: SRE, Tue Jun 1 14:47:14 1999 [St. Louis] + * + * Purpose: Convenience functions for free'ing 2D + * and 3D pointer arrays. Tolerates any of the + * pointers being NULL, to allow "sparse" + * arrays. + * + * Args: p - array to be freed + * dim1 - n for first dimension + * dim2 - n for second dimension + * + * e.g. a 2d array is indexed p[0..dim1-1][] + * a 3D array is indexed p[0..dim1-1][0..dim2-1][] + * + * Returns: void + * + * Diagnostics: (void) + * "never fails" + */ +void +Free2DArray(void **p, int dim1) +{ + int i; + + if (p != NULL) { + for (i = 0; i < dim1; i++) + if (p[i] != NULL) free(p[i]); + free(p); + } +} +void +Free3DArray(void ***p, int dim1, int dim2) +{ + int i, j; + + if (p != NULL) { + for (i = 0; i < dim1; i++) + if (p[i] != NULL) { + for (j = 0; j < dim2; j++) + if (p[i][j] != NULL) free(p[i][j]); + free(p[i]); + } + free(p); + } +} + + +/* Function: RandomSequence() + * + * Purpose: Generate an iid symbol sequence according + * to some alphabet, alphabet_size, probability + * distribution, and length. Return the + * sequence. + * + * Args: alphabet - e.g. "ACGT" + * p - probability distribution [0..n-1] + * n - number of symbols in alphabet + * len - length of generated sequence + * + * Return: ptr to random sequence, or NULL on failure. + */ +char * +RandomSequence(char *alphabet, float *p, int n, int len) +{ + char *s; + int x; + + s = (char *) MallocOrDie (sizeof(char) * (len+1)); + for (x = 0; x < len; x++) + s[x] = alphabet[FChoose(p,n)]; + s[x] = '\0'; + return s; +} + +/* Function: sre_fgets() + * Date: SRE, Thu May 13 10:56:28 1999 [St. Louis] + * + * Purpose: Dynamic allocation version of fgets(), + * capable of reading unlimited line lengths. + * + * Args: buf - ptr to a string (may be reallocated) + * n - ptr to current allocated length of buf, + * (may be changed) + * fp - open file ptr for reading + * + * Before the first call to sre_fgets(), + * buf should be initialized to NULL and n to 0. + * They're a linked pair, so don't muck with the + * allocation of buf or the value of n while + * you're still doing sre_fgets() calls with them. + * + * Returns: ptr to the buffer on success. + * NULL on EOF (buf isn't to be used in this case) + * sre_fgets() *always* results in an allocation + * in buf. + * + * The reason to have it return a ptr to buf + * is that it makes wrapper macros easy; see + * MSAFileGetLine() for an example. + * + * Example: char *buf; + * int n; + * FILE *fp; + * + * fp = fopen("my_file", "r"); + * buf = NULL; + * n = 0; + * while (sre_fgets(&buf, &n, fp) != NULL) + * { + * do stuff with buf; + * } + */ +char * +sre_fgets(char **buf, int *n, FILE *fp) +{ + char *s; + int len; + int pos; + + if (*n == 0) + { + *buf = MallocOrDie(sizeof(char) * 128); + *n = 128; + } + + /* Simple case 1. We're sitting at EOF, or there's an error. + * fgets() returns NULL, so we return NULL. + */ + if (fgets(*buf, *n, fp) == NULL) return NULL; + + /* Simple case 2. fgets() got a string, and it reached EOF. + * return success status, so caller can use + * the last line; on the next call we'll + * return the 0 for the EOF. + */ + if (feof(fp)) return *buf; + + /* Simple case 3. We got a complete string, with \n, + * and don't need to extend the buffer. + */ + len = strlen(*buf); + if ((*buf)[len-1] == '\n') return *buf; + + /* The case we're waiting for. We have an incomplete string, + * and we have to extend the buffer one or more times. Make + * sure we overwrite the previous fgets's \0 (hence +(n-1) + * in first step, rather than 128, and reads of 129, not 128). + */ + pos = (*n)-1; + while (1) { + *n += 128; + *buf = ReallocOrDie(*buf, sizeof(char) * (*n)); + s = *buf + pos; + if (fgets(s, 129, fp) == NULL) return *buf; + len = strlen(s); + if (s[len-1] == '\n') return *buf; + pos += 128; + } + /*NOTREACHED*/ +} + +/* Function: sre_strcat() + * Date: SRE, Thu May 13 09:36:32 1999 [St. Louis] + * + * Purpose: Dynamic memory version of strcat(). + * appends src to the string that dest points to, + * extending allocation for dest if necessary. + * + * One timing experiment (100 successive appends of + * 1-255 char) shows sre_strcat() has about a 20% + * overhead relative to strcat(). However, if optional + * length info is passed, sre_strcat() is about 30% + * faster than strcat(). + * + * Args: dest - ptr to string (char **), '\0' terminated + * ldest - length of dest, if known; or -1 if length unknown. + * src - string to append to dest, '\0' terminated + * lsrc - length of src, if known; or -1 if length unknown. + * + * dest may be NULL, in which case this is + * the equivalent of dest = Strdup(src). + * + * src may also be NULL, in which case + * dest is unmodified (but why would you want to pass + * a NULL src?) + * + * if both dest and src are NULL, dest is + * unmodified; it stays NULL. + * + * the length parameters are optional. If a -1 + * is passed, sre_strcat() will call strlen() to + * determine the length itself. Passing length + * info saves the strlen() calls and can speed things + * up if lots of successive appends need to be done. + * + * Returns: new length of dest (>=0 on success); + * dest is (probably) reallocated, and modified + * to a longer string, '\0' terminated. + */ +int +sre_strcat(char **dest, int ldest, char *src, int lsrc) +{ + int len1, len2; + + if (ldest < 0) len1 = ((*dest == NULL) ? 0 : strlen(*dest)); + else len1 = ldest; + + if (lsrc < 0) len2 = (( src == NULL) ? 0 : strlen(src)); + else len2 = lsrc; + + if (len2 == 0) return len1; + + if (*dest == NULL) *dest = MallocOrDie(sizeof(char) * (len2+1)); + else *dest = ReallocOrDie(*dest, sizeof(char) * (len1+len2+1)); + + memcpy((*dest)+len1, src, len2+1); + return len1+len2; +} + +/* Function: sre_strtok() + * Date: SRE, Wed May 19 16:30:20 1999 [St. Louis] + * + * Purpose: Thread-safe version of strtok(). + * + * Returns ptr to next token in a string: skips + * until it reaches a character that is not in the delim + * string, and sets beginning of token. Skips to + * next delim character (or '\0') to set the end; replaces that + * character with '\0'. + * If there's still more string left, sets s to point to next + * character after the '\0' that was written, so successive + * calls extract tokens in succession. If there was no string + * left, s points at the terminal '\0'. + * + * If no token is found, returns NULL. + * + * Also returns the length of the token, which + * may save us a strlen() call in some applications. + * + * Limitations: + * *s can't be a constant string, since we write to it. + * + * Example: + * char *tok; + * int len; + * char *s; + * char buf[50] = "This is a sentence."; + * + * s = buf; + * tok = sre_strtok(&s, " ", &len); + * tok is "This"; s is "is a sentence."; len is 4. + * tok = sre_strtok(&s, " ", &len); + * tok is "is"; s is " a sentence."; len is 2. + * tok = sre_strtok(&s, " ", &len); + * tok is "a"; s is "sentence."; len is 1. + * tok = sre_strtok(&s, " ", &len); + * tok is "sentence."; s is "\0"; len is 9. + * tok = sre_strtok(&s, " ", &len); + * tok is NULL; s is "\0", len is undefined. + * + * Args: s - a tmp, modifiable ptr to string + * delim - characters that delimits tokens + * len - RETURN: length of token; pass NULL if not wanted + * + * Returns: ptr to next token, or NULL if there aren't any. + */ +char * +sre_strtok(char **s, char *delim, int *len) +{ + char *begin, *end; + int n; + + begin = *s; + begin += strspn(begin, delim); + if (! *begin) return NULL; + + n = strcspn(begin, delim); + end = begin + n; + if (*end == '\0') { *s = end;} + else { + *end = '\0'; + *s = end+1; + } + + if (len != NULL) *len = n; + return begin; +} + + + +/* Function: sre_strdup() + * Date: SRE, Wed May 19 17:57:28 1999 [St. Louis] + * + * Purpose: A version of the common but non-ANSI strdup() + * function. Can pass len, if known, to save a + * strlen() call. + * + * Args: s - string to duplicate + * n - length of string, if known; -1 if unknown. + * + * Returns: allocated copy of string. + * NULL on failure. + */ +char * +sre_strdup(char *s, int n) +{ + char *new; + + if (s == NULL) return NULL; + if (n < 0) n = strlen(s); + new = MallocOrDie (sizeof(char) * (n+1)); + strcpy(new, s); + return new; +} + + +/* Function: sre_strncpy() + * Date: SRE, Tue Jun 22 10:10:46 1999 [Sanger Centre] + * + * Purpose: a strncpy() that makes sure it adds a trailing \0. + * + * Args: s1 - string to copy to (allocated n+1 or larger) + * s2 - string to copy from + * n - number of chars to copy + * + * Returns: s1. + * Done only for consistency with strncpy(). Not clear + * why it's useful for a strncpy() to return s1. + */ +char * +sre_strncpy(char *s1, char *s2, int n) +{ + strncpy(s1,s2,n); + s1[n] = '\0'; + return s1; +} + +/* Function: IsBlankline() + * Date: SRE, Fri Jun 18 14:36:08 1999 [St. Louis] + * + * Purpose: Returns TRUE if string consists solely of whitespace. + * + * Args: s - string to check + */ +int +IsBlankline(char *s) +{ + for (; *s != '\0'; s++) + if (! isspace(*s)) return FALSE; + return TRUE; +} + + + +#ifdef CUBS_WIN +/* A timing test for sre_strcat() + * cc -O2 -g sre_string.c sre_ctype.c sqerror.c sre_math.c hsregex.c -lm + * 15.200u - 5.360u = 9.84u if sre_strcat() with no length info passed + * 13.660u - 5.360u = 8.30u if strcat(), with a single malloc(). + * 11.370u - 5.360u = 6.01u if sre_strcat() with length info passed. + */ +int main(void) +{ + float p[4] = {0.25, 0.25, 0.25, 0.25}; + int buflen; + int len; + int nappends; + int nstrings; + char *s1 = NULL; + char *s2; + int i; + + nappends = 100; + nstrings = 1000; + while (nstrings--) + { + /* s1 = malloc(sizeof(char) * (255*nappends+1)); + s1[0] = '\0'; + */ + + s1 = NULL; + len = 0; + for (i = 0; i < nappends; i++) + { + buflen = CHOOSE(255) + 1; + s2 = RandomSequence("ACGT", p, 4, buflen); + + /* strcat(s1,s2); */ + if ((len = sre_strcat(&s1, len, s2, buflen)) < 0) exit(1); + free(s2); + } + free(s1); + } + exit(0); +} +#endif /*CUBS_WIN*/ diff --git a/forester/archive/RIO/others/hmmer/squid/sreformat_main.c b/forester/archive/RIO/others/hmmer/squid/sreformat_main.c new file mode 100644 index 0000000..709f5ba --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/sreformat_main.c @@ -0,0 +1,251 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* sreformat_main.c + * Mon Sep 13 13:06:51 1993 + * + * sreformat - reformat sequence files. + * renamed sreformat from reformat, Tue Jun 30 10:53:38 1998 + * + * CVS $Id: sreformat_main.c,v 1.1.1.1 2005/03/22 08:34:25 cmzmasek Exp $ + */ + + +#include +#include +#include +#include "squid.h" +#include "msa.h" + +static char banner[] = "sreformat - convert between sequence formats"; + +static char usage[] = "\ +Usage: sreformat [-options] \n\ + Output format choices: Unaligned Aligned\n\ + ----------- -------\n\ + fasta stockholm\n\ + embl msf\n\ + genbank a2m\n\ + gcg phylip\n\ + gcgdata clustal\n\ + pir selex\n\ + raw eps\n\n\ + Available options are:\n\ + -h : help; print brief help on version and usage\n\ + -d : force DNA alphabet for nucleic acid sequence\n\ + -r : force RNA alphabet for nucleic acid sequence\n\ + -l : force lower case\n\ + -u : force upper case\n\ + -x : convert non-IUPAC chars in DNA to N's for IUPAC/BLAST compatibility\n\ +"; + +static char experts[] = "\ + Expert options:\n\ + --informat : input sequence file is in format \n\ + --mingap : remove columns containing all gaps (seqfile=alignment)\n\ + --nogap : remove columns containing any gaps (seqfile=alignment)\n\ + --pfam : modify Stockholm format output to be in PFAM style (1 line/seq)\n\ + --sam : try to convert gaps to SAM style (seqfile=alignment)\n\ + --samfrac : convert to SAM convention; cols w/ gapfrac > x are inserts\n\ + --gapsym : convert all gaps to character ''\n\ +"; + +static struct opt_s OPTIONS[] = { + { "-d", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "-l", TRUE, sqdARG_NONE }, + { "-r", TRUE, sqdARG_NONE }, + { "-u", TRUE, sqdARG_NONE }, + { "-x", TRUE, sqdARG_NONE }, + { "--gapsym", FALSE, sqdARG_CHAR }, + { "--informat",FALSE, sqdARG_STRING }, + { "--mingap", FALSE, sqdARG_NONE }, + { "--nogap", FALSE, sqdARG_NONE }, + { "--pfam", FALSE, sqdARG_NONE }, + { "--sam", FALSE, sqdARG_NONE }, + { "--samfrac", FALSE, sqdARG_FLOAT }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *seqfile; /* name of sequence file */ + char *format; + SQFILE *dbfp; /* open sequence file */ + int fmt; /* format of seqfile */ + int outfmt; /* output format */ + char *seq; /* sequence */ + SQINFO sqinfo; + int i; + + int force_rna; /* TRUE to force RNA alphabet */ + int force_dna; /* TRUE to force DNA alphabet */ + int force_lower; /* TRUE to force lower case */ + int force_upper; /* TRUE to force upper case */ + int x_is_bad; /* TRUE to convert X to N */ + int do_mingap; /* TRUE to remove columns containing all gaps */ + int do_nogap; /* TRUE to remove columns containing any gaps */ + int do_pfam; /* TRUE to make SELEX -> PFAM */ + int samize; /* TRUE to SAMize an A2M conversion */ + float samfrac; /* -1, or gap fraction for a SAM conversion */ + int expect_alignment; /* TRUE to expect an input alignment to convert */ + char gapsym; /* 0 if unset; else = character to use for gaps */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + + force_rna = FALSE; + force_dna = FALSE; + force_upper = FALSE; + force_lower = FALSE; + x_is_bad = FALSE; + do_mingap = FALSE; + do_nogap = FALSE; + do_pfam = FALSE; + samize = FALSE; + samfrac = -1.0; + fmt = SQFILE_UNKNOWN; + expect_alignment = FALSE; + gapsym = 0; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-a") == 0) expect_alignment= TRUE; + else if (strcmp(optname, "-d") == 0) force_dna = TRUE; + else if (strcmp(optname, "-l") == 0) force_lower = TRUE; + else if (strcmp(optname, "-r") == 0) force_rna = TRUE; + else if (strcmp(optname, "-u") == 0) force_upper = TRUE; + else if (strcmp(optname, "-x") == 0) x_is_bad = TRUE; + else if (strcmp(optname, "--gapsym") == 0) gapsym = *optarg; + else if (strcmp(optname, "--mingap") == 0) do_mingap = TRUE; + else if (strcmp(optname, "--nogap") == 0) do_nogap = TRUE; + else if (strcmp(optname, "--pfam") == 0) do_pfam = TRUE; + else if (strcmp(optname, "--sam") == 0) samize = TRUE; + else if (strcmp(optname, "--samfrac") == 0) samfrac = atof(optarg); + else if (strcmp(optname, "--informat") == 0) { + fmt = String2SeqfileFormat(optarg); + if (fmt == SQFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc - optind != 2) + Die("%s\n", usage); + if (force_lower && force_upper) + Die("Can't force both upper case and lower case. Stop trying to confuse me.\n%s", + usage); + if (force_rna && force_dna) + Die("Can't force both RNA and DNA. Stop trying to find bugs. You'll be sorry.\n%s", + usage); + + format = argv[optind]; optind++; + seqfile = argv[optind]; optind++; + + /*********************************************** + * Figure out what format we're supposed to write + ***********************************************/ + + if ((outfmt = String2SeqfileFormat(format)) == SQFILE_UNKNOWN) + Die("Unknown output format %s\n%s", format, usage); + + /*********************************************** + * Reformat the file, printing to stdout. + ***********************************************/ + + /* If the output format is an alignment, then the input format + * has to be an alignment. + */ + if (IsAlignmentFormat(outfmt)) + { + MSAFILE *afp; + MSA *msa; + + if ((afp = MSAFileOpen(seqfile, fmt, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", seqfile); + + while ((msa = MSAFileRead(afp)) != NULL) + { + /* If asked, convert upper/lower convention and + * gap character conventions now + */ + if (do_mingap) MSAMingap(msa); + if (do_nogap) MSANogap(msa); + if (gapsym) AlignmentHomogenousGapsym(msa->aseq, msa->nseq, msa->alen, gapsym); + if (samize) SAMizeAlignment(msa->aseq, msa->nseq, msa->alen); + if (samfrac >= 0) SAMizeAlignmentByGapFrac(msa->aseq, msa->nseq, msa->alen, samfrac); + + for (i = 0; i < msa->nseq; i++) + { + if (force_dna) ToDNA(msa->aseq[i]); + if (force_rna) ToRNA(msa->aseq[i]); + if (x_is_bad) ToIUPAC(msa->aseq[i]); + if (force_lower) s2lower(msa->aseq[i]); + if (force_upper) s2upper(msa->aseq[i]); + } + + /* This code block can be replaced with a + * MSAFileWrite() call someday... SRE Sun Apr 22 19:17:19 2001 + */ + switch (outfmt) { + case MSAFILE_A2M: WriteA2M(stdout, msa); break; + case MSAFILE_CLUSTAL: WriteClustal(stdout, msa); break; + case MSAFILE_MSF: WriteMSF(stdout, msa); break; + case MSAFILE_PHYLIP: WritePhylip(stdout, msa); break; + case MSAFILE_SELEX: + if (do_pfam) WriteSELEXOneBlock(stdout, msa); + else WriteSELEX(stdout, msa); + break; + case MSAFILE_EPS: EPSWriteSmallMSA(stdout, msa); break; + case MSAFILE_STOCKHOLM: + if (do_pfam) WriteStockholmOneBlock(stdout, msa); + else WriteStockholm(stdout, msa); + break; + default: + Die("can't write. no such alignment format %d\n", outfmt); + } + + MSAFree(msa); + } + MSAFileClose(afp); + } + else + { + if ((dbfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL) + Die("Failed to open sequence file %s for reading", seqfile); + + while (ReadSeq(dbfp, fmt, &seq, &sqinfo)) + { + if (force_dna) ToDNA(seq); + if (force_rna) ToRNA(seq); + if (x_is_bad) ToIUPAC(seq); + if (force_lower) s2lower(seq); + if (force_upper) s2upper(seq); + + WriteSeq(stdout, outfmt, seq, &sqinfo); + FreeSequence(seq, &sqinfo); + } + SeqfileClose(dbfp); + } + + return 0; +} + diff --git a/forester/archive/RIO/others/hmmer/squid/ssi.c b/forester/archive/RIO/others/hmmer/squid/ssi.c new file mode 100644 index 0000000..04bb4a5 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/ssi.c @@ -0,0 +1,1504 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "squid.h" +#include "ssi.h" + +static sqd_uint32 v20magic = 0xf3f3e9b1; /* SSI 1.0: "ssi1" + 0x80808080 */ +static sqd_uint32 v20swap = 0xb1e9f3f3; /* byteswapped */ + +static int read_i16(FILE *fp, sqd_uint16 *ret_result); +static int read_i32(FILE *fp, sqd_uint32 *ret_result); +static int read_i64(FILE *fp, sqd_uint64 *ret_result); +static int read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset); +static int write_i16(FILE *fp, sqd_uint16 n); +static int write_i32(FILE *fp, sqd_uint32 n); +static int write_i64(FILE *fp, sqd_uint64 n); +static int write_offset(FILE *fp, SSIOFFSET *offset); +static int binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, + sqd_uint32 recsize, sqd_uint32 maxidx); +static int indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, + sqd_uint32 n); +static void clear_ssifile(SSIFILE *sfp); +static int write_index(FILE *fp, SSIINDEX *g); +static int write_index_chunk(SSIINDEX *g); +static sqd_uint64 current_chunk_size(SSIINDEX *g); +static int load_indexfile(SSIFILE *sfp); + +/* Function: SSIOpen() + * Date: SRE, Sun Dec 31 12:40:03 2000 [St. Louis] + * + * Purpose: Opens the SSI index file {filename} and returns + * a SSIFILE * stream thru {ret_sfp}. + * The caller must eventually close this stream using + * SSIClose(). More than one index file can be open + * at once. + * + * Args: filename - full path to a SSI index file + * + * Returns: Returns 0 on success, nonzero on failure. + */ +int +SSIOpen(char *filename, SSIFILE **ret_sfp) +{ + SSIFILE *sfp = NULL; + int status; + if ((sfp = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC; + if ((sfp->fp = fopen(filename, "rb")) == NULL) return SSI_ERR_NOFILE; + status = load_indexfile(sfp); + *ret_sfp = sfp; + return status; +} +/* load_indexfile(): given a SSIFILE structure with an open and positioned + * stream (fp) -- but no other data loaded -- read the next SSIFILE + * in from disk. We use this routine without its SSIOpen() wrapper + * as part of the external mergesort when creating large indices. + */ +static int +load_indexfile(SSIFILE *sfp) +{ + sqd_uint32 magic; + sqd_uint16 i; /* counter over files */ + int status; /* overall return status if an error is thrown */ + + status = SSI_ERR_BADFORMAT; /* default: almost every kind of error is a bad format error */ + + sfp->filename = NULL; + sfp->fileformat = NULL; + sfp->fileflags = NULL; + sfp->bpl = NULL; + sfp->rpl = NULL; + sfp->nfiles = 0; + if (! read_i32(sfp->fp, &magic)) {status = SSI_ERR_BADMAGIC; goto FAILURE; } + if (magic != v20magic && magic != v20swap) {status = SSI_ERR_BADMAGIC; goto FAILURE; } + if (! read_i32(sfp->fp, &(sfp->flags))) goto FAILURE; + + /* If we have 64-bit offsets, make sure we can deal with them. + */ +#ifndef HAS_64BIT_FILE_OFFSETS + if ((sfp->flags & SSI_USE64_INDEX) || + (sfp->flags & SSI_USE64)) + { status = SSI_ERR_NO64BIT; goto FAILURE; } +#endif + + sfp->imode = (sfp->flags & SSI_USE64_INDEX) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; + sfp->smode = (sfp->flags & SSI_USE64) ? SSI_OFFSET_I64 : SSI_OFFSET_I32; + + if (! read_i16(sfp->fp, &(sfp->nfiles))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->nprimary))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->nsecondary))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->flen))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->plen))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->slen))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->frecsize))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->precsize))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->srecsize))) goto FAILURE; + + if (! read_offset(sfp->fp, sfp->imode, &(sfp->foffset))) goto FAILURE; + if (! read_offset(sfp->fp, sfp->imode, &(sfp->poffset))) goto FAILURE; + if (! read_offset(sfp->fp, sfp->imode, &(sfp->soffset))) goto FAILURE; + + /* Read the file information and keep it. + * We expect the number of files to be small, so reading it + * once should be advantageous overall. If SSI ever had to + * deal with large numbers of files, you'd probably want to + * read file information on demand. + */ + if (sfp->nfiles == 0) goto FAILURE; + if ((sfp->filename=malloc(sizeof(char *) *sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + for (i = 0; i < sfp->nfiles; i++) sfp->filename[i] = NULL; + if ((sfp->fileformat=malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if ((sfp->fileflags =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if ((sfp->bpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if ((sfp->rpl =malloc(sizeof(sqd_uint32)*sfp->nfiles)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + + for (i = 0; i < sfp->nfiles; i++) + { + /* We have to explicitly position, because header and file + * records may expand in the future; frecsize and foffset + * give us forwards compatibility. + */ + if (indexfile_position(sfp, &(sfp->foffset), sfp->frecsize, i) !=0) goto FAILURE; + if ((sfp->filename[i] =malloc(sizeof(char)*sfp->flen)) == NULL) {status = SSI_ERR_MALLOC; goto FAILURE; } + if (fread(sfp->filename[i],sizeof(char),sfp->flen, sfp->fp)!=sfp->flen) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->fileformat[i]))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->fileflags[i]))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->bpl[i]))) goto FAILURE; + if (! read_i32(sfp->fp, &(sfp->rpl[i]))) goto FAILURE; + } + + /* Success. Return 0. + */ + return 0; + + FAILURE: + /* Failure: free the damaged structure, return status code. + */ + SSIClose(sfp); + return status; +} + + + +/* Function: SSIGetOffsetByName() + * Date: SRE, Sun Dec 31 13:55:31 2000 [St. Louis] + * + * Purpose: Looks up the string {key} in the open index {sfp}. + * {key} can be either a primary or secondary key. If {key} + * is found, {*ret_fh} contains a unique handle on + * the file that contains {key} (suitable for an SSIFileInfo() + * call, or for comparison to the handle of the last file + * that was opened for retrieval), and {offset} is filled + * in with the offset in that file. + * + * Args: sfp - open index file + * key - string to search for + * ret_fh - RETURN: handle on file that key is in + * ret_offset - RETURN: offset of the start of that key's record + * + * Returns: 0 on success. + * non-zero on error. + */ +int +SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh, + SSIOFFSET *ret_offset) +{ + int status; + sqd_uint16 fnum; + + /* Look in the primary keys. + */ + status = binary_search(sfp, key, sfp->plen, &(sfp->poffset), sfp->precsize, + sfp->nprimary); + if (status == 0) { + /* We found it as a primary key; get our data & return. + */ + if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; + *ret_fh = (int) fnum; + if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; + + return 0; /* success! (we don't need the other key data) */ + } else if (status == SSI_ERR_NO_SUCH_KEY) { + /* Not in the primary keys? OK, try the secondary keys. + */ + if (sfp->nsecondary > 0) { + char *pkey; + status = binary_search(sfp, key, sfp->slen, &(sfp->soffset), sfp->srecsize, + sfp->nsecondary); + if (status != 0) return status; + if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; + if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; + + status = SSIGetOffsetByName(sfp, pkey, ret_fh, ret_offset); + free(pkey); + } + return status; + + } else return status; + /*NOTREACHED*/ +} + +/* Function: SSIGetOffsetByNumber() + * Date: SRE, Mon Jan 1 19:42:42 2001 [St. Louis] + * + * Purpose: Looks up primary key #{n} in the open index {sfp}. + * {n} ranges from 0..nprimary-1. When key #{n} + * is found, {*ret_fh} contains a unique + * handle on the file that contains {key} (suitable + * for an SSIFileInfo() call, or for comparison to + * the handle of the last file that was opened for retrieval), + * and {offset} is filled in with the offset in that file. + * + * Args: sfp - open index file + * n - primary key number to retrieve. + * ret_fh - RETURN: handle on file that key is in + * ret_offset - RETURN: offset of the start of that key's record + * + * Returns: 0 on success. + * non-zero on error. + */ +int +SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, SSIOFFSET *ret_offset) +{ + sqd_uint16 fnum; + char *pkey; + + if (n >= sfp->nprimary) return SSI_ERR_NO_SUCH_KEY; + if (indexfile_position(sfp, &(sfp->poffset), sfp->precsize, n) != 0) + return SSI_ERR_SEEK_FAILED; + + if ((pkey = malloc(sizeof(char) * sfp->plen)) == NULL) return SSI_ERR_MALLOC; + if (fread(pkey, sizeof(char), sfp->plen, sfp->fp) != sfp->plen) return SSI_ERR_NODATA; + if (! read_i16(sfp->fp, &fnum)) return SSI_ERR_NODATA; + if (! read_offset(sfp->fp, sfp->smode, ret_offset)) return SSI_ERR_NODATA; + *ret_fh = fnum; + free(pkey); + return 0; +} + +/* Function: SSIGetSubseqOffset() + * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis] + * + * Purpose: Implements SSI_FAST_SUBSEQ. + * + * Looks up a primary or secondary {key} in the open + * index {sfp}. Asks for the nearest offset to a + * subsequence starting at position {requested_start} + * in the sequence (numbering the sequence 1..L). + * If {key} is found, on return, {ret_fh} + * contains a unique handle on the file that contains + * {key} (suitable for an SSIFileInfo() call, or for + * comparison to the handle of the last file that was + * opened for retrieval); {record_offset} contains the + * disk offset to the start of the record; {data_offset} + * contains the disk offset either exactly at the requested + * residue, or at the start of the line containing the + * requested residue; {ret_actual_start} contains the + * coordinate (1..L) of the first valid residue at or + * after {data_offset}. {ret_actual_start} is <= + * {requested_start}. + * + * Args: sfp - open index file + * key - primary or secondary key to find + * requested_start - residue we'd like to start at (1..L) + * ret_fh - RETURN: handle for file the key is in + * record_offset - RETURN: offset of entire record + * data_offset - RETURN: offset of subseq (see above) + * ret_actual_start- RETURN: coord (1..L) of residue at data_offset + * + * Returns: 0 on success, non-zero on failure. + */ +int +SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start, + int *ret_fh, SSIOFFSET *record_offset, + SSIOFFSET *data_offset, int *ret_actual_start) +{ + int status; + sqd_uint32 len; + int r, b, i, l; /* tmp variables for "clarity", to match docs */ + + /* Look up the key. Rely on the fact that SSIGetOffsetByName() + * leaves the index file positioned at the rest of the data for this key. + */ + status = SSIGetOffsetByName(sfp, key, ret_fh, record_offset); + if (status != 0) return status; + + /* Check that we're allowed to do subseq lookup on that file. + */ + if (! (sfp->fileflags[*ret_fh] & SSI_FAST_SUBSEQ)) + return SSI_ERR_NO_SUBSEQS; + + /* Read the data we need for subseq lookup + */ + if (! read_offset(sfp->fp, sfp->smode, data_offset)) return SSI_ERR_NODATA; + if (! read_i32(sfp->fp, &len)) return SSI_ERR_NODATA; + + /* Set up tmp variables for clarity of equations below, + * and to make them match documentation (ssi-format.tex). + */ + r = sfp->rpl[*ret_fh]; /* residues per line */ + b = sfp->bpl[*ret_fh]; /* bytes per line */ + i = requested_start; /* start position 1..L */ + l = (i-1)/r; /* data line # (0..) that the residue is on */ + if (r == 0 || b == 0) return SSI_ERR_NO_SUBSEQS; + if (i < 0 || i > len) return SSI_ERR_RANGE; + + /* When b = r+1, there's nothing but sequence on each data line (and the \0), + * and we can find each residue precisely. + */ + if (b == r+1) { + if (sfp->smode == SSI_OFFSET_I32) { + data_offset->mode = SSI_OFFSET_I32; + data_offset->off.i32 = data_offset->off.i32 + l*b + (i-1)%r; + } else if (sfp->smode == SSI_OFFSET_I64) { + data_offset->mode = SSI_OFFSET_I64; + data_offset->off.i64 = data_offset->off.i64 + l*b + (i-1)%r; + } + *ret_actual_start = requested_start; + } else { + /* else, there's other stuff on seq lines, so the best + * we can do easily is to position at start of relevant line. + */ + if (sfp->smode == SSI_OFFSET_I32) { + data_offset->mode = SSI_OFFSET_I32; + data_offset->off.i32 = data_offset->off.i32 + l*b; + } else if (sfp->smode == SSI_OFFSET_I64) { + data_offset->mode = SSI_OFFSET_I64; + data_offset->off.i64 = data_offset->off.i64 + l*b; + } + /* yes, the eq below is = 1 + (i-1)/r*r but it's not = i. that's an integer /. */ + *ret_actual_start = 1 + l*r; + } + return 0; +} + +/* Function: SSISetFilePosition() + * Date: SRE, Tue Jan 2 09:13:46 2001 [St. Louis] + * + * Purpose: Uses {offset} to sets the file position for {fp}, usually an + * open sequence file, relative to the start of the file. + * Hides the details of system-dependent shenanigans necessary for + * file positioning in large (>2 GB) files. + * + * Behaves just like fseek(fp, offset, SEEK_SET) for 32 bit + * offsets and <2 GB files. + * + * Warning: if all else fails, in desperation, it will try to + * use fsetpos(). This requires making assumptions about fpos_t + * that may be unwarranted... assumptions that ANSI C prohibits + * me from making... though I believe the ./configure + * script robustly tests whether I can play with fpos_t like this. + * + * Args: fp - file to position. + * offset - SSI offset relative to file start. + * + * Returns: 0 on success, nonzero on error. + */ +int +SSISetFilePosition(FILE *fp, SSIOFFSET *offset) +{ + if (offset->mode == SSI_OFFSET_I32) { + if (fseek(fp, offset->off.i32, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; + } +#ifndef HAS_64BIT_FILE_OFFSETS + else return SSI_ERR_NO64BIT; +#elif defined HAVE_FSEEKO && SIZEOF_OFF_T == 8 + else if (fseeko(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; +#elif defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8 + else if (fseeko64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; +#elif defined HAVE_FSEEK64 + else if (fseek64(fp, offset->off.i64, SEEK_SET) != 0) return SSI_ERR_SEEK_FAILED; +#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 + else if (fsetpos(fp, &(offset->off.i64)) != 0) return SSI_ERR_SEEK_FAILED; +#endif + return 0; +} + + +/* Function: SSIFileInfo() + * Date: SRE, Tue Jan 2 10:31:01 2001 [St. Louis] + * + * Purpose: Given a file number {fh} in an open index file + * {sfp}, retrieve file name {ret_filename} and + * the file format {ret_format}. + * + * {ret_filename} is a pointer to a string maintained + * internally by {sfp}. It should not be free'd; + * SSIClose(sfp) takes care of it. + * + * Args: sfp - open index file + * fh - handle on file to look up + * ret_filename - RETURN: name of file n + * ret_format - RETURN: format of file n + * + * Returns: 0 on success, nonzero on failure. + */ +int +SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format) +{ + if (fh < 0 || fh >= sfp->nfiles) return SSI_ERR_BADARG; + *ret_filename = sfp->filename[fh]; + *ret_format = sfp->fileformat[fh]; + return 0; +} + +/* Function: SSIClose() + * Date: SRE, Sun Dec 31 14:56:37 2000 [St. Louis] + * + * Purpose: Close an open {SSIFILE *}. + * + * Args: sfp - index file to close. + * + * Returns: (void) + */ +void +SSIClose(SSIFILE *sfp) +{ + if (sfp != NULL) { + clear_ssifile(sfp); + if (sfp->fp != NULL) fclose(sfp->fp); + free(sfp); + } +} +/* clear_ssifile(): free the innards of SSIFILE, without + * destroying the structure or closing the stream. + */ +static void +clear_ssifile(SSIFILE *sfp) +{ + int i; + + if (sfp->filename != NULL) { + for (i = 0; i < sfp->nfiles; i++) + if (sfp->filename[i] != NULL) free(sfp->filename[i]); + free(sfp->filename); + } + if (sfp->fileformat != NULL) free(sfp->fileformat); + if (sfp->fileflags != NULL) free(sfp->fileflags); + if (sfp->bpl != NULL) free(sfp->bpl); + if (sfp->rpl != NULL) free(sfp->rpl); +} + + +/* Function: SSIRecommendMode() + * Date: SRE, Fri Feb 16 08:23:47 2001 [St. Louis] + * + * Purpose: Examines the file and determines whether it should be + * indexed with large file support or not; returns + * SSI_OFFSET_I32 for most files, SSI_OFFSET_I64 for large + * files, or -1 on failure. + * + * Args: file - name of file to check for size + * + * Returns: -1 on failure (including case where file is too big) + * SSI_OFFSET_I32 for most files (<= 2^31-1 bytes) + * SSI_OFFSET_I64 for large files (> 2^31-1 bytes) + */ +int +SSIRecommendMode(char *file) +{ +#if HAVE_STAT64 + struct stat64 s1; + if (stat64(file, &s1) == 0) { + if (s1.st_size <= 2146483647L) return SSI_OFFSET_I32; + else return SSI_OFFSET_I64; + } +#else + struct stat s2; + if (stat(file, &s2) == 0) { + if (s2.st_size <= 2146483647L) return SSI_OFFSET_I32; + else return SSI_OFFSET_I64; + } +#endif + return -1; +} + + +/* Function: SSICreateIndex() + * Date: SRE, Tue Jan 2 11:23:25 2001 [St. Louis] + * + * Purpose: Creates and initializes a SSI index structure. + * Sequence file offset type is specified by {mode}. + * + * Args: mode - SSI_OFFSET_I32 or SSI_OFFSET_I64, sequence file index mode. + * + * Returns: ptr to new index structure, or NULL on failure. + * Caller is responsible for free'ing the returned + * structure with SSIFreeIndex(). + */ +SSIINDEX * +SSICreateIndex(int mode) +{ + SSIINDEX *g; + + g = NULL; + if ((g = malloc(sizeof(SSIINDEX))) == NULL) goto FAILURE; + g->smode = mode; + g->imode = SSI_OFFSET_I32; /* index always starts as 32-bit; may get upgraded later */ + +#ifndef HAS_64BIT_FILE_OFFSETS + if (mode == SSI_OFFSET_I64) + Die("\ +Can't create a 64-bit SSI index on this system, sorry;\n\ +I don't have 64-bit file offset functions available.\n"); +#endif + + g->filenames = NULL; + g->fileformat = NULL; + g->bpl = NULL; + g->rpl = NULL; + g->flen = 0; + g->nfiles = 0; + + g->pkeys = NULL; + g->plen = 0; + g->nprimary = 0; + g->tot_primary = 0; + + g->skeys = NULL; + g->slen = 0; + g->nsecondary = 0; + g->tot_secondary = 0; + + g->tmpbase = NULL; + g->t1 = NULL; + g->chunkoffset = NULL; + g->nchunks = 0; + + /* temporarily disabled: sort-on-disk needs more thought! */ + /* g->max_chunk_size= maxchunk; */ + g->max_chunk_size = 999999; + + /* All mallocs must go after NULL initializations, because of the cleanup strategy; + * we'll try to free anything non-NULL if a malloc fails. + */ + /* This is temporarily disabled. Sort-on-disk needs more thought! + if ((g->tmpbase = sre_strdup(tmpfile, -1)) == NULL) goto FAILURE; + */ + + if ((g->filenames = malloc(sizeof(char *) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + if ((g->fileformat= malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + if ((g->bpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + if ((g->rpl = malloc(sizeof(sqd_uint32) * SSI_FILE_BLOCK)) == NULL) goto FAILURE; + + if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; + if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; + + return g; + + FAILURE: + SSIFreeIndex(g); /* free the damaged structure */ + return NULL; +} + +/* Function: SSIGetFilePosition() + * Date: SRE, Tue Jan 2 09:59:26 2001 [St. Louis] + * + * Purpose: Fills {ret_offset} with the current disk + * offset of {fp}, relative to the start of the file. + * {mode} is set to either SSI_OFFSET_I32 or + * SSI_OFFSET_I64. If {mode} is _I32 (32 bit), just wraps + * a call to ftell(); otherwise deals with system-dependent + * details of 64-bit file offsets. + * + * Args: fp - open stream + * mode - SSI_OFFSET_I32 or SSI_OFFSET_I64 + * ret_offset - RETURN: file position + * + * Returns: 0 on success. nonzero on error. + */ +int +SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset) +{ + if (mode == SSI_OFFSET_I32) + { + ret_offset->mode = SSI_OFFSET_I32; + ret_offset->off.i32 = ftell(fp); + if (ret_offset->off.i32 == -1) return SSI_ERR_TELL_FAILED; + } + else if (mode != SSI_OFFSET_I64) abort(); /* only happens on a coding error */ + else { + ret_offset->mode = SSI_OFFSET_I64; +#ifndef HAS_64BIT_FILE_OFFSETS + return SSI_ERR_NO64BIT; +#elif defined HAVE_FTELLO && SIZEOF_OFF_T == 8 + if ((ret_offset->off.i64 = ftello(fp)) == -1) return SSI_ERR_TELL_FAILED; +#elif defined HAVE_FTELLO64 && SIZEOF_OFF64_T == 8 + if ((ret_offset->off.i64 = ftello64(fp)) == -1) return SSI_ERR_TELL_FAILED; +#elif defined HAVE_FTELL64 + if ((ret_offset->off.i64 = ftell64(fp)) == -1) return SSI_ERR_TELL_FAILED; +#elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 + if (fgetpos(fp, &(ret_offset->off.i64)) != 0) return SSI_ERR_TELL_FAILED; +#endif + } + return 0; +} + +/* Function: SSIAddFileToIndex() + * Date: SRE, Tue Jan 2 12:54:36 2001 [St. Louis] + * + * Purpose: Adds the sequence file {filename}, which is known to + * be in format {fmt}, to the index {g}. Creates and returns + * a unique filehandle {fh} for then associating primary keys + * with this file using SSIAddPrimaryKeyToIndex(). + * + * Args: g - active index + * filename - file to add + * fmt - format code for this file (e.g. SQFILE_FASTA) + * ret_fh - RETURN: unique handle for this file + * + * Returns: 0 on success; nonzero on error. + */ +int +SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh) +{ + int n; + + if (g->nfiles >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; + + n = strlen(filename); + if ((n+1) > g->flen) g->flen = n+1; + + if ((g->filenames[g->nfiles] = sre_strdup(filename, n)) == NULL) return SSI_ERR_MALLOC; + g->fileformat[g->nfiles] = fmt; + g->bpl[g->nfiles] = 0; + g->rpl[g->nfiles] = 0; + *ret_fh = g->nfiles; /* handle is simply = file number */ + g->nfiles++; + + if (g->nfiles % SSI_FILE_BLOCK == 0) { + g->filenames = realloc(g->filenames, sizeof(char *) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->filenames == NULL) return SSI_ERR_MALLOC; + g->fileformat= realloc(g->fileformat,sizeof(int) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->fileformat == NULL) return SSI_ERR_MALLOC; + g->bpl = realloc(g->fileformat,sizeof(int) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->bpl == NULL) return SSI_ERR_MALLOC; + g->rpl = realloc(g->fileformat,sizeof(int) * (g->nfiles+SSI_FILE_BLOCK)); + if (g->rpl == NULL) return SSI_ERR_MALLOC; + } + return 0; +} + + +/* Function: SSISetFileForSubseq() + * Date: SRE, Tue Jan 9 10:02:05 2001 [St. Louis] + * + * Purpose: Set SSI_FAST_SUBSEQ for the file indicated by + * filehandle {fh} in the index {g}, setting + * parameters {bpl} and {rpl} to the values given. + * {bpl} is the number of bytes per sequence data line. + * {rpl} is the number of residues per sequence data line. + * Caller must be sure that {bpl} and {rpl} do not change + * on any line of any sequence record in the file + * (except for the last data line of each record). If + * this is not the case in this file, SSI_FAST_SUBSEQ + * will not work, and this routine should not be + * called. + * + * Args: g - the active index + * fh - handle for file to set SSI_FAST_SUBSEQ on + * bpl - bytes per data line + * rpl - residues per data line + * + * Returns: 0 on success; 1 on error. + */ +int +SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl) +{ + if (fh < 0 || fh >= g->nfiles) return SSI_ERR_BADARG; + if (bpl <= 0 || rpl <= 0) return SSI_ERR_BADARG; + g->bpl[fh] = bpl; + g->rpl[fh] = rpl; + return 0; +} + + +/* Function: SSIAddPrimaryKeyToIndex() + * Date: SRE, Tue Jan 2 11:50:54 2001 [St. Louis] + * + * Purpose: Put primary key {key} in the index {g}, while telling + * the index this primary key is in the file associated + * with filehandle {fh} (returned by a previous call + * to SSIAddFileToIndex()), and its record starts at + * position {r_off} in the file. + * + * {d_off} and {L} are optional; they may be left unset + * by passing NULL and 0, respectively. (If one is + * provided, both must be provided.) If they are provided, + * {d_off} gives the position of the first line of sequence + * data in the record, and {L} gives the length of + * the sequence in residues. They are used when + * SSI_FAST_SUBSEQ is set for this file. If SSI_FAST_SUBSEQ + * is not set for the file, {d_off} and {L} will be + * ignored by the index reading API even if they are stored + * by the index writing API, so it doesn't hurt for the + * indexing program to provide them; typically they + * won't know whether it's safe to set SSI_FAST_SUBSEQ + * for the whole file until the whole file has been + * read and every key has already been added to the index. + * + * Args: g - active index + * key - primary key to add + * fh - handle on file that this key's in + * r_off - offset to start of record + * d_off - offset to start of sequence data + * L - length of sequence, or 0 + * + * Returns: 0 on success, nonzero on error. + */ +int +SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh, + SSIOFFSET *r_off, SSIOFFSET *d_off, int L) +{ + int n; /* a string length */ + + if (fh >= SSI_MAXFILES) return SSI_ERR_TOOMANY_FILES; + if (g->nprimary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; + if (L > 0 && d_off == NULL) abort(); /* need both. */ + + /* Before adding the key: check how big our chunk of + * index is. If it's getting too large, flush a chunk to disk tmpfile. + */ + if (current_chunk_size(g) >= g->max_chunk_size) write_index_chunk(g); + + n = strlen(key); + if ((n+1) > g->plen) g->plen = n+1; + + if ((g->pkeys[g->nprimary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; + g->pkeys[g->nprimary].fnum = (sqd_uint16) fh; + g->pkeys[g->nprimary].r_off = *r_off; + if (d_off != NULL && L > 0) { + g->pkeys[g->nprimary].d_off = *d_off; + g->pkeys[g->nprimary].len = L; + } else { + /* yeah, this looks stupid, but look: we have to give a valid + looking, non-NULL d_off of some sort, or writes will fail. + It's going to be unused anyway. */ + g->pkeys[g->nprimary].d_off = *r_off; + g->pkeys[g->nprimary].len = 0; + } + g->pkeys[g->nprimary].handle = g->nprimary; + g->nprimary++; + + if (g->nprimary % SSI_KEY_BLOCK == 0) { + g->pkeys = realloc(g->pkeys, sizeof(struct ssipkey_s) * (g->nprimary+SSI_KEY_BLOCK)); + if (g->pkeys == NULL) return SSI_ERR_MALLOC; + } + + return 0; +} + + +/* Function: SSIAddSecondaryKeyToIndex() + * Date: SRE, Tue Jan 2 12:44:40 2001 [St. Louis] + * + * Purpose: Puts secondary key {key} in the index {g}, associating + * it with primary key {pkey} that was previously + * registered by SSIAddPrimaryKeyToIndex(). + * + * Args: g - active index + * key - secondary key to add + * pkey - primary key to associate this key with + * + * Returns: 0 on success, 1 on failure. + */ +int +SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey) +{ + int n; /* a string length */ + + if (g->nsecondary >= SSI_MAXKEYS) return SSI_ERR_TOOMANY_KEYS; + + n = strlen(key); + if ((n+1) > g->slen) g->slen = n+1; + + if ((g->skeys[g->nsecondary].key = sre_strdup(key, n)) == NULL) return SSI_ERR_MALLOC; + if ((g->skeys[g->nsecondary].pkey = sre_strdup(pkey, -1)) == NULL) return SSI_ERR_MALLOC; + g->nsecondary++; + + if (g->nsecondary % SSI_KEY_BLOCK == 0) { + g->skeys = realloc(g->skeys, sizeof(struct ssiskey_s) * (g->nsecondary+SSI_KEY_BLOCK)); + if (g->skeys == NULL) return SSI_ERR_MALLOC; + } + return 0; +} + + + + +/* Function: SSIWriteIndex() + * Date: SRE, Tue Jan 2 13:55:56 2001 [St. Louis] + * + * Purpose: Writes complete index {g} in SSI format to a + * binary file {file}. Does all + * the overhead of sorting the primary and secondary keys, + * and maintaining the association of secondary keys + * with primary keys during and after the sort. + * + * Args: file - file to write to + * g - index to sort & write out. + * + * Returns: 0 on success, nonzero on error. + */ +/* needed for qsort() */ +static int +pkeysort(const void *k1, const void *k2) +{ + struct ssipkey_s *key1; + struct ssipkey_s *key2; + key1 = (struct ssipkey_s *) k1; + key2 = (struct ssipkey_s *) k2; + return strcmp(key1->key, key2->key); +} +static int +skeysort(const void *k1, const void *k2) +{ + struct ssiskey_s *key1; + struct ssiskey_s *key2; + key1 = (struct ssiskey_s *) k1; + key2 = (struct ssiskey_s *) k2; + return strcmp(key1->key, key2->key); +} +int +SSIWriteIndex(char *file, SSIINDEX *g) +{ + FILE *fp; + int status; + + /* Case 1. Simple: the whole index fit in memory; write it to disk, + * we're done. + */ + if (g->t1 == NULL) { + if ((fp = fopen(file,"wb")) == NULL) return SSI_ERR_NOFILE; + status = write_index(fp, g); + fclose(fp); + g->tot_primary = g->nprimary; + g->tot_secondary = g->nsecondary; + return status; + } + + /* Case 2. Ugly: the index is big (and possibly *really* big, necessitating + * 64-bit offsets in the index itself!); we had to write the index to a tmp + * file on disk. Flush the last chunk to disk; then mergesort the chunks + * until we have one chunk to rule them all, one chunk to bind them. + */ + write_index_chunk(g); /* flush the last chunk. */ + fclose(g->t1); + + Die("oi, you haven't IMPLEMENTED the mergesort yet, dumbass."); + return 0; +} +static int +write_index(FILE *fp, SSIINDEX *g) +{ + int i; + sqd_uint32 header_flags, file_flags; + sqd_uint32 frecsize, precsize, srecsize; + sqd_uint64 foffset, poffset, soffset; + char *s, *s2; + + /* Magic-looking numbers come from adding up sizes + * of things in bytes + */ + frecsize = 16 + g->flen; + precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen; + srecsize = g->slen + g->plen; + + header_flags = 0; + if (g->smode == SSI_OFFSET_I64) header_flags |= SSI_USE64; + if (g->imode == SSI_OFFSET_I64) header_flags |= SSI_USE64_INDEX; + + /* Magic-looking numbers again come from adding up sizes + * of things in bytes + */ + foffset = (header_flags & SSI_USE64_INDEX) ? 66 : 54; + poffset = foffset + frecsize*g->nfiles; + soffset = poffset + precsize*g->nprimary; + + /* Sort the keys + */ + qsort((void *) g->pkeys, g->nprimary, sizeof(struct ssipkey_s), pkeysort); + qsort((void *) g->skeys, g->nsecondary, sizeof(struct ssiskey_s), skeysort); + + /* Write the header + */ + if (! write_i32(fp, v20magic)) return SSI_ERR_FWRITE; + if (! write_i32(fp, header_flags)) return SSI_ERR_FWRITE; + if (! write_i16(fp, g->nfiles)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->nprimary)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->nsecondary)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->flen)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->plen)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->slen)) return SSI_ERR_FWRITE; + if (! write_i32(fp, frecsize)) return SSI_ERR_FWRITE; + if (! write_i32(fp, precsize)) return SSI_ERR_FWRITE; + if (! write_i32(fp, srecsize)) return SSI_ERR_FWRITE; + if (g->imode == SSI_OFFSET_I32) { + if (! write_i32(fp, foffset)) return SSI_ERR_FWRITE; + if (! write_i32(fp, poffset)) return SSI_ERR_FWRITE; + if (! write_i32(fp, soffset)) return SSI_ERR_FWRITE; + } else { + if (! write_i64(fp, foffset)) return SSI_ERR_FWRITE; + if (! write_i64(fp, poffset)) return SSI_ERR_FWRITE; + if (! write_i64(fp, soffset)) return SSI_ERR_FWRITE; + } + + /* The file section + */ + if ((s = malloc(sizeof(char) * g->flen)) == NULL) return SSI_ERR_MALLOC; + for (i = 0; i < g->nfiles; i++) + { + file_flags = 0; + if (g->bpl[i] > 0 && g->rpl[i] > 0) file_flags |= SSI_FAST_SUBSEQ; + + strcpy(s, g->filenames[i]); + if (fwrite(s, sizeof(char), g->flen, fp) != g->flen) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->fileformat[i])) return SSI_ERR_FWRITE; + if (! write_i32(fp, file_flags)) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->bpl[i])) return SSI_ERR_FWRITE; + if (! write_i32(fp, g->rpl[i])) return SSI_ERR_FWRITE; + } + free(s); + + /* The primary key section + */ + if ((s = malloc(sizeof(char) * g->plen)) == NULL) return SSI_ERR_MALLOC; + for (i = 0; i < g->nprimary; i++) + { + strcpy(s, g->pkeys[i].key); + if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; + if (! write_i16( fp, g->pkeys[i].fnum)) return SSI_ERR_FWRITE; + if (! write_offset(fp, &(g->pkeys[i].r_off))) return SSI_ERR_FWRITE; + if (! write_offset(fp, &(g->pkeys[i].d_off))) return SSI_ERR_FWRITE; + if (! write_i32( fp, g->pkeys[i].len)) return SSI_ERR_FWRITE; + } + + /* The secondary key section + */ + if (g->nsecondary > 0) { + if ((s2 = malloc(sizeof(char) * g->slen)) == NULL) return SSI_ERR_MALLOC; + for (i = 0; i < g->nsecondary; i++) + { + strcpy(s2, g->skeys[i].key); + strcpy(s, g->skeys[i].pkey); + if (fwrite(s2, sizeof(char), g->slen, fp) != g->slen) return SSI_ERR_FWRITE; + if (fwrite(s, sizeof(char), g->plen, fp) != g->plen) return SSI_ERR_FWRITE; + } + free(s2); + } + + free(s); + return 0; +} +static int +write_index_chunk(SSIINDEX *g) +{ + int status; + int i; + + SQD_DPRINTF1(("Writing index chunk %d to disk... \n", g->nchunks)); + + /* Save the offset for each chunk in an array; remember how many + * chunks we put into the tmp file t1. + */ + if (g->t1 == NULL) { + char *t1file = NULL; + if ((t1file = sre_strdup(g->tmpbase, -1)) == NULL) goto FAILURE; + if (sre_strcat(&t1file, -1, ".t1", 3) < 0) goto FAILURE; + if ((g->t1 = fopen(t1file, "wb")) == NULL) return SSI_ERR_NOFILE; + free(t1file); + + if ((g->chunkoffset = malloc(sizeof(fpos_t))) == NULL) goto FAILURE; + } else { + if ((g->chunkoffset = realloc(g->chunkoffset, sizeof(fpos_t) * (g->nchunks+1))) == NULL) goto FAILURE; + } + if (fgetpos(g->t1, &(g->chunkoffset[g->nchunks])) != 0) + Die("Index file size has apparently exceeded system limitations, sorry."); + g->nchunks++; + + /* Sort and append this chunk of the index to the open tmp file t1 + */ + if ((status = write_index(g->t1, g)) != 0) return status; + g->tot_primary += g->nprimary; + g->tot_secondary += g->nsecondary; + + /* Now, a partial free'ing of the index - clear the keys, but leave the files + */ + for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey); + free(g->pkeys); + free(g->skeys); + + /* Reset the primary and secondary keys sections, in preparation + * for accumulating more + */ + g->pkeys = NULL; + g->plen = 0; + g->nprimary = 0; + + g->skeys = NULL; + g->slen = 0; + g->nsecondary = 0; + + if ((g->pkeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; + if ((g->skeys = malloc(sizeof(struct ssipkey_s)* SSI_KEY_BLOCK))== NULL) goto FAILURE; + return 0; + + FAILURE: + SSIFreeIndex(g); + return SSI_ERR_MALLOC; +} + + + +/* Function: SSIFreeIndex() + * Date: SRE, Tue Jan 2 11:44:08 2001 [St. Louis] + * + * Purpose: Free an index structure {g}. + * + * Args: g - ptr to an open index. + * + * Returns: (void) + */ +void +SSIFreeIndex(SSIINDEX *g) +{ + int i; + if (g != NULL) + { + for (i = 0; i < g->nfiles; i++) free(g->filenames[i]); + for (i = 0; i < g->nprimary; i++) free(g->pkeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].key); + for (i = 0; i < g->nsecondary; i++) free(g->skeys[i].pkey); + if (g->filenames != NULL) free(g->filenames); + if (g->fileformat != NULL) free(g->fileformat); + if (g->bpl != NULL) free(g->bpl); + if (g->rpl != NULL) free(g->rpl); + if (g->pkeys != NULL) free(g->pkeys); + if (g->skeys != NULL) free(g->skeys); + if (g->tmpbase != NULL) free(g->tmpbase); + if (g->chunkoffset != NULL) free(g->chunkoffset); + if (g->t1 != NULL) fclose(g->t1); + free(g); + } +} + + +/* Function: SSIErrorString() + * Date: SRE, Tue Jan 2 10:38:10 2001 [St. Louis] + * + * Purpose: Returns a ptr to an internal string corresponding + * to error {n}, a code returned from any of the + * functions in the API that return non-zero on error. + * + * Args: n - error code + * + * Returns: ptr to an internal string. + */ +char * +SSIErrorString(int n) +{ + switch (n) { + case SSI_ERR_OK: return "ok (no error)"; + case SSI_ERR_NODATA: return "no data, fread() failed"; + case SSI_ERR_NO_SUCH_KEY: return "no such key"; + case SSI_ERR_MALLOC: return "out of memory, malloc() failed"; + case SSI_ERR_NOFILE: return "file not found, fopen() failed"; + case SSI_ERR_BADMAGIC: return "not a SSI file? (bad magic)"; + case SSI_ERR_BADFORMAT: return "corrupt format? unexpected data"; + case SSI_ERR_NO64BIT: return "no large file support for this system"; + case SSI_ERR_SEEK_FAILED: return "failed to reposition on disk"; + case SSI_ERR_TELL_FAILED: return "failed to get file position on disk"; + case SSI_ERR_NO_SUBSEQS: return "no fast subseq support for this seqfile"; + case SSI_ERR_RANGE: return "subseq start is out of range"; + case SSI_ERR_BADARG: return "an argument is out of range"; + default: return "unrecognized code"; + } + /*NOTREACHED*/ +} + +static int +read_i16(FILE *fp, sqd_uint16 *ret_result) +{ + sqd_uint16 result; + if (fread(&result, sizeof(sqd_uint16), 1, fp) != 1) return 0; + *ret_result = sre_ntoh16(result); + return 1; +} +static int +write_i16(FILE *fp, sqd_uint16 n) +{ + n = sre_hton16(n); + if (fwrite(&n, sizeof(sqd_uint16), 1, fp) != 1) return 0; + return 1; +} +static int +read_i32(FILE *fp, sqd_uint32 *ret_result) +{ + sqd_uint32 result; + if (fread(&result, sizeof(sqd_uint32), 1, fp) != 1) return 0; + *ret_result = sre_ntoh32(result); + return 1; +} +static int +write_i32(FILE *fp, sqd_uint32 n) +{ + n = sre_hton32(n); + if (fwrite(&n, sizeof(sqd_uint32), 1, fp) != 1) return 0; + return 1; +} +static int +read_i64(FILE *fp, sqd_uint64 *ret_result) +{ + sqd_uint64 result; + if (fread(&result, sizeof(sqd_uint64), 1, fp) != 1) return 0; + *ret_result = sre_ntoh64(result); + return 1; +} +static int +write_i64(FILE *fp, sqd_uint64 n) +{ + n = sre_hton64(n); + if (fwrite(&n, sizeof(sqd_uint64), 1, fp) != 1) return 0; + return 1; +} +static int +read_offset(FILE *fp, char mode, SSIOFFSET *ret_offset) +{ + if (mode == SSI_OFFSET_I32) { + ret_offset->mode = SSI_OFFSET_I32; + if (! read_i32(fp, &(ret_offset->off.i32))) return 0; + } else if (mode == SSI_OFFSET_I64) { + ret_offset->mode = SSI_OFFSET_I64; + if (! read_i64(fp, &(ret_offset->off.i64))) return 0; + } else return 0; + + return 1; +} +static int +write_offset(FILE *fp, SSIOFFSET *offset) +{ + if (offset->mode == SSI_OFFSET_I32) return write_i32(fp, offset->off.i32); + else if (offset->mode == SSI_OFFSET_I64) return write_i64(fp, offset->off.i64); + else abort(); + /*UNREACHED*/ + return 1; /* silence bitchy compilers */ +} + + +/* Function: binary_search() + * Date: SRE, Sun Dec 31 16:05:03 2000 [St. Louis] + * + * Purpose: Find a key in a SSI index, by a binary search + * in an alphabetically sorted list of keys. If successful, + * return 0, and the index file is positioned to read + * the rest of the data for that key. Else returns nonzero. + * + * Args: sfp - an open SSIFILE + * key - key to find + * klen - key length to allocate (plen or slen from sfp) + * base - base offset (poffset or soffset) + * recsize - size of each key record in bytes (precsize or srecsize) + * maxidx - # of keys (nprimary or nsecondary) + * + * Returns: 0 on success, and leaves file positioned for reading remaining + * data for the key. + * Nonzero on failure: + * SSI_ERR_NO_SUCH_KEY - that key's not in the index + * SSI_ERR_MALLOC - a memory allocation failure + * SSI_ERR_NODATA - an fread() failed + */ +static int +binary_search(SSIFILE *sfp, char *key, int klen, SSIOFFSET *base, + sqd_uint32 recsize, sqd_uint32 maxidx) +{ + char *name; + sqd_uint32 left, right, mid; + int cmp; + int status; + + if ((name = malloc (sizeof(char)*klen)) == NULL) return SSI_ERR_MALLOC; + left = 0; + right = maxidx; + while (1) { /* A binary search: */ + mid = (left+right) / 2; /* careful here. only works because + we limit unsigned vars to signed ranges. */ + if ((status = indexfile_position(sfp, base, recsize, mid)) != 0) + { free(name); return status; } + if (fread(name, sizeof(char), klen, sfp->fp) != klen) + { free(name); return SSI_ERR_NODATA; } + cmp = strcmp(name, key); + if (cmp == 0) break; /* found it! */ + else if (left >= right) /* oops, missed it; fail */ + { free(name); return SSI_ERR_NO_SUCH_KEY; } + else if (cmp < 0) left = mid+1; /* it's right of mid */ + else if (cmp > 0) right = mid-1; /* it's left of mid */ + } + free(name); + return 0; /* and sfp->fp is positioned... */ +} + +/* Function: indexfile_position() + * Date: SRE, Mon Jan 1 19:32:49 2001 [St. Louis] + * + * Purpose: Position the open index file {sfp} at the start + * of record {n} in a list of records that starts at + * base offset {base}, where each record takes up {l} + * bytes. (e.g. the position is byte (base + n*l)). + * + * Args: sfp - open SSIFILE + * base - offset of record 0 (e.g. sfp->foffset) + * len - size of each record in bytes (e.g. sfp->frecsize) + * n - which record to get (e.g. 0..sfp->nfiles) + * + * Returns: 0 on success, non-zero on failure. + */ +static int +indexfile_position(SSIFILE *sfp, SSIOFFSET *base, sqd_uint32 len, sqd_uint32 n) +{ + SSIOFFSET pos; + int status; + + if (base->mode == SSI_OFFSET_I32) { + pos.mode = SSI_OFFSET_I32; + pos.off.i32 = base->off.i32 + n*len; + } else if (base->mode == SSI_OFFSET_I64) { + pos.mode = SSI_OFFSET_I64; + pos.off.i64 = base->off.i64 + n*len; + } else return 0; + if ((status = SSISetFilePosition(sfp->fp, &pos)) != 0) return status; + return 0; +} + +/* Function: current_chunk_size() + * Date: SRE, Tue Feb 20 18:23:30 2001 [St. Louis] + * + * Purpose: Calculates the size of the current indexfile chunk, + * in megabytes. + */ +static sqd_uint64 +current_chunk_size(SSIINDEX *g) +{ + sqd_uint64 frecsize, precsize, srecsize; + sqd_uint64 total; + + /* Magic-looking numbers come from adding up sizes + * of things in bytes + */ + frecsize = 16 + g->flen; + precsize = (g->smode == SSI_OFFSET_I64) ? 22+g->plen : 14+g->plen; + srecsize = g->plen+g->slen; + total = (66L + /* header size, if 64bit index offsets */ + frecsize * g->nfiles + /* file section size */ + precsize * g->nprimary + /* primary key section size */ + srecsize * g->nsecondary) / /* secondary key section size */ + 1048576L; + return total; +} + + +#if 0 +static int +mergesort(SSIINDEX *g) +{ + char *infile; /* reading "tape" 1: source. */ + char *outfile; /* writing "tape" 2: destination. */ + SSIFILE *in1; /* on read, a chunk of the SSI file goes in an SSIFILE. */ + SSIFILE *in2; /* and chunk 2 goes in here. */ + FILE *outfp; /* where we're writing the merged data */ + int b; /* b, b+1 are current chunks we're merging from infile */ + char *k1, *k2; /* buffers full of keys to be merged from ch1, ch2 */ + sqd_uint32 base1, pos1, buflen1; /* buffered key input for ch1 */ + sqd_uint32 base2, pos2, buflen2; /* buffered key input for ch2 */ + sqd_uint32 maxbuf; + int status; + + /* Initializations. + */ + /* create the tmp file names */ + if ((infile = sre_strdup(g->tmpbase, -1)) == NULL) return SSI_ERR_MALLOC; + if (sre_strcat(&infile, -1, ".t1", 3) < 0) return SSI_ERR_MALLOC; + if ((outfile = sre_strdup(g->tmpbase, -1)) == NULL) return SSI_ERR_MALLOC; + if (sre_strcat(&outfile, -1, ".t2", 3) < 0) return SSI_ERR_MALLOC; + /* allocate the SSIFILEs for reading chunks */ + if ((in1 = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC; + if ((in2 = malloc(sizeof(SSIFILE))) == NULL) return SSI_ERR_MALLOC; + + /* Open infile for read; both chunks (in1 and in2) are read from this file, + * from different file offsets kept in g->chunkoffset[] + */ + if ((in1->fp = fopen(infile, "rb")) == NULL) return SSI_ERR_NOFILE; + in2->fp = in1->fp; + if ((outfp = fopen(outfile, "wb")) == NULL) return SSI_ERR_NOFILE; + + for (b = 0; b+1 < g->nchunks; b+=2) + { + if (fsetpos(in1->fp, &(g->chunkoffset[b])) > 0) return SSI_ERR_SEEK_FAILED; + if (fsetpos(in2->fp, &(g->chunkoffset[b+1])) > 0) return SSI_ERR_SEEK_FAILED; + + if (status = load_indexfile(in1) > 0) return status; + if (status = load_indexfile(in2) > 0) return status; + + merge_headers(g, in1, in2); + write_index_header(outfp, g); + + /* Merge the primary key section; + * do a buffered read of the pkeys from ch1 and ch2. + */ + maxbuf = 100000; + if ((k1 = malloc(sizeof(char) * (maxbuf*in1->precsize))) == NULL) return SSI_ERR_MALLOC; + if ((k2 = malloc(sizeof(char) * (maxbuf*in2->precsize))) == NULL) return SSI_ERR_MALLOC; + base1 = pos1 = buflen1 = 0; + base2 = pos2 = buflen2 = 0; + while (base1+pos1 < ch1->nprimary || base2+pos2 < ch2->nprimary) { + /* refill buffer for ch1? */ + if (pos1 == buflen1) { + base1 += buflen1; + pos1 = 0; + buflen1 = MIN(in1->nprimary - base1, maxbuf); + if (buflen1 > 0) { + if (fread(k1, sizeof(char), (buflen1*in1->precsize), in1->fp) + < buflen1*in1->precsize) + return SSI_ERR_NODATA; + } + } + /* refill buffer for ch2? */ + if (pos2 == buflen2) { + base2 += buflen2; + pos2 = 0; + buflen2 = MIN(in2->nprimary - base2, maxbuf); + if (buflen2 > 0) { + if (fread(k2, sizeof(char), (buflen1*in2->precsize), in2->fp) + < buflen2*in2->precsize) + return SSI_ERR_NODATA; + } + } + /* mergesort on keys; be careful of case where we're + out of keys in either ch1 or ch2 */ + if (base2+pos2 == ch2->nprimary || + strcmp(k1+(pos1*in1->precsize), k2+(pos2*in2->precsize))) + write_pkey(t3, &(pk1[pos1]), s); + pos1++; + } else { + write_pkey(t3, &(pk2[pos2]), s); + pos2++; + } + } + free(s); + free(pk1); + free(pk2); + + /* Merge the secondary keys; much like the primary key code above. + */ + maxbuf = 100000; + if ((sk1 = malloc(sizeof(struct ssiskey_s) * maxbuf)) == NULL) return SSI_ERR_MALLOC; + if ((sk2 = malloc(sizeof(struct ssiskey_s) * maxbuf)) == NULL) return SSI_ERR_MALLOC; + if ((s = malloc(sizeof(char) * newch->slen)) == NULL) return SSI_ERR_MALLOC; + base1 = pos1 = buflen1 = 0; + base2 = pos2 = buflen2 = 0; + while (base1+pos1 < ch1->nsecondary || base2+pos2 < ch2->nsecondary) { + /* refill buffer for ch1? */ + if (pos1 == buflen1) { + base1 += buflen1; + pos1 = 0; + buflen1 = MIN(ch1->nsecondary - base1, maxbuf); + if (buflen1 > 0) read_skeys(ch1->fp, sk1, buflen1); + } + /* refill buffer for ch2? */ + if (pos2 == buflen2) { + base2 += buflen2; + pos2 = 0; + buflen2 = MIN(ch2->nsecondary - base2, maxbuf); + if (buflen2 > 0) read_skeys(ch2->fp, sk2, buflen2); + } + /* mergesort on keys; be careful of case where we're + out of keys in either ch1 or ch2 */ + if (base2+pos2 == ch2->nsecondary || pkeysort(&(sk1[pos1]), &(sk2[pos2])) < 0) { + write_skey(t3, &(pk1[pos1]), s); + pos1++; + } else { + write_skey(t3, &(pk2[pos2]), s); + pos2++; + } + } + free(s); + free(pk1); + free(pk2); + + + + + /* clear ch1, ch2, in prep for loading new chunks */ + clear_ssifile(ch1); + clear_ssifile(ch2); + } /* end loop over chunks */ + +} +#endif + + +#ifdef MUGGINS_LETS_ME_SLEEP /* test driving code. */ +/* Minimally: + cc -g -Wall -o shiva -D MUGGINS_LETS_ME_SLEEP ssi.c sqerror.c sre_string.c types.c sre_ctype.c sre_math.c -lm +*/ + +int +main(int argc, char **argv) +{ + char name[32], accession[32]; + SSIINDEX *ssi; + int mode; + SSIOFFSET r_off, d_off; + FILE *ofp; + int i; + int fh; /* a file handle */ + int status; /* return status from a SSI call */ + + mode = SSI_OFFSET_I32; + if ((ssi = SSICreateIndex(mode)) == NULL) + Die("Failed to allocate SSI index"); + + /* Generate two FASTA files, tmp.0 and tmp.1, and index them. + */ + if ((ofp = fopen("tmp.0", "w")) == NULL) + Die("failed to open tmp.0"); + if ((status = SSIAddFileToIndex(ssi, "tmp.0", SQFILE_FASTA, &fh)) != 0) + Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status)); + for (i = 0; i < 10; i++) { + if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + sprintf(name, "seq%d", i); + sprintf(accession, "ac%d", i); + fprintf(ofp, ">%s [%s] Description? we don't need no steenking description.\n", + name, accession); + if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + fprintf(ofp, "AAAAAAAAAA\n"); + fprintf(ofp, "CCCCCCCCCC\n"); + fprintf(ofp, "GGGGGGGGGG\n"); + fprintf(ofp, "TTTTTTTTTT\n"); + + if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0) + Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status)); + if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0) + Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status)); + } + SSISetFileForSubseq(ssi, fh, 11, 10); + fclose(ofp); + + if ((ofp = fopen("tmp.1", "w")) == NULL) + Die("failed to open tmp.1"); + if ((status = SSIAddFileToIndex(ssi, "tmp.1", SQFILE_FASTA, &fh)) != 0) + Die("SSIAddFileToIndex() failed: %s", SSIErrorString(status)); + for (i = 10; i < 20; i++) { + if ((status = SSIGetFilePosition(ofp, mode, &r_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + sprintf(name, "seq%d", i); + sprintf(accession, "ac%d", i); + fprintf(ofp, ">%s [%s] i/o, i/o, it's off to disk we go.\n", + name, accession); + if ((status = SSIGetFilePosition(ofp, mode, &d_off)) != 0) + Die("SSIGetFilePosition() failed: %s", SSIErrorString(status)); + fprintf(ofp, "AAAAAAAAAA 10\n"); + fprintf(ofp, "CCCCCCCCCC 20\n"); + fprintf(ofp, "GGGGGGGGGG 30\n"); + fprintf(ofp, "TTTTTTTTTT 40\n"); + + if ((status = SSIAddPrimaryKeyToIndex(ssi, name, fh, &r_off, &d_off, 40)) != 0) + Die("SSIAddPrimaryKeyToIndex() failed: %s", SSIErrorString(status)); + if ((status = SSIAddSecondaryKeyToIndex(ssi, accession, name)) != 0) + Die("SSIAddSecondaryKeyToIndex() failed: %s", SSIErrorString(status)); + } + SSISetFileForSubseq(ssi, fh, 14, 10); + fclose(ofp); + + /* Write the index to tmp.ssi + */ + if ((status = SSIWriteIndex("tmp.ssi", ssi)) != 0) + Die("SSIWriteIndex() failed: %s", SSIErrorString(status)); + SSIFreeIndex(ssi); + + /* Now reopen the index and run some tests. + */ + exit(0); +} + + +#endif /* test driving code */ + + + diff --git a/forester/archive/RIO/others/hmmer/squid/ssi.h b/forester/archive/RIO/others/hmmer/squid/ssi.h new file mode 100644 index 0000000..1ecde6c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/ssi.h @@ -0,0 +1,193 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +#ifndef SSIH_INCLUDED +#define SSIH_INCLUDED + +/* ssi.h + * Database indexing (SSI format support) + * CVS $Id: ssi.h,v 1.1.1.1 2005/03/22 08:34:21 cmzmasek Exp $ + * + * See: ssi_format.tex in Docs/ + */ + +#include +#include "squid.h" + +/* Limits + */ +#define SSI_MAXFILES 32767 /* 2^15-1 */ +#define SSI_MAXKEYS 2147483647L /* 2^31-1 */ + +/* typedef: SSIOFFSET + * Use the union to save space, since the two offset types are + * mutually exclusive, controlled by "mode" + */ +struct ssioffset_s { + char mode; /* GSI_OFFSET_I32, for example */ + union { + sqd_uint32 i32; /* an offset that fseek() can use */ + sqd_uint64 i64; /* an offset that e.g. fseeko64() can use */ + } off; +}; +typedef struct ssioffset_s SSIOFFSET; +#define SSI_OFFSET_I32 0 +#define SSI_OFFSET_I64 1 + +/* Structure: SSIFILE + * xref: SSI API documentation in ssi-format.tex + */ +struct ssifile_s { + FILE *fp; /* open SSI index file */ + sqd_uint32 flags; /* optional behavior flags */ + sqd_uint16 nfiles; /* number of files = 16 bit int */ + sqd_uint32 nprimary; /* number of primary keys */ + sqd_uint32 nsecondary; /* number of secondary keys */ + sqd_uint32 flen; /* length of filenames (inc '\0') */ + sqd_uint32 plen; /* length of primary keys (inc '\0') */ + sqd_uint32 slen; /* length of secondary keys (inc '\0') */ + sqd_uint32 frecsize; /* # bytes in a file record */ + sqd_uint32 precsize; /* # bytes in a primary key record */ + sqd_uint32 srecsize; /* # bytes in a secondary key record */ + SSIOFFSET foffset; /* disk offset, start of file records */ + SSIOFFSET poffset; /* disk offset, start of pri key recs */ + SSIOFFSET soffset; /* disk offset, start of sec key recs */ + + char imode; /* mode for index file offsets, 32 v. 64 bit */ + char smode; /* mode for sequence file offsets, 32 v. 64 bit */ + + /* File information: + */ + char **filename; /* list of file names [0..nfiles-1] */ + sqd_uint32 *fileformat; /* file formats */ + sqd_uint32 *fileflags; /* optional per-file behavior flags */ + sqd_uint32 *bpl; /* bytes per line in file */ + sqd_uint32 *rpl; /* residues per line in file */ +}; +typedef struct ssifile_s SSIFILE; + +/* optional per-index behavior flags in SSIFILE structure's flags: + */ +#define SSI_USE64 1<<0 /* seq offsets are 64-bit */ +#define SSI_USE64_INDEX 1<<1 /* index file offsets are 64-bit */ + +/* optional per-file behavior flags in fileflags + */ +#define SSI_FAST_SUBSEQ 1<<0 /* can do subseq lookup in this file */ + +/* Structure: SSIINDEX + * + * Used when building up an index and writing it to disk + */ +struct ssipkey_s { /* Primary key data: */ + char *key; /* key name */ + sqd_uint16 fnum; /* file number */ + SSIOFFSET r_off; /* record offset */ + SSIOFFSET d_off; /* data offset */ + sqd_uint32 len; /* sequence length */ + sqd_uint32 handle; /* handle on this key*/ +}; +struct ssiskey_s { /* Secondary key data: */ + char *key; /* secondary key name */ + char *pkey; /* primary key name */ +}; +struct ssiindex_s { + int smode; /* sequence mode: SSI_OFFSET_I32 or _I64 */ + int imode; /* index mode: SSI_OFFSET_I32 or _I64 */ + + char **filenames; + sqd_uint32 *fileformat; + sqd_uint32 *bpl; + sqd_uint32 *rpl; + sqd_uint32 flen; /* length of longest filename, inc '\0' */ + sqd_uint16 nfiles; + + struct ssipkey_s *pkeys; + sqd_uint32 plen; /* length of longest pkey, including '\0' */ + sqd_uint32 nprimary; + sqd_uint32 tot_primary; + + struct ssiskey_s *skeys; + sqd_uint32 slen; /* length of longest skey, including '\0' */ + sqd_uint32 nsecondary; + sqd_uint32 tot_secondary; + + /* The following stuff is for creating really big indexes, where + * we have to write a tmp file to disk with multiple chunks, then + * mergesort the chunks. + */ + char *tmpbase; /* root name of tmp files: .t1 and .t2 */ + FILE *t1; /* open tmp file for collecting chunks */ + fpos_t *chunkoffset; /* array of offsets to individual chunks; 0..nchunks-1 */ + int nchunks; /* total # of chunks in t1 */ + int max_chunk_size; /* maximum size of chunk to hold in memory at one time, in MB */ +}; +typedef struct ssiindex_s SSIINDEX; + +/* These control malloc and realloc chunk sizes in the index + * construction code. + */ +#define SSI_FILE_BLOCK 10 +#define SSI_KEY_BLOCK 100 + +/* Error codes set by the API + */ +#define SSI_ERR_OK 0 +#define SSI_ERR_NODATA 1 /* no data? an fread() failed */ +#define SSI_ERR_NO_SUCH_KEY 2 /* that key's not in the index */ +#define SSI_ERR_MALLOC 3 +#define SSI_ERR_NOFILE 4 /* no such file? an fopen() failed */ +#define SSI_ERR_BADMAGIC 5 /* magic number mismatch in GSIOpen() */ +#define SSI_ERR_BADFORMAT 6 /* didn't read what I expected to fread() */ +#define SSI_ERR_NO64BIT 7 /* needed 64-bit support and didn't have it */ +#define SSI_ERR_SEEK_FAILED 8 /* an fseek() (or similar) failed */ +#define SSI_ERR_TELL_FAILED 9 /* an ftell() (or similar) failed */ +#define SSI_ERR_NO_SUBSEQS 10 /* fast subseq is disallowed */ +#define SSI_ERR_RANGE 11 /* subseq requested is out of range */ +#define SSI_ERR_BADARG 12 /* something wrong with a function argument */ + +#define SSI_ERR_TOOMANY_FILES 13 /* ran out of range for files in an index */ +#define SSI_ERR_TOOMANY_KEYS 14 /* ran out of range for keys in an index */ +#define SSI_ERR_FWRITE 15 + +/* The SSI file reading API: + */ +extern int SSIOpen(char *filename, SSIFILE **ret_sfp); +extern int SSIGetOffsetByName(SSIFILE *sfp, char *key, int *ret_fh, + SSIOFFSET *ret_offset); +extern int SSIGetOffsetByNumber(SSIFILE *sfp, int n, int *ret_fh, + SSIOFFSET *ret_offset); +extern int SSIGetSubseqOffset(SSIFILE *sfp, char *key, int requested_start, + int *ret_fh, SSIOFFSET *record_offset, + SSIOFFSET *data_offset, int *ret_actual_start); +extern int SSISetFilePosition(FILE *fp, SSIOFFSET *offset); +extern int SSIFileInfo(SSIFILE *sfp, int fh, char **ret_filename, int *ret_format); +extern void SSIClose(SSIFILE *sfp); + +/* The SSI index file writing API: + */ +extern int SSIRecommendMode(char *file); +extern SSIINDEX *SSICreateIndex(int mode); +extern int SSIGetFilePosition(FILE *fp, int mode, SSIOFFSET *ret_offset); +extern int SSIAddFileToIndex(SSIINDEX *g, char *filename, int fmt, int *ret_fh); +extern int SSISetFileForSubseq(SSIINDEX *g, int fh, int bpl, int rpl); +extern int SSIAddPrimaryKeyToIndex(SSIINDEX *g, char *key, int fh, + SSIOFFSET *r_off, SSIOFFSET *d_off, + int L); +extern int SSIAddSecondaryKeyToIndex(SSIINDEX *g, char *key, char *pkey); +extern int SSIWriteIndex(char *file, SSIINDEX *g); +extern void SSIFreeIndex(SSIINDEX *g); + +/* The SSI misc. functions API: + */ +extern char *SSIErrorString(int n); + + +#endif /*SSIH_INCLUDED*/ diff --git a/forester/archive/RIO/others/hmmer/squid/stack.c b/forester/archive/RIO/others/hmmer/squid/stack.c new file mode 100644 index 0000000..51b8664 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/stack.c @@ -0,0 +1,103 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* stack.c + * SRE, Thu Mar 3 10:08:48 1994 + * + * Implementation of generic stack structures. + * RCS $Id: stack.c,v 1.1.1.1 2005/03/22 08:34:25 cmzmasek Exp $ + */ + +#include +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + + +/************************************************************ + * intstack_s implementation. + * + * Functions: InitIntStack() - returns ptr to new stack + * PushIntStack() - (void) + * PopIntStack() - returns 1 on success, 0 if stack empty + * FreeIntStack() - returns number of elements free'd, or 0 if + * stack was empty. + * + * Implementation of the pushdown stack for storing single + * integers. + *************************************************************/ +struct intstack_s * +InitIntStack(void) +{ + struct intstack_s *stack; + + if ((stack = (struct intstack_s *) malloc (sizeof(struct intstack_s))) == NULL) + Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); + stack->nxt = NULL; + return stack; +} +void +PushIntStack(struct intstack_s *stack, int data) +{ + struct intstack_s *new; + + if ((new = (struct intstack_s *) malloc (sizeof(struct intstack_s))) == NULL) + Die("Memory allocation failure at %s line %d", __FILE__, __LINE__); + new->data = data; + + new->nxt = stack->nxt; + stack->nxt = new; +} + +int +PopIntStack(struct intstack_s *stack, int *ret_data) +{ + struct intstack_s *old; + + if (stack->nxt == NULL) return 0; + + old = stack->nxt; + stack->nxt = old->nxt; + + *ret_data = old->data; + free(old); + return 1; +} + +void +ReverseIntStack(struct intstack_s *stack) +{ + struct intstack_s *old; + struct intstack_s *new; + + old = stack->nxt; + stack->nxt = NULL; + while (old != NULL) + { + new = old; /* remove one from top of old stack */ + old = old->nxt; + new->nxt = stack->nxt; /* push it onto new stack */ + stack->nxt = new; + } +} + +int +FreeIntStack( struct intstack_s *stack ) +{ + int data; + int count = 0; + + while (PopIntStack(stack, &data)) + count++; + free(stack); + return count; +} diff --git a/forester/archive/RIO/others/hmmer/squid/stockholm.c b/forester/archive/RIO/others/hmmer/squid/stockholm.c new file mode 100644 index 0000000..f70250e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/stockholm.c @@ -0,0 +1,607 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* stockholm.c + * SRE, Fri May 28 15:46:41 1999 + * + * Reading/writing of Stockholm format multiple sequence alignments. + * + * example of API: + * + * MSA *msa; + * FILE *fp; -- opened for write with fopen() + * MSAFILE *afp; -- opened for read with MSAFileOpen() + * + * while ((msa = ReadStockholm(afp)) != NULL) + * { + * WriteStockholm(fp, msa); + * MSAFree(msa); + * } + * + * RCS $Id: stockholm.c,v 1.1.1.1 2005/03/22 08:34:30 cmzmasek Exp $ + */ +#include +#include +#include "squid.h" +#include "msa.h" + +static int parse_gf(MSA *msa, char *buf); +static int parse_gs(MSA *msa, char *buf); +static int parse_gc(MSA *msa, char *buf); +static int parse_gr(MSA *msa, char *buf); +static int parse_comment(MSA *msa, char *buf); +static int parse_sequence(MSA *msa, char *buf); +static void actually_write_stockholm(FILE *fp, MSA *msa, int cpl); + +#ifdef TESTDRIVE_STOCKHOLM +/***************************************************************** + * stockholm.c test driver: + * cc -DTESTDRIVE_STOCKHOLM -g -O2 -Wall -o test stockholm.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm + * + */ +int +main(int argc, char **argv) +{ + MSAFILE *afp; + MSA *msa; + char *file; + + file = argv[1]; + + if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL) + Die("Couldn't open %s\n", file); + + while ((msa = ReadStockholm(afp)) != NULL) + { + WriteStockholm(stdout, msa); + MSAFree(msa); + } + + MSAFileClose(afp); + exit(0); +} +/******************************************************************/ +#endif /* testdriver */ + + +/* Function: ReadStockholm() + * Date: SRE, Fri May 21 17:33:10 1999 [St. Louis] + * + * Purpose: Parse the next alignment from an open Stockholm + * format alignment file. Return the alignment, or + * NULL if there are no more alignments in the file. + * + * Args: afp - open alignment file + * + * Returns: MSA * - an alignment object. + * caller responsible for an MSAFree() + * NULL if no more alignments + * + * Diagnostics: + * Will Die() here with a (potentially) useful message + * if a parsing error occurs + */ +MSA * +ReadStockholm(MSAFILE *afp) +{ + MSA *msa; + char *s; + int status; + + if (feof(afp->f)) return NULL; + + /* Initialize allocation of the MSA. + */ + msa = MSAAlloc(10, 0); + + /* Check the magic Stockholm header line. + * We have to skip blank lines here, else we perceive + * trailing blank lines in a file as a format error when + * reading in multi-record mode. + */ + do { + if ((s = MSAFileGetLine(afp)) == NULL) { + MSAFree(msa); + return NULL; + } + } while (IsBlankline(s)); + + if (strncmp(s, "# STOCKHOLM 1.", 14) != 0) + Die("\ +File %s doesn't appear to be in Stockholm format.\n\ +Assuming there isn't some other problem with your file (it is an\n\ +alignment file, right?), please either:\n\ + a) use the Babelfish format autotranslator option (-B, usually);\n\ + b) specify the file's format with the --informat option; or\n\ + a) reformat the alignment to Stockholm format.\n", + afp->fname); + + /* Read the alignment file one line at a time. + */ + while ((s = MSAFileGetLine(afp)) != NULL) + { + while (*s == ' ' || *s == '\t') s++; /* skip leading whitespace */ + + if (*s == '#') { + if (strncmp(s, "#=GF", 4) == 0) status = parse_gf(msa, s); + else if (strncmp(s, "#=GS", 4) == 0) status = parse_gs(msa, s); + else if (strncmp(s, "#=GC", 4) == 0) status = parse_gc(msa, s); + else if (strncmp(s, "#=GR", 4) == 0) status = parse_gr(msa, s); + else status = parse_comment(msa, s); + } + else if (strncmp(s, "//", 2) == 0) break; + else if (*s == '\n') continue; + else status = parse_sequence(msa, s); + + if (status == 0) + Die("Stockholm format parse error: line %d of file %s while reading alignment %s", + afp->linenumber, afp->fname, msa->name == NULL? "" : msa->name); + } + + if (s == NULL && msa->nseq != 0) + Die ("Didn't find // at end of alignment %s", msa->name == NULL ? "" : msa->name); + + if (s == NULL && msa->nseq == 0) { + /* probably just some junk at end of file */ + MSAFree(msa); + return NULL; + } + + MSAVerifyParse(msa); + return msa; +} + + +/* Function: WriteStockholm() + * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis] + * + * Purpose: Write an alignment in standard multi-block + * Stockholm format to an open file. A wrapper + * for actually_write_stockholm(). + * + * Args: fp - file that's open for writing + * msa - alignment to write + * + * Returns: (void) + */ +void +WriteStockholm(FILE *fp, MSA *msa) +{ + actually_write_stockholm(fp, msa, 50); /* 50 char per block */ +} + +/* Function: WriteStockholmOneBlock() + * Date: SRE, Mon May 31 19:15:22 1999 [St. Louis] + * + * Purpose: Write an alignment in Pfam's single-block + * Stockholm format to an open file. A wrapper + * for actually_write_stockholm(). + * + * Args: fp - file that's open for writing + * msa - alignment to write + * + * Returns: (void) + */ +void +WriteStockholmOneBlock(FILE *fp, MSA *msa) +{ + actually_write_stockholm(fp, msa, msa->alen); /* one big block */ +} + + +/* Function: actually_write_stockholm() + * Date: SRE, Fri May 21 17:39:22 1999 [St. Louis] + * + * Purpose: Write an alignment in Stockholm format to + * an open file. This is the function that actually + * does the work. The API's WriteStockholm() + * and WriteStockholmOneBlock() are wrappers. + * + * Args: fp - file that's open for writing + * msa - alignment to write + * cpl - characters to write per line in alignment block + * + * Returns: (void) + */ +static void +actually_write_stockholm(FILE *fp, MSA *msa, int cpl) +{ + int i, j; + int len = 0; + int namewidth; + int typewidth = 0; /* markup tags are up to 5 chars long */ + int markupwidth = 0; /* #=GR, #=GC are four char wide + 1 space */ + char buf[256]; + int currpos; + char *s, *tok; + + /* Figure out how much space we need for name + markup + * to keep the alignment in register. Required by Stockholm + * spec, even though our Stockholm parser doesn't care (Erik's does). + */ + namewidth = 0; + for (i = 0; i < msa->nseq; i++) + if ((len = strlen(msa->sqname[i])) > namewidth) + namewidth = len; + + /* Figure out how much space we need for markup tags + * markupwidth = always 4 if we're doing markup: strlen("#=GR") + * typewidth = longest markup tag + */ + if (msa->ss != NULL) { markupwidth = 4; typewidth = 2; } + if (msa->sa != NULL) { markupwidth = 4; typewidth = 2; } + for (i = 0; i < msa->ngr; i++) + if ((len = strlen(msa->gr_tag[i])) > typewidth) typewidth = len; + + if (msa->rf != NULL) { markupwidth = 4; if (typewidth < 2) typewidth = 2; } + if (msa->ss_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; } + if (msa->sa_cons != NULL) { markupwidth = 4; if (typewidth < 7) typewidth = 7; } + for (i = 0; i < msa->ngc; i++) + if ((len = strlen(msa->gc_tag[i])) > typewidth) typewidth = len; + + + /* Magic Stockholm header + */ + fprintf(fp, "# STOCKHOLM 1.0\n"); + + /* Free text comments + */ + for (i = 0; i < msa->ncomment; i++) + fprintf(fp, "# %s\n", msa->comment[i]); + if (msa->ncomment > 0) fprintf(fp, "\n"); + + /* GF section: per-file annotation + */ + if (msa->name != NULL) fprintf(fp, "#=GF ID %s\n", msa->name); + if (msa->acc != NULL) fprintf(fp, "#=GF AC %s\n", msa->acc); + if (msa->desc != NULL) fprintf(fp, "#=GF DE %s\n", msa->desc); + if (msa->au != NULL) fprintf(fp, "#=GF AU %s\n", msa->au); + if (msa->flags & MSA_SET_GA) fprintf(fp, "#=GF GA %.1f %.1f\n", msa->ga1, msa->ga2); + if (msa->flags & MSA_SET_NC) fprintf(fp, "#=GF TC %.1f %.1f\n", msa->nc1, msa->nc2); + if (msa->flags & MSA_SET_TC) fprintf(fp, "#=GF TC %.1f %.1f\n", msa->tc1, msa->tc2); + for (i = 0; i < msa->ngf; i++) + fprintf(fp, "#=GF %-5s %s\n", msa->gf_tag[i], msa->gf[i]); + fprintf(fp, "\n"); + + + /* GS section: per-sequence annotation + */ + if (msa->flags & MSA_SET_WGT) + { + for (i = 0; i < msa->nseq; i++) + fprintf(fp, "#=GS %-*.*s WT %.2f\n", namewidth, namewidth, msa->sqname[i], msa->wgt[i]); + fprintf(fp, "\n"); + } + if (msa->sqacc != NULL) + { + for (i = 0; i < msa->nseq; i++) + if (msa->sqacc[i] != NULL) + fprintf(fp, "#=GS %-*.*s AC %s\n", namewidth, namewidth, msa->sqname[i], msa->sqacc[i]); + fprintf(fp, "\n"); + } + if (msa->sqdesc != NULL) + { + for (i = 0; i < msa->nseq; i++) + if (msa->sqdesc[i] != NULL) + fprintf(fp, "#=GS %*.*s DE %s\n", namewidth, namewidth, msa->sqname[i], msa->sqdesc[i]); + fprintf(fp, "\n"); + } + for (i = 0; i < msa->ngs; i++) + { + /* Multiannotated GS tags are possible; for example, + * #=GS foo DR PDB; 1xxx; + * #=GS foo DR PDB; 2yyy; + * These are stored, for example, as: + * msa->gs[0][0] = "PDB; 1xxx;\nPDB; 2yyy;" + * and must be decomposed. + */ + for (j = 0; j < msa->nseq; j++) + if (msa->gs[i][j] != NULL) + { + s = msa->gs[i][j]; + while ((tok = sre_strtok(&s, "\n", NULL)) != NULL) + fprintf(fp, "#=GS %*.*s %5s %s\n", namewidth, namewidth, + msa->sqname[j], msa->gs_tag[i], tok); + } + fprintf(fp, "\n"); + } + + /* Alignment section: + * contains aligned sequence, #=GR annotation, and #=GC annotation + */ + for (currpos = 0; currpos < msa->alen; currpos += cpl) + { + if (currpos > 0) fprintf(fp, "\n"); + for (i = 0; i < msa->nseq; i++) + { + strncpy(buf, msa->aseq[i] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "%-*.*s %s\n", namewidth+typewidth+markupwidth, namewidth+typewidth+markupwidth, + msa->sqname[i], buf); + + if (msa->ss != NULL && msa->ss[i] != NULL) { + strncpy(buf, msa->ss[i] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "#=GR %-*.*s SS %s\n", namewidth, namewidth, msa->sqname[i], buf); + } + if (msa->sa != NULL && msa->sa[i] != NULL) { + strncpy(buf, msa->sa[i] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "#=GR %-*.*s SA %s\n", namewidth, namewidth, msa->sqname[i], buf); + } + for (j = 0; j < msa->ngr; j++) + if (msa->gr[j][i] != NULL) { + strncpy(buf, msa->gr[j][i] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "#=GR %-*.*s %5s %s\n", + namewidth, namewidth, msa->sqname[i], msa->gr_tag[j], buf); + } + } + if (msa->ss_cons != NULL) { + strncpy(buf, msa->ss_cons + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SS_cons", buf); + } + + if (msa->sa_cons != NULL) { + strncpy(buf, msa->sa_cons + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "SA_cons", buf); + } + + if (msa->rf != NULL) { + strncpy(buf, msa->rf + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, "RF", buf); + } + for (j = 0; j < msa->ngc; j++) { + strncpy(buf, msa->gc[j] + currpos, cpl); + buf[cpl] = '\0'; + fprintf(fp, "#=GC %-*.*s %s\n", namewidth+typewidth, namewidth+typewidth, + msa->gc_tag[j], buf); + } + } + fprintf(fp, "//\n"); +} + + + + + +/* Format of a GF line: + * #=GF + */ +static int +parse_gf(MSA *msa, char *buf) +{ + char *gf; + char *featurename; + char *text; + char *s; + + s = buf; + if ((gf = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0; + while (*text && (*text == ' ' || *text == '\t')) text++; + + if (strcmp(featurename, "ID") == 0) + msa->name = sre_strdup(text, -1); + else if (strcmp(featurename, "AC") == 0) + msa->acc = sre_strdup(text, -1); + else if (strcmp(featurename, "DE") == 0) + msa->desc = sre_strdup(text, -1); + else if (strcmp(featurename, "AU") == 0) + msa->au = sre_strdup(text, -1); + else if (strcmp(featurename, "GA") == 0) + { + s = text; + if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + msa->ga1 = atof(text); + if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + msa->ga2 = atof(text); + msa->flags |= MSA_SET_GA; + } + else if (strcmp(featurename, "NC") == 0) + { + s = text; + if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + msa->nc1 = atof(text); + if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + msa->nc2 = atof(text); + msa->flags |= MSA_SET_NC; + } + else if (strcmp(featurename, "TC") == 0) + { + s = text; + if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + msa->tc1 = atof(text); + if ((text = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + msa->tc2 = atof(text); + msa->flags |= MSA_SET_TC; + } + else + MSAAddGF(msa, featurename, text); + + return 1; +} + + +/* Format of a GS line: + * #=GS + */ +static int +parse_gs(MSA *msa, char *buf) +{ + char *gs; + char *seqname; + char *featurename; + char *text; + int seqidx; + char *s; + + s = buf; + if ((gs = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((text = sre_strtok(&s, "\n", NULL)) == NULL) return 0; + while (*text && (*text == ' ' || *text == '\t')) text++; + + /* GS usually follows another GS; guess lastidx+1 + */ + seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1); + msa->lastidx = seqidx; + + if (strcmp(featurename, "WT") == 0) + { + msa->wgt[seqidx] = atof(text); + msa->flags |= MSA_SET_WGT; + } + + else if (strcmp(featurename, "AC") == 0) + MSASetSeqAccession(msa, seqidx, text); + + else if (strcmp(featurename, "DE") == 0) + MSASetSeqDescription(msa, seqidx, text); + + else + MSAAddGS(msa, featurename, seqidx, text); + + return 1; +} + +/* Format of a GC line: + * #=GC + */ +static int +parse_gc(MSA *msa, char *buf) +{ + char *gc; + char *featurename; + char *text; + char *s; + int len; + + s = buf; + if ((gc = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; + + if (strcmp(featurename, "SS_cons") == 0) + sre_strcat(&(msa->ss_cons), -1, text, len); + else if (strcmp(featurename, "SA_cons") == 0) + sre_strcat(&(msa->sa_cons), -1, text, len); + else if (strcmp(featurename, "RF") == 0) + sre_strcat(&(msa->rf), -1, text, len); + else + MSAAppendGC(msa, featurename, text); + + return 1; +} + +/* Format of a GR line: + * #=GR + */ +static int +parse_gr(MSA *msa, char *buf) +{ + char *gr; + char *seqname; + char *featurename; + char *text; + int seqidx; + int len; + int j; + char *s; + + s = buf; + if ((gr = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((featurename = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; + + /* GR usually follows sequence it refers to; guess msa->lastidx */ + seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx); + msa->lastidx = seqidx; + + if (strcmp(featurename, "SS") == 0) + { + if (msa->ss == NULL) + { + msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc); + msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc); + for (j = 0; j < msa->nseqalloc; j++) + { + msa->ss[j] = NULL; + msa->sslen[j] = 0; + } + } + msa->sslen[seqidx] = sre_strcat(&(msa->ss[seqidx]), msa->sslen[seqidx], text, len); + } + else if (strcmp(featurename, "SA") == 0) + { + if (msa->sa == NULL) + { + msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc); + msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc); + for (j = 0; j < msa->nseqalloc; j++) + { + msa->sa[j] = NULL; + msa->salen[j] = 0; + } + } + msa->salen[seqidx] = sre_strcat(&(msa->sa[seqidx]), msa->salen[seqidx], text, len); + } + else + MSAAppendGR(msa, featurename, seqidx, text); + + return 1; +} + + +/* comments are simply stored verbatim, not parsed + */ +static int +parse_comment(MSA *msa, char *buf) +{ + char *s; + char *comment; + + s = buf + 1; /* skip leading '#' */ + if (*s == '\n') { *s = '\0'; comment = s; } /* deal with blank comment */ + else if ((comment = sre_strtok(&s, "\n", NULL)) == NULL) return 0; + + MSAAddComment(msa, comment); + return 1; +} + +static int +parse_sequence(MSA *msa, char *buf) +{ + char *s; + char *seqname; + char *text; + int seqidx; + int len; + + s = buf; + if ((seqname = sre_strtok(&s, WHITESPACE, NULL)) == NULL) return 0; + if ((text = sre_strtok(&s, WHITESPACE, &len)) == NULL) return 0; + + /* seq usually follows another seq; guess msa->lastidx +1 */ + seqidx = MSAGetSeqidx(msa, seqname, msa->lastidx+1); + msa->lastidx = seqidx; + + msa->sqlen[seqidx] = sre_strcat(&(msa->aseq[seqidx]), msa->sqlen[seqidx], text, len); + return 1; +} + + + diff --git a/forester/archive/RIO/others/hmmer/squid/stockholm.h b/forester/archive/RIO/others/hmmer/squid/stockholm.h new file mode 100644 index 0000000..a9cae55 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/stockholm.h @@ -0,0 +1,51 @@ +#ifndef STOCKHOLM_H_INCLUDED +#define STOCKHOLM_H_INCLUDED + +#include "gki.h" + +typedef struct { + int *linetype; /* e.g. STOCKHOLM_GF_LINE; always valid */ + int *featurecode; /* all markup codes: e.g. STOCKHOLM_GF_ID; + nonmarkup: always set to STOCKHOLM_UNPARSED */ + char **featurename; /* all unparsed markup codes: string, e.g. "ID"; + all other lines: NULL */ + int *seqidx; /* all GS, GR, GC, sequence lines: which sequence; + other lines: 0 */ + int *len; /* all GR, GC, sequence lines: length of text field; + other lines: 0 */ + char **text; /* all unparsed nonblank lines: rest of data + other lines: NULL */ + int nseqalloc; /* current nseqs allocated for in aseqs and ainfo */ + int nlines; /* number of lines in this skel */ + int nlinealloc; /* current # of lines allocated for in this skel */ + int overall_line; /* line # in file (important in files w/ >1 ali)*/ +} alifile_skeleton; + +#define STOCKHOLM_GF_LINE 0 +#define STOCKHOLM_GS_LINE 1 +#define STOCKHOLM_GC_LINE 2 +#define STOCKHOLM_GR_LINE 3 +#define STOCKHOLM_SEQ_LINE 4 +#define STOCKHOLM_BLANK_LINE 5 +#define STOCKHOLM_COMMENT_LINE 6 + +#define STOCKHOLM_UNPARSED 0 +#define STOCKHOLM_GF_ID 1 +#define STOCKHOLM_GF_AC 2 +#define STOCKHOLM_GF_DE 3 +#define STOCKHOLM_GF_AU 4 +#define STOCKHOLM_GF_GA 5 +#define STOCKHOLM_GF_NC 6 +#define STOCKHOLM_GF_TC 7 +#define STOCKHOLM_GS_WT 100 +#define STOCKHOLM_GS_AC 101 +#define STOCKHOLM_GS_DE 102 +#define STOCKHOLM_GC_CS 200 +#define STOCKHOLM_GC_RF 201 +#define STOCKHOLM_GR_SS 300 +#define STOCKHOLM_GR_SA 301 + +#define SKEL_NSEQLUMP 10 /* allocate for new seqs in blocks of this size */ +#define SKEL_LUMPSIZE 100 /* allocate for new lines in skel in blocks of this size */ + +#endif /*STOCKHOLM_H_INCLUDED*/ diff --git a/forester/archive/RIO/others/hmmer/squid/stopwatch.c b/forester/archive/RIO/others/hmmer/squid/stopwatch.c new file mode 100644 index 0000000..5f2c4bd --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/stopwatch.c @@ -0,0 +1,307 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* stopwatch.c + * SRE, Fri Nov 26 14:54:21 1999 [St. Louis] [HMMER] + * SRE, Thu Aug 3 08:11:52 2000 [St. Louis] [moved to SQUID] + * + * Reporting of cpu/system/elapsed time used by a process. + * thanks to Warren Gish for assistance. + * + * Basic API: + * + * Stopwatch_t *w; + * w = StopwatchCreate(); + * + * StopwatchStart(w); + * do_lots_of_stuff; + * StopwatchStop(w); + * StopwatchDisplay(stdout, "CPU time: ", w); + * + * StopwatchFree(w); + * + * Some behavior can be controlled at compile time by #define's: + * + * SRE_STRICT_ANSI: By default, stopwatch module assumes that a + * machine is POSIX-compliant (e.g. has struct tms, sys/times.h, + * and times()). If compiled with -DSRE_STRICT_ANSI, reverts to + * pure ANSI C conformant implementation. This simpler system + * won't report system times, only user and elapsed times. + * + * SRE_ENABLE_PVM: If compiled with -DSRE_ENABLE_PVM, the + * functions StopwatchPVMPack() and StopwatchPVMUnpack() + * are compiled, providing PVM communications ability. + * + * One additional compile-time configuration note: + * PTHREAD_TIMES_HACK: Linux pthreads, as of RH6.0/glibc-devel-2.1.1-6, + * appears to interact poorly with times() -- usage times in all + * but the master thread are lost. A workaround for this bug is + * to run stopwatches in each worker thread, and accumulate those + * times back into the master stopwatch using StopwatchInclude(). + * (Just like a PVM implementation has to do.) In HMMER, this + * behavior is compiled in with -DPTHREAD_TIMES_HACK. No + * changes are made in stopwatch functions themselves, though; + * all the extra code is HMMER code. See hmmcalibrate.c for + * an example. + * + * See hmmcalibrate.c for examples of more complex usage + * in dealing with pthreads and PVM. + */ + +#include +#include +#include +#ifdef SRE_ENABLE_PVM +#include +#endif + +#include "stopwatch.h" + +/* Function: format_time_string() + * Date: SRE, Fri Nov 26 15:06:28 1999 [St. Louis] + * + * Purpose: Given a number of seconds, format into + * hh:mm:ss.xx in a provided buffer. + * + * Args: buf - allocated space (128 is plenty!) + * sec - number of seconds + * do_frac - TRUE (1) to include hundredths of a sec + */ +static void +format_time_string(char *buf, double sec, int do_frac) +{ + int h, m, s, hs; + + h = (int) (sec / 3600.); + m = (int) (sec / 60.) - h * 60; + s = (int) (sec) - h * 3600 - m * 60; + if (do_frac) { + hs = (int) (sec * 100.) - h * 360000 - m * 6000 - s * 100; + sprintf(buf, "%02d:%02d:%02d.%02d", h,m,s,hs); + } else { + sprintf(buf, "%02d:%02d:%02d", h,m,s); + } +} + +/* Function: StopwatchStart() + * Date: SRE, Fri Nov 26 15:07:48 1999 [St. Louis] + * + * Purpose: Start a stopwatch. + * + * Args: w - the watch + */ +void +StopwatchStart(Stopwatch_t *w) +{ + w->t0 = time(NULL); +#ifdef SRE_STRICT_ANSI + w->cpu0 = clock(); +#else + (void) times(&(w->cpu0)); +#endif + + w->elapsed = 0.; + w->user = 0.; + w->sys = 0.; +} + +/* Function: StopwatchStop() + * Date: SRE, Fri Nov 26 15:08:16 1999 [St. Louis] + * + * Purpose: Stop a stopwatch. + * + * The implementation allows "split times": + * you can stop a watch multiple times, reporting + * times at multiple points during program + * execution. + * + * Args: w - the watch + */ +void +StopwatchStop(Stopwatch_t *w) +{ + time_t t1; +#ifdef SRE_STRICT_ANSI + clock_t cpu1; +#else + struct tms cpu1; + long clk_tck; +#endif + + t1 = time(NULL); + w->elapsed = difftime(t1, w->t0); + +#ifdef SRE_STRICT_ANSI + cpu1 = clock(); + w->user = (double) (cpu1- w->cpu0) / (double) CLOCKS_PER_SEC; + w->sys = 0.; /* no way to portably get system time in ANSI C */ + +#else /* assume we're on a POSIX system by default */ + (void) times(&cpu1); + + clk_tck = sysconf(_SC_CLK_TCK); + w->user = (double) (cpu1.tms_utime + cpu1.tms_cutime - + w->cpu0.tms_utime - w->cpu0.tms_cutime) / + (double) clk_tck; + + w->sys = (double) (cpu1.tms_stime + cpu1.tms_cstime - + w->cpu0.tms_stime - w->cpu0.tms_cstime) / + (double) clk_tck; +#endif +} + +/* Function: StopwatchInclude() + * Date: SRE, Fri Nov 26 15:09:34 1999 [St. Louis] + * + * Purpose: Merge the cpu and system times from a slave into + * a master stopwatch. Both watches must be + * stopped, and should not be stopped again unless + * You Know What You're Doing. + * + * Elapsed time is *not* merged; master is assumed + * to be keeping track of the wall clock time, + * and the slave/worker watch is ignored. + * + * Used in two cases: + * 1) PVM; merge in the stopwatch(es) from separate + * process(es) in a cluster. + * 2) Threads, for broken pthreads/times() implementations + * that lose track of cpu times used by spawned + * threads. + * + * Args: w1 - the master stopwatch + * w2 - the slave/worker watch + * + */ +void +StopwatchInclude(Stopwatch_t *w1, Stopwatch_t *w2) +{ + w1->user += w2->user; + w1->sys += w2->sys; +} + +/* Function: StopwatchAlloc(), StopwatchZero(), StopwatchCopy(), + * StopwatchFree() + * Date: SRE, Fri Nov 26 15:13:14 1999 [St. Louis] + * + * Purpose: The usual creation/manipulation/destruction routines + * for a stopwatch object. + */ +Stopwatch_t * +StopwatchCreate(void) +{ + Stopwatch_t *w; + w = malloc(sizeof(Stopwatch_t)); + return w; +} +void +StopwatchZero(Stopwatch_t *w) +{ + w->elapsed = 0.; + w->user = 0.; + w->sys = 0.; +} +void +StopwatchCopy(Stopwatch_t *w1, Stopwatch_t *w2) +{ + w1->t0 = w2->t0; +#ifdef SRE_STRICT_ANSI + w1->cpu0 = w2->cpu0; +#else + w1->cpu0.tms_utime = w2->cpu0.tms_utime; + w1->cpu0.tms_stime = w2->cpu0.tms_stime; + w1->cpu0.tms_cutime = w2->cpu0.tms_cutime; + w1->cpu0.tms_cstime = w2->cpu0.tms_cstime; +#endif + w1->elapsed = w2->elapsed; + w1->user = w2->user; + w1->sys = w2->sys; +} +void +StopwatchFree(Stopwatch_t *w) +{ + free(w); +} + + +/* Function: StopwatchDisplay() + * Date: SRE, Fri Nov 26 15:14:12 1999 [St. Louis] + * + * Purpose: Output a usage summary line from a *stopped* + * stopwatch (the times will reflect the last + * time StopwatchStop() was called.) + * + * For s = "CPU Time: " an example output line is: + * CPU Time: 142.55u 7.17s 149.72 Elapsed: 00:02:35.00 + * + * Args: fp - open file for writing (stdout, possibly) + * s - prefix for the report line + * w - a (recently stopped) stopwatch + * + */ +void +StopwatchDisplay(FILE *fp, char *s, Stopwatch_t *w) +{ + char buf[128]; /* (safely holds up to 10^14 years) */ + + if (s == NULL) + fputs("CPU Time: ", fp); + else + fputs(s, fp); + + format_time_string(buf, w->user+w->sys, 1); +#ifdef SRE_STRICT_ANSI + fprintf(fp, "%.2fu %s ", w->user, buf); +#else + fprintf(fp, "%.2fu %.2fs %s ", w->user, w->sys, buf); +#endif + + format_time_string(buf, w->elapsed, 0); + fprintf(fp, "Elapsed: %s\n", buf); +} + +#ifdef SRE_ENABLE_PVM +/* Function: StopwatchPVMPack(), StopwatchPVMUnpack() + * Date: SRE, Fri Nov 26 15:22:04 1999 [St. Louis] + * + * Purpose: Transmission of stopwatch data in a PVM + * cluster. + */ +void +StopwatchPVMPack(Stopwatch_t *w) +{ + pvm_pkdouble(&(w->elapsed), 1, 1); + pvm_pkdouble(&(w->user), 1, 1); + pvm_pkdouble(&(w->sys), 1, 1); +} +void +StopwatchPVMUnpack(Stopwatch_t *w) +{ + pvm_upkdouble(&(w->elapsed), 1, 1); + pvm_upkdouble(&(w->user), 1, 1); + pvm_upkdouble(&(w->sys), 1, 1); +} +#endif /*SRE_ENABLE_PVM*/ + + +#ifdef TESTDRIVER +int +main(int argc, char **argv) +{ + Stopwatch_t stopwatch; + + StopwatchStart(&stopwatch); + + sleep(5); + + StopwatchStop(&stopwatch); + StopwatchDisplay(stdout, "CPU Time: ", &stopwatch); +} +#endif diff --git a/forester/archive/RIO/others/hmmer/squid/stopwatch.h b/forester/archive/RIO/others/hmmer/squid/stopwatch.h new file mode 100644 index 0000000..4794a2e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/stopwatch.h @@ -0,0 +1,59 @@ +/* stopwatch.h + * SRE, Fri Nov 26 14:54:21 1999 [St. Louis] [HMMER] + * SRE, Thu Aug 3 08:00:35 2000 [St. Louis] [moved to SQUID] + * CVS $Id: stopwatch.h,v 1.1.1.1 2005/03/22 08:34:24 cmzmasek Exp $ + * + * Header file for stopwatch.c module: + * reporting of cpu/system/elapsed time used by a process. + * See stopwatch.c comments for documentation of compile-time + * configuration options and API. + * + ***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ***************************************************************** + */ +#include +#include +#ifndef SRE_STRICT_ANSI +#include +#endif + +#ifndef STOPWATCH_H_INCLUDED +#define STOPWATCH_H_INCLUDED + +struct stopwatch_s { + time_t t0; /* Wall clock time, ANSI time() */ +#ifdef SRE_STRICT_ANSI + clock_t cpu0; /* CPU time, ANSI clock() */ +#else + struct tms cpu0; /* CPU/system time, POSIX times()*/ +#endif + + double elapsed; /* elapsed time, seconds */ + double user; /* CPU time, seconds */ + double sys; /* system time, seconds */ +}; +typedef struct stopwatch_s Stopwatch_t; + +extern void StopwatchStart(Stopwatch_t *w); +extern void StopwatchStop(Stopwatch_t *w); +extern void StopwatchInclude(Stopwatch_t *w1, Stopwatch_t *w2); +extern Stopwatch_t *StopwatchCreate(void); +extern void StopwatchZero(Stopwatch_t *w); +extern void StopwatchCopy(Stopwatch_t *w1, Stopwatch_t *w2); +extern void StopwatchFree(Stopwatch_t *w); +extern void StopwatchDisplay(FILE *fp, char *s, Stopwatch_t *w); + +#ifdef HMMER_PVM +extern void StopwatchPVMPack(Stopwatch_t *w); +extern void StopwatchPVMUnpack(Stopwatch_t *w); +#endif + +#endif /*STOPWATCH_H_INCLUDED*/ + diff --git a/forester/archive/RIO/others/hmmer/squid/test_main.c b/forester/archive/RIO/others/hmmer/squid/test_main.c new file mode 100644 index 0000000..1e80d54 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/test_main.c @@ -0,0 +1,25 @@ +/* Test of the file.c functions + * cp to ../test_main.c and "make test". + * Usage: ./test + */ + +#include +#include +#include "squid.h" + +int +main(int argc, char **argv) +{ + char *env; + char *file; + FILE *fp; + + env = argv[1]; + file = argv[2]; + + fp = EnvFileOpen(file, env); + if (fp != NULL) printf("File open succeeded\n"); + else printf("File open FAILED\n"); + + return 0; +} diff --git a/forester/archive/RIO/others/hmmer/squid/translate.c b/forester/archive/RIO/others/hmmer/squid/translate.c new file mode 100644 index 0000000..fbf7247 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/translate.c @@ -0,0 +1,87 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* + * translate.c - functions for translating nucleic acid sequence + * created Tue Jan 12 11:27:29 1993, SRE + * + * RCS $Id: translate.c,v 1.1.1.1 2005/03/22 08:34:31 cmzmasek Exp $ + */ + +#include +#include +#include "squid.h" + + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + + + +/* Function: Translate(char *seq, char **code) + * + * Given a ptr to the start of a nucleic acid sequence, + * and a genetic code, translate the sequence into + * amino acid sequence. + * + * code is an array of 65 strings, representing + * the translations of the 64 codons, arranged + * in order AAA, AAC, AAG, AAU, ..., UUA, UUC, UUG, UUU. + * '*' or '***' is used to represent termination + * codons, usually. The final string, code[64], + * is the code for an ambiguous amino acid. + * + * Because of the way space is allocated for the amino + * acid sequence, the amino acid strings cannot be + * longer than 3 letters each. (I don't foresee using + * anything but the single- and triple- letter codes.) + * + * Returns a ptr to the translation string on success, + * or NULL on failure. + */ +char * +Translate(char *seq, char **code) +{ + int codon; /* index for codon */ + char *aaseq; /* RETURN: the translation */ + char *aaptr; /* ptr into aaseq */ + int i; + + if (seq == NULL) + { squid_errno = SQERR_NODATA; return NULL; } + if ((aaseq = (char *) calloc (strlen(seq) + 1, sizeof(char))) == NULL) + Die("calloc failed"); + + aaptr = aaseq; + for (; *seq != '\0' && *(seq+1) != '\0' && *(seq+2) != '\0'; seq += 3) + { + /* calculate the lookup value for + this codon */ + codon = 0; + for (i = 0; i < 3; i++) + { + codon *= 4; + switch (*(seq + i)) { + case 'A': case 'a': break; + case 'C': case 'c': codon += 1; break; + case 'G': case 'g': codon += 2; break; + case 'T': case 't': codon += 3; break; + case 'U': case 'u': codon += 3; break; + default: codon = 64; break; + } + if (codon == 64) break; + } + + strcpy(aaptr, code[codon]); + aaptr += strlen(code[codon]); + } + return aaseq; +} diff --git a/forester/archive/RIO/others/hmmer/squid/translate_main.c b/forester/archive/RIO/others/hmmer/squid/translate_main.c new file mode 100644 index 0000000..1de9505 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/translate_main.c @@ -0,0 +1,226 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* translate_main.c + * + * translate - create a file of all possible protein ORFs, given + * an input nucleic acid sequence + * + * + * Not currently compliant w/ HMMER API. + * + * 1.02 Thu Apr 20 16:12:41 1995 + * + incorporated into squid + * + -a, -s options added + * + * CVS $Id: translate_main.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $ + */ + +#include +#include +#include +#include "squid.h" +#include "version.h" + +#ifdef NEED_GETOPTH +#include +#endif + +#define OPTIONS "ahl:o:qs:" + +static char usage[] = "\ +Usage: translate [-options] \n\ + Translate a nucleic acid sequence into protein ORFs.\n\ + Available options are:\n\ + -a : translate in full, with stops; no individual ORFs\n\ + -h : help; show brief usage and version info\n\ + -l : report only ORFs greater than minlen (default 20)\n\ + -o : save results in output file\n\ + -q : quiet; silence banner, for piping or redirection\n\ + -s : with -a, set stop character to \n"; + +int +main(int argc, char **argv) +{ + char *seqfile; /* name of seq file to read */ + SQFILE *seqfp; /* ptr to opened seq file */ + int format; /* format of sequence file */ + char *seq; /* ptr to current sequence */ + SQINFO sqinfo; /* sequence information */ + char *revseq; /* reverse complement of seq */ + int start, end; /* coords of ORF in current seq */ + int orfnumber; /* counter for ORFs in current seq */ + char *aaseq[6]; /* full translations in all 6 frames */ + char *orf; /* ptr to translated ORF sequence */ + char *sptr; /* ptr into orf */ + int len; /* length of an ORF */ + int frame; /* counter for frames (3..5 are reverse)*/ + + int minimum_len; /* minimum length of ORFs to print out */ + char *outfile; /* file to save output in */ + FILE *ofp; /* where to direct output */ + char stopchar; /* what to use as a stop character */ + int keepstops; /* TRUE to do six big ORFs */ + int quiet; /* TRUE to silence banner */ + + int optchar; /* option character */ + extern char *optarg; /* for getopt() */ + extern int optind; /* for getopt() */ + + /*********************************************** + * Parse the command line + ***********************************************/ + + format = SQFILE_UNKNOWN; /* autodetect by default */ + minimum_len = 20; + outfile = NULL; + stopchar = '*'; + keepstops = FALSE; + quiet = FALSE; + + while ((optchar = getopt(argc, argv, OPTIONS)) != -1) + switch (optchar) { + + case 'a': keepstops = TRUE; break; + case 'l': minimum_len = atoi(optarg); break; + case 'o': outfile = optarg; break; + case 'q': quiet = TRUE; break; + case 's': stopchar = *optarg; break; + + case 'h': + printf("translate %s, %s\n%s\n", RELEASE, RELEASEDATE, usage); + exit(EXIT_SUCCESS); + default: + Die("%s\n", usage); + } + + if (argc - optind != 1) + Die("Incorrect number of command line arguments\n%s\n", usage); + + seqfile = argv[optind]; + + /*********************************************** + * Open sequence file and output file + ***********************************************/ + + seqfp = SeqfileOpen(seqfile, format, NULL); + if (seqfp == NULL) + Die("Failed to open sequence file %s\n%s\n", + seqfile, usage); + + if (outfile != NULL) + { + if ((ofp = fopen(outfile, "w")) == NULL) + Die("Failed to open output file %s\n", outfile); + } + else + ofp = stdout; + + + /*********************************************** + * Main routine + ***********************************************/ + + if (! quiet) printf("translate %s, %s\n", RELEASE, RELEASEDATE); + + while (ReadSeq(seqfp, seqfp->format, &seq, &sqinfo)) + { + s2upper(seq); + revseq = (char *) malloc (sqinfo.len + 1); + revcomp(revseq, seq); + orfnumber = 1; + + /* Translate seq in all six frames */ + aaseq[0] = Translate(seq, stdcode1); + aaseq[1] = Translate(seq + 1, stdcode1); + aaseq[2] = Translate(seq + 2, stdcode1); + aaseq[3] = Translate(revseq, stdcode1); + aaseq[4] = Translate(revseq + 1, stdcode1); + aaseq[5] = Translate(revseq + 2, stdcode1); + + + + if (keepstops) + { /* full translation including stops */ + for (frame = 0; frame < 6; frame++) + { + fprintf(ofp, "> %s:%d", sqinfo.name, frame); + for (sptr = aaseq[frame]; *sptr; sptr++) + { + if (*sptr == '*') *sptr = stopchar; + if (! ((sptr - aaseq[frame]) % 50)) putc('\n', ofp); + putc((int) *sptr, ofp); + } + putc('\n', ofp); + } + } + else + { /* Print all decent ORF's in FASTA format */ + for (frame = 0; frame < 6; frame++) + { + /* initialize strtok on the first ORF; + termination codons are '*' symbols */ + orf = strtok(aaseq[frame], "*"); + while (orf != NULL) + { + len = strlen(orf); + if (len > minimum_len) + { + /* calculate coords */ + start = (orf - aaseq[frame]) * 3 + 1; + if (frame < 3) start += frame; /* frame corrections */ + else start -= frame-3; + + if (frame < 3) + end = start + len * 3; + else + { + start = -1 * (start - sqinfo.len - 1); + end = start - len * 3; + } + + fprintf(ofp, "> %s.%d length %d, nt %d..%d", + sqinfo.name, + orfnumber, + len, + start, + end); + + for (sptr = orf; *sptr; sptr++) + { + if (! ((sptr - orf) % 50)) + putc('\n', ofp); + putc((int) *sptr, ofp); + } + putc('\n', ofp); + + orfnumber++; + } + + /* pick off next orf */ + orf = strtok(NULL, "*"); + } + } + } + + for (frame = 0; frame < 6; frame++) + free(aaseq[frame]); + FreeSequence(seq, &sqinfo); + free(revseq); + } + + SeqfileClose(seqfp); + + /************************************************** + * Successful return to invocation environment + **************************************************/ + return 0; +} + diff --git a/forester/archive/RIO/others/hmmer/squid/types.c b/forester/archive/RIO/others/hmmer/squid/types.c new file mode 100644 index 0000000..d1e0b16 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/types.c @@ -0,0 +1,228 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* file: types.c + * + * Finicky type checkers for strings. Return 1 (TRUE) if ok, 0 elsewise. + * Also, finicky type converters (sre_ntoh32() and friends) + * + * CVS $Id: types.c,v 1.1.1.1 2005/03/22 08:34:27 cmzmasek Exp $ + */ + +#include +#include +#include "squid.h" + +/* Function: IsInt() + * + * Returns TRUE if s points to something that atoi() will parse + * completely and convert to an integer. + */ +int +IsInt(char *s) +{ + int hex = 0; + + if (s == NULL) {squid_errno = SQERR_PARAMETER; return 0; } + + /* skip whitespace */ + while (isspace((int) (*s))) s++; + /* skip leading sign */ + if (*s == '-' || *s == '+') s++; + /* skip leading conversion signals */ + if ((strncmp(s, "0x", 2) == 0 && (int) strlen(s) > 2) || + (strncmp(s, "0X", 2) == 0 && (int) strlen(s) > 2)) + { + s += 2; + hex = 1; + } + else if (*s == '0' && (int) strlen(s) > 1) + s++; + /* examine remainder for garbage chars */ + if (!hex) + while (*s != '\0') + { + if (!isdigit((int) (*s))) return 0; + s++; + } + else + while (*s != '\0') + { + if (!isxdigit((int) (*s))) return 0; + s++; + } + + return 1; +} + + +/* Function: IsReal() + * + * Purpose: Returns TRUE if s is a string representation + * of a valid floating point number. + */ +int +IsReal(char *s) +{ + int gotdecimal = 0; + int gotexp = 0; + int gotreal = 0; + + if (s == NULL) return 0; + + while (isspace((int) (*s))) s++; /* skip leading whitespace */ + if (*s == '-' || *s == '+') s++; /* skip leading sign */ + + /* Examine remainder for garbage. Allowed one '.' and + * one 'e' or 'E'; if both '.' and e/E occur, '.' + * must be first. + */ + while (*s != '\0') + { + if (isdigit((int) (*s))) + gotreal++; + else if (*s == '.') + { + if (gotdecimal) return 0; /* can't have two */ + if (gotexp) return 0; /* e/E preceded . */ + else gotdecimal++; + } + else if (*s == 'e' || *s == 'E') + { + if (gotexp) return 0; /* can't have two */ + else gotexp++; + } + else if (isspace((int) (*s))) + break; + + s++; + } + + while (isspace((int) (*s))) s++; /* skip trailing whitespace */ + if (*s == '\0' && gotreal) return 1; + else return 0; +} + + +/* Function: Byteswap() + * + * Purpose: Swap between big-endian and little-endian. + * For example: + * int foo = 0x12345678; + * byteswap((char *) &foo, sizeof(int)); + * printf("%x\n", foo) + * gives 78563412. + * + * I don't fully understand byte-swapping issues. + * However, I have tested this on chars through floats, + * on various machines: + * SGI IRIX 4.0.5, SunOS 4.1.3, DEC Alpha OSF/1, Alliant + * + * Date: Sun Feb 12 10:26:22 1995 + */ +void +Byteswap(char *swap, int nbytes) +{ + int x; + char byte; + + for (x = 0; x < nbytes / 2; x++) + { + byte = swap[nbytes - x - 1]; + swap[nbytes - x - 1] = swap[x]; + swap[x] = byte; + } +} + + + +/* Functions: sre_ntoh16(), etc. + * Date: SRE, Sun Dec 31 11:26:53 2000 [St. Louis] + * + * Purpose: Provide functionality of ntohs(), etc; extended + * to 64-bit unsigned ints, and explicitly provided + * in case a machine doesn't have the ntohs() + * family. + * + * If we're using the host functions, + * USE_HOST_BYTESWAP_FUNCTIONS was set to 1 in + * squidconf.h, and we #define'd sre_hton16(x)=hton(x), etc. + * in squid.h. In doing this, we assumed that the + * host functions work on 16- and 32-bit unsigned quantities. + * If for some reason that's not true, set + * USE_HOST_BYTESWAP_FUNCTIONS to 0. + */ +#ifndef USE_HOST_BYTESWAP_FUNCTIONS +sqd_uint16 +sre_ntoh16(sqd_uint16 netshort) +{ +#ifdef WORDS_BIGENDIAN + return netshort; +#else + Byteswap((char *) &netshort, 2); + return netshort; +#endif +} +sqd_uint32 +sre_ntoh32(sqd_uint32 netlong) +{ +#ifdef WORDS_BIGENDIAN + return netlong; +#else + Byteswap((char *) &netlong, 4); + return netlong; +#endif +} +sqd_uint16 +sre_hton16(sqd_uint16 hostshort) +{ +#ifdef WORDS_BIGENDIAN + return hostshort; +#else + Byteswap((char *) &hostshort, 2); + return hostshort; +#endif +} +sqd_uint32 +sre_hton32(sqd_uint32 hostlong) +{ +#ifdef WORDS_BIGENDIAN + return hostlong; +#else + Byteswap((char *) &hostlong, 4); + return hostlong; +#endif +} +#endif /*USE_HOST_BYTESWAP_FUNCTIONS*/ + +sqd_uint64 +sre_ntoh64(sqd_uint64 net_int64) +{ +#ifdef WORDS_BIGENDIAN + return net_int64; +#else + Byteswap((char *) &net_int64, 8); + return net_int64; +#endif +} +sqd_uint64 +sre_hton64(sqd_uint64 host_int64) +{ +#ifdef WORDS_BIGENDIAN + return host_int64; +#else + Byteswap((char *) &host_int64, 8); + return host_int64; +#endif +} + + + + diff --git a/forester/archive/RIO/others/hmmer/squid/weight.c b/forester/archive/RIO/others/hmmer/squid/weight.c new file mode 100644 index 0000000..d33902b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/weight.c @@ -0,0 +1,748 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* weight.c + * SRE, Thu Mar 3 07:56:01 1994 + * + * Calculate weights for sequences in an alignment. + * RCS $Id: weight.c,v 1.1.1.1 2005/03/22 08:34:33 cmzmasek Exp $ + */ + +#include +#include +#include "squid.h" + +static void upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node); +static void downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, + float *fwt, int node); +static float simple_distance(char *s1, char *s2); +static int simple_diffmx(char **aseqs,int num, float ***ret_dmx); + +/* Function: GSCWeights() + * + * Purpose: Use Erik's tree-based algorithm to set weights for + * sequences in an alignment. upweight() and downweight() + * are derived from Graeme Mitchison's code. + * + * Args: aseq - array of (0..nseq-1) aligned sequences + * nseq - number of seqs in alignment + * alen - length of alignment + * wgt - allocated [0..nseq-1] array of weights to be returned + * + * Return: (void) + * wgt is filled in. + */ +void +GSCWeights(char **aseq, int nseq, int alen, float *wgt) +{ + float **dmx; /* distance (difference) matrix */ + struct phylo_s *tree; + float *lwt, *rwt; /* weight on left, right of this tree node */ + float *fwt; /* final weight assigned to this node */ + int i; + + /* Sanity check first + */ + if (nseq == 1) { wgt[0] = 1.0; return; } + + /* I use a simple fractional difference matrix derived by + * pairwise identity. Perhaps I should include a Poisson + * distance correction. + */ + MakeDiffMx(aseq, nseq, &dmx); + if (! Cluster(dmx, nseq, CLUSTER_MIN, &tree)) Die("Cluster() failed"); + + /* Allocations + */ + lwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); + rwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); + fwt = MallocOrDie (sizeof(float) * (2 * nseq - 1)); + + /* lwt and rwt are the total branch weight to the left and + * right of a node or sequence. They are 0..2N-2. 0..N-1 are + * the sequences; these have weight 0. N..2N-2 are the actual + * tree nodes. + */ + for (i = 0; i < nseq; i++) + lwt[i] = rwt[i] = 0.0; + /* recursively calculate rwt, lwt, starting + at node nseq (the root) */ + upweight(tree, nseq, lwt, rwt, nseq); + + /* recursively distribute weight across the + tree */ + fwt[nseq] = nseq; + downweight(tree, nseq, lwt, rwt, fwt, nseq); + /* collect the weights */ + for (i = 0; i < nseq; i++) + wgt[i] = fwt[i]; + + FMX2Free(dmx); + FreePhylo(tree, nseq); + free(lwt); free(rwt); free(fwt); +} + +static void +upweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, int node) +{ + int ld,rd; + + ld = tree[node-nseq].left; + if (ld >= nseq) upweight(tree, nseq, lwt, rwt, ld); + rd = tree[node-nseq].right; + if (rd >= nseq) upweight(tree, nseq, lwt, rwt, rd); + lwt[node] = lwt[ld] + rwt[ld] + tree[node-nseq].lblen; + rwt[node] = lwt[rd] + rwt[rd] + tree[node-nseq].rblen; +} + + +static void +downweight(struct phylo_s *tree, int nseq, float *lwt, float *rwt, float *fwt, int node) +{ + int ld,rd; + float lnum, rnum; + + ld = tree[node-nseq].left; + rd = tree[node-nseq].right; + if (lwt[node] + rwt[node] > 0.0) + { + fwt[ld] = fwt[node] * (lwt[node] / (lwt[node] + rwt[node])); + fwt[rd] = fwt[node] * (rwt[node] / (lwt[node] + rwt[node])); + } + else + { + lnum = (ld >= nseq) ? tree[ld-nseq].incnum : 1.0; + rnum = (rd >= nseq) ? tree[rd-nseq].incnum : 1.0; + fwt[ld] = fwt[node] * lnum / (lnum + rnum); + fwt[rd] = fwt[node] * rnum / (lnum + rnum); + } + + if (ld >= nseq) downweight(tree, nseq, lwt, rwt, fwt, ld); + if (rd >= nseq) downweight(tree, nseq, lwt, rwt, fwt, rd); +} + + + + +/* Function: VoronoiWeights() + * + * Purpose: Calculate weights using the scheme of Sibbald & + * Argos (JMB 216:813-818 1990). The scheme is + * slightly modified because the original algorithm + * actually doesn't work on gapped alignments. + * The sequences are assumed to be protein. + * + * Args: aseq - array of (0..nseq-1) aligned sequences + * nseq - number of sequences + * alen - length of alignment + * wgt - allocated [0..nseq-1] array of weights to be returned + * + * Return: void + * wgt is filled in. + */ +void +VoronoiWeights(char **aseq, int nseq, int alen, float *wgt) +{ + float **dmx; /* distance (difference) matrix */ + float *halfmin; /* 1/2 minimum distance to other seqs */ + char **psym; /* symbols seen in each column */ + int *nsym; /* # syms seen in each column */ + int symseen[27]; /* flags for observed syms */ + char *randseq; /* randomly generated sequence */ + int acol; /* pos in aligned columns */ + int idx; /* index in sequences */ + int symidx; /* 0..25 index for symbol */ + int i; /* generic counter */ + float min; /* minimum distance */ + float dist; /* distance between random and real */ + float challenge, champion; /* for resolving ties */ + int itscale; /* how many iterations per seq */ + int iteration; + int best; /* index of nearest real sequence */ + + /* Sanity check first + */ + if (nseq == 1) { wgt[0] = 1.0; return; } + + itscale = 50; + + /* Precalculate 1/2 minimum distance to other + * sequences for each sequence + */ + if (! simple_diffmx(aseq, nseq, &dmx)) + Die("simple_diffmx() failed"); + halfmin = MallocOrDie (sizeof(float) * nseq); + for (idx = 0; idx < nseq; idx++) + { + for (min = 1.0, i = 0; i < nseq; i++) + { + if (i == idx) continue; + if (dmx[idx][i] < min) min = dmx[idx][i]; + } + halfmin[idx] = min / 2.0; + } + Free2DArray((void **) dmx, nseq); + + /* Set up the random sequence generating model. + */ + psym = MallocOrDie (alen * sizeof(char *)); + nsym = MallocOrDie (alen * sizeof(int)); + for (acol = 0; acol < alen; acol++) + psym[acol] = MallocOrDie (27 * sizeof(char)); + +/* #ifdef ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ + for (acol = 0; acol < alen; acol++) + { + memset(symseen, 0, sizeof(int) * 27); + for (idx = 0; idx < nseq; idx++) + if (! isgap(aseq[idx][acol])) + { + if (isupper((int) aseq[idx][acol])) + symidx = aseq[idx][acol] - 'A'; + else + symidx = aseq[idx][acol] - 'a'; + if (symidx >= 0 && symidx < 26) + symseen[symidx] = 1; + } + else + symseen[26] = 1; /* a gap */ + + for (nsym[acol] = 0, i = 0; i < 26; i++) + if (symseen[i]) + { + psym[acol][nsym[acol]] = 'A'+i; + nsym[acol]++; + } + if (symseen[26]) { psym[acol][nsym[acol]] = ' '; nsym[acol]++; } + } +/* #endif ORIGINAL_SIBBALD_ALGORITHM_IS_BROKEN */ + + /* Note: the original Sibbald&Argos algorithm calls for + * bounding the sampled space using a template-like random + * sequence generator. However, this leads to one minor + * and one major problem. The minor problem is that + * exceptional amino acids in a column can have a + * significant effect by altering the amount of sampled + * sequence space; the larger the data set, the worse + * this problem becomes. The major problem is that + * there is no reasonable way to deal with gaps. + * Gapped sequences simply inhabit a different dimensionality + * and it's pretty painful to imagine calculating Voronoi + * volumes when the N in your N-space is varying. + * Note that all the examples shown by Sibbald and Argos + * are *ungapped* examples. + * + * The best way I've found to circumvent this problem is + * just not to bound the sampled space; count gaps as + * symbols and generate completely random sequences. + */ +#ifdef ALL_SEQUENCE_SPACE + for (acol = 0; acol < alen; acol++) + { + strcpy(psym[acol], "ACDEFGHIKLMNPQRSTVWY "); + nsym[acol] = 21; + } +#endif + + /* Sibbald and Argos algorithm: + * 1) assign all seqs weight 0. + * 2) generate a "random" sequence + * 3) calculate distance to every other sequence + * (if we get a distance < 1/2 minimum distance + * to other real seqs, we can stop) + * 4) if unique closest sequence, increment its weight 1. + * if multiple closest seq, choose one randomly + * 5) repeat 2-4 for lots of iterations + * 6) normalize all weights to sum to nseq. + */ + randseq = MallocOrDie ((alen+1) * sizeof(char)); + + best = 42.; /* solely to silence GCC uninit warnings. */ + FSet(wgt, nseq, 0.0); + for (iteration = 0; iteration < itscale * nseq; iteration++) + { + for (acol = 0; acol < alen; acol++) + randseq[acol] = (nsym[acol] == 0) ? ' ' : psym[acol][CHOOSE(nsym[acol])]; + randseq[acol] = '\0'; + + champion = sre_random(); + for (min = 1.0, idx = 0; idx < nseq; idx++) + { + dist = simple_distance(aseq[idx], randseq); + if (dist < halfmin[idx]) + { + best = idx; + break; + } + if (dist < min) + { champion = sre_random(); best = idx; min = dist; } + else if (dist == min) + { + challenge = sre_random(); + if (challenge > champion) + { champion = challenge; best = idx; min = dist; } + } + } + wgt[best] += 1.0; + } + + for (idx = 0; idx < nseq; idx++) + wgt[idx] = wgt[idx] / (float) itscale; + + free(randseq); + free(nsym); + free(halfmin); + Free2DArray((void **) psym, alen); +} + + +/* Function: simple_distance() + * + * Purpose: For two identical-length null-terminated strings, return + * the fractional difference between them. (0..1) + * (Gaps don't count toward anything.) + */ +static float +simple_distance(char *s1, char *s2) +{ + int diff = 0; + int valid = 0; + + for (; *s1 != '\0'; s1++, s2++) + { + if (isgap(*s1) || isgap(*s2)) continue; + if (*s1 != *s2) diff++; + valid++; + } + return (valid > 0 ? ((float) diff / (float) valid) : 0.0); +} + +/* Function: simple_diffmx() + * + * Purpose: Given a set of flushed, aligned sequences, construct + * an NxN fractional difference matrix using the + * simple_distance rule. + * + * Args: aseqs - flushed, aligned sequences + * num - number of aseqs + * ret_dmx - RETURN: difference matrix (caller must free) + * + * Return: 1 on success, 0 on failure. + */ +static int +simple_diffmx(char **aseqs, + int num, + float ***ret_dmx) +{ + float **dmx; /* RETURN: distance matrix */ + int i,j; /* counters over sequences */ + + /* Allocate + */ + if ((dmx = (float **) malloc (sizeof(float *) * num)) == NULL) + Die("malloc failed"); + for (i = 0; i < num; i++) + if ((dmx[i] = (float *) malloc (sizeof(float) * num)) == NULL) + Die("malloc failed"); + + /* Calculate distances, symmetric matrix + */ + for (i = 0; i < num; i++) + for (j = i; j < num; j++) + dmx[i][j] = dmx[j][i] = simple_distance(aseqs[i], aseqs[j]); + + /* Return + */ + *ret_dmx = dmx; + return 1; +} + + + +/* Function: BlosumWeights() + * Date: SRE, Fri Jul 16 17:33:59 1999 (St. Louis) + * + * Purpose: Assign weights to a set of aligned sequences + * using the BLOSUM rule: + * - do single linkage clustering at some pairwise identity + * - in each cluster, give each sequence 1/clustsize + * total weight. + * + * The clusters have no pairwise link >= maxid. + * + * O(N) in memory. Probably ~O(NlogN) in time; O(N^2) + * in worst case, which is no links between sequences + * (e.g., values of maxid near 1.0). + * + * Args: aseqs - alignment + * nseq - number of seqs in alignment + * alen - # of columns in alignment + * maxid - fractional identity (e.g. 0.62 for BLOSUM62) + * wgt - [0..nseq-1] array of weights to be returned + */ +void +BlosumWeights(char **aseqs, int nseq, int alen, float maxid, float *wgt) +{ + int *c, nc; + int *nmem; /* number of seqs in each cluster */ + int i; /* loop counter */ + + SingleLinkCluster(aseqs, nseq, alen, maxid, &c, &nc); + + FSet(wgt, nseq, 1.0); + nmem = MallocOrDie(sizeof(int) * nc); + + for (i = 0; i < nc; i++) nmem[i] = 0; + for (i = 0; i < nseq; i++) nmem[c[i]]++; + for (i = 0; i < nseq; i++) wgt[i] = 1. / (float) nmem[c[i]]; + + free(nmem); + free(c); + return; +} + + +/* Function: PositionBasedWeights() + * Date: SRE, Fri Jul 16 17:47:22 1999 [St. Louis] + * + * Purpose: Implementation of Henikoff and Henikoff position-based + * weights (JMB 243:574-578, 1994) [Henikoff94b]. + * + * A significant advantage of this approach that Steve and Jorja + * don't point out is that it is O(N) in memory, unlike + * many other approaches like GSC weights or Voronoi. + * + * A potential disadvantage that they don't point out + * is that in the theoretical limit of infinite sequences + * in the alignment, weights go flat: eventually every + * column has at least one representative of each of 20 aa (or 4 nt) + * in it. + * + * They also don't give a rule for how to handle gaps. + * The rule used here seems the obvious and sensible one + * (ignore them). This means that longer sequences + * initially get more weight; hence a "double + * normalization" in which the weights are first divided + * by sequence length (to compensate for that effect), + * then normalized to sum to nseq. + * + * Limitations: + * Implemented in a way that's alphabet-independent: + * it uses the 26 upper case letters as "residues". + * Any alphabetic character in aseq is interpreted as + * a unique "residue" (case insensitively; lower case + * mapped to upper case). All other characters are + * interpreted as gaps. + * + * This way, we don't have to pass around any alphabet + * type info (DNA vs. RNA vs. protein) and don't have + * to deal with remapping IUPAC degenerate codes + * probabilistically. However, on the down side, + * a sequence with a lot of degenerate IUPAC characters + * will get an artifactually high PB weight. + * + * Args: aseq - sequence alignment to weight + * nseq - number of sequences in alignment + * alen - length of alignment + * wgt - RETURN: weights filled in (pre-allocated 0..nseq-1) + * + * Returns: (void) + * wgt is allocated (0..nseq-1) by caller, and filled in here. + */ +void +PositionBasedWeights(char **aseq, int nseq, int alen, float *wgt) +{ + int rescount[26]; /* count of A-Z residues in a column */ + int nres; /* number of different residues in col */ + int idx, pos; /* indices into aseq */ + int x; + float norm; + + FSet(wgt, nseq, 0.0); + for (pos = 0; pos < alen; pos++) + { + for (x = 0; x < 26; x++) rescount[x] = 0; + for (idx = 0; idx < nseq; idx++) + if (isalpha(aseq[idx][pos])) + rescount[toupper(aseq[idx][pos]) - 'A'] ++; + + nres = 0; + for (x = 0; x < 26; x++) + if (rescount[x] > 0) nres++; + + for (idx = 0; idx < nseq; idx++) + if (isalpha(aseq[idx][pos])) + wgt[idx] += 1. / (float) (nres * rescount[toupper(aseq[idx][pos]) - 'A']); + } + + for (idx = 0; idx < nseq; idx++) + wgt[idx] /= (float) DealignedLength(aseq[idx]); + norm = (float) nseq / FSum(wgt, nseq); + FScale(wgt, nseq, norm); + return; +} + + + + +/* Function: FilterAlignment() + * Date: SRE, Wed Jun 30 09:19:30 1999 [St. Louis] + * + * Purpose: Constructs a new alignment by removing near-identical + * sequences from a given alignment (where identity is + * calculated *based on the alignment*). + * Does not affect the given alignment. + * Keeps earlier sequence, discards later one. + * + * Usually called as an ad hoc sequence "weighting" mechanism. + * + * Limitations: + * Unparsed Stockholm markup is not propagated into the + * new alignment. + * + * Args: msa -- original alignment + * cutoff -- fraction identity cutoff. 0.8 removes sequences > 80% id. + * ret_new -- RETURN: new MSA, usually w/ fewer sequences + * + * Return: (void) + * ret_new must be free'd by caller: MSAFree(). + */ +void +FilterAlignment(MSA *msa, float cutoff, MSA **ret_new) +{ + int nnew; /* number of seqs in new alignment */ + int *list; + int *useme; + float ident; + int i,j; + int remove; + + /* find which seqs to keep (list) */ + /* diff matrix; allow ragged ends */ + list = MallocOrDie (sizeof(int) * msa->nseq); + useme = MallocOrDie (sizeof(int) * msa->nseq); + for (i = 0; i < msa->nseq; i++) useme[i] = FALSE; + + nnew = 0; + for (i = 0; i < msa->nseq; i++) + { + remove = FALSE; + for (j = 0; j < nnew; j++) + { + ident = PairwiseIdentity(msa->aseq[i], msa->aseq[list[j]]); + if (ident > cutoff) + { + remove = TRUE; + printf("removing %12s -- fractional identity %.2f to %s\n", + msa->sqname[i], ident, + msa->sqname[list[j]]); + break; + } + } + if (remove == FALSE) { + list[nnew++] = i; + useme[i] = TRUE; + } + } + + MSASmallerAlignment(msa, useme, ret_new); + free(list); + free(useme); + return; +} + + +/* Function: SampleAlignment() + * Date: SRE, Wed Jun 30 10:13:56 1999 [St. Louis] + * + * Purpose: Constructs a new, smaller alignment by sampling a given + * number of sequences at random. Does not change the + * alignment nor the order of the sequences. + * + * If you ask for a sample that is larger than nseqs, + * it silently returns the original alignment. + * + * Not really a weighting method, but this is as good + * a place as any to keep it, since it's similar in + * construction to FilterAlignment(). + * + * Args: msa -- original alignment + * sample -- number of sequences in new alignment (0 < sample <= nseq) + * ret_new -- RETURN: new MSA + * + * Return: (void) + * ret_new must be free'd by caller: MSAFree(). + */ +void +SampleAlignment(MSA *msa, int sample, MSA **ret_new) +{ + int *list; /* array for random selection w/o replace */ + int *useme; /* array of flags 0..nseq-1: TRUE to use */ + int i, idx; + int len; + + /* Allocations + */ + list = (int *) MallocOrDie (sizeof(int) * msa->nseq); + useme = (int *) MallocOrDie (sizeof(int) * msa->nseq); + for (i = 0; i < msa->nseq; i++) + { + list[i] = i; + useme[i] = FALSE; + } + + /* Sanity check. + */ + if (sample >= msa->nseq) sample = msa->nseq; + + /* random selection w/o replacement */ + for (len = msa->nseq, i = 0; i < sample; i++) + { + idx = CHOOSE(len); + printf("chose %d: %s\n", list[idx], msa->sqname[list[idx]]); + useme[list[idx]] = TRUE; + list[idx] = list[--len]; + } + + MSASmallerAlignment(msa, useme, ret_new); + free(list); + free(useme); + return; +} + + +/* Function: SingleLinkCluster() + * Date: SRE, Fri Jul 16 15:02:57 1999 [St. Louis] + * + * Purpose: Perform simple single link clustering of seqs in a + * sequence alignment. A pairwise identity threshold + * defines whether two sequences are linked or not. + * + * Important: runs in O(N) memory, unlike standard + * graph decomposition algorithms that use O(N^2) + * adjacency matrices or adjacency lists. Requires + * O(N^2) time in worst case (which is when you have + * no links at all), O(NlogN) in "average" + * case, and O(N) in best case (when there is just + * one cluster in a completely connected graph. + * + * (Developed because hmmbuild could no longer deal + * with GP120, a 16,013 sequence alignment.) + * + * Limitations: + * CASE-SENSITIVE. Assumes aseq have been put into + * either all lower or all upper case; or at least, + * within a column, there's no mixed case. + * + * Algorithm: + * I don't know if this algorithm is published. I + * haven't seen it in graph theory books, but that might + * be because it's so obvious that nobody's bothered. + * + * In brief, we're going to do a breadth-first search + * of the graph, and we're going to calculate links + * on the fly rather than precalculating them into + * some sort of standard adjacency structure. + * + * While working, we keep two stacks of maximum length N: + * a : list of vertices that are still unconnected. + * b : list of vertices that we've connected to + * in our current breadth level, but we haven't + * yet tested for other connections to a. + * The current length (number of elements in) a and b are + * kept in na, nb. + * + * We store our results in an array of length N: + * c : assigns each vertex to a component. for example + * c[4] = 1 means that vertex 4 is in component 1. + * nc is the number of components. Components + * are numbered from 0 to nc-1. We return c and nc + * to our caller. + * + * The algorithm is: + * + * Initialisation: + * a <-- all the vertices + * na <-- N + * b <-- empty set + * nb <-- 0 + * nc <-- 0 + * + * Then: + * while (a is not empty) + * pop a vertex off a, push onto b + * while (b is not empty) + * pop vertex v off b + * assign c[v] = nc + * for each vertex w in a: + * compare v,w. If w is linked to v, remove w + * from a, push onto b. + * nc++ + * q.e.d. :) + * + * Args: aseq - aligned sequences + * nseq - number of sequences in aseq + * alen - alignment length + * maxid - fractional identity threshold 0..1. if id >= maxid, seqs linked + * ret_c - RETURN: 0..nseq-1 assignments of seqs to components (clusters) + * ret_nc - RETURN: number of components + * + * Returns: void. + * ret_c is allocated here. Caller free's with free(*ret_c) + */ +void +SingleLinkCluster(char **aseq, int nseq, int alen, float maxid, + int **ret_c, int *ret_nc) +{ + int *a, na; /* stack of available vertices */ + int *b, nb; /* stack of working vertices */ + int *c; /* array of results */ + int nc; /* total number of components */ + int v,w; /* index of a working vertices */ + int i; /* loop counter */ + + /* allocations and initializations + */ + a = MallocOrDie (sizeof(int) * nseq); + b = MallocOrDie (sizeof(int) * nseq); + c = MallocOrDie (sizeof(int) * nseq); + for (i = 0; i < nseq; i++) a[i] = i; + na = nseq; + nb = 0; + nc = 0; + + /* Main algorithm + */ + while (na > 0) + { + v = a[na-1]; na--; /* pop a vertex off a, */ + b[nb] = v; nb++; /* and push onto b */ + while (nb > 0) + { + v = b[nb-1]; nb--; /* pop vertex off b */ + c[v] = nc; /* assign it to component nc */ + for (i = na-1; i >= 0; i--)/* backwards, becase of deletion/swapping we do*/ + if (simple_distance(aseq[v], aseq[a[i]]) < 1. - maxid) /* linked? */ + { + w = a[i]; a[i] = a[na-1]; na--; /* delete w from a (note swap) */ + b[nb] = w; nb++; /* push w onto b */ + } + } + nc++; + } + + /* Cleanup and return + */ + free(a); + free(b); + *ret_c = c; + *ret_nc = nc; + return; +} diff --git a/forester/archive/RIO/others/hmmer/squid/weight_main.c b/forester/archive/RIO/others/hmmer/squid/weight_main.c new file mode 100644 index 0000000..6bc3d65 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/squid/weight_main.c @@ -0,0 +1,187 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* weight_main.c + * SRE, Thu Mar 3 13:43:39 1994 + * + * Calculate weights for a sequence alignment. + * CVS $Id: weight_main.c,v 1.1.1.1 2005/03/22 08:34:30 cmzmasek Exp $ + */ + +#include +#include +#include +#include + +#include "squid.h" +#include "msa.h" + +static char banner[] = "weight - calculate sequence weights for an alignment"; + +static char usage[] = "\ +Usage: weight [-options] \n\ + Available options:\n\ + -b : use BLOSUM weighting scheme at fractional identity\n\ + -f : filter out seqs w/ fractional ident > [0-1]\n\ + -h : help; print version and usage info\n\ + -o : save weight-annotated alignment in \n\ + -p : use position based weight scheme (Henikoff & Henikoff)\n\ + -s : sample sequences at random into a new alignment\n\ + -v : use Voronoi weight scheme (Sibbald & Argos) \n\ +"; + +static char experts[] = "\ + Expert options:\n\ + --informat : specify alignment file format \n\ + allowed formats: SELEX, MSF, Clustal, a2m, PHYLIP\n\ + --quiet : suppress verbose banner\n\ +"; + +static struct opt_s OPTIONS[] = { + { "-b", TRUE, sqdARG_FLOAT }, + { "-f", TRUE, sqdARG_FLOAT }, + { "-h", TRUE, sqdARG_NONE }, + { "-o", TRUE, sqdARG_STRING }, + { "-p", TRUE, sqdARG_NONE }, + { "-s", TRUE, sqdARG_INT }, + { "-v", TRUE, sqdARG_NONE }, + { "--informat", FALSE, sqdARG_STRING }, + { "--quiet", FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *seqfile; /* file containing aligned seqs */ + MSAFILE *afp; /* pointer to open alignment file */ + MSA *msa; /* multiple sequence alignment */ + int fmt; /* expected format of alignment file */ + int idx; + char *outfile; /* output file for weighted alignment */ + FILE *ofp; /* open outfile */ + + int do_voronoi; /* use Sibbald/Argos Voronoi scheme */ + int do_blosum; /* use BLOSUM weighting scheme */ + int do_pbased; /* use position-based weights */ + int do_filter; /* use filtering scheme */ + float idlevel; /* identity level to filter at, [0-1] */ + int samplesize; /* if >0, don't weight, random sample */ + int be_quiet; /* TRUE to suppress banner */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + + fmt = MSAFILE_UNKNOWN; /* autodetect file format by default */ + outfile = NULL; + do_blosum = FALSE; + do_voronoi = FALSE; + do_pbased = FALSE; + do_filter = FALSE; + samplesize = 0; + be_quiet = FALSE; + idlevel = 0.; /* just to suppress gcc uninit warnings */ + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-b") == 0) + { do_blosum = TRUE; idlevel = atof(optarg); } + else if (strcmp(optname, "-f") == 0) + { do_filter = TRUE; idlevel = atof(optarg); } + else if (strcmp(optname, "-o") == 0) outfile = optarg; + else if (strcmp(optname, "-p") == 0) do_pbased = TRUE; + else if (strcmp(optname, "-s") == 0) samplesize = atoi(optarg); + else if (strcmp(optname, "-v") == 0) do_voronoi = TRUE; + else if (strcmp(optname, "--quiet") == 0) be_quiet = TRUE; + else if (strcmp(optname, "--informat") == 0) { + fmt = String2SeqfileFormat(optarg); + if (fmt == MSAFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + if (! IsAlignmentFormat(fmt)) + Die("%s is an unaligned format, can't read as an alignment", optarg); + } + else if (strcmp(optname, "-h") == 0) + { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + + if (argc -optind != 1) + Die("Wrong number of arguments specified on command line\n%s\n", usage); + seqfile = argv[optind]; + + if (outfile == NULL) + ofp = stdout; + else if ((ofp = fopen(outfile, "w")) == NULL) + Die("Failed to open alignment output file %s", outfile); + + if (do_voronoi + do_pbased + do_blosum + do_filter + samplesize > 1) + Die("Choose only one weighting scheme, please.\n%s\n", usage); + + if (do_voronoi || samplesize > 0) + sre_srandom(time(0)); + + if (! be_quiet) + Banner(stdout, banner); + + /*********************************************** + * Open the input alignment file and start... + * be prepared to deal with multiple entries in Stockholm files + ***********************************************/ + + if ((afp = MSAFileOpen(seqfile, fmt, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", seqfile); + + while ((msa = MSAFileRead(afp)) != NULL) + { + for (idx = 0; idx < msa->nseq; idx++) + s2upper(msa->aseq[idx]); + + if (do_filter || samplesize > 0) + { + MSA *new; + + if (do_filter) + FilterAlignment(msa, idlevel, &new); + else if (samplesize > 0) + SampleAlignment(msa, samplesize, &new); + + if (new != NULL) { + WriteStockholm(ofp, new); + MSAFree(msa); + MSAFree(new); + } + } + else + { + if (do_voronoi) VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + else if (do_blosum) BlosumWeights(msa->aseq, msa->nseq, msa->alen, idlevel, msa->wgt); + else if (do_pbased) PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + else GSCWeights (msa->aseq, msa->nseq, msa->alen, msa->wgt); + + msa->flags |= MSA_SET_WGT; + WriteStockholm(ofp, msa); + MSAFree(msa); + } + } + MSAFileClose(afp); + fclose(ofp); + return EXIT_SUCCESS; +} + diff --git a/forester/archive/RIO/others/hmmer/src/Makefile.in b/forester/archive/RIO/others/hmmer/src/Makefile.in new file mode 100644 index 0000000..8113e57 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/Makefile.in @@ -0,0 +1,128 @@ +############################################################ +# Makefile for HMMER src directory +# CVS $Id: Makefile.in,v 1.1.1.1 2005/03/22 08:34:05 cmzmasek Exp $ +########### +# HMMER - Biological sequence analysis with profile HMMs +# Copyright (C) 1992-1999 Washington University School of Medicine +# All Rights Reserved +# +# This source code is distributed under the terms of the +# GNU General Public License. See the files COPYING and LICENSE +# for details. +########### + +## your compiler and compiler flags +# +CC = @CC@ +CFLAGS = @CFLAGS@ + +## other defined flags. +# DEFS contains stuff that autoconf +# decides on. MDEFS contains stuff that we added to +# the configure script tests. LIBS contains system +# libraries that the configure script decides we need. +# +MDEFS = @MDEFS@ @DEFS@ +LIBS = @LIBS@ -lm + +## archiving command, and ranlib command if you need one. +# In general, you shouldn't need to change these, and they're +# only used for building the testsuite anyway... e.g. we +# make a "libhmmer.a" library for building the testsuite. +# +AR = ar rcv +RANLIB = @RANLIB@ + +# Configuration for optional pthreads multiprocessor support +# +PTHREAD_LIBS = @PTHREAD_LIBS@ +PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ + + +# Configuration for optional PVM functionality +# +PVMFLAG = @PVMFLAG@ +PVMLIBDIR = @PVMLIBDIR@ +PVMINCDIR = @PVMINCDIR@ +PVMLIBS = @PVMLIBS@ +PVMPROGS = @PVMPROGS@ + +SHELL = /bin/sh +MYLIBS = -lsquid +MYLIBDIR = -L../squid +MYINCDIR = -I../squid + +PROGS = hmmalign\ + hmmbuild\ + hmmcalibrate\ + hmmconvert\ + hmmemit\ + hmmfetch\ + hmmindex\ + hmmpfam\ + hmmsearch\ + ${PVMPROGS} + +OBJS = alphabet.o\ + core_algorithms.o\ + debug.o\ + display.o\ + emit.o\ + emulation.o\ + histogram.o\ + hmmio.o\ + mathsupport.o\ + masks.o\ + misc.o\ + modelmakers.o\ + plan7.o\ + plan9.o\ + postprob.o\ + prior.o\ + pvm.o\ + threads.o\ + tophits.o\ + trace.o + +HDRS = config.h\ + funcs.h\ + globals.h\ + postprob.h\ + structs.h + +.c.o: + $(CC) $(CFLAGS) $(MDEFS) $(PTHREAD_CFLAGS) $(PVMFLAG) $(MYINCDIR) $(PVMINCDIR) -c $< + +################################################################# +## Targets defining how to make HMMER executables. +## +all: $(PROGS) + +$(PROGS): @EXEC_DEPENDENCY@ $(OBJS) + $(CC) $(CFLAGS) $(PTHREAD_CFLAGS) $(MDEFS) $(MYLIBDIR) $(PVMLIBDIR) -o $@ $@.o $(OBJS) $(PVMLIBS) $(MYLIBS) $(PTHREAD_LIBS) $(LIBS) + + +################################################################# +## Targets used in making HMMER module for testsuite compilation. +## +module: libhmmer.a + +libhmmer.a: $(OBJS) + $(AR) libhmmer.a $(OBJS) + $(RANLIB) libhmmer.a + chmod 644 libhmmer.a + + +################################################################# +## Miscellaneous targets. +## +distclean: + make clean + -rm -f Makefile version.h + +clean: + -rm -f *.o *~ Makefile.bak core $(PROGS) TAGS gmon.out libhmmer.a + +TAGS: + etags -t *.c *.h Makefile.in + diff --git a/forester/archive/RIO/others/hmmer/src/alphabet.c b/forester/archive/RIO/others/hmmer/src/alphabet.c new file mode 100644 index 0000000..a431207 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/alphabet.c @@ -0,0 +1,426 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* alphabet.c + * Configuration of the global symbol alphabet information. + * RCS $Id: alphabet.c,v 1.1.1.1 2005/03/22 08:34:08 cmzmasek Exp $ + */ + +#include +#include +#include +#ifdef HMMER_THREADS +#include +#endif /* HMMER_THREADS */ + +#include "config.h" +#include "structs.h" +#include "funcs.h" +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +static void set_degenerate(char iupac, char *syms); + + +/* Function: DetermineAlphabet() + * + * Purpose: From a set of sequences (raw or aligned), make a good + * guess whether they're Nucleic, Amino, or something + * else, and set alphabet accordingly. + * + * If Alphabet_type is already set, that means our + * autodetection was overridden from the command line, + * and we just set the other globals accordingly. + */ +void +DetermineAlphabet(char **rseqs, int nseq) +{ + int idx; + int other, nucleic, amino; + int type; + + /* Autodetection of alphabet type. + */ + type = hmmNOTSETYET; + other = nucleic = amino = 0; + for (idx = 0; idx < nseq; idx++) { + switch (Seqtype(rseqs[idx])) { + case kRNA: nucleic++; break; + case kDNA: nucleic++; break; + case kAmino: amino++; break; + case kOtherSeq: other++; break; + default: Die("No such alphabet type"); + } + } + + if (nucleic == nseq) type = hmmNUCLEIC; + else if (amino == nseq) type = hmmAMINO; + else if (nucleic > amino && nucleic > other) { + Warn("Looks like nucleic acid sequence, hope that's right"); + type = hmmNUCLEIC; + } + else if (amino > nucleic && amino > other) { + Warn("Looks like amino acid sequence, hope that's right"); + type = hmmAMINO; + } + else Die("Sorry, I can't tell if that's protein or DNA"); + + /* Now set up the alphabet. + */ + SetAlphabet(type); +} + + +/* Function: SetAlphabet() + * + * Purpose: Set the alphabet globals, given an alphabet type + * of either hmmAMINO or hmmNUCLEIC. + */ +void +SetAlphabet(int type) +{ + int x; +#ifdef HMMER_THREADS + pthread_mutex_t alphabet_lock; /* alphabet is global; must protect to be threadsafe */ + int rtn; /* return code from pthreads */ + + if ((rtn = pthread_mutex_init(&alphabet_lock, NULL)) != 0) + Die("pthread_mutex_init FAILED; %s\n", strerror(rtn)); + if ((rtn = pthread_mutex_lock(&alphabet_lock)) != 0) + Die("pthread_mutex_lock FAILED: %s\n", strerror(rtn)); +#endif + + /* Because the alphabet information is global, we must + * be careful to make this a thread-safe function. The mutex + * (above) takes care of that. But, indeed, it's also + * just good sense (and more efficient) to simply never + * allow resetting the alphabet. If type is Alphabet_type, + * silently return; else die with an alphabet mismatch + * warning. + */ + if (Alphabet_type != hmmNOTSETYET) + { + if (type != Alphabet_type) + Die("An alphabet type conflict occurred.\nYou probably mixed a DNA seq file with a protein model, or vice versa."); + +#ifdef HMMER_THREADS + if ((rtn = pthread_mutex_unlock(&alphabet_lock)) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); +#endif + return; + } + + switch(type) { /* Alphabet is not a string - careful! */ + case hmmAMINO: + Alphabet_type = type; + strncpy(Alphabet, "ACDEFGHIKLMNPQRSTVWYBZX", 23); + Alphabet_size = 20; + Alphabet_iupac = 23; + for (x = 0; x < Alphabet_iupac; x++) { + memset(Degenerate[x], 0, Alphabet_size); + } + for (x = 0; x < Alphabet_size; x++) { + Degenerate[x][x] = 1; + DegenCount[x] = 1; + } + set_degenerate('B', "ND"); + set_degenerate('Z', "QE"); + set_degenerate('X', "ACDEFGHIKLMNPQRSTVWY"); + break; + case hmmNUCLEIC: + Alphabet_type = type; + strncpy(Alphabet, "ACGTUNRYMKSWHBVDX", 17); + Alphabet_size = 4; + Alphabet_iupac = 17; + for (x = 0; x < Alphabet_iupac; x++) { + memset(Degenerate[x], 0, Alphabet_size); + } + for (x = 0; x < Alphabet_size; x++) { + Degenerate[x][x] = 1; + DegenCount[x] = 1; + } + set_degenerate('U', "T"); + set_degenerate('N', "ACGT"); + set_degenerate('X', "ACGT"); + set_degenerate('R', "AG"); + set_degenerate('Y', "CT"); + set_degenerate('M', "AC"); + set_degenerate('K', "GT"); + set_degenerate('S', "CG"); + set_degenerate('W', "AT"); + set_degenerate('H', "ACT"); + set_degenerate('B', "CGT"); + set_degenerate('V', "ACG"); + set_degenerate('D', "AGT"); + break; + default: Die("No support for non-nucleic or protein alphabets"); + } + +#ifdef HMMER_THREADS + if ((rtn = pthread_mutex_unlock(&alphabet_lock)) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); +#endif +} + +/* Function: SymbolIndex() + * + * Purpose: Convert a symbol to its index in Alphabet[]. + * Bogus characters are converted to 'X'. + * More robust than the SYMIDX() macro but + * presumably slower. + */ +int +SymbolIndex(char sym) +{ + char *s; + return ((s = strchr(Alphabet, (char) toupper((int) sym))) == NULL) ? + Alphabet_iupac-1 : s - Alphabet; +} + + +/* Function: DigitizeSequence() + * + * Purpose: Internal representation of a sequence in HMMER is + * as a char array. 1..L are the indices + * of seq symbols in Alphabet[]. 0,L+1 are sentinel + * bytes, set to be Alphabet_iupac -- i.e. one more + * than the maximum allowed index. + * + * Assumes that 'X', the fully degenerate character, + * is the last character in the allowed alphabet. + * + * Args: seq - sequence to be digitized (0..L-1) + * L - length of sequence + * + * Return: digitized sequence, dsq. + * dsq is allocated here and must be free'd by caller. + */ +char * +DigitizeSequence(char *seq, int L) +{ + char *dsq; + int i; + + dsq = MallocOrDie (sizeof(char) * (L+2)); + dsq[0] = dsq[L+1] = (char) Alphabet_iupac; + for (i = 1; i <= L; i++) + dsq[i] = SymbolIndex(seq[i-1]); + return dsq; +} + + +/* Function: DedigitizeSequence() + * Date: SRE, Tue Dec 16 10:39:19 1997 [StL] + * + * Purpose: Returns a 0..L-1 character string, converting the + * dsq back to the real alphabet. + */ +char * +DedigitizeSequence(char *dsq, int L) +{ + char *seq; + int i; + + seq = MallocOrDie(sizeof(char) * (L+1)); + for (i = 0; i < L; i++) + seq[i] = Alphabet[(int) dsq[i+1]]; + seq[L] = '\0'; + return seq; +} + + +/* Function: DigitizeAlignment() + * + * Purpose: Given an alignment, return digitized unaligned + * sequence array. (Tracebacks are always relative + * to digitized unaligned seqs, even if they are + * faked from an existing alignment in modelmakers.c.) + * + * Args: msa - alignment to digitize + * ret_dsqs - RETURN: array of digitized unaligned sequences + * + * Return: (void) + * dsqs is alloced here. Free2DArray(dseqs, nseq). + */ +void +DigitizeAlignment(MSA *msa, char ***ret_dsqs) +{ + char **dsq; + int idx; /* counter for sequences */ + int dpos; /* position in digitized seq */ + int apos; /* position in aligned seq */ + + dsq = (char **) MallocOrDie (sizeof(char *) * msa->nseq); + for (idx = 0; idx < msa->nseq; idx++) { + dsq[idx] = (char *) MallocOrDie (sizeof(char) * (msa->alen+2)); + + dsq[idx][0] = (char) Alphabet_iupac; /* sentinel byte at start */ + + for (apos = 0, dpos = 1; apos < msa->alen; apos++) { + if (! isgap(msa->aseq[idx][apos])) /* skip gaps */ + dsq[idx][dpos++] = SymbolIndex(msa->aseq[idx][apos]); + } + dsq[idx][dpos] = (char) Alphabet_iupac; /* sentinel byte at end */ + } + *ret_dsqs = dsq; +} + + +/* Function: P7CountSymbol() + * + * Purpose: Given a possibly degenerate symbol code, increment + * a symbol counter array (generally an emission + * probability vector in counts form) appropriately. + * + * Args: counters: vector to count into. [0..Alphabet_size-1] + * symidx: symbol index to count: [0..Alphabet_iupac-1] + * wt: weight to use for the count; often 1.0 + * + * Return: (void) + */ +void +P7CountSymbol(float *counters, char symidx, float wt) +{ + int x; + + if (symidx < Alphabet_size) + counters[(int) symidx] += wt; + else + for (x = 0; x < Alphabet_size; x++) { + if (Degenerate[(int) symidx][x]) + counters[x] += wt / (float) DegenCount[(int) symidx]; + } +} + + +/* Function: DefaultGeneticCode() + * + * Purpose: Configure aacode, mapping triplets to amino acids. + * Triplet index: AAA = 0, AAC = 1, ... UUU = 63. + * AA index: alphabetical: A=0,C=1... Y=19 + * Stop codon: -1. + * Uses the stdcode1[] global translation table from SQUID. + * + * Args: aacode - preallocated 0.63 array for genetic code + * + * Return: (void) + */ +void +DefaultGeneticCode(int *aacode) +{ + int x; + + for (x = 0; x < 64; x++) { + if (*(stdcode1[x]) == '*') aacode[x] = -1; + else aacode[x] = SYMIDX(*(stdcode1[x])); + } +} + + +/* Function: DefaultCodonBias() + * + * Purpose: Configure a codonbias table, mapping triplets to + * probability of using the triplet for the amino acid + * it represents: P(triplet | aa). + * The default is to assume codons are used equiprobably. + * + * Args: codebias: 0..63 array of P(triplet|aa), preallocated. + * + * Return: (void) + */ +void +DefaultCodonBias(float *codebias) +{ + codebias[0] = 1./2.; /* AAA Lys 2 */ + codebias[1] = 1./2.; /* AAC Asn 2 */ + codebias[2] = 1./2.; /* AAG Lys 2 */ + codebias[3] = 1./2.; /* AAU Asn 2 */ + codebias[4] = 1./4.; /* ACA Thr 4 */ + codebias[5] = 1./4.; /* ACC Thr 4 */ + codebias[6] = 1./4.; /* ACG Thr 4 */ + codebias[7] = 1./4.; /* ACU Thr 4 */ + codebias[8] = 1./6.; /* AGA Ser 6 */ + codebias[9] = 1./6.; /* AGC Arg 6 */ + codebias[10] = 1./6.; /* AGG Ser 6 */ + codebias[11] = 1./6.; /* AGU Arg 6 */ + codebias[12] = 1./3.; /* AUA Ile 3 */ + codebias[13] = 1./3.; /* AUC Ile 3 */ + codebias[14] = 1.; /* AUG Met 1 */ + codebias[15] = 1./3.; /* AUU Ile 3 */ + codebias[16] = 1./2.; /* CAA Gln 2 */ + codebias[17] = 1./2.; /* CAC His 2 */ + codebias[18] = 1./2.; /* CAG Gln 2 */ + codebias[19] = 1./2.; /* CAU His 2 */ + codebias[20] = 1./4.; /* CCA Pro 4 */ + codebias[21] = 1./4.; /* CCC Pro 4 */ + codebias[22] = 1./4.; /* CCG Pro 4 */ + codebias[23] = 1./4.; /* CCU Pro 4 */ + codebias[24] = 1./6.; /* CGA Arg 6 */ + codebias[25] = 1./6.; /* CGC Arg 6 */ + codebias[26] = 1./6.; /* CGG Arg 6 */ + codebias[27] = 1./6.; /* CGU Arg 6 */ + codebias[28] = 1./6.; /* CUA Leu 6 */ + codebias[29] = 1./6.; /* CUC Leu 6 */ + codebias[30] = 1./6.; /* CUG Leu 6 */ + codebias[31] = 1./6.; /* CUU Leu 6 */ + codebias[32] = 1./2.; /* GAA Glu 2 */ + codebias[33] = 1./2.; /* GAC Asp 2 */ + codebias[34] = 1./2.; /* GAG Glu 2 */ + codebias[35] = 1./2.; /* GAU Asp 2 */ + codebias[36] = 1./4.; /* GCA Ala 4 */ + codebias[37] = 1./4.; /* GCC Ala 4 */ + codebias[38] = 1./4.; /* GCG Ala 4 */ + codebias[39] = 1./4.; /* GCU Ala 4 */ + codebias[40] = 1./4.; /* GGA Gly 4 */ + codebias[41] = 1./4.; /* GGC Gly 4 */ + codebias[42] = 1./4.; /* GGG Gly 4 */ + codebias[43] = 1./4.; /* GGU Gly 4 */ + codebias[44] = 1./4.; /* GUA Val 4 */ + codebias[45] = 1./4.; /* GUC Val 4 */ + codebias[46] = 1./4.; /* GUG Val 4 */ + codebias[47] = 1./4.; /* GUU Val 4 */ + codebias[48] = 0.; /* UAA och - */ + codebias[49] = 1./2.; /* UAC Tyr 2 */ + codebias[50] = 0.; /* UAG amb - */ + codebias[51] = 1./2.; /* UAU Tyr 2 */ + codebias[52] = 1./6.; /* UCA Ser 6 */ + codebias[53] = 1./6.; /* UCC Ser 6 */ + codebias[54] = 1./6.; /* UCG Ser 6 */ + codebias[55] = 1./6.; /* UCU Ser 6 */ + codebias[56] = 0.; /* UGA opa - */ + codebias[57] = 1./2.; /* UGC Cys 2 */ + codebias[58] = 1.; /* UGG Trp 1 */ + codebias[59] = 1./2.; /* UGU Cys 2 */ + codebias[60] = 1./6.; /* UUA Leu 6 */ + codebias[61] = 1./2.; /* UUC Phe 2 */ + codebias[62] = 1./6.; /* UUG Leu 6 */ + codebias[63] = 1./2.; /* UUU Phe 2 */ +} + + + +/* Function: set_degenerate() + * + * Purpose: convenience function for setting up + * Degenerate[][] global for the alphabet. + */ +static void +set_degenerate(char iupac, char *syms) +{ + DegenCount[strchr(Alphabet,iupac)-Alphabet] = strlen(syms); + while (*syms) { + Degenerate[strchr(Alphabet,iupac)-Alphabet] + [strchr(Alphabet,*syms)-Alphabet] = 1; + syms++; + } +} diff --git a/forester/archive/RIO/others/hmmer/src/camJul97.c b/forester/archive/RIO/others/hmmer/src/camJul97.c new file mode 100644 index 0000000..e9b364f --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/camJul97.c @@ -0,0 +1,747 @@ +/* Source code from Cambridge visit July 1997 + * + * Position-specific matrices. + */ + +#include +#include +#include +#include +#include +#include + +#include "funcs.h" +#include "config.h" +#include "structs.h" +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +/* Function: MakeStarHMM() + * + * Purpose: Given an HMM with counts, create an HMM according + * to the star rule. In star models we typically expect + * that the counts have been collected using BLOSUM style + * weights. + * + * Args: hmm - HMM structure containing counts data + * mx - Star vectors, mx[q][x] + * pq - vector prior P(q) + * nq - number of vectors + * pri - Dirichlet priors for other parameters + * + * Return: (void) + * hmm is converted to probabilities. + */ +void +MakeStarHMM(struct plan7_s *hmm, float **mx, float *pq, int nq, struct p7prior_s *pri) +{ + int k; /* counter over model position */ + int x; /* counter over symbol/transition */ + float *pxa; /* P(x | a) : our parameter estimate */ + float *pqa; /* P(q | a) for all q */ + int q; /* counters over vectors q */ + int ai; /* counter over symbols */ + + /* Match emissions: Star rule implementation. + */ + pxa = (float *) MallocOrDie(sizeof(float) * Alphabet_size); + pqa = (float *) MallocOrDie(sizeof(float) * nq); + for (k = 1; k <= hmm->M; k++) + { + /* calculate log P(q | a) unnormalized (i.e. + log P(a))*/ + for (q = 0; q < nq; q++) + { + pqa[q] = log(pq[q]); + for (ai = 0; ai < Alphabet_size; ai++) + pqa[q] += hmm->mat[k][ai] * log(mx[q][ai]); + } + /* calculate log P(x | a) unnormalized (i.e + log P(a))*/ + for (x = 0; x < Alphabet_size; x++) + { + pxa[x] = pqa[0] + log(mx[0][x]); + for (q = 1; q < nq; q++) + pxa[x] = LogSum(pxa[x], (pqa[q] + log(mx[q][x]))); + } + /* normalize now to get P(x|a) and store */ + LogNorm(pxa, Alphabet_size); + FCopy(hmm->mat[k], pxa, Alphabet_size); + } + + + /* Everything else is done according to P7PriorifyHMM() + */ + /* Model-dependent transitions are handled simply; Laplace. + */ + FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */ + FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */ + hmm->tbd1 += 1.0; + hmm->begin[1] += 1.0; + + /* Main model transitions and insert emissions + */ + for (k = 1; k < hmm->M; k++) + { + P7PriorifyTransitionVector(hmm->t[k], pri); + P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, pri->iq, pri->i, NULL); + } + + Plan7Renormalize(hmm); + free(pxa); + free(pqa); + return; +} + + + + +#ifdef SRE_REMOVED +/* Function: MakeIslandHMM() + * + * Purpose: Given a sequence alignment of i = 1..nseq sequences, + * with columns j = 1..alen; and a sequence index idx + * to build the island from. Return a Plan7 island HMM in + * probability form. + * + * Args: aseqs - alignment + * ainfo - alignment info + * idx - index of which sequence to build island from + * null - random sequence model [0..Alphabet_size-1] + * mx - probability matrices mx[q][root b][x] + * bpri - priors on root distributions bpri[q][root b] + * qpri - prior probability distribution over matrices + * nmx - number of joint probability matrices + * + * Return: a new Plan7 HMM + */ +struct plan7_s * +MakeIslandHMM(char **aseqs, AINFO *ainfo, int idx, + float null[MAXABET], float ***mx, float **bpri, + float *qpri, int nmx) +{ + struct plan7_s *hmm; /* RETURN: Plan7 HMM */ + int j; /* column position index */ + int k; /* model position index */ + int q; /* counter for matrices */ + int x; /* counter for symbols */ + float *mat; /* a match emission probability vector */ + float **probq; /* posterior P(q | column) */ + int sym; /* index of a symbol in alphabet */ + float max; + int qmax; + float **pxaq; /* P(x | a,q) vectors, [q][x] */ + int b; /* counter over root symbols */ + + /* Allocate a model which is the length of the + * raw sequence. + */ + hmm = AllocPlan7(DealignedLength(aseqs[idx])); + if (ainfo->sqinfo[idx].flags & SQINFO_NAME) + Plan7SetName(hmm, ainfo->sqinfo[idx].name); + if (ainfo->sqinfo[idx].flags & SQINFO_DESC) + Plan7SetDescription(hmm, ainfo->sqinfo[idx].desc); + Plan7SetNullModel(hmm, null, 350./351.); /* p1 made up; shouldn't matter*/ + + mat = (float *) MallocOrDie( sizeof(float) * Alphabet_size); + pxaq = FMX2Alloc(nmx, Alphabet_size); + + /* Calculate the posterior probability distribution + * probq (= P(q | col)) over nmx different matrices + * at each column j -- probq[0..alen-1][0..nmx-1]; + * currently does not use the prior on q, but does a + * winner-take-all rule. + */ + probq = FMX2Alloc(ainfo->alen, nmx); + calc_probq(aseqs, ainfo, mx, bpri, qpri, nmx, probq); + + /* Debugging + */ + print_probq(stdout, probq, ainfo->alen, nmx); + + for (k = 1, j = 0; j < ainfo->alen; j++) + { + if (isgap(aseqs[idx][j])) continue; + + if (strchr(Alphabet, aseqs[idx][j]) != NULL) + sym = SYMIDX(aseqs[idx][j]); + else + Die("MakeIslandHMM() can't handle ambiguous query symbols yet"); + + + /* Calculate P(x | a, q) emission vectors for all matrices q + */ + for (q = 0; q < nmx; q++) + { + for (x = 0; x < Alphabet_size; x++) + { + pxaq[q][x] = 0.0; + for (b = 0; b < 20; b++) + pxaq[q][x] += mx[q][b][x] * mx[q][b][sym] * bpri[q][b]; + } + FNorm(pxaq[q], Alphabet_size); + } + + /* Sum P(x | a, q) emission vectors over matrices q: + * P(x | a, col) = \sum_q P(x | a, q, col) P(q | a, col) + * = \sum_q P(x | a, q) P(q | col) + */ + for (x = 0; x < Alphabet_size; x++) + { + hmm->mat[k][x] = 0.; + for (q = 0; q < nmx; q++) + hmm->mat[k][x] += probq[j][q] * pxaq[q][x]; + if (k < hmm->M) + hmm->ins[k][x] = null[x]; + } + + /* Reference annotation on columns: most probable matrix + */ + max = -FLT_MAX; + for (q = 0; q < nmx; q++) + if (probq[j][q] > max) { qmax = q; max = probq[j][q]; } + hmm->rf[k] = 'a'+(char)qmax; /* q > 9, so convert to char a-z*/ + + /* Consensus annotation on columns: original sequence. + */ + hmm->cs[k] = aseqs[idx][j]; + + k++; + } + + /* State transitions are set subjectively + */ + hmm->tbd1 = 0.02; + for (k = 1; k < hmm->M; k++) + { + hmm->t[k][TMM] = 0.97; + hmm->t[k][TMI] = 0.02; + hmm->t[k][TMD] = 0.01; + hmm->t[k][TIM] = 0.20; + hmm->t[k][TII] = 0.80; + hmm->t[k][TDM] = 0.90; + hmm->t[k][TDD] = 0.10; + } + + hmm->flags |= PLAN7_HASPROB | PLAN7_RF | PLAN7_CS; + + FMX2Free(pxaq); + FMX2Free(probq); + free(mat); + return hmm; +} +#endif + + +/* Function: ReadGJMMatrices() + * + * Purpose: Read GJM's file format for star-based mixture matrices. + * Very first line is nq. + * First line of a set is P(q), the prior of the matrix. + * Second line contains P(b|q), the prior of the root symbols, + * _in arbitrary order_ (the root distribution is not over AA's!) + * Third line is blank. + * Next 20 lines give a 20x20 matrix of conditional probabilities; + * rows = root symbols b; cols = leaf symbols x; + * mx[row][col] = P(x | b). + * + * Instead of storing as matrices, store as q x r vectors. + * + * Return: (void) + * mx, pq, nq are returned via passed pointers. + * Caller must free FMX2Free(mx) + * Caller must free(pq). + */ +void +ReadGJMMatrices(FILE *fp, float ***ret_mx, float **ret_pq, int *ret_nq) +{ + float **mx; /* conditional p's [0..nq-1][0..19] */ + float *pq; /* priors on vectors, [0..nq-1] */ + int nq, nr; /* number of matrices, rows */ + char buf[2048]; + float tmppq; /* prior for matrix */ + int q,r; /* counter for matrices, rows */ + int x; /* counter for symbols */ + char *s; /* tmp pointer into buf */ + + + /* allocations */ + if (fgets(buf, 2048, fp) == NULL) Die("read failed"); + nr = 20; + nq = atoi(buf); + mx = FMX2Alloc(nq*nr, 20); + pq = (float *) MallocOrDie (nq*nr * sizeof(float)); + + /* parse matrices */ + for (q = 0; q < nq; q++) + { + if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); + tmppq = atof(buf); + + if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); + s = strtok(buf, "\n\t "); + for (r = 0; r < nr; r++) + { + pq[q*nr + r] = atof(s) * tmppq; + s = strtok(NULL, "\n\t "); + } + if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); + + for (r = 0; r < 20; r++) + { + if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); + s = strtok(buf, "\n\t "); + for (x = 0; x < 20; x++) + { + mx[q*nr+r][x] = atof(s); + s = strtok(NULL, "\n\t "); + } + } + /* two blank lines */ + if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); + if (fgets(buf, 2048, fp) == NULL) Die("parse failed"); + } + + *ret_mx = mx; + *ret_pq = pq; + *ret_nq = nq*nr; + return; +} + + +#ifdef SRE_REMOVED +/* Function: OldReadGJMMatrices() + * + * Purpose: Read GJM's file format for joint probability matrix sets. + * + * Return: (void) + * mx, qprior, nmx are returned via passed pointers. + * Caller must free mx: each matrix by FMX2Free(), then free(mx). + * Caller must also free(qprior). + */ +void +OldReadGJMMatrices(FILE *fp, float ****ret_mx, float **ret_qprior, int *ret_nmx) +{ + float ***mx; /* joint prob matrix [0..nmx-1][0..19][0..19] */ + float *qprior; /* priors on matrices, [0..nmx-1] */ + int nmx; /* number of matrices */ + char buf[2048]; + int q; /* counter for matrices */ + int idx; /* index for this matrix seen in file */ + int r,c; /* counter for row, column */ + char *s; /* tmp pointer into buf */ + + /* pass one: count matrices */ + nmx = 0; + while (fgets(buf, 2048, fp) != NULL) + if (Strparse("use [0-9]+ = .+", buf, 0) == 0) + nmx++; + rewind(fp); + /* allocations */ + qprior = (float *) MallocOrDie (20 * sizeof(float)); + mx = (float ***) MallocOrDie (nmx * sizeof(float **)); + for (q = 0; q < nmx; q++) + mx[q] = FMX2Alloc(20, 20); + + /* pass two: parse matrices */ + q = 0; + while (fgets(buf, 2048, fp) != NULL) + { + if (Strparse("use ([0-9]+) = (.+)", buf, 2) != 0) + continue; + idx = atoi(sqd_parse[1]); + qprior[q] = atof(sqd_parse[2]); + + /* skip two lines in his new format */ + if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed"); + if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed"); + + for (r = 0; r < 20; r++) + { + if (fgets(buf, 2048, fp) == NULL) + Die("ReadGJMMatrices(): parse failed"); + s = strtok(buf, "\n\t "); + for (c = 0; c < 20; c++) + { + mx[q][r][c] = atof(s); + s = strtok(NULL, "\n\t "); + } + } + q++; + } + + *ret_mx = mx; + *ret_qprior = qprior; + *ret_nmx = nmx; + return; +} + +/* Function: OldPrintGJMMatrix() + * + * Purpose: (debugging, basically): print out Graeme's + * joint probability matrices in log odds integer form. + * + */ +void +OldPrintGJMMatrix(FILE *fp, float **jmx, float *rnd, int N) +{ + int r, c; + + fprintf(fp, " "); + for (c = 0; c < N; c++) + fprintf(fp, " %c ", Alphabet[c]); + fprintf(fp, "\n"); + + for (r = 0; r < N; r++) + { + fprintf(fp, "%c ", Alphabet[r]); + for (c = 0; c < N; c++) + fprintf(fp, "%3d ", + (int) (10. * sreLOG2(jmx[r][c] / (rnd[r] * rnd[c])))); + fprintf(fp, "\n"); + } +} +#endif /* SRE_REMOVED*/ + +/* Function: Joint2SubstitutionMatrix() + * + * Purpose: Convert a joint probability matrix to a substitution + * matrix. + * + * Convention here for substitution matrices is + * smx[r][c] = r->c = P(c|r). + * + * We obtain the substitution matrix from the following logic: + * P(rc) = P(c|r) P(r); + * P(r) = \sum_c P(rc); + * thus P(c|r) = P(rc) / \sum_c P(rc) + * + * Args: jmx - NxN P(rc) joint probability matrix + * smx - NxN P(c|r) substitution matrix, alloced in caller + * N - size of matrices; typically Alphabet_size + * + * Return: (void) + * smx is filled in. + */ +void +Joint2SubstitutionMatrix(float **jmx, float **smx, int N) +{ + float pr; /* P(r) = \sum_c P(rc) */ + int r,c; /* counters for rows, columns */ + + for (r = 0; r < N; r++) + { + for (pr = 0., c = 0; c < N; c++) + pr += jmx[r][c]; + for (c = 0; c < N; c++) + smx[r][c] = jmx[r][c] / pr; + } +} + + +#ifdef SRE_REMOVED +/* Function: BlosumWeights() + * + * Purpose: Assign weights to a set of aligned sequences + * using the BLOSUM rule: + * - do single linkage clustering at some pairwise identity + * - in each cluster, give each sequence 1/clustsize + * total weight. + * + * Args: aseqs - alignment + * N - number of seqs in alignment + * maxid - fractional identity (e.g. 0.62 for BLOSUM62) + * clust - [0..nseq-1] vector of cluster assignments, filled here (or NULL) + * ret_nc - total number of clusters found (or pass NULL) + */ +void +BlosumWeights(char **aseqs, AINFO *ainfo, float maxid, int *clust,int *ret_nc) +{ + float **dmx; /* difference matrix */ + struct phylo_s *tree; /* UPGMA tree */ + float mindiff; /* minimum distance between clusters */ + int c; /* counter for clusters */ + struct intstack_s *stack; + int node; + int i; + + mindiff = 1.0 - maxid; + /* first we do a difference matrix */ + MakeDiffMx(aseqs, ainfo->nseq, &dmx); + /* then we build a tree */ + Cluster(dmx, ainfo->nseq, CLUSTER_MIN, &tree); + + /* Find clusters below mindiff. + * The rule is: + * -traverse the tree + * -if the parent is > mindiff and current < mindiff, then + * make current node a cluster. + */ + for (i = 0; i < ainfo->nseq; i++) + { + ainfo->sqinfo[i].weight = 1.0; + ainfo->sqinfo[i].flags |= SQINFO_WGT; + } + + stack = InitIntStack(); + PushIntStack(stack, 0); /* push root on stack to start */ + c = 0; + while (PopIntStack(stack, &node)) + { + if ((node == 0 || tree[tree[node].parent-ainfo->nseq].diff > mindiff) && + tree[node].diff < mindiff) + { /* we're at a cluster */ + for (i = 0; i < ainfo->nseq; i++) + if (tree[node].is_in[i]) + { + ainfo->sqinfo[i].weight = 1.0 / (float) tree[node].incnum; + if (clust != NULL) clust[i] = c; + } + c++; + } + else /* we're not a cluster, keep traversing */ + { + if (tree[node].right >= ainfo->nseq) + PushIntStack(stack, tree[node].right - ainfo->nseq); + else + { + c++; + if (clust != NULL) clust[tree[node].right] = c; /* single seq, wgt 1.0 */ + } + + if (tree[node].left >= ainfo->nseq) + PushIntStack(stack, tree[node].left - ainfo->nseq); + else + { + c++; + if (clust != NULL) clust[tree[node].left] = c; + } + } + } + FreeIntStack(stack); + FreePhylo(tree, ainfo->nseq); + FMX2Free(dmx); + if (ret_nc != NULL) *ret_nc = c; + return; +} +#endif + + +#ifdef SRE_REMOVED +/* Function: calc_probq() + * + * Purpose: Calculate the posterior probability distribution + * P(q | a_j) for every column j in the alignment + * and every matrix choice q. + * + * Probabilistic, based on a star topology. + * Uses a BLOSUM-like rule to cluster the sequences in + * the alignment into groups with some seq identity (62%). + * Finds the consensus (majority rule) residue in + * each cluster as the representative. + * Then P(q | col) comes by Bayes: + * = (P(col | q) P(q) / Z + * where the likelihood + * P(col | q) = \sum_b [\prod_i P(a_i | q,b)] P(b | q) + * log P(col | q) = \logsum_b P(b|q) + \sum_i \log(P(a_i | q,b)) + * + * Args: aseqs - alignment + * ainfo - optional info for alignment + * mx - conditional probability matrices [0..nmx-1][root b][x] + * bprior- root priors [0..nmx-1][root b] + * qprior- prior prob distribution over matrices + * nmx - number of matrices + * probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1] + * alloc'ed in called, filled in here. + * + * Return: (void) + * probq is filled in. + */ +static void +calc_probq(char **aseqs, AINFO *ainfo, float ***mx, float **bprior, + float *qprior, int nmx, float **probq) +{ + int q; /* counter over matrices */ + int a1; /* counter over sequences */ + int j; /* counter over columns */ + int *clust; /* assignment of seqs to clusters 0..nseq-1 */ + int nclust; /* number of clusters */ + float *wgt; /* weights on seqs, 0..nseq-1 */ + int *sym; /* symbol indices in a column */ + float obs[MAXABET]; /* number of symbols observed in a column */ + int i, x; + float maxc; + float ngap; + float bterm[20]; /* intermediate in calculation, over root b's */ + int b; /* counter over root symbols */ + + /* Use the BLOSUM rule to calculate weights and clusters + * for sequences in the alignment + */ + wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq); + clust = (int *) MallocOrDie (sizeof(int) * ainfo->nseq); + BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust); + + /* Use the BLOSUM rule to calculate a "likelihood" function + * P(column | q) for each column. + */ + sym = (int *) MallocOrDie (sizeof(int) * nclust); + for (j = 0; j < ainfo->alen; j++) + { + /* Find majority rule symbols in this col */ + for (i = 0; i < nclust; i++) + { + FSet(obs, Alphabet_size, 0.); + ngap = 0.; + for (a1 = 0; a1 < ainfo->nseq; a1++) + if (clust[a1] == i) + if (isgap(aseqs[a1][j])) ngap += 0.; + else P7CountSymbol(obs, SymbolIndex(aseqs[a1][j]), 1.0); + + maxc = -1.; + for (x = 0; x < Alphabet_size; x++) + if (obs[x] > maxc) { maxc = obs[x]; sym[i] = x; } + /* either if no symbols observed, or more gaps than syms: */ + if (ngap >= maxc) sym[i] = -1; + } + /* Calculate log likelihood + log prior */ + for (q = 0; q < nmx; q++) + { + for (b = 0; b < 20; b++) + { + bterm[b] = bprior[q][b]; + for (i = 0; i < nclust; i++) + if (sym[i] >= 0) + bterm[b] += log(mx[q][b][sym[i]]); + } + probq[j][q] = log(qprior[q]) + FLogSum(bterm, 20); + } + LogNorm(probq[j], nmx); /* normalize -> gives posterior. */ + } + free(sym); + free(wgt); + free(clust); +} + + +/* Function: old_calc_probq() OBSOLETE VERSION + * + * Purpose: Calculate the posterior probability distribution + * P(q | a_j) for every column j in the alignment + * and every matrix choice q. + * + * Non-probabilistic. Uses a BLOSUM-like rule to + * find the single best matrix for a column, then + * assigns it a posterior of 1.0. + * + * This was version 1: a competitive learning rule, + * posterior either 1.0 or 0.0. + * + * Args: aseqs - alignment + * ainfo - optional info for alignment + * jmx - *joint* probability matrices [0..nmx-1][0..19][0..19] + * qprior- prior prob distribution over matrices [UNUSED] + * nmx - number of matrices + * probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1] + * alloc'ed in called, filled in here. + * + * Return: (void) + * probq is filled in. + */ +static void +old_calc_probq(char **aseqs, AINFO *ainfo, float ***jmx, float *qprior, + int nmx, float **probq) +{ + int q; /* counter over matrices */ + int a1, a2; /* counters over sequences */ + int j; /* counter over columns */ + float x; /* BLOSUM-style objective function */ + float maxx; /* maximum x so far */ + int maxq; /* maximum q so far */ + int *clust; /* assignment of seqs to clusters 0..nseq-1 */ + int nclust; /* number of clusters */ + float *wgt; /* weights on seqs, 0..nseq-1 */ + int *sym; /* symbol indices in a column */ + + + /* Use the BLOSUM rule to calculate weights and clusters + * for sequences in the alignment + */ + wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq); + clust = (int *) MallocOrDie (sizeof(int) * ainfo->nseq); + BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust); + + /* Use the BLOSUM rule to calculate a "likelihood" function + * P(column | q) for each column. + */ + sym = (int *) MallocOrDie (sizeof(int) * ainfo->nseq); + for (j = 0; j < ainfo->alen; j++) + { + for (a1 = 0; a1 < ainfo->nseq; a1++) + if (!isgap(aseqs[a1][j]) && + strchr(Alphabet, aseqs[a1][j]) != NULL) + { + sym[a1] = SYMIDX(aseqs[a1][j]); + if (sym[a1] >= Alphabet_size) sym[a1] = -1; /* no degenerates */ + } + else sym[a1] = -1; + + maxx = -FLT_MAX; + for (q = 0; q < nmx; q++) + { + x = 0.; + for (a1 = 0; a1 < ainfo->nseq; a1++) + for (a2 = 0; a2 < ainfo->nseq; a2++) + if (sym[a1] >= 0 && sym[a2] >= 0 && clust[a1] != clust[a2]) + x += wgt[a1] * wgt[a2] * log(jmx[q][sym[a1]][sym[a2]]); + +#ifdef SRE_REMOVED + printf("%% col %3d mx %c x = %f\n", + j+1, 'a'+(char)q, x); +#endif + + if (x > maxx) + { + maxx = x; + maxq = q; + } + } + FSet(probq[j], nmx, 0.0); + probq[j][maxq] = 1.0; /* winner-take-all rule */ + } + + free(sym); + free(wgt); + free(clust); +} + + +/* Function: print_probq() + * + * Purpose: Debugging output. + * probq is the posterior probability P(q | column) of + * a matrix q given an observed alignment column. + * Indexed probq[0..alen-1][0..nmx-1]. + */ +static void +print_probq(FILE *fp, float **probq, int alen, int nmx) +{ + int c; /* counter for columns */ + int q; /* counter for matrices */ + + fputs("### probq debugging output\n", fp); + fputs(" ", fp); + for (q = 0; q < nmx; q++) + fprintf(fp, " %c ", 'a'+(char)q); + fputs("\n", fp); + + for (c = 0; c < alen; c++) + { + fprintf(fp, "%4d ", c); + for (q = 0; q < nmx; q++) + fprintf(fp, "%5.3f ", probq[c][q]); + fputs("\n", fp); + } +} +#endif diff --git a/forester/archive/RIO/others/hmmer/src/config.h b/forester/archive/RIO/others/hmmer/src/config.h new file mode 100644 index 0000000..fb89df2 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/config.h @@ -0,0 +1,52 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* config.h + * + * Configurable compile-time parameters in HMMER. + */ + +#ifndef CONFIGH_INCLUDED +#define CONFIGH_INCLUDED + +/* RAMLIMIT determines the point at which we switch from fast, + * full dynamic programming to slow, linear-memory divide and conquer + * dynamic programming algorithms. It is the minimum amount of available + * RAM on the systems the package will run on. It can be overridden + * from the Makefile. + * By default, we assume we have 32 Mb RAM available (per thread). + */ +#ifndef RAMLIMIT +#define RAMLIMIT 32 +#endif + +/* HMMER_NCPU determines the number of threads/processors that + * a threads version will parallelize across. This can be overridden + * by -DHMMER_NCPU=x in the Makefile, and by a setenv HMMER_NCPU x + * in the environment, and usually by a command line option. + * Usually we detect the number of processors dynamically, but + * on some systems (FreeBSD and Linux, notably), we can't. On + * these systems we assume 2 processors by default. That assumption + * can be overridden here if HMMER_NCPU is uncommented. + */ +/* #define HMMER_NCPU 4 */ + +#define INTSCALE 1000.0 /* scaling constant for floats to integer scores */ +#define MAXABET 20 /* maximum size of alphabet (4 or 20) */ +#define MAXCODE 23 /* maximum degenerate alphabet size (17 or 23) */ +#define MAXDCHLET 200 /* maximum # Dirichlet components in mixture prior */ +#define NINPUTS 4 /* number of inputs into structural prior */ +#define INFTY 987654321 /* infinity for purposes of integer DP cells */ +#define NXRAY 4 /* number of structural inputs */ +#define LOGSUM_TBL 20000 /* controls precision of ILogsum() */ +#define ALILENGTH 50 /* length of displayed alignment lines */ + +#endif /*CONFIGH_INCLUDED*/ + diff --git a/forester/archive/RIO/others/hmmer/src/core_algorithms.c b/forester/archive/RIO/others/hmmer/src/core_algorithms.c new file mode 100644 index 0000000..b4fc349 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/core_algorithms.c @@ -0,0 +1,2445 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* core_algorithms.c + * SRE, Mon Nov 11 15:58:52 1996 + * CVS $Id: core_algorithms.c,v 1.1.1.1 2005/03/22 08:34:11 cmzmasek Exp $ + * + * Simple and robust "research" implementations of Forward, Backward, + * and Viterbi for Plan7. + */ + +#include "structs.h" +#include "config.h" +#include "funcs.h" +#include "squid.h" + +#include +#include + +static float get_wee_midpt(struct plan7_s *hmm, char *dsq, int L, + int k1, char t1, int s1, + int k3, char t3, int s3, + int *ret_k2, char *ret_t2, int *ret_s2); + + +/* Function: AllocPlan7Matrix() + * + * Purpose: Allocate a dynamic programming matrix for standard Forward, + * Backward, or Viterbi, with scores kept as scaled log-odds + * integers. Keeps 2D arrays compact in RAM in an attempt + * to maximize cache hits. Sets up individual ptrs to the + * four matrix components as a convenience. + * + * Args: rows - number of rows to allocate; typically L+1 + * M - size of model + * xmx, mmx, imx, dmx + * - RETURN: ptrs to four mx components as a convenience + * + * Return: mx + * mx is allocated here. Caller frees with FreeDPMatrix(mx). + */ + +struct dpmatrix_s * +AllocPlan7Matrix(int rows, int M, int ***xmx, int ***mmx, int ***imx, int ***dmx) +{ + struct dpmatrix_s *mx; + int i; + + mx = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s)); + mx->xmx = (int **) MallocOrDie (sizeof(int *) * rows); + mx->mmx = (int **) MallocOrDie (sizeof(int *) * rows); + mx->imx = (int **) MallocOrDie (sizeof(int *) * rows); + mx->dmx = (int **) MallocOrDie (sizeof(int *) * rows); + mx->xmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*5)); + mx->mmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2))); + mx->imx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2))); + mx->dmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2))); + for (i = 1; i < rows; i++) + { + mx->xmx[i] = mx->xmx[0] + (i*5); + mx->mmx[i] = mx->mmx[0] + (i*(M+2)); + mx->imx[i] = mx->imx[0] + (i*(M+2)); + mx->dmx[i] = mx->dmx[0] + (i*(M+2)); + } + + if (xmx != NULL) *xmx = mx->xmx; + if (mmx != NULL) *mmx = mx->mmx; + if (imx != NULL) *imx = mx->imx; + if (dmx != NULL) *dmx = mx->dmx; + return mx; +} + +/* Function: FreePlan7Matrix() + * + * Purpose: Free a dynamic programming matrix allocated by AllocPlan7Matrix(). + * + * Return: (void) + */ +void +FreePlan7Matrix(struct dpmatrix_s *mx) +{ + free (mx->xmx[0]); + free (mx->mmx[0]); + free (mx->imx[0]); + free (mx->dmx[0]); + free (mx->xmx); + free (mx->mmx); + free (mx->imx); + free (mx->dmx); + free (mx); +} + +/* Function: AllocShadowMatrix() + * + * Purpose: Allocate a dynamic programming traceback pointer matrix for + * a Viterbi algorithm. + * + * Args: rows - number of rows to allocate; typically L+1 + * M - size of model + * xtb, mtb, itb, dtb + * - RETURN: ptrs to four mx components as a convenience + * + * Return: mx + * mx is allocated here. Caller frees with FreeDPMatrix(mx). + */ + +struct dpshadow_s * +AllocShadowMatrix(int rows, int M, char ***xtb, char ***mtb, char ***itb, char ***dtb) +{ + struct dpshadow_s *tb; + int i; + + tb = (struct dpshadow_s *) MallocOrDie (sizeof(struct dpshadow_s)); + tb->xtb = (char **) MallocOrDie (sizeof(char *) * rows); + tb->mtb = (char **) MallocOrDie (sizeof(char *) * rows); + tb->itb = (char **) MallocOrDie (sizeof(char *) * rows); + tb->dtb = (char **) MallocOrDie (sizeof(char *) * rows); + tb->esrc = (int *) MallocOrDie (sizeof(int) * rows); + tb->xtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*5)); + tb->mtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2))); + tb->itb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2))); + tb->dtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2))); + for (i = 1; i < rows; i++) + { + tb->xtb[i] = tb->xtb[0] + (i*5); + tb->mtb[i] = tb->mtb[0] + (i*(M+2)); + tb->itb[i] = tb->itb[0] + (i*(M+2)); + tb->dtb[i] = tb->dtb[0] + (i*(M+2)); + } + + if (xtb != NULL) *xtb = tb->xtb; + if (mtb != NULL) *mtb = tb->mtb; + if (itb != NULL) *itb = tb->itb; + if (dtb != NULL) *dtb = tb->dtb; + return tb; +} + +/* Function: FreeShadowMatrix() + * + * Purpose: Free a dynamic programming matrix allocated by AllocShadowMatrix(). + * + * Return: (void) + */ +void +FreeShadowMatrix(struct dpshadow_s *tb) +{ + free (tb->xtb[0]); + free (tb->mtb[0]); + free (tb->itb[0]); + free (tb->dtb[0]); + free (tb->esrc); + free (tb->xtb); + free (tb->mtb); + free (tb->itb); + free (tb->dtb); + free (tb); +} + +/* Function: P7ViterbiSize() + * Date: SRE, Fri Mar 6 15:13:20 1998 [St. Louis] + * + * Purpose: Returns the ballpark predicted memory requirement for a + * P7Viterbi() alignment, in MB. + * + * Currently L must fit in an int (< 2 GB), but we have + * to deal with LM > 2 GB - e.g. watch out for overflow, do + * the whole calculation in floating point. Bug here detected + * in 2.1.1 by David Harper, Sanger Centre. + * + * Args: L - length of sequence + * M - length of HMM + * + * Returns: # of MB + */ +int +P7ViterbiSize(int L, int M) +{ + float Mbytes; + + /* We're excessively precise here, but it doesn't cost + * us anything to be pedantic. The four terms are: + * 1. the matrix structure itself; + * 2. the O(NM) main matrix (this dominates!) + * 3. ptrs into the rows of the matrix + * 4. storage for 5 special states. (xmx) + */ + Mbytes = (float) sizeof(struct dpmatrix_s); + Mbytes += 3. * (float) (L+1) * (float) (M+2) * (float) sizeof(int); + Mbytes += 4. * (float) (L+1) * (float) sizeof(int *); + Mbytes += 5. * (float) (L+1) * (float) sizeof(int); + Mbytes /= 1048576.; + return (int) Mbytes; +} + +/* Function: P7SmallViterbiSize() + * Date: SRE, Fri Mar 6 15:20:04 1998 [St. Louis] + * + * Purpose: Returns the ballpark predicted memory requirement for + * a P7SmallViterbi() alignment, in MB. + * + * P7SmallViterbi() is a wrapper, calling both P7ParsingViterbi() + * and P7WeeViterbi(). P7ParsingViterbi() typically dominates + * the memory requirement, so the value returned + * is the P7ParsingViterbi() number. + * + * We don't (yet) worry about overflow issues like we did with + * P7ViterbiSize(). We'll have many other 32-bit int issues in the + * code if we overflow here. + * + * Args: L - length of sequence + * M - length of HMM + * + * Returns: # of MB + */ +int +P7SmallViterbiSize(int L, int M) +{ + return ((2 * sizeof(struct dpmatrix_s) + + 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */ + 16 * sizeof(int *) + /* ptrs into rows of matrix */ + 20 * sizeof(int) + /* 5 special states */ + 2 * (L+1) * sizeof(int)) /* traceback indices */ + / 1000000); +} + + +/* Function: P7WeeViterbiSize() + * Date: SRE, Fri Mar 6 15:40:42 1998 [St. Louis] + * + * Purpose: Returns the ballpark predicted memory requirement for + * a P7WeeViterbi() alignment, in MB. + * + * Args: L - length of sequence + * M - length of HMM + * + * Returns: # of MB + */ +int +P7WeeViterbiSize(int L, int M) +{ + return ((2 * sizeof(struct dpmatrix_s) + + 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */ + 16 * sizeof(int *) + /* ptrs into rows of matrix */ + 20 * sizeof(int) + /* 5 special states */ + 2 * (L+2) * sizeof(int) + /* stacks for starts/ends (overkill) */ + (L+2) * sizeof(int) + /* k assignments to seq positions */ + (L+2) * sizeof(char)) /* state assignments to seq pos */ + / 1000000); +} + + +/* Function: P7Forward() + * + * Purpose: The Forward dynamic programming algorithm. + * The scaling issue is dealt with by working in log space + * and calling ILogsum(); this is a slow but robust approach. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model + * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted + * + * Return: log P(S|M)/P(S|R), as a bit score. + */ +float +P7Forward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx) +{ + struct dpmatrix_s *mx; + int **xmx; + int **mmx; + int **imx; + int **dmx; + int i,k; + int sc; + + /* Allocate a DP matrix with 0..L rows, 0..M-1 columns. + */ + mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx); + + /* Initialization of the zero row. + * Note that xmx[i][stN] = 0 by definition for all i, + * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need + * to be calculated in DP matrices. + */ + xmx[0][XMN] = 0; /* S->N, p=1 */ + xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ + xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ + for (k = 0; k <= hmm->M; k++) + mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ + + /* Recursion. Done as a pull. + * Note some slightly wasteful boundary conditions: + * tsc[0] = -INFTY for all eight transitions (no node 0) + * D_M and I_M are wastefully calculated (they don't exist) + */ + for (i = 1; i <= L; i++) + { + mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY; + for (k = 1; k < hmm->M; k++) + { + mmx[i][k] = ILogsum(ILogsum(mmx[i-1][k-1] + hmm->tsc[k-1][TMM], + imx[i-1][k-1] + hmm->tsc[k-1][TIM]), + ILogsum(xmx[i-1][XMB] + hmm->bsc[k], + dmx[i-1][k-1] + hmm->tsc[k-1][TDM])); + mmx[i][k] += hmm->msc[(int) dsq[i]][k]; + + dmx[i][k] = ILogsum(mmx[i][k-1] + hmm->tsc[k-1][TMD], + dmx[i][k-1] + hmm->tsc[k-1][TDD]); + imx[i][k] = ILogsum(mmx[i-1][k] + hmm->tsc[k][TMI], + imx[i-1][k] + hmm->tsc[k][TII]); + imx[i][k] += hmm->isc[(int) dsq[i]][k]; + } + mmx[i][hmm->M] = ILogsum(ILogsum(mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM], + imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]), + ILogsum(xmx[i-1][XMB] + hmm->bsc[hmm->M-1], + dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM])); + mmx[i][hmm->M] += hmm->msc[(int) dsq[i]][hmm->M]; + + /* Now the special states. + * remember, C and J emissions are zero score by definition + */ + xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]; + + xmx[i][XME] = -INFTY; + for (k = 1; k <= hmm->M; k++) + xmx[i][XME] = ILogsum(xmx[i][XME], mmx[i][k] + hmm->esc[k]); + + xmx[i][XMJ] = ILogsum(xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP], + xmx[i][XME] + hmm->xsc[XTE][LOOP]); + + xmx[i][XMB] = ILogsum(xmx[i][XMN] + hmm->xsc[XTN][MOVE], + xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]); + + xmx[i][XMC] = ILogsum(xmx[i-1][XMC] + hmm->xsc[XTC][LOOP], + xmx[i][XME] + hmm->xsc[XTE][MOVE]); + } + + sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE]; + + if (ret_mx != NULL) *ret_mx = mx; + else FreePlan7Matrix(mx); + + return Scorify(sc); /* the total Forward score. */ +} + + +/* Function: P7Viterbi() + * + * Purpose: The Viterbi dynamic programming algorithm. + * Identical to Forward() except that max's + * replace sum's. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model + * ret_tr - RETURN: traceback; pass NULL if it's not wanted + * + * Return: log P(S|M)/P(S|R), as a bit score + */ +float +P7Viterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr) +{ + struct dpmatrix_s *mx; + struct p7trace_s *tr; + int **xmx; + int **mmx; + int **imx; + int **dmx; + int i,k; + int sc; + + /* Allocate a DP matrix with 0..L rows, 0..M-1 columns. + */ + mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx); + + /* Initialization of the zero row. + */ + xmx[0][XMN] = 0; /* S->N, p=1 */ + xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ + xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ + for (k = 0; k <= hmm->M; k++) + mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ + + /* Recursion. Done as a pull. + * Note some slightly wasteful boundary conditions: + * tsc[0] = -INFTY for all eight transitions (no node 0) + * D_M and I_M are wastefully calculated (they don't exist) + */ + for (i = 1; i <= L; i++) { + mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY; + + for (k = 1; k <= hmm->M; k++) { + /* match state */ + mmx[i][k] = -INFTY; + if ((sc = mmx[i-1][k-1] + hmm->tsc[k-1][TMM]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = imx[i-1][k-1] + hmm->tsc[k-1][TIM]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = xmx[i-1][XMB] + hmm->bsc[k]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = dmx[i-1][k-1] + hmm->tsc[k-1][TDM]) > mmx[i][k]) + mmx[i][k] = sc; + if (hmm->msc[(int) dsq[i]][k] != -INFTY) mmx[i][k] += hmm->msc[(int) dsq[i]][k]; + else mmx[i][k] = -INFTY; + + /* delete state */ + dmx[i][k] = -INFTY; + if ((sc = mmx[i][k-1] + hmm->tsc[k-1][TMD]) > dmx[i][k]) + dmx[i][k] = sc; + if ((sc = dmx[i][k-1] + hmm->tsc[k-1][TDD]) > dmx[i][k]) + dmx[i][k] = sc; + + /* insert state */ + if (k < hmm->M) { + imx[i][k] = -INFTY; + if ((sc = mmx[i-1][k] + hmm->tsc[k][TMI]) > imx[i][k]) + imx[i][k] = sc; + if ((sc = imx[i-1][k] + hmm->tsc[k][TII]) > imx[i][k]) + imx[i][k] = sc; + if (hmm->isc[(int)dsq[i]][k] != -INFTY) imx[i][k] += hmm->isc[(int) dsq[i]][k]; + else imx[i][k] = -INFTY; + } + } + + /* Now the special states. Order is important here. + * remember, C and J emissions are zero score by definition, + */ + /* N state */ + xmx[i][XMN] = -INFTY; + if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY) + xmx[i][XMN] = sc; + + /* E state */ + xmx[i][XME] = -INFTY; + for (k = 1; k <= hmm->M; k++) + if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME]) + xmx[i][XME] = sc; + /* J state */ + xmx[i][XMJ] = -INFTY; + if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY) + xmx[i][XMJ] = sc; + if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ]) + xmx[i][XMJ] = sc; + + /* B state */ + xmx[i][XMB] = -INFTY; + if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY) + xmx[i][XMB] = sc; + if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB]) + xmx[i][XMB] = sc; + + /* C state */ + xmx[i][XMC] = -INFTY; + if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY) + xmx[i][XMC] = sc; + if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC]) + xmx[i][XMC] = sc; + } + /* T state (not stored) */ + sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE]; + + if (ret_tr != NULL) { + P7ViterbiTrace(hmm, dsq, L, mx, &tr); + *ret_tr = tr; + } + + FreePlan7Matrix(mx); + return Scorify(sc); /* the total Viterbi score. */ +} + + +/* Function: P7ViterbiTrace() + * Date: SRE, Sat Aug 23 10:30:11 1997 (St. Louis Lambert Field) + * + * Purpose: Traceback of a Viterbi matrix: i.e. retrieval + * of optimum alignment. + * + * Args: hmm - hmm, log odds form, used to make mx + * dsq - sequence aligned to (digital form) 1..N + * N - length of seq + * mx - the matrix to trace back in, N x hmm->M + * ret_tr - RETURN: traceback. + * + * Return: (void) + * ret_tr is allocated here. Free using P7FreeTrace(). + */ +void +P7ViterbiTrace(struct plan7_s *hmm, char *dsq, int N, + struct dpmatrix_s *mx, struct p7trace_s **ret_tr) +{ + struct p7trace_s *tr; + int curralloc; /* current allocated length of trace */ + int tpos; /* position in trace */ + int i; /* position in seq (1..N) */ + int k; /* position in model (1..M) */ + int **xmx, **mmx, **imx, **dmx; + int sc; /* temp var for pre-emission score */ + + /* Overallocate for the trace. + * S-N-B- ... - E-C-T : 6 states + N is minimum trace; + * add N more as buffer. + */ + curralloc = N * 2 + 6; + P7AllocTrace(curralloc, &tr); + + xmx = mx->xmx; + mmx = mx->mmx; + imx = mx->imx; + dmx = mx->dmx; + + /* Initialization of trace + * We do it back to front; ReverseTrace() is called later. + */ + tr->statetype[0] = STT; + tr->nodeidx[0] = 0; + tr->pos[0] = 0; + tr->statetype[1] = STC; + tr->nodeidx[1] = 0; + tr->pos[1] = 0; + tpos = 2; + i = N; /* current i (seq pos) we're trying to assign */ + + /* Traceback + */ + while (tr->statetype[tpos-1] != STS) { + switch (tr->statetype[tpos-1]) { + case STM: /* M connects from i-1,k-1, or B */ + sc = mmx[i+1][k+1] - hmm->msc[(int) dsq[i+1]][k+1]; + if (sc == xmx[i][XMB] + hmm->bsc[k+1]) + { + /* Check for wing unfolding */ + if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1]) + while (k > 0) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = 0; + tpos++; + if (tpos == curralloc) + { /* grow trace if necessary */ + curralloc += N; + P7ReallocTrace(tr, curralloc); + } + } + + tr->statetype[tpos] = STB; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else if (sc == mmx[i][k] + hmm->tsc[k][TMM]) + { + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + } + else if (sc == imx[i][k] + hmm->tsc[k][TIM]) + { + tr->statetype[tpos] = STI; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = i--; + } + else if (sc == dmx[i][k] + hmm->tsc[k][TDM]) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = 0; + } + else Die("traceback failed"); + break; + + case STD: /* D connects from M,D */ + if (dmx[i][k+1] == mmx[i][k] + hmm->tsc[k][TMD]) + { + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + } + else if (dmx[i][k+1] == dmx[i][k] + hmm->tsc[k][TDD]) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = 0; + } + else Die("traceback failed"); + break; + + case STI: /* I connects from M,I */ + sc = imx[i+1][k] - hmm->isc[(int) dsq[i+1]][k]; + if (sc == mmx[i][k] + hmm->tsc[k][TMI]) + { + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + } + else if (sc == imx[i][k] + hmm->tsc[k][TII]) + { + tr->statetype[tpos] = STI; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = i--; + } + else Die("traceback failed"); + break; + + case STN: /* N connects from S, N */ + if (i == 0 && xmx[i][XMN] == 0) + { + tr->statetype[tpos] = STS; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else if (i > 0 && xmx[i+1][XMN] == xmx[i][XMN] + hmm->xsc[XTN][LOOP]) + { + tr->statetype[tpos] = STN; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* note convention adherence: */ + tr->pos[tpos-1] = i--; /* first N doesn't emit */ + } + else Die("traceback failed"); + break; + + case STB: /* B connects from N, J */ + if (xmx[i][XMB] == xmx[i][XMN] + hmm->xsc[XTN][MOVE]) + { + tr->statetype[tpos] = STN; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else if (xmx[i][XMB] == xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) + { + tr->statetype[tpos] = STJ; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else Die("traceback failed"); + break; + + case STE: /* E connects from any M state. k set here */ + for (k = hmm->M; k >= 1; k--) + if (xmx[i][XME] == mmx[i][k] + hmm->esc[k]) + { + /* check for wing unfolding */ + if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k]) + { + int dk; /* need a tmp k while moving thru delete wing */ + for (dk = hmm->M; dk > k; dk--) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = dk; + tr->pos[tpos] = 0; + tpos++; + if (tpos == curralloc) + { /* grow trace if necessary */ + curralloc += N; + P7ReallocTrace(tr, curralloc); + } + } + } + + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + break; + } + if (k < 0) Die("traceback failed"); + break; + + case STC: /* C comes from C, E */ + if (xmx[i][XMC] == xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) + { + tr->statetype[tpos] = STC; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* note convention adherence: */ + tr->pos[tpos-1] = i--; /* first C doesn't emit */ + } + else if (xmx[i][XMC] == xmx[i][XME] + hmm->xsc[XTE][MOVE]) + { + tr->statetype[tpos] = STE; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* E is a nonemitter */ + } + else Die("Traceback failed."); + break; + + case STJ: /* J connects from E, J */ + if (xmx[i][XMJ] == xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) + { + tr->statetype[tpos] = STJ; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* note convention adherence: */ + tr->pos[tpos-1] = i--; /* first J doesn't emit */ + } + else if (xmx[i][XMJ] == xmx[i][XME] + hmm->xsc[XTE][LOOP]) + { + tr->statetype[tpos] = STE; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* E is a nonemitter */ + } + else Die("Traceback failed."); + break; + + default: + Die("traceback failed"); + + } /* end switch over statetype[tpos-1] */ + + tpos++; + if (tpos == curralloc) + { /* grow trace if necessary */ + curralloc += N; + P7ReallocTrace(tr, curralloc); + } + + } /* end traceback, at S state; tpos == tlen now */ + tr->tlen = tpos; + P7ReverseTrace(tr); + *ret_tr = tr; +} + + +/* Function: P7SmallViterbi() + * Date: SRE, Fri Mar 6 15:29:41 1998 [St. Louis] + * + * Purpose: Wrapper function, for linear memory alignment + * with same arguments as P7Viterbi(). + * + * Calls P7ParsingViterbi to break the sequence + * into fragments. Then, based on size of fragments, + * calls either P7Viterbi() or P7WeeViterbi() to + * get traces for them. Finally, assembles all these + * traces together to produce an overall optimal + * trace for the sequence. + * + * If the trace isn't needed for some reason, + * all we do is call P7ParsingViterbi. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model + * ret_tr - RETURN: traceback; pass NULL if it's not wanted + * + * Returns: Score of optimal alignment in bits. + */ +float +P7SmallViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr) +{ + struct p7trace_s *ctr; /* collapsed trace of optimal parse */ + struct p7trace_s *tr; /* full trace of optimal alignment */ + struct p7trace_s **tarr; /* trace array */ + int ndom; /* number of subsequences */ + int i; /* counter over domains */ + int pos; /* position in sequence */ + int tpos; /* position in trace */ + int tlen; /* length of full trace */ + int sqlen; /* length of a subsequence */ + int totlen; /* length of L matched by model (as opposed to N/C/J) */ + float sc; /* score of optimal alignment */ + int t2; /* position in a subtrace */ + + /* Step 1. Call P7ParsingViterbi to calculate an optimal parse + * of the sequence into single-hit subsequences; this parse + * is returned in a "collapsed" trace + */ + sc = P7ParsingViterbi(dsq, L, hmm, &ctr); + + /* If we don't want full trace, we're done */ + if (ret_tr == NULL) + { + P7FreeTrace(ctr); + return sc; + } + + /* Step 2. Call either P7Viterbi or P7WeeViterbi on each subsequence + * to recover a full traceback of each, collecting them in + * an array. + */ + ndom = ctr->tlen/2 - 1; + tarr = MallocOrDie(sizeof(struct p7trace_s *) * ndom); + tlen = totlen = 0; + for (i = 0; i < ndom; i++) + { + sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1]; /* length of subseq */ + + if (P7ViterbiSize(sqlen, hmm->M) > RAMLIMIT) + P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i])); + else + P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i])); + + tlen += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */ + totlen += sqlen; + } + + /* Step 3. Compose the subtraces into one big final trace. + * This is wasteful because we're going to TraceDecompose() + * it again in both hmmsearch and hmmpfam to look at + * individual domains; but we do it anyway so the P7SmallViterbi + * interface looks exactly like the P7Viterbi interface. Maybe + * long traces shouldn't include all the N/J/C states anyway, + * since they're unambiguously implied. + */ + + /* Calculate total trace len and alloc; + * nonemitting SNCT + nonemitting J's + emitting NJC + */ + tlen += 4 + (ndom-1) + (L-totlen); + P7AllocTrace(tlen, &tr); + tr->tlen = tlen; + + /* Add N-terminal trace framework + */ + tr->statetype[0] = STS; + tr->nodeidx[0] = 0; + tr->pos[0] = 0; + tr->statetype[1] = STN; + tr->nodeidx[1] = 0; + tr->pos[1] = 0; + tpos = 2; + /* add implied N's */ + for (pos = 1; pos <= ctr->pos[1]; pos++) + { + tr->statetype[tpos] = STN; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = pos; + tpos++; + } + + /* Add each subseq trace in, with its appropriate + * sequence offset derived from the collapsed trace + */ + for (i = 0; i < ndom; i++) + { /* skip SN, CT framework at ends */ + for (t2 = 2; t2 < tarr[i]->tlen-2; t2++) + { + tr->statetype[tpos] = tarr[i]->statetype[t2]; + tr->nodeidx[tpos] = tarr[i]->nodeidx[t2]; + if (tarr[i]->pos[t2] > 0) + tr->pos[tpos] = tarr[i]->pos[t2] + ctr->pos[i*2+1]; + else + tr->pos[tpos] = 0; + tpos++; + } + /* add nonemitting J or C */ + tr->statetype[tpos] = (i == ndom-1) ? STC : STJ; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tpos++; + /* add implied emitting J's */ + if (i != ndom-1) + for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++) + { + tr->statetype[tpos] = STJ; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = pos; + tpos++; + } + } + + /* add implied C's */ + for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++) + { + tr->statetype[tpos] = STC; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = pos; + tpos++; + } + /* add terminal T */ + tr->statetype[tpos] = STT; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tpos++; + + for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]); + free(tarr); + P7FreeTrace(ctr); + + *ret_tr = tr; + return sc; +} + + + + +/* Function: P7ParsingViterbi() + * Date: SRE, Wed Mar 4 14:07:31 1998 [St. Louis] + * + * Purpose: The "hmmfs" linear-memory algorithm for finding + * the optimal alignment of a very long sequence to + * a looping, multihit (e.g. Plan7) model, parsing it into + * a series of nonoverlapping subsequences that match + * the model once. Other algorithms (e.g. P7Viterbi() + * or P7WeeViterbi()) are applied subsequently to + * these subsequences to recover complete alignments. + * + * The hmmfs algorithm appears briefly in [Durbin98], + * but is otherwise unpublished. + * + * The traceback structure returned is special: a + * "collapsed" trace S->B->E->...->B->E->T, where + * stateidx is unused and pos is used to indicate the + * position of B and E in the sequence. The matched + * subsequence is B_pos+1...E_pos. The number of + * matches in the trace is (tlen/2)-1. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model (log odds scores ready) + * ret_tr - RETURN: a collapsed traceback. + * + * Returns: Score of the optimal Viterbi alignment, in bits. + */ +float +P7ParsingViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr) +{ + struct dpmatrix_s *mx; /* two rows of score matrix */ + struct dpmatrix_s *tmx; /* two rows of misused score matrix: traceback ptrs */ + struct p7trace_s *tr; /* RETURN: collapsed traceback */ + int **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */ + int **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */ + int *btr, *etr; /* O(L) trace ptrs for B, E state pts in seq */ + int sc; /* integer score of optimal alignment */ + int i,k,tpos; /* index for seq, model, trace position */ + int cur, prv; /* indices for rolling dp matrix */ + int curralloc; /* size of allocation for tr */ + + + /* Alloc a DP matrix and traceback pointers, two rows each, O(M). + * Alloc two O(L) arrays to trace back through the sequence thru B and E. + */ + mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx); + tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr); + btr = MallocOrDie(sizeof(int) * (L+1)); + etr = MallocOrDie(sizeof(int) * (L+1)); + + /* Initialization of the zero row. + */ + xmx[0][XMN] = 0; /* S->N, p=1 */ + xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ + btr[0] = 0; + xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ + etr[0] = -1; + for (k = 0; k <= hmm->M; k++) + mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ + + /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick. + * Note some slightly wasteful boundary conditions: + * tsc[0] = -INFTY for all eight transitions (no node 0) + * D_M and I_M are wastefully calculated (they don't exist) + * + * Notes on traceback pointer propagation. + * - In the path B->E, we propagate the i that B was aligned to in the optimal + * alignment, via mtr, dtr, and itr. + * - When we reach an E, we record the i of the B it started from in etr. + * - In a looping path E->J...->B or terminal path E->C...->T, we propagate + * the i that E was aligned to in the optimal alignment via xtr[][XMC] + * and xtr[][XMJ]. + * - When we enter B, we record the i of the best previous E, or 0 if there + * isn't one, in btr. + */ + for (i = 1; i <= L; i++) { + cur = i % 2; + prv = !cur; + + mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY; + + for (k = 1; k <= hmm->M; k++) { + /* match state */ + mmx[cur][k] = -INFTY; + if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY) + { mmx[cur][k] = sc; mtr[cur][k] = mtr[prv][k-1]; } + if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k]) + { mmx[cur][k] = sc; mtr[cur][k] = itr[prv][k-1]; } + if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k]) + { mmx[cur][k] = sc; mtr[cur][k] = i-1; } + if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k]) + { mmx[cur][k] = sc; mtr[cur][k] = dtr[prv][k-1]; } + if (hmm->msc[(int) dsq[i]][k] != -INFTY) + mmx[cur][k] += hmm->msc[(int) dsq[i]][k]; + else + mmx[cur][k] = -INFTY; + + /* delete state */ + dmx[cur][k] = -INFTY; + if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY) + { dmx[cur][k] = sc; dtr[cur][k] = mtr[cur][k-1]; } + if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k]) + { dmx[cur][k] = sc; dtr[cur][k] = dtr[cur][k-1]; } + + /* insert state */ + if (k < hmm->M) { + imx[cur][k] = -INFTY; + if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY) + { imx[cur][k] = sc; itr[cur][k] = mtr[prv][k]; } + if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k]) + { imx[cur][k] = sc; itr[cur][k] = itr[prv][k]; } + if (hmm->isc[(int) dsq[i]][k] != -INFTY) + imx[cur][k] += hmm->isc[(int) dsq[i]][k]; + else + imx[cur][k] = -INFTY; + } + } + + /* Now the special states. Order is important here. + * remember, C and J emissions are zero score by definition, + */ + /* N state */ + xmx[cur][XMN] = -INFTY; + if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY) + xmx[cur][XMN] = sc; + /* E state */ + xmx[cur][XME] = -INFTY; + for (k = 1; k <= hmm->M; k++) + if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME]) + { xmx[cur][XME] = sc; etr[i] = mtr[cur][k]; } + /* J state */ + xmx[cur][XMJ] = -INFTY; + if ((sc = xmx[prv][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY) + { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = xtr[prv][XMJ]; } + if ((sc = xmx[cur][XME] + hmm->xsc[XTE][LOOP]) > xmx[cur][XMJ]) + { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = i; } + /* B state */ + xmx[cur][XMB] = -INFTY; + if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY) + { xmx[cur][XMB] = sc; btr[i] = 0; } + if ((sc = xmx[cur][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[cur][XMB]) + { xmx[cur][XMB] = sc; btr[i] = xtr[cur][XMJ]; } + /* C state */ + xmx[cur][XMC] = -INFTY; + if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY) + { xmx[cur][XMC] = sc; xtr[cur][XMC] = xtr[prv][XMC]; } + if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC]) + { xmx[cur][XMC] = sc; xtr[cur][XMC] = i; } + } + /* T state (not stored) */ + sc = xmx[cur][XMC] + hmm->xsc[XTC][MOVE]; + + /***************************************************************** + * Collapsed traceback stage. + * xtr[L%2][XMC] contains the position j of the previous E + * etr[j] contains the position i of the previous B + * btr[i] contains the position j of the previous E, or 0 + * continue until btr[i] = 0. + *****************************************************************/ + + curralloc = 2; /* minimum: no hits */ + P7AllocTrace(curralloc, &tr); + + /* Init of collapsed trace. Back to front; we ReverseTrace() later. + */ + tpos = 0; + tr->statetype[tpos] = STT; + tr->pos[tpos] = 0; + i = xtr[L%2][XMC]; + while (i > 0) + { + curralloc += 2; + P7ReallocTrace(tr, curralloc); + + tpos++; + tr->statetype[tpos] = STE; + tr->pos[tpos] = i; + i = etr[i]; + + tpos++; + tr->statetype[tpos] = STB; + tr->pos[tpos] = i; + i = btr[i]; + } + + tpos++; + tr->statetype[tpos] = STS; + tr->pos[tpos] = 0; + tr->tlen = tpos + 1; + P7ReverseTrace(tr); + + FreePlan7Matrix(mx); + FreePlan7Matrix(tmx); + free(btr); + free(etr); + + *ret_tr = tr; + return Scorify(sc); +} + +/* Function: P7WeeViterbi() + * Date: SRE, Wed Mar 4 08:24:04 1998 [St. Louis] + * + * Purpose: Hirschberg/Myers/Miller linear memory alignment. + * See [Hirschberg75,MyM-88a] for the idea of the algorithm. + * Adapted to HMM implementation. + * + * Requires that you /know/ that there's only + * one hit to the model in the sequence: either + * because you're forcing single-hit, or you've + * previously called P7ParsingViterbi to parse + * the sequence into single-hit segments. The reason + * for this is that a cyclic model (a la Plan7) + * defeats the nice divide and conquer trick. + * (I think some trickery with propagated trace pointers + * could get around this but haven't explored it.) + * This is implemented by ignoring transitions + * to/from J state. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model + * ret_tr - RETURN: traceback. + * + * Returns: Score of the optimal Viterbi alignment. + */ +float +P7WeeViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr) +{ + struct p7trace_s *tr; /* RETURN: traceback */ + int *kassign; /* 0..L+1, alignment of seq positions to model nodes */ + char *tassign; /* 0..L+1, alignment of seq positions to state types */ + int *endlist; /* stack of end points on sequence to work on */ + int *startlist; /* stack of start points on sequence to work on */ + int lpos; /* position in endlist, startlist */ + int k1, k2, k3; /* start, mid, end in model */ + char t1, t2, t3; /* start, mid, end in state type */ + int s1, s2, s3; /* start, mid, end in sequence */ + float sc; /* score of segment optimal alignment */ + float ret_sc; /* optimal score over complete seq */ + int tlen; /* length needed for trace */ + int i, k, tpos; /* index in sequence, model, trace */ + + + /* Initialize. + */ + kassign = MallocOrDie (sizeof(int) * (L+1)); + tassign = MallocOrDie (sizeof(char)* (L+1)); + endlist = MallocOrDie (sizeof(int) * (L+1)); + startlist = MallocOrDie (sizeof(int) * (L+1)); + + lpos = 0; + startlist[lpos] = 1; + endlist[lpos] = L; + kassign[1] = 1; + kassign[L] = hmm->M; + tassign[1] = STS; /* temporary boundary condition! will become N or M */ + tassign[L] = STT; /* temporary boundary condition! will become M or C */ + + /* Recursive divide-and-conquer alignment. + */ + while (lpos >= 0) + { + /* Pop a segment off the stack */ + s1 = startlist[lpos]; + k1 = kassign[s1]; + t1 = tassign[s1]; + s3 = endlist[lpos]; + k3 = kassign[s3]; + t3 = tassign[s3]; + lpos--; + /* find optimal midpoint of segment */ + sc = get_wee_midpt(hmm, dsq, L, k1, t1, s1, k3, t3, s3, &k2, &t2, &s2); + kassign[s2] = k2; + tassign[s2] = t2; + /* score is valid on first pass */ + if (t1 == STS && t3 == STT) ret_sc = sc; + + /* push N-terminal segment on stack */ + if (t2 != STN && (s2 - s1 > 1 || (s2 - s1 == 1 && t1 == STS))) + { + lpos++; + startlist[lpos] = s1; + endlist[lpos] = s2; + } + /* push C-terminal segment on stack */ + if (t2 != STC && (s3 - s2 > 1 || (s3 - s2 == 1 && t3 == STT))) + { + lpos++; + startlist[lpos] = s2; + endlist[lpos] = s3; + } + + if (t2 == STN) + { /* if we see STN midpoint, we know the whole N-term is STN */ + for (; s2 >= s1; s2--) { + kassign[s2] = 1; + tassign[s2] = STN; + } + } + if (t2 == STC) + { /* if we see STC midpoint, we know whole C-term is STC */ + for (; s2 <= s3; s2++) { + kassign[s2] = hmm->M; + tassign[s2] = STC; + } + } + } + + /***************************************************************** + * Construct a traceback structure from kassign/tassign by interpolating + * necessary states. + * Trace allocation is as follows. We clearly need L emitting states. + * We also need nonemitting states as follows: + * STS,STN,STB,STE,STC,STT = 6 + * STD: count k2-k1-1 in kassign M->M's + * Also, count N->M's and M->C's (potential wing unfoldings)... + * ...and be careful to check wing unfoldings when there aren't + * any emitting N or C flanks! (bugfix, 2.1.1b) + *****************************************************************/ + + tlen = L + 6; + for (i = 1; i < L; i++) + { + if (tassign[i] == STM && tassign[i+1] == STM) + tlen += kassign[i+1] - kassign[i] - 1; + if (tassign[i] == STN && tassign[i+1] == STM) + tlen += kassign[i+1] - 1; + if (tassign[i] == STM && tassign[i+1] == STC) + tlen += hmm->M - kassign[i]; + } + if (tassign[1] == STM) tlen += kassign[1] - 1; + if (tassign[L] == STM) tlen += hmm->M - kassign[L]; + P7AllocTrace(tlen, &tr); + + tr->statetype[0] = STS; + tr->nodeidx[0] = 0; + tr->pos[0] = 0; + tr->statetype[1] = STN; + tr->nodeidx[1] = 0; + tr->pos[1] = 0; + tpos = 2; + + for (i = 1; i <= L; i++) + { + switch(tassign[i]) { + case STM: + /* check for first match state */ + if (tr->statetype[tpos-1] == STN) { + tr->statetype[tpos] = STB; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tpos++; + /* check for wing unfolding */ + if (Prob2Score(hmm->begin[kassign[i]], hmm->p1) + INTSCALE <= hmm->bsc[kassign[i]]) + for (k = 1; k < kassign[i]; k++) { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = 0; + tpos++; + } + } + /* do the match state itself */ + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = kassign[i]; + tr->pos[tpos] = i; + tpos++; + /* do any deletes necessary 'til next match */ + if (i < L && tassign[i+1] == STM && kassign[i+1] - kassign[i] > 1) + for (k = kassign[i] + 1; k < kassign[i+1]; k++) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = 0; + tpos++; + } + /* check for last match state */ + if (i == L || tassign[i+1] == STC) { + /* check for wing unfolding */ + if (Prob2Score(hmm->end[kassign[i-1]], 1.) + INTSCALE <= hmm->esc[kassign[i-1]]) + for (k = kassign[i]+1; k <= hmm->M; k++) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = 0; + tpos++; + } + /* add on the end state */ + tr->statetype[tpos] = STE; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tpos++; + /* and a nonemitting C state */ + tr->statetype[tpos] = STC; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tpos++; + } + break; + + case STI: + tr->statetype[tpos] = STI; + tr->nodeidx[tpos] = kassign[i]; + tr->pos[tpos] = i; + tpos++; + break; + + case STN: + tr->statetype[tpos] = STN; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = i; + tpos++; + break; + + case STC: + tr->statetype[tpos] = STC; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = i; + tpos++; + break; + + default: Die("Bogus state %s", Statetype(tassign[i])); + } + } + /* terminate the trace */ + tr->statetype[tpos] = STT; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tr->tlen = tpos+1; + + *ret_tr = tr; + + free(kassign); + free(tassign); + free(startlist); + free(endlist); + return ret_sc; +} + + +/* Function: Plan7ESTViterbi() + * + * Purpose: Frameshift-tolerant alignment of protein model to cDNA EST. + * + * + */ +float +Plan7ESTViterbi(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx) +{ + struct dpmatrix_s *mx; + int **xmx; + int **mmx; + int **imx; + int **dmx; + int i,k; + int sc; + int codon; + + /* Allocate a DP matrix with 0..L rows, 0..M+1 columns. + */ + mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx); + + /* Initialization of the zero row (DNA sequence of length 0) + * Note that xmx[i][stN] = 0 by definition for all i, + * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need + * to be calculated in DP matrices. + */ + xmx[0][XMN] = 0; /* S->N, p=1 */ + xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ + xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ + for (k = 0; k <= hmm->M; k++) + mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ + + /* Initialization of the first row (DNA sequence of length 1); + * only N state can make this nucleotide. + */ + xmx[1][XMN] = xmx[0][XMN] + hmm->xsc[XTN][LOOP]; + xmx[1][XMB] = xmx[1][XMN] + hmm->xsc[XTN][MOVE]; + xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need 2 nt to get here */ + for (k = 0; k <= hmm->M; k++) + mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need 2 nt to get into model */ + + /* Recursion. Done as a pull. + * Note some slightly wasteful boundary conditions: + * tsc[0] = -INFTY for all eight transitions (no node 0) + * D_M and I_M are wastefully calculated (they don't exist) + */ + for (i = 2; i <= L; i++) { + mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY; + + /* crude calculation of lookup value for codon */ + if (i > 2) { + if (dsq[i-2] < 4 && dsq[i-1] < 4 && dsq[i] < 4) + codon = dsq[i-2] * 16 + dsq[i-1] * 4 + dsq[i]; + else + codon = 64; /* ambiguous codon; punt */ + } + + for (k = 1; k <= hmm->M; k++) { + /* match state */ + if (i > 2) { + mmx[i][k] = mmx[i-3][k-1] + hmm->tsc[k-1][TMM]; + if ((sc = imx[i-3][k-1] + hmm->tsc[k-1][TIM]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = xmx[i-3][XMB] + hmm->bsc[k]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = dmx[i-3][k-1] + hmm->tsc[k-1][TDM]) > mmx[i][k]) + mmx[i][k] = sc; + mmx[i][k] += hmm->dnam[codon][k]; + } + /* -1 frameshifts into match state */ + if ((sc = mmx[i-2][k-1] + hmm->tsc[k-1][TMM] + hmm->dna2) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = imx[i-2][k-1] + hmm->tsc[k-1][TIM] + hmm->dna2) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = xmx[i-2][XMB] + hmm->bsc[k] + hmm->dna2) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = dmx[i-2][k-1] + hmm->tsc[k-1][TDM] + hmm->dna2) > mmx[i][k]) + mmx[i][k] = sc; + + /* +1 frameshifts into match state */ + if (i > 3) { + if ((sc = mmx[i-4][k-1] + hmm->tsc[k-1][TMM] + hmm->dna4) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = imx[i-4][k-1] + hmm->tsc[k-1][TIM] + hmm->dna4) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = xmx[i-4][XMB] + hmm->bsc[k] + hmm->dna4) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = dmx[i-4][k-1] + hmm->tsc[k-1][TDM] + hmm->dna4) > mmx[i][k]) + mmx[i][k] = sc; + } + /* delete state */ + dmx[i][k] = mmx[i][k-1] + hmm->tsc[k-1][TMD]; + if ((sc = dmx[i][k-1] + hmm->tsc[k-1][TDD]) > dmx[i][k]) + dmx[i][k] = sc; + + /* insert state */ + if (i > 2) { + imx[i][k] = mmx[i-3][k] + hmm->tsc[k][TMI]; + if ((sc = imx[i-3][k] + hmm->tsc[k][TII]) > imx[i][k]) + imx[i][k] = sc; + imx[i][k] += hmm->dnai[codon][k]; + } + + /* -1 frameshifts into insert state */ + if ((sc = mmx[i-2][k] + hmm->tsc[k][TMI] + hmm->dna2) > imx[i][k]) + imx[i][k] = sc; + if ((sc = imx[i-2][k] + hmm->tsc[k][TII] + hmm->dna2) > imx[i][k]) + imx[i][k] = sc; + + /* +1 frameshifts into insert state */ + if (i > 4) { + if ((sc = mmx[i-4][k] + hmm->tsc[k][TMI] + hmm->dna4) > imx[i][k]) + imx[i][k] = sc; + if ((sc = imx[i-4][k] + hmm->tsc[k][TII] + hmm->dna4) > imx[i][k]) + imx[i][k] = sc; + } + } + /* Now the special states. Order is important here. + * remember, C and J emissions are zero score by definition, + */ + /* N state: +1 nucleotide */ + xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]; + /* E state: collect from M's, and last D */ + xmx[i][XME] = dmx[i][hmm->M]; /* transition prob from last D = 1.0 */ + for (k = 1; k <= hmm->M; k++) + if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME]) + xmx[i][XME] = sc; + /* J state: +1 nucleotide */ + xmx[i][XMJ] = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]; + if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ]) + xmx[i][XMJ] = sc; + /* B state: collect from N,J */ + xmx[i][XMB] = xmx[i][XMN] + hmm->xsc[XTN][MOVE]; + if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB]) + xmx[i][XMB] = sc; + /* C state: +1 nucleotide */ + xmx[i][XMC] = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]; + if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC]) + xmx[i][XMC] = sc; + } + + sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE]; + + if (ret_mx != NULL) *ret_mx = mx; + else FreePlan7Matrix(mx); + + return Scorify(sc); /* the total Viterbi score. */ +} + + + +/* Function: get_wee_midpt() + * Date: SRE, Wed Mar 4 08:27:11 1998 [St. Louis] + * + * Purpose: The heart of the divide and conquer algorithm + * for P7WeeViterbi(). This function is called + * recursively to find successive optimal midpoints + * in the alignment matrix. See P7WeeViterbi() for + * further comments on the assumptions of this algorithm. + * + * Args: hmm - the model, set up for integer scores + * dsq - the sequence, digitized + * L - length of the sequence + * k1 - model node to start with, 1..M + * t1 - state type to start with, STM | STI | STN | STC; STS to start + * s1 - sequence position to start with, 1..L; 1 to start + * k3 - model node to end with, 1..M + * t3 - state type to end with, STM | STI | STN | STC; STT to start + * s3 - sequence position to end with, 1..L; L to start + * ret_k2 - RETURN: optimal midpoint, node position in model + * ret_t2 - RETURN: optimal midpoint, state type + * ret_s2 - RETURN: optimal midpoint, sequence position + * + * Returns: score of optimal alignment, in bits. + */ +static float +get_wee_midpt(struct plan7_s *hmm, char *dsq, int L, + int k1, char t1, int s1, + int k3, char t3, int s3, + int *ret_k2, char *ret_t2, int *ret_s2) +{ + struct dpmatrix_s *fwd; + struct dpmatrix_s *bck; + int **xmx; /* convenience ptr into special states */ + int **mmx; /* convenience ptr into match states */ + int **imx; /* convenience ptr into insert states */ + int **dmx; /* convenience ptr into delete states */ + int k2; + char t2; + int s2; + int cur, prv, nxt; /* current, previous, next row index (0 or 1)*/ + int i,k; /* indices for seq, model */ + int sc; /* integer score */ + int max; /* maximum integer score */ + int start; /* s1 to start at (need, for STS special case) */ + + + /* Choose our midpoint. + * Special cases: s1, s3 adjacent and t1 == STS: s2 = s1 + * s1, s3 adjacent and t3 == STT: s2 = s3 + * (where we must replace STS, STT eventually) + */ + s2 = s1 + (s3-s1) / 2; + if (s3-s1 == 1 && t1 == STS) s2 = s1; + if (s3-s1 == 1 && t3 == STT) s2 = s3; + + /* STS is a special case. STS aligns to row zero by convention, + * but we'll be passed s1=1, t1=STS. We have to init on row + * zero then start DP on row 1. + */ + start = (t1 == STS) ? 0 : s1; + + /* Allocate our forward two rows. + * Initialize row zero. + */ + fwd = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx); + cur = start%2; + xmx[cur][XMN] = xmx[cur][XMB] = -INFTY; + xmx[cur][XME] = xmx[cur][XMC] = -INFTY; + for (k = k1; k <= k3; k++) + mmx[cur][k] = imx[cur][k] = dmx[cur][k] = -INFTY; + + /* Where to put our zero for our start point... + * (only possible to start on an emitting state; J disallowed) + */ + switch (t1) { + case STM: mmx[cur][k1] = 0; break; + case STI: imx[cur][k1] = 0; break; + case STN: xmx[cur][XMN] = 0; break; + case STC: xmx[cur][XMC] = 0; break; + case STS: xmx[cur][XMN] = 0; break; + default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t1)); + } + + /* Still initializing. + * Deal with pulling horizontal matrix moves in initial row. + * These are any transitions to nonemitters: + * STM-> E, D + * STI-> none + * STN-> B + * STC-> (T, but we never observe this in the forward pass of a d&c) + * STE-> C + * STS-> (N, already implied by setting xmx[cur][XMN] = 0) + * STB-> M + */ + if (t1 == STM) + { + for (k = k1+1; k <= k3; k++) + { /* transits into STD */ + dmx[cur][k] = -INFTY; + if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY) + dmx[cur][k] = sc; + if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k]) + dmx[cur][k] = sc; + } + /* transit into STE */ + xmx[cur][XME] = -INFTY; + if ((sc = mmx[cur][k1] + hmm->esc[k1]) > -INFTY) + xmx[cur][XME] = sc; + } + /* transit into STB from STN */ + xmx[cur][XMB] = -INFTY; + if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY) + xmx[cur][XMB] = sc; + /* transit into STC from STE */ + xmx[cur][XMC] = -INFTY; + if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > -INFTY) + xmx[cur][XMC] = sc; + + /* Done initializing. + * Start recursive DP; sweep forward to chosen s2 midpoint. Done as a pull. + */ + for (i = start+1; i <= s2; i++) { + cur = i % 2; + prv = !cur; + + mmx[cur][k1] = imx[cur][k1] = dmx[cur][k1] = -INFTY; + + /* Insert state in column k1, and B->M transition in k1. + */ + if (k1 < hmm->M) { + imx[cur][k1] = -INFTY; + if ((sc = mmx[prv][k1] + hmm->tsc[k1][TMI]) > -INFTY) + imx[cur][k1] = sc; + if ((sc = imx[prv][k1] + hmm->tsc[k1][TII]) > imx[cur][k1]) + imx[cur][k1] = sc; + if (hmm->isc[(int) dsq[i]][k1] != -INFTY) + imx[cur][k1] += hmm->isc[(int) dsq[i]][k1]; + else + imx[cur][k1] = -INFTY; + } + if ((sc = xmx[prv][XMB] + hmm->bsc[k1]) > -INFTY) + mmx[cur][k1] = sc; + if (hmm->msc[(int) dsq[i]][k1] != -INFTY) + mmx[cur][k1] += hmm->msc[(int) dsq[i]][k1]; + else + mmx[cur][k1] = -INFTY; + + /* Main chunk of recursion across model positions + */ + for (k = k1+1; k <= k3; k++) { + /* match state */ + mmx[cur][k] = -INFTY; + if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY) + mmx[cur][k] = sc; + if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k]) + mmx[cur][k] = sc; + if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k]) + mmx[cur][k] = sc; + if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k]) + mmx[cur][k] = sc; + if (hmm->msc[(int) dsq[i]][k] != -INFTY) + mmx[cur][k] += hmm->msc[(int) dsq[i]][k]; + else + mmx[cur][k] = -INFTY; + + /* delete state */ + dmx[cur][k] = -INFTY; + if (k < hmm->M) { + if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY) + dmx[cur][k] = sc; + if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k]) + dmx[cur][k] = sc; + } + + /* insert state */ + imx[cur][k] = -INFTY; + if (k < hmm->M) { + if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY) + imx[cur][k] = sc; + if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k]) + imx[cur][k] = sc; + if (hmm->isc[(int) dsq[i]][k] != -INFTY) + imx[cur][k] += hmm->isc[(int) dsq[i]][k]; + else + imx[cur][k] = -INFTY; + } + } + /* N state */ + xmx[cur][XMN] = -INFTY; + if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY) + xmx[cur][XMN] = sc; + /* E state */ + xmx[cur][XME] = -INFTY; + for (k = k1; k <= k3 && k <= hmm->M; k++) + if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME]) + xmx[cur][XME] = sc; + /* B state */ + xmx[cur][XMB] = -INFTY; + if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY) + xmx[cur][XMB] = sc; + /* C state */ + xmx[cur][XMC] = -INFTY; + if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY) + xmx[cur][XMC] = sc; + if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC]) + xmx[cur][XMC] = sc; + } + + /* Row s2%2 in fwd matrix now contains valid scores from s1 (start) to s2, + * with J transitions disallowed (no cycles through model). + */ + + /***************************************************************** + * Backwards pass. + *****************************************************************/ + + /* Allocate our backwards two rows. Init last row. + */ + bck = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx); + nxt = s3%2; + xmx[nxt][XMN] = xmx[nxt][XMB] = -INFTY; + xmx[nxt][XME] = xmx[nxt][XMC] = -INFTY; + for (k = k1; k <= k3 + 1; k++) + mmx[nxt][k] = imx[nxt][k] = dmx[nxt][k] = -INFTY; + cur = !nxt; + mmx[cur][k3+1] = imx[cur][k3+1] = dmx[cur][k3+1] = -INFTY; + + /* Where to put the zero for our end point on last row. + */ + switch (t3) { + case STM: mmx[nxt][k3] = 0; break; + case STI: imx[nxt][k3] = 0; break; + case STN: xmx[nxt][XMN] = 0; break; + case STC: xmx[nxt][XMC] = 0; break; /* must be an emitting C */ + case STT: xmx[nxt][XMC] = hmm->xsc[XTC][MOVE]; break; /* C->T implied */ + default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t3)); + } + + /* Still initializing. + * In the case t3==STT, there are a few horizontal moves possible + * on row s3, because STT isn't an emitter. All other states are + * emitters, so their connections have to be to the previous row s3-1. + */ + if (t3 == STT) + { /* E->C */ + xmx[nxt][XME] = xmx[nxt][XMC] + hmm->xsc[XTE][MOVE]; + /* M->E */ + for (k = k3; k >= k1; k--) { + mmx[nxt][k] = xmx[nxt][XME] + hmm->esc[k]; + if (s3 != s2) + mmx[nxt][k] += hmm->msc[(int)dsq[s3]][k]; + } + } + + /* Start recursive DP; sweep backwards to chosen s2 midpoint. + * Done as a pull. M, I scores at current row do /not/ include + * emission scores. Be careful of integer underflow. + */ + for (i = s3-1; i >= s2; i--) { + /* note i < L, so i+1 is always a legal index */ + cur = i%2; + nxt = !cur; + /* C pulls from C (T is special cased) */ + xmx[cur][XMC] = -INFTY; + if ((sc = xmx[nxt][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY) + xmx[cur][XMC] = sc; + /* B pulls from M's */ + xmx[cur][XMB] = -INFTY; + for (k = k1; k <= k3; k++) + if ((sc = mmx[nxt][k] + hmm->bsc[k]) > xmx[cur][XMB]) + xmx[cur][XMB] = sc; + /* E pulls from C (J disallowed) */ + xmx[cur][XME] = -INFTY; + if ((sc = xmx[cur][XMC] + hmm->xsc[XTE][MOVE]) > -INFTY) + xmx[cur][XME] = sc; + /* N pulls from B, N */ + xmx[cur][XMN] = -INFTY; + if ((sc = xmx[cur][XMB] + hmm->xsc[XTN][MOVE]) > -INFTY) + xmx[cur][XMN] = sc; + if ((sc = xmx[nxt][XMN] + hmm->xsc[XTN][LOOP]) > xmx[cur][XMN]) + xmx[cur][XMN] = sc; + + /* Main recursion across model + */ + for (k = k3; k >= k1; k--) { + /* special case k == M */ + if (k == hmm->M) { + mmx[cur][k] = xmx[cur][XME]; /* p=1 transition to E by definition */ + dmx[cur][k] = -INFTY; /* doesn't exist */ + imx[cur][k] = -INFTY; /* doesn't exist */ + if (i != s2) + mmx[cur][k] += hmm->msc[(int)dsq[i]][k]; + continue; + } /* below this k < M, so k+1 is a legal index */ + + /* pull into match state */ + mmx[cur][k] = -INFTY; + if ((sc = xmx[cur][XME] + hmm->esc[k]) > -INFTY) + mmx[cur][k] = sc; + if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TMM]) > mmx[cur][k]) + mmx[cur][k] = sc; + if ((sc = imx[nxt][k] + hmm->tsc[k][TMI]) > mmx[cur][k]) + mmx[cur][k] = sc; + if ((sc = dmx[cur][k+1] + hmm->tsc[k][TMD]) > mmx[cur][k]) + mmx[cur][k] = sc; + if (i != s2) + mmx[cur][k] += hmm->msc[(int)dsq[i]][k]; + + /* pull into delete state */ + dmx[cur][k] = -INFTY; + if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TDM]) > -INFTY) + dmx[cur][k] = sc; + if ((sc = dmx[cur][k+1] + hmm->tsc[k][TDD]) > dmx[cur][k]) + dmx[cur][k] = sc; + /* pull into insert state */ + imx[cur][k] = -INFTY; + if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TIM]) > -INFTY) + imx[cur][k] = sc; + if ((sc = imx[nxt][k] + hmm->tsc[k][TII]) > imx[cur][k]) + imx[cur][k] = sc; + if (i != s2) + imx[cur][k] += hmm->isc[(int)dsq[i]][k]; + + } + } + + /***************************************************************** + * DP complete; we have both forward and backward passes. Now we + * look across the s2 row and find the optimal emitting state. + *****************************************************************/ + + cur = s2%2; + max = -INFTY; + for (k = k1; k <= k3; k++) + { + if ((sc = fwd->mmx[cur][k] + bck->mmx[cur][k]) > max) + { k2 = k; t2 = STM; max = sc; } + if ((sc = fwd->imx[cur][k] + bck->imx[cur][k]) > max) + { k2 = k; t2 = STI; max = sc; } + } + if ((sc = fwd->xmx[cur][XMN] + bck->xmx[cur][XMN]) > max) + { k2 = 1; t2 = STN; max = sc; } + if ((sc = fwd->xmx[cur][XMC] + bck->xmx[cur][XMC]) > max) + { k2 = hmm->M; t2 = STC; max = sc; } + + /***************************************************************** + * Garbage collection, return. + *****************************************************************/ + + FreePlan7Matrix(fwd); + FreePlan7Matrix(bck); + *ret_k2 = k2; + *ret_t2 = t2; + *ret_s2 = s2; + return Scorify(max); +} + + +/* Function: P7ViterbiAlignAlignment() + * Date: SRE, Sat Jul 4 13:39:00 1998 [St. Louis] + * + * Purpose: Align a multiple alignment to an HMM without + * changing the multiple alignment itself. + * Adapted from P7Viterbi(). + * + * Heuristic; not a guaranteed optimal alignment. + * Guaranteeing an optimal alignment appears difficult. + * [cryptic note to myself:] In paths connecting to I* metastates, + * recursion breaks down; if there is a gap in the + * previous column for a given seq, we can't determine what state the + * I* metastate corresponds to for this sequence, unless we + * look back in the DP matrix. The lookback would either involve + * recursing back to the previous M* metastate (giving a + * O(MN^2) algorithm instead of O(MN)) or expanding the I* + * metastate into 3^nseq separate I* metastates to keep track + * of which of three states each seq is in. Since the second + * option blows up exponentially w/ nseq, it is not attractive. + * If the first option were used, the correct algorithm would be related to + * modelmakers.c:Maxmodelmaker(), but somewhat more difficult. + * + * The heuristic approach here is to calculate a "consensus" + * sequence from the alignment, and align the consensus to the HMM. + * Some hackery is employed, weighting transitions and emissions + * to make things work (re: con and mocc arrays). + * + * Args: aseq - aligned sequences + * ainfo - info for aseqs (includes alen, nseq, wgt) + * hmm - model to align to + * + * Returns: Traceback. Caller must free with P7FreeTrace(). + * pos[] contains alignment columns, indexed 1..alen. + * statetype[] contains metastates M*, etc. as STM, etc. + */ +struct p7trace_s * +P7ViterbiAlignAlignment(MSA *msa, struct plan7_s *hmm) +{ + struct dpmatrix_s *mx; /* Viterbi calculation lattice (two rows) */ + struct dpshadow_s *tb; /* shadow matrix of traceback pointers */ + struct p7trace_s *tr; /* RETURN: traceback */ + int **xmx, **mmx, **imx, **dmx; + char **xtb, **mtb, **itb, **dtb; + float **con; /* [1..alen][0..Alphabet_size-1], consensus counts */ + float *mocc; /* fractional occupancy of a column; used to weight transitions */ + int i; /* counter for columns */ + int k; /* counter for model positions */ + int idx; /* counter for seqs */ + int sym; /* counter for alphabet symbols */ + int sc; /* temp variable for holding score */ + float denom; /* total weight of seqs; used to "normalize" counts */ + int cur, prv; + + /* The "consensus" is a counts matrix, [1..alen][0..Alphabet_size-1]. + * Gaps are not counted explicitly, but columns with lots of gaps get + * less total weight because they have fewer counts. + */ + /* allocation */ + con = MallocOrDie(sizeof(float *) * (msa->alen+1)); + mocc = MallocOrDie(sizeof(float) * (msa->alen+1)); + for (i = 1; i <= msa->alen; i++) { + con[i] = MallocOrDie(sizeof(float) * Alphabet_size); + FSet(con[i], Alphabet_size, 0.0); + } + mocc[0] = -9999.; + /* initialization */ + /* note: aseq is off by one, 0..alen-1 */ + /* "normalized" to have a max total count of 1 per col */ + denom = FSum(msa->wgt, msa->nseq); + for (i = 1; i <= msa->alen; i++) + { + for (idx = 0; idx < msa->nseq; idx++) + if (! isgap(msa->aseq[idx][i-1])) + P7CountSymbol(con[i], SYMIDX(msa->aseq[idx][i-1]), msa->wgt[idx]); + FScale(con[i], Alphabet_size, 1./denom); + mocc[i] = FSum(con[i], Alphabet_size); + } + + /* Allocate a DP matrix with 2 rows, 0..M columns, + * and a shadow matrix with 0,1..alen rows, 0..M columns. + */ + mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx); + tb = AllocShadowMatrix(msa->alen+1, hmm->M, &xtb, &mtb, &itb, &dtb); + + /* Initialization of the zero row. + */ + xmx[0][XMN] = 0; /* S->N, p=1 */ + xtb[0][XMN] = STS; + xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */ + xtb[0][XMB] = STN; + xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */ + tb->esrc[0] = 0; + xtb[0][XMC] = xtb[0][XMJ] = STBOGUS; + for (k = 0; k <= hmm->M; k++) { + mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */ + mtb[0][k] = itb[0][k] = dtb[0][k] = STBOGUS; + } + + /* Recursion. Done as a pull. + * Note some slightly wasteful boundary conditions: + * tsc[0] = -INFTY for all eight transitions (no node 0) + * D_M and I_M are wastefully calculated (they don't exist) + */ + for (i = 1; i <= msa->alen; i++) { + cur = i % 2; + prv = ! cur; + + mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY; + mtb[i][0] = itb[i][0] = dtb[i][0] = STBOGUS; + + for (k = 1; k <= hmm->M; k++) { + /* match state */ + mmx[cur][k] = -INFTY; + mtb[i][k] = STBOGUS; + if (mmx[prv][k-1] > -INFTY && hmm->tsc[k-1][TMM] > -INFTY && + (sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > mmx[cur][k]) + { mmx[cur][k] = sc; mtb[i][k] = STM; } + if (imx[prv][k-1] > -INFTY && hmm->tsc[k-1][TIM] > -INFTY && + (sc = imx[prv][k-1] + hmm->tsc[k-1][TIM] * mocc[i-1]) > mmx[cur][k]) + { mmx[cur][k] = sc; mtb[i][k] = STI; } + if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k]) + { mmx[cur][k] = sc; mtb[i][k] = STB; } + if (dmx[prv][k-1] > -INFTY && hmm->tsc[k-1][TDM] > -INFTY && + (sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k]) + { mmx[cur][k] = sc; mtb[i][k] = STD; } + /* average over "consensus" sequence */ + for (sym = 0; sym < Alphabet_size; sym++) + { + if (con[i][sym] > 0 && hmm->msc[sym][k] == -INFTY) { mmx[cur][k] = -INFTY; break; } + mmx[cur][k] += hmm->msc[sym][k] * con[i][sym]; + } + + /* delete state */ + dmx[cur][k] = -INFTY; + dtb[i][k] = STBOGUS; + if (mmx[cur][k-1] > -INFTY && hmm->tsc[k-1][TMD] > -INFTY && + (sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > dmx[cur][k]) + { dmx[cur][k] = sc; dtb[i][k] = STM; } + if (dmx[cur][k-1] > -INFTY && hmm->tsc[k-1][TDD] > -INFTY && + (sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k]) + { dmx[cur][k] = sc; dtb[i][k] = STD; } + + /* insert state */ + if (k < hmm->M) { + imx[cur][k] = -INFTY; + itb[i][k] = STBOGUS; + if (mmx[prv][k] > -INFTY && hmm->tsc[k][TMI] > -INFTY && + (sc = mmx[prv][k] + hmm->tsc[k][TMI] * mocc[i]) > imx[cur][k]) + { imx[cur][k] = sc; itb[i][k] = STM; } + if (imx[prv][k] > -INFTY && hmm->tsc[k][TII] > -INFTY && + (sc = imx[prv][k] + hmm->tsc[k][TII] * mocc[i-1] * mocc[i]) > imx[cur][k]) + { imx[cur][k] = sc; itb[i][k] = STI; } + /* average over "consensus" sequence */ + for (sym = 0; sym < Alphabet_size; sym++) + { + if (con[i][sym] > 0 && hmm->isc[sym][k] == -INFTY) { imx[cur][k] = -INFTY; break; } + imx[cur][k] += hmm->isc[sym][k] * con[i][sym]; + } + } + } + + /* Now the special states. Order is important here. + * remember, N, C, and J emissions are zero score by definition. + */ + /* N state */ + xmx[cur][XMN] = -INFTY; + xtb[i][XMN] = STBOGUS; + if (xmx[prv][XMN] > -INFTY && hmm->xsc[XTN][LOOP] > -INFTY && + (sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP] * mocc[i]) > -INFTY) + { xmx[cur][XMN] = sc; xtb[i][XMN] = STN; } + /* E state */ + xmx[cur][XME] = -INFTY; + xtb[i][XME] = STBOGUS; + for (k = 1; k <= hmm->M; k++) + if (mmx[cur][k] > -INFTY && hmm->esc[k] > -INFTY && + (sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME]) + { xmx[cur][XME] = sc; tb->esrc[i] = k; } + + /* we don't check J state */ + /* B state; don't connect from J */ + xmx[cur][XMB] = -INFTY; + xtb[i][XMB] = STBOGUS; + if (xmx[cur][XMN] > -INFTY && hmm->xsc[XTN][MOVE] > -INFTY && + (sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > xmx[cur][XMB]) + { xmx[cur][XMB] = sc; xtb[i][XMB] = STN; } + + /* C state */ + xmx[cur][XMC] = -INFTY; + xtb[i][XMC] = STBOGUS; + if (xmx[prv][XMC] > -INFTY && hmm->xsc[XTC][LOOP] > -INFTY && + (sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP] * mocc[i]) > -INFTY) + { xmx[cur][XMC] = sc; xtb[i][XMC] = STC; } + if (xmx[cur][XME] > -INFTY && hmm->xsc[XTE][MOVE] > -INFTY && + (sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC]) + { xmx[cur][XMC] = sc; xtb[i][XMC] = STE; } + } + /* T state (not stored in mx) */ + sc = xmx[msa->alen%2][XMC] + hmm->xsc[XTC][MOVE]; + + /* do the traceback */ + tr = ShadowTrace(tb, hmm, msa->alen); + /* cleanup and return */ + FreePlan7Matrix(mx); + FreeShadowMatrix(tb); + for (i = 1; i <= msa->alen; i++) + free(con[i]); + free(con); + free(mocc); + + return tr; +} + + + +/* Function: ShadowTrace() + * Date: SRE, Sun Jul 5 11:38:24 1998 [St. Louis] + * + * Purpose: Given a shadow matrix, trace it back, and return + * the trace. + * + * Args: tb - shadow matrix of traceback pointers + * hmm - the model (needed for figuring out wing unfolding) + * L - sequence length + * + * Returns: traceback. Caller must free w/ P7FreeTrace(). + */ +struct p7trace_s * +ShadowTrace(struct dpshadow_s *tb, struct plan7_s *hmm, int L) +{ + struct p7trace_s *tr; + int curralloc; /* current allocated length of trace */ + int tpos; /* position in trace */ + int i; /* position in seq (1..N) */ + int k; /* position in model (1..M) */ + char nxtstate; /* next state to assign in traceback */ + + /* Overallocate for the trace. + * S-N-B- ... - E-C-T : 6 states + L is minimum trace; + * add L more as buffer. + */ + curralloc = L * 2 + 6; + P7AllocTrace(curralloc, &tr); + + /* Initialization of trace + * We do it back to front; ReverseTrace() is called later. + */ + tr->statetype[0] = STT; + tr->nodeidx[0] = 0; + tr->pos[0] = 0; + tpos = 1; + i = L; /* current i (seq pos) we're trying to assign */ + k = 0; /* current k (model pos) we're trying to assign */ + nxtstate = STC; /* assign the C state first, for C->T */ + + /* Traceback + */ + while (nxtstate != STS) { + switch (nxtstate) { + case STM: + tr->statetype[tpos] = STM; + nxtstate = tb->mtb[i][k]; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + tpos++; + break; + + case STI: + tr->statetype[tpos] = STI; + nxtstate = tb->itb[i][k]; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = i--; + tpos++; + break; + + case STD: + tr->statetype[tpos] = STD; + nxtstate = tb->dtb[i][k]; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = 0; + tpos++; + break; + + case STN: + tr->statetype[tpos] = STN; + nxtstate = tb->xtb[i][XMN]; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = (nxtstate == STN) ? i-- : 0; /* N->N; 2nd one emits. */ + tpos++; + break; + + case STB: + /* Check for wing unfolding */ + if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1]) + while (k > 0) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = 0; + tpos++; + if (tpos == curralloc) + { /* grow trace if necessary */ + curralloc += L; + P7ReallocTrace(tr, curralloc); + } + } + + tr->statetype[tpos] = STB; + nxtstate = tb->xtb[i][XMB]; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tpos++; + break; + + case STJ: + tr->statetype[tpos] = STJ; + nxtstate = tb->xtb[i][XMJ]; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = (nxtstate == STJ) ? i-- : 0; /* J->J; 2nd one emits. */ + tpos++; + break; + + case STE: + tr->statetype[tpos] = STE; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + k = tb->esrc[i]; + nxtstate = STM; + tpos++; + /* check for wing unfolding */ + if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k]) + { + int dk; /* need a tmp k while moving thru delete wing */ + for (dk = hmm->M; dk > k; dk--) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = dk; + tr->pos[tpos] = 0; + tpos++; + if (tpos == curralloc) + { /* grow trace if necessary */ + curralloc += L; + P7ReallocTrace(tr, curralloc); + } + } + } + break; + + case STC: + tr->statetype[tpos] = STC; + nxtstate = tb->xtb[i][XMC]; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = (nxtstate == STC) ? i-- : 0; /* C->C; 2nd one emits. */ + tpos++; + break; + + default: + Die("HMMER: Bad state (%s) in ShadowTrace()\n", Statetype(nxtstate)); + + } /* end switch over nxtstate */ + + if (tpos == curralloc) + { /* grow trace if necessary */ + curralloc += L; + P7ReallocTrace(tr, curralloc); + } + + } /* end traceback, just before assigning S state */ + + tr->statetype[tpos] = STS; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + tr->tlen = tpos + 1; + + P7ReverseTrace(tr); + return tr; +} + + + +/* Function: PostprocessSignificantHit() + * Date: SRE, Wed Dec 20 12:11:01 2000 [StL] + * + * Purpose: Add a significant hit to per-seq and per-domain hit + * lists, after postprocessing the scores appropriately, + * and making sure per-domain scores add up to the per-seq + * score. + * + * [doesn't really belong in core_algorithms.c, because + * it's more of a hack than an algorithm, but on the other + * hand it's now part of the core of how HMMER scores + * things. Maybe there should be a core_hacks.c.] + * + * Given: active hit lists for per-seq and per-domain + * scores (e.g. hmmpfam and hmmsearch, collating their + * results), and a new hit that's significant enough + * that it may need to be reported in final output. + * + * Breaks the traceback into individual domain traces; + * scores each one of them, then applies null2 correction + * for biased composition. Recalculates the per-seq score + * as the sum of the per-domain scores. Stores the hits + * in the lists, for eventual sorting and output by the + * caller. + * + * Notes: In principle we've got the score, and a pvalue, and a traceback + * by doing the Viterbi algorithm, right? What else is left + * to do? Well, in practice, life is more complicated, because + * of the trace-dependent null2 score correction. + * + * After a null2 score correction is carried out on + * each domain (the default) the number of detected domains + * with scores > 0 may have decreased. We want the + * global (per-seq) hit list to have the recalculated number of + * domains, not necessarily what Viterbi gave us. + * + * Also, since we want the global score to be the sum of + * the individual domains, but the null2 correction is + * applied to each domain individually, we have to calculate + * an adjusted global score. (To do otherwise invites + * subtle inconsistencies; xref bug 2.) + * + * We don't have final evalues, so we may put a few + * more hits into the hit lists than we end up reporting. + * The main output routine is responsible for final + * enforcement of the thresholds. + * + * This routine is NOT THREADSAFE. When multithreaded, + * with using shared ghit/dhit output buffers, calls to + * PostprocessSignificantHit() need to be protected. + * + * Args: ghit - an active list of per-seq (global) hits + * dhit - an active list of per-domain hits + * tr - the significant HMM/seq traceback we'll report on + * hmm - ptr to the HMM + * dsq - digitized sequence (1..L) + * L - length of dsq + * seqname - name of sequence (same as targname, in hmmsearch) + * seqacc - seq's accession (or NULL) + * seqdesc - seq's description (or NULL) + * do_forward - TRUE if we've already calculated final per-seq score + * sc_override - per-seq score to use if do_forward is TRUE + * do_null2 - TRUE to apply the null2 scoring correction + * thresh - contains the threshold/cutoff information. + * hmmpfam_mode - TRUE if called by hmmpfam, else assumes hmmsearch; + * affects how the lists' sort keys are set. + * + * Returns: (void) + */ +void +PostprocessSignificantHit(struct tophit_s *ghit, + struct tophit_s *dhit, + struct p7trace_s *tr, + struct plan7_s *hmm, + char *dsq, + int L, + char *seqname, + char *seqacc, + char *seqdesc, + int do_forward, + float sc_override, + int do_null2, + struct threshold_s *thresh, + int hmmpfam_mode) +{ + struct p7trace_s **tarr; /* array of per-domain traces */ + struct fancyali_s *ali; /* alignment of a domain */ + int ntr; /* number of domain traces from Viterbi */ + int tidx; /* index for traces (0..ntr-1) */ + int ndom; /* # of domains accepted in sequence */ + int didx; /* index for domains (1..ndom) */ + int k1, k2; /* start, stop coord in model */ + int i1, i2; /* start, stop in sequence */ + float whole_sc; /* whole sequence score = \sum domain scores */ + float *score; /* array of raw scores for each domain */ + int *usedomain; /* TRUE if this domain is accepted */ + double whole_pval; + double pvalue; + double sortkey; + + /* Break the trace into one or more individual domains. + */ + TraceDecompose(tr, &tarr, &ntr); + if (ntr == 0) Die("TraceDecompose() screwup"); /* "can't happen" (!) */ + + /* Rescore each domain, apply null2 correction if asked. + * Mark positive-scoring ones (we'll definitely report those), + * and include their score in the whole sequence score. + */ + score = MallocOrDie(sizeof(float) * ntr); + usedomain = MallocOrDie(sizeof(int) * ntr); + ndom = 0; + whole_sc = 0.; + for (tidx = 0; tidx < ntr; tidx++) + { + score[tidx] = P7TraceScore(hmm, dsq, tarr[tidx]); + if (do_null2) score[tidx] -= TraceScoreCorrection(hmm, tarr[tidx], dsq); + if (score[tidx] > 0.0) { + usedomain[tidx] = TRUE; + ndom++; + whole_sc += score[tidx]; + } else + usedomain[tidx] = FALSE; + } + + /* Make sure at least one positive scoring domain is in + * the trace. If not, invoke "weak single domain" rules: + * we will always report at least one domain per sequence, even + * if it has a negative score. (HMMER's Plan7 architecture can report + * one negative scoring domain but not more.) + */ + if (ndom == 0) { + tidx = FMax(score, ntr); + usedomain[tidx] = TRUE; + whole_sc = score[tidx]; + ndom = 1; + } + + /* Implement --do_forward: override the trace-dependent sum-of-domain + * whole score, use the P7Forward() score that the called passed + * us instead. This is a hack; null2 is trace-dependent and + * thus undefined for P7Forward() scoring; see commentary in hmmpfam.c. + */ + if (do_forward) whole_sc = sc_override; + + /* Go through and put all the accepted domains into the hit list. + */ + whole_pval = PValue(hmm, whole_sc); + for (tidx = 0, didx = 1; tidx < ntr; tidx++) { + if (! usedomain[tidx]) continue; + + TraceSimpleBounds(tarr[tidx], &i1, &i2, &k1, &k2); + pvalue = PValue(hmm, score[tidx]); + + if (pvalue <= thresh->domE && score[tidx] >= thresh->domT) { + ali = CreateFancyAli(tarr[tidx], hmm, dsq, seqname); + + if (hmmpfam_mode) + sortkey = -1.*(double)i1; /* hmmpfam: sort on position in seq */ + else + sortkey = score[tidx]; /* hmmsearch: sort on E (monotonic w/ sc) */ + + RegisterHit(dhit, sortkey, + pvalue, score[tidx], + whole_pval, whole_sc, + hmmpfam_mode ? hmm->name : seqname, + hmmpfam_mode ? hmm->acc : seqacc, + hmmpfam_mode ? hmm->desc : seqdesc, + i1,i2, L, + k1,k2, hmm->M, + didx,ndom,ali); + } + didx++; + } + + /* Now register the global hit, with the domain-derived score. + */ + + /* sorting: + * hmmpfam has to worry that score and E-value are not monotonic + * when multiple HMMs (with different EVD parameters) are potential + * targets. Therefore in hmmpfam_mode we apply a weird hack + * to sort primarily on E-value, but on score + * for really good hits with E=0.0... works because we can + * assume 100000. > -log(DBL_MIN). + * hmmsearch simply sorts on score (which for a single HMM, we + * know is monotonic with E-value). + */ + if (hmmpfam_mode) + sortkey = (whole_pval > 0.0) ? -1.*log(whole_pval) : 100000. + whole_sc; + else + sortkey = whole_sc; + + /* Note: we've recalculated whole_sc and it may have decreased + * after the null2 correction was applied. For Pfam GA, TC, + * or NC cutoffs, we have to be sure that everything on the + * hitlist is correct (the hmmpfam output routine assumes it, + * otherwise it would have to reload each HMM to get its + * cutoffs). In all other cases, though, we don't care if + * the hit list has a bit too many things on it, because the + * output routine in hmmsearch or hmmpfam will check against + * the cutoffs. Hence we only need to check against globT + * (it may be set by GA, TC, or NC) but not globE. + * - SRE, CSHL genome mtg May 2001 + */ + if (whole_sc >= thresh->globT) { + RegisterHit(ghit, sortkey, + whole_pval, whole_sc, + 0., 0., /* no mother seq */ + hmmpfam_mode ? hmm->name : seqname, + hmmpfam_mode ? hmm->acc : seqacc, + hmmpfam_mode ? hmm->desc : seqdesc, + 0,0,0, /* seq positions */ + 0,0,0, /* HMM positions */ + 0, ndom, /* # domains info */ + NULL); /* alignment info */ + } + + /* Clean up and return. + */ + for (tidx = 0; tidx < ntr; tidx++) + P7FreeTrace(tarr[tidx]); + free(tarr); + free(score); + free(usedomain); + return; +} diff --git a/forester/archive/RIO/others/hmmer/src/debug.c b/forester/archive/RIO/others/hmmer/src/debug.c new file mode 100644 index 0000000..1044436 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/debug.c @@ -0,0 +1,368 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* debug.c + * Thu Nov 21 09:58:05 1996 + * + * Printing out or naming various useful things from HMMER + * innards. + * + * CVS $Id: debug.c,v 1.1.1.1 2005/03/22 08:33:58 cmzmasek Exp $ + */ + +#include +#include +#include +#include + +#include "structs.h" +#include "config.h" +#include "funcs.h" +#include "squid.h" + +/* Function: Statetype() + * + * Purpose: Returns the state type in text. + * Example: Statetype(S) = "S" + */ +char * +Statetype(char st) +{ + switch (st) { + case STS: return "S"; + case STN: return "N"; + case STB: return "B"; + case STM: return "M"; + case STD: return "D"; + case STI: return "I"; + case STE: return "E"; + case STJ: return "J"; + case STC: return "C"; + case STT: return "T"; + default: return "BOGUS"; + } +} + +/* Function: AlphabetType2String() + * Date: SRE, Sun Dec 24 11:33:40 2000 [St. Louis] + * + * Purpose: Returns a string "protein" for hmmAMINO, + * "nucleic acid" for hmmNUCLEIC, etc... used + * for formatting diagnostics. + * + * Args: type - Alphabet type, e.g. hmmAMINO + * + * Returns: char * + */ +char * +AlphabetType2String(int type) +{ + switch (type) { + case hmmAMINO: return "protein"; + case hmmNUCLEIC: return "nucleic acid"; + case hmmNOTSETYET: return "unknown"; + default: return "BOGUS"; + } +} + + +/* Function: P7PrintTrace() + * + * Purpose: Print out a traceback structure. + * If hmm is non-NULL, also print transition and emission scores. + * + * Args: fp - stderr or stdout, often + * tr - trace structure to print + * hmm - NULL or hmm containing scores to print + * dsq - NULL or digitized sequence trace refers to. + */ +void +P7PrintTrace(FILE *fp, struct p7trace_s *tr, struct plan7_s *hmm, char *dsq) +{ + int tpos; /* counter for trace position */ + int sym; + int sc; + + if (hmm == NULL) { + fprintf(fp, "st node rpos - traceback len %d\n", tr->tlen); + fprintf(fp, "-- ---- ------\n"); + for (tpos = 0; tpos < tr->tlen; tpos++) { + fprintf(fp, "%1s %4d %6d\n", + Statetype(tr->statetype[tpos]), + tr->nodeidx[tpos], + tr->pos[tpos]); + } + } else { + if (!(hmm->flags & PLAN7_HASBITS)) + Die("oi, you can't print scores from that hmm, it's not ready."); + + sc = 0; + fprintf(fp, "st node rpos transit emission - traceback len %d\n", tr->tlen); + fprintf(fp, "-- ---- ------ ------- --------\n"); + for (tpos = 0; tpos < tr->tlen; tpos++) { + if (dsq != NULL) sym = (int) dsq[tr->pos[tpos]]; + + fprintf(fp, "%1s %4d %6d %7d", + Statetype(tr->statetype[tpos]), + tr->nodeidx[tpos], + tr->pos[tpos], + (tpos < tr->tlen-1) ? + TransitionScoreLookup(hmm, tr->statetype[tpos], tr->nodeidx[tpos], + tr->statetype[tpos+1], tr->nodeidx[tpos+1]) : 0); + + if (tpos < tr->tlen-1) + sc += TransitionScoreLookup(hmm, tr->statetype[tpos], tr->nodeidx[tpos], + tr->statetype[tpos+1], tr->nodeidx[tpos+1]); + + if (dsq != NULL) { + if (tr->statetype[tpos] == STM) + { + fprintf(fp, " %8d %c", hmm->msc[sym][tr->nodeidx[tpos]], + Alphabet[sym]); + sc += hmm->msc[sym][tr->nodeidx[tpos]]; + } + else if (tr->statetype[tpos] == STI) + { + fprintf(fp, " %8d %c", hmm->isc[sym][tr->nodeidx[tpos]], + (char) tolower((int) Alphabet[sym])); + sc += hmm->isc[sym][tr->nodeidx[tpos]]; + } + else if ((tr->statetype[tpos] == STN && tr->statetype[tpos-1] == STN) || + (tr->statetype[tpos] == STC && tr->statetype[tpos-1] == STC) || + (tr->statetype[tpos] == STJ && tr->statetype[tpos-1] == STJ)) + { + fprintf(fp, " %8d %c", 0, (char) tolower((int) Alphabet[sym])); + } + } else { + fprintf(fp, " %8s %c", "-", '-'); + } + + + fputs("\n", fp); + } + fprintf(fp, " ------- --------\n"); + fprintf(fp, " total: %6d\n\n", sc); + } +} + +/* Function: P7PrintPrior() + * + * Purpose: Print out a Plan 7 prior structure. + */ +void +P7PrintPrior(FILE *fp, struct p7prior_s *pri) +{ + int q, x; /* counters for mixture component, element */ + + if (pri->strategy == PRI_DCHLET) fputs("Dirichlet\n", fp); + else if (pri->strategy == PRI_PAM) fputs("PAM\n", fp); + else Die("No such strategy."); + + if (Alphabet_type == hmmAMINO) fputs("Amino\n", fp); + else if (Alphabet_type == hmmNUCLEIC) fputs("Nucleic\n", fp); + + /* Transitions + */ + fprintf(fp, "\n%d\n", pri->tnum); + for (q = 0; q < pri->tnum; q++) + { + fprintf(fp, "%.4f\n", pri->tq[q]); + for (x = 0; x < 7; x++) + fprintf(fp, "%.4f ", pri->t[q][x]); + fputs("\n", fp); + } + + /* Match emissions + */ + fprintf(fp, "\n%d\n", pri->mnum); + for (q = 0; q < pri->mnum; q++) + { + fprintf(fp, "%.4f\n", pri->mq[q]); + for (x = 0; x < Alphabet_size; x++) + fprintf(fp, "%.4f ", pri->m[q][x]); + fputs("\n", fp); + } + + /* Insert emissions + */ + fprintf(fp, "\n%d\n", pri->inum); + for (q = 0; q < pri->inum; q++) + { + fprintf(fp, "%.4f\n", pri->iq[q]); + for (x = 0; x < Alphabet_size; x++) + fprintf(fp, "%.4f ", pri->i[q][x]); + fputs("\n", fp); + } +} + +/* Function: TraceVerify() + * Date: SRE, Mon Feb 2 07:48:52 1998 [St. Louis] + * + * Purpose: Check a traceback structure for internal consistency. + * Used in Shiva testsuite, for example. + * + * Args: tr - traceback to verify + * M - length of HMM + * N - length of sequence + * + * Returns: 1 if OK. 0 if not. + */ +int +TraceVerify(struct p7trace_s *tr, int M, int N) +{ + int tpos; /* position in trace */ + int k; /* current position in HMM nodes 1..M */ + int i; /* current position in seq 1..N */ + int nn, nc, nj; /* number of STN's, STC's, STJ's seen */ + int nm; /* number of STM's seen */ + + /* Basic checks on ends. + */ + if (tr->statetype[0] != STS) return 0; + if (tr->statetype[1] != STN) return 0; + if (tr->statetype[tr->tlen-2] != STC) return 0; + if (tr->statetype[tr->tlen-1] != STT) return 0; + if (tr->pos[1] != 0) return 0; + + /* Check for consistency throughout trace + */ + k = i = nn = nc = nj = nm = 0; + for (tpos = 0; tpos < tr->tlen; tpos++) + { + switch (tr->statetype[tpos]) { + case STS: + if (tr->nodeidx[tpos] != 0) return 0; + if (tr->pos[tpos] != 0) return 0; + if (k != 0) return 0; + if (i != 0) return 0; + if (tpos != 0) return 0; + break; + + case STN: /* first N doesn't emit. */ + if (tr->nodeidx[tpos] != 0) return 0; + if (k != 0) return 0; + if (nn > 0) + { + if (tr->pos[tpos] != i+1) return 0; + i++; + } + else + { + if (tr->pos[tpos] != 0) return 0; + if (i != 0) return 0; + } + nn++; + break; + + case STB: + if (tr->nodeidx[tpos] != 0) return 0; + if (tr->pos[tpos] != 0) return 0; + nm = 0; + break; + + case STM: /* can enter anywhere on first M */ + if (tr->pos[tpos] != i+1) return 0; + if (tr->nodeidx[tpos] < 1 || tr->nodeidx[tpos] > M) return 0; + i++; + if (nm == 0) k = tr->nodeidx[tpos]; + else { + if (tr->nodeidx[tpos] != k+1) return 0; + k++; + } + nm++; + break; + + case STI: + if (tr->pos[tpos] != i+1) return 0; + if (tr->nodeidx[tpos] != k) return 0; + if (tr->nodeidx[tpos] < 1 || tr->nodeidx[tpos] > M-1) return 0; + if (k >= M) return 0; + i++; + break; + + case STD: + if (tr->pos[tpos] != 0) return 0; + if (tr->nodeidx[tpos] != k+1) return 0; + if (tr->nodeidx[tpos] < 1 || tr->nodeidx[tpos] > M) return 0; + k++; + break; + + case STE: + if (tr->nodeidx[tpos] != 0) return 0; + if (tr->pos[tpos] != 0) return 0; + nj = 0; + break; + + case STJ: + if (tr->nodeidx[tpos] != 0) return 0; + if (nj > 0) + { + if (tr->pos[tpos] != i+1) return 0; + i++; + } + else if (tr->pos[tpos] != 0) return 0; + nj++; + break; + + case STC: + if (tr->nodeidx[tpos] != 0) return 0; + if (nc > 0) + { + if (tr->pos[tpos] != i+1) return 0; + i++; + } + else if (tr->pos[tpos] != 0) return 0; + nc++; + break; + + case STT: + if (tpos != tr->tlen - 1) return 0; + if (tr->nodeidx[tpos] != 0) return 0; + if (tr->pos[tpos] != 0) return 0; + if (i != N) return 0; + break; + + case STBOGUS: + default: + return 0; + } /* end switch over statetypes */ + } /* end loop over trace positions */ + + return 1; +} + + +/* Function: TraceCompare() + * Date: SRE, Wed Mar 4 17:26:49 1998 [St. Louis] + * + * Purpose: Compare two tracebacks; return 1 if they're + * identical, else 0. Written for Shiva testsuite. + * + * Args: t1 - first trace + * t2 - second trace + * + * Returns: 1 if identical; 0 elsewise + */ +int +TraceCompare(struct p7trace_s *t1, struct p7trace_s *t2) +{ + int tpos; + + if (t1->tlen != t2->tlen) return 0; + + for (tpos = 0; tpos < t1->tlen; tpos++) + { + if (t1->statetype[tpos] != t2->statetype[tpos]) return 0; + if (t1->nodeidx[tpos] != t2->nodeidx[tpos]) return 0; + if (t1->pos[tpos] != t2->pos[tpos]) return 0; + } + return 1; +} + diff --git a/forester/archive/RIO/others/hmmer/src/display.c b/forester/archive/RIO/others/hmmer/src/display.c new file mode 100644 index 0000000..137180a --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/display.c @@ -0,0 +1,447 @@ +/************************************************************ + * Copyright (C) 1998 Ian Holmes + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* display.c + * Author: Ian Holmes (ihh@sanger.ac.uk, Jun 5 1998) + * Derived from core_algorithms.c (SRE, Nov 11 1996) + * Incorporated SRE, Sat Nov 6 10:09:41 1999 + * + * Functions for displaying HMMer2.0 structures. + * + * RCS $Id: display.c,v 1.1.1.1 2005/03/22 08:33:59 cmzmasek Exp $ + */ + +#include "structs.h" +#include "config.h" +#include "funcs.h" +#include "squid.h" + +void PrintIscore(int sc); + +void PrintTransition(char src, + int isrc, + int ksrc, + char dest, + int idest, + int kdest, + int sc, + struct p7trace_s **alignment, + int *min, + int *max, + int *on, + int A); + + +/* Function: DisplayPlan7Posteriors() + * + * Purpose: Print out posterior transition probabilities + * in modelpost format. + * NB only prints out transitions that touch + * either the Viterbi or the optimal accuracy path. + * + * Args: L - the length of the sequence + * hmm - the model + * forward - forward matrix + * backward - backward matrix + * viterbi - Viterbi trace + * optacc - optimal accuracy trace + * + * Return: void + * + */ +void DisplayPlan7Posteriors(int L, struct plan7_s *hmm, + struct dpmatrix_s *forward, + struct dpmatrix_s *backward, + struct p7trace_s *viterbi, + struct p7trace_s *optacc) +{ + struct p7trace_s* alignment[2]; + alignment[0] = viterbi; + alignment[1] = optacc; + DisplayPlan7PostAlign (L, hmm, forward, backward, alignment, 2); +} + + +/* Function: DisplayPlan7PostAlign() + * + * Purpose: Print out posterior transition probabilities + * in modelpost format, for any set of alignments. + * + * Args: L - the length of the sequence + * hmm - the model + * forward - forward matrix + * backward - backward matrix + * alignment - array of traces + * A - size of alignment array + * + * Return: void + * + */ +void DisplayPlan7PostAlign(int L, struct plan7_s *hmm, + struct dpmatrix_s *forward, + struct dpmatrix_s *backward, + struct p7trace_s **alignment, + int A) +{ + int sc; + int i; + int j; + int k; + int kmin; + int kmax; + int* min; + int* max; + int* on; + char state; + + sc = forward->xmx[L][XMC] + hmm->xsc[XTC][MOVE]; /* total Forward score */ + + min = (int*) calloc (A, sizeof(int)); + max = (int*) calloc (A, sizeof(int)); + on = (int*) calloc (A, sizeof(int)); + + for (i = 0; i <= L; i++) + { + for (j = 0; j < A; j++) { + while (alignment[j]->pos[min[j]] < i - 1 && min[j] < alignment[j]->tlen - 1) + min[j]++; + + while (alignment[j]->pos[max[j]] <= i + 1 && max[j] < alignment[j]->tlen - 1) + max[j]++; + } + + for (state = STM; state <= STJ; state++) + { + if (state == STM || state == STB) + { + kmin = 1; + kmax = hmm->M; + } + else if (state == STD) + { + kmin = 2; + kmax = hmm->M - 1; + } + else if (state == STI) + { + kmin = 1; + kmax = hmm->M - 1; + } + else + kmin = kmax = 0; + + for (k = kmin; k <= kmax; k++) + { + switch (state) + { + case STM: + if (iM) + PrintTransition (STM,i,k, STM,i+1,k+1, + forward->mmx[i][k] + hmm->tsc[k][TMM] + backward->mmx[i+1][k+1] - sc, + alignment, min, max, on, A); + + if (iM) + PrintTransition (STM,i,k, STI,i+1,k, + forward->mmx[i][k] + hmm->tsc[k][TMI] + backward->imx[i+1][k] - sc, + alignment, min, max, on, A); + + if (kM-1) + PrintTransition (STM,i,k, STD,i,k+1, + forward->mmx[i][k] + hmm->tsc[k][TMD] + backward->dmx[i][k+1] - sc, + alignment, min, max, on, A); + + PrintTransition (STM,i,k, STE,i,0, + forward->mmx[i][k] + hmm->esc[k] + backward->xmx[i][XME] - sc, + alignment, min, max, on, A); + break; + + case STD: + if (idmx[i][k] + hmm->tsc[k][TDM] + backward->mmx[i+1][k+1] - sc, + alignment, min, max, on, A); + + PrintTransition (STD,i,k, STD,i,k+1, + forward->dmx[i][k] + hmm->tsc[k][TDD] + backward->dmx[i][k+1] - sc, + alignment, min, max, on, A); + + break; + + case STI: + if (iimx[i][k] + hmm->tsc[k][TIM] + backward->mmx[i+1][k+1] - sc, + alignment, min, max, on, A); + + if (iimx[i][k] + hmm->tsc[k][TII] + backward->imx[i+1][k] - sc, + alignment, min, max, on, A); + + break; + + case STB: + if (ixmx[i][XMB] + hmm->bsc[k] + backward->mmx[i+1][k] - sc, + alignment, min, max, on, A); + break; + + default: + break; + + } + } + + switch (state) + { + case STN: + PrintTransition (STN,i,0, STB,i,0, + forward->xmx[i][XMN] + hmm->xsc[XTN][MOVE] + backward->xmx[i][XMB] - sc, + alignment, min, max, on, A); + + if (ixmx[i][XMN] + hmm->xsc[XTN][LOOP] + backward->xmx[i+1][XMN] - sc, + alignment, min, max, on, A); + break; + + case STJ: + PrintTransition (STJ,i,0, STB,i,0, + forward->xmx[i][XMJ] + hmm->xsc[XTJ][MOVE] + backward->xmx[i][XMB] - sc, + alignment, min, max, on, A); + + if (ixmx[i][XMJ] + hmm->xsc[XTJ][LOOP] + backward->xmx[i+1][XMJ] - sc, + alignment, min, max, on, A); + break; + + case STC: + PrintTransition (STC,i,0, STT,i,0, + forward->xmx[i][XMC] + hmm->xsc[XTC][MOVE] - sc, /* should be 1 */ + alignment, min, max, on, A); + + if (ixmx[i][XMC] + hmm->xsc[XTC][LOOP] + backward->xmx[i+1][XMC] - sc, + alignment, min, max, on, A); + break; + + case STE: + PrintTransition (STE,i,0, STC,i,0, + forward->xmx[i][XME] + hmm->xsc[XTE][MOVE] + backward->xmx[i][XMC] - sc, + alignment, min, max, on, A); + + PrintTransition (STE,i,0, STJ,i,0, + forward->xmx[i][XME] + hmm->xsc[XTE][LOOP] + backward->xmx[i][XMJ] - sc, + alignment, min, max, on, A); + break; + + case STS: + if (i == 0) + PrintTransition (STS,i,0, STN,i,0, + backward->xmx[i][XMN] - sc, /* should be 1 */ + alignment, min, max, on, A); + break; + + case STM: + case STD: + case STI: + case STB: + case STT: + break; + + default: + Die ("unknown state"); + + } + } + } + + free (min); + free (max); + free (on); + +} + + + +/* Function: DisplayPlan7Matrix() + * + * Purpose: Print out a dynamic programming matrix. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model + * mx - dp matrix + * + * Return: void + * + * The output of this function inverts HMMer's concept of rows and columns + * (i.e. each row represents a state, and each column, a residue); + * also, probabilities are displayed as natural logs, not bit scores. + * It should probably only be used by ihh... + * + */ +void +DisplayPlan7Matrix(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx) +{ + int i; + int k; + + printf(" * "); + for (i=1;i<=L;i++) printf(" %c ",Alphabet[dsq[i]]); + printf("\nN "); + for (i=0;i<=L;i++) PrintIscore(mx->xmx[i][XMN]); + for (k=1;k<=hmm->M;k++) { + printf("\nM%-3d ",k); + for (i=0;i<=L;i++) PrintIscore(mx->mmx[i][k]); + } + for (k=1;kM;k++) { + printf("\nI%-3d ",k); + for (i=0;i<=L;i++) PrintIscore(mx->imx[i][k]); + } + printf("\nE "); + for (i=0;i<=L;i++) PrintIscore(mx->xmx[i][XME]); + printf("\nC "); + for (i=0;i<=L;i++) PrintIscore(mx->xmx[i][XMC]); + printf("\nJ "); + for (i=0;i<=L;i++) PrintIscore(mx->xmx[i][XMJ]); + printf("\nB "); + for (i=0;i<=L;i++) PrintIscore(mx->xmx[i][XMB]); + for (k=2;kM;k++) { + printf("\nD%-3d ",k); + for (i=0;i<=L;i++) PrintIscore(mx->dmx[i][k]); + } + printf("\n\n"); +} + + +void PrintIscore(int sc) { + double dsc; + double div; + dsc = (double) sc; + div = INTSCALE / 0.693147180559945; /* == INTSCALE / log(2) */ + dsc = dsc / div; + printf("%- #11.3e",dsc); +} + + +void PrintTransition(char src, + int isrc, + int ksrc, + char dest, + int idest, + int kdest, + int sc, + struct p7trace_s **alignment, + int *min, + int *max, + int *on, + int A) +{ + char src_str[6]; /* buffer for source state label */ + char dest_str[6]; /* buffer for destination state label */ + int j; + int tpos; + int tnext; + int pos; + int next; + int near; + + near = 0; + + for (j = 0; j < A; j++) { + on[j] = 0; + for (pos = 0, tpos = min[j]; tpos <= max[j]; tpos++) { + + if (alignment[j]->pos[tpos] != 0) + pos = alignment[j]->pos[tpos]; + + if (src == alignment[j]->statetype[tpos] + && ksrc == alignment[j]->nodeidx[tpos] + && isrc == pos) + near = TRUE; + + if (dest == alignment[j]->statetype[tpos] + && kdest == alignment[j]->nodeidx[tpos] + && idest == pos) + near = TRUE; + + if (tpos < alignment[j]->tlen - 1) + { + tnext = tpos + 1; + + /* fold up B->D->M transitions into pseudo- B->M transitions */ + + if (alignment[j]->statetype[tpos] == STB) + while (alignment[j]->statetype[tnext] == STD && tnext < alignment[j]->tlen - 1) + tnext++; + + next = alignment[j]->pos[tnext]; + if (next == 0) + next = pos; + + if (src == alignment[j]->statetype[tpos] + && ksrc == alignment[j]->nodeidx[tpos] + && isrc == pos + && dest == alignment[j]->statetype[tnext] + && kdest == alignment[j]->nodeidx[tnext] + && idest == next) + on[j] = TRUE; + } + } + } + + if (!near) return; + + switch (src) + { + case STM: sprintf (src_str, "M%d", ksrc); break; + case STD: sprintf (src_str, "D%d", ksrc); break; + case STI: sprintf (src_str, "I%d", ksrc); break; + case STS: sprintf (src_str, "S"); break; + case STN: sprintf (src_str, "N"); break; + case STB: sprintf (src_str, "B"); break; + case STE: sprintf (src_str, "E"); break; + case STC: sprintf (src_str, "C"); break; + case STJ: sprintf (src_str, "J"); break; + case STT: sprintf (src_str, "T"); break; + default: Die ("bad transition"); + } + + switch (dest) + { + case STM: sprintf (dest_str, "M%d", kdest); break; + case STD: sprintf (dest_str, "D%d", kdest); break; + case STI: sprintf (dest_str, "I%d", kdest); break; + case STS: sprintf (dest_str, "S"); break; + case STN: sprintf (dest_str, "N"); break; + case STB: sprintf (dest_str, "B"); break; + case STE: sprintf (dest_str, "E"); break; + case STC: sprintf (dest_str, "C"); break; + case STJ: sprintf (dest_str, "J"); break; + case STT: sprintf (dest_str, "T"); break; + default: Die ("bad transition"); + } + + printf ("%d\t%s\t%d\t%s\t%-14.7g\t", isrc, src_str, idest, dest_str, (double) Score2Prob(sc,1.)); + + for (j = 0; j < A; j++) { + if (on[j]) printf ("*"); + if (j < A - 1) printf ("\t"); + } + + printf ("\n"); + +} + diff --git a/forester/archive/RIO/others/hmmer/src/emit.c b/forester/archive/RIO/others/hmmer/src/emit.c new file mode 100644 index 0000000..e6b101c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/emit.c @@ -0,0 +1,457 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* emit.c + * SRE, Sun Mar 8 12:26:58 1998 + * RCS $Id: emit.c,v 1.1.1.1 2005/03/22 08:34:04 cmzmasek Exp $ + * + * Generation of sequences/traces from an HMM. + */ + +#include "structs.h" +#include "config.h" +#include "funcs.h" +#include "squid.h" + +#include + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +/* Function: EmitSequence() + * Date: SRE, Sun Mar 8 12:28:03 1998 [St. Louis] + * + * Purpose: Given a model, sample a sequence and/or traceback. + * + * Args: hmm - the model + * ret_dsq - RETURN: generated digitized sequence (pass NULL if unwanted) + * ret_L - RETURN: length of generated sequence + * ret_tr - RETURN: generated trace (pass NULL if unwanted) + * + * Returns: void + */ +void +EmitSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr) +{ + struct p7trace_s *tr; + char type; /* current state type */ + int k; /* current node index */ + char *dsq; /* generated sequence, digitized */ + int L; /* length of sequence */ + int alloc_tlen; /* allocated space for traceback */ + int alloc_L; /* allocated space for sequence */ + int tpos; /* position in traceback */ + int sym; /* a generated symbol index */ + float t[4]; /* little array for choosing M transition from */ + + /* Initialize; allocations + */ + P7AllocTrace(64, &tr); + alloc_tlen = 64; + dsq = MallocOrDie(sizeof(char) * 64); + alloc_L = 64; + + TraceSet(tr, 0, STS, 0, 0); + TraceSet(tr, 1, STN, 0, 0); + dsq[0] = (char) Alphabet_iupac; + L = 1; + k = 0; + type = STN; + tpos = 2; + + while (type != STT) + { + /* Deal with state transition + */ + switch (type) { + case STB: + hmm->begin[0] = hmm->tbd1; /* begin[0] hack (documented in structs.h) */ + k = FChoose(hmm->begin, hmm->M+1); + if (k == 0) { type = STD; k = 1; } else {type = STM; } + break; + + case STI: type = (FChoose(hmm->t[k]+TIM, 2) == 0) ? STM : STI; if (type == STM) k++; break; + case STN: type = (FChoose(hmm->xt[XTN], 2) == LOOP) ? STN : STB; k = 0; break; + case STE: type = (FChoose(hmm->xt[XTE], 2) == LOOP) ? STJ : STC; k = 0; break; + case STC: type = (FChoose(hmm->xt[XTC], 2) == LOOP) ? STC : STT; k = 0; break; + case STJ: type = (FChoose(hmm->xt[XTJ], 2) == LOOP) ? STJ : STB; k = 0; break; + + case STD: + if (k < hmm->M) { + type = (FChoose(hmm->t[k]+TDM, 2) == 0) ? STM : STD; + k++; + } else { + type = STE; + k = 0; + } + break; + + case STM: + if (k < hmm->M) { + FCopy(t, hmm->t[k], 3); + t[3] = hmm->end[k]; + switch (FChoose(t,4)) { + case 0: k++; type = STM; break; + case 1: type = STI; break; + case 2: k++; type = STD; break; + case 3: k=0; type = STE; break; + default: Die("never happens"); + } + } else { + k = 0; + type = STE; + } + break; + + case STT: + case STBOGUS: + default: + Die("can't happen."); + } + + /* Choose a symbol emission, if necessary + */ + sym = -1; + if (type == STM) sym = FChoose(hmm->mat[k], Alphabet_size); + else if (type == STI) sym = FChoose(hmm->ins[k], Alphabet_size); + else if ((type == STN && tr->statetype[tpos-1] == STN) || + (type == STC && tr->statetype[tpos-1] == STC) || + (type == STJ && tr->statetype[tpos-1] == STJ)) + sym = FChoose(hmm->null, Alphabet_size); + + /* Add to the traceback; deal with realloc if necessary + */ + TraceSet(tr, tpos, type, k, (sym != -1) ? L : 0); + tpos++; + if (tpos == alloc_tlen) { + alloc_tlen += 64; + P7ReallocTrace(tr, alloc_tlen); + } + + /* Add to the digitized seq; deal with realloc, if necessary + */ + if (sym != -1) { + dsq[L] = (char) sym; + L++; + if (L+1 == alloc_L) { /* L+1 leaves room for sentinel byte + \0 */ + alloc_L += 64; + dsq = ReallocOrDie(dsq, sizeof(char) * alloc_L); + } + } + } + + /* Finish off the trace + */ + tr->tlen = tpos; + + /* Finish off the dsq with sentinel byte and null terminator. + * Emitted Sequence length is L-1. + */ + dsq[L] = (char) Alphabet_iupac; + dsq[L+1] = '\0'; + L--; + + /* Return + */ + if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq); + if (ret_L != NULL) *ret_L = L; + if (ret_tr != NULL) *ret_tr = tr; else P7FreeTrace(tr); + return; +} + +#ifdef SRE_REMOVED +/* Function: EmitBestSequence() + * Date: SRE, Tue Nov 10 16:21:59 1998 [St. Louis] + * + * Purpose: Given a model, emit the maximum probability sequence + * from it: argmax_{seq} P(seq | model). + * This is a sensible HMM equivalent to a "consensus" + * sequence. + * The model should be Plan7NakedConfig()'ed; + * in particular, if we allowed B->M and M->E, + * the highest probability sequence would be + * artifactually short. (We could do the highest + * scoring sequence instead, to get around this problem, + * but the highest scoring sequence is prone to + * other artifacts -- any looping state N,C,J, or I + * with a positively scoring residue leads to + * an infinitely long "best scoring" sequence.) + * + * Args: hmm - the model + * ret_seq - RETURN: best sequence + * ret_L - RETURN: length of sequence + * ret_tr - RETURN: traceback of the model/seq alignment; or NULL. + * + * Returns: void + */ +void +EmitBestSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr) +{ + char *seq; /* RETURN: best seq */ + struct p7trace_s *tr; /* RETURN: traceback */ + float *mmx, *imx, *dmx; /* log P forward scores for M,D,I */ + char *mtb, *itb, *dtb; /* traceback ptrs for M,D,I */ + int x; /* counter for symbols */ + int k; /* counter for nodes */ + float sc; /* tmp var for a log P */ + int bestsym; + int rpos; /* position in a sequence */ + int tpos; /* position in a trace */ + int tlen; /* length of the traceback */ + + /* Initial allocations. We only need a 1D matrix and its shadow; + * it's overkill to use the Plan7Matrix structures, so don't. + */ + mmx = MallocOrDie(sizeof(float) * (hmm->M+1)); + imx = MallocOrDie(sizeof(float) * (hmm->M)); + dmx = MallocOrDie(sizeof(float) * (hmm->M)); + mtb = MallocOrDie(sizeof(char) * (hmm->M+1)); + itb = MallocOrDie(sizeof(char) * (hmm->M)); + dtb = MallocOrDie(sizeof(char) * (hmm->M)); + + /* Initialization. + * We can safely assume a max probability path of S->N->B->(M1 or D1), + * so just init M1 and D1. + */ + mmx[1] = log(hmm->xt[XTN][MOVE]) + log(1.F - hmm->tbd1); + dmx[1] = + + + /* Main recursion, done as a push. + * The model is used in probability form; no wing folding needed. + */ + for (k = 1; k < hmm->M; k++) + { + /* Transits out of match state (init with these) + */ + mmx[k+1] = mmx[k] + log(hmm->t[k][TMM]); mtb[k+1] = STM; + dmx[k+1] = mmx[k] + log(hmm->t[k][TMD]); dtb[k+1] = STM; + if (k < hmm->M-1) + imx[k] = mmx[k] + log(hmm->t[k][TMI]); itb[k] = STM; + + /* Transits out of delete state + */ + if ((sc = dmx[k] + log(hmm->t[k][TDM])) > mmx[k+1]) + { mmx[k+1] = sc; mtb[k+1] = STD; } + if ((sc = dmx[k] + log(hmm->t[k][TDD])) > dmx[k+1]) + { dmx[k+1] = sc; dtb[k+1] = STD; } + + /* Transits out of insert state (self-loops are never good) + */ + if ((sc = imx[k] + log(hmm->t[k][TIM])) > mmx[k+1]) + { mmx[k+1] = sc; mtb[k+1] = STI; } + + /* Best emissions + */ + x = FMax(hmm->mat[k+1], Alphabet_size); + mmx[k+1] += log(hmm->mat[k+1][x]); + + if (k < hmm->M-1) { + x = FMax(hmm->ins[k+1], Alphabet_size); + imx[k+1] += log(hmm->ins[k+1][x]); + } + } +} +#endif /* SRE_REMOVED */ + + +/* Function: EmitConsensusSequence() + * Date: SRE, Wed Nov 11 11:08:59 1998 [St. Louis] + * + * Purpose: Generate a "consensus sequence". For the purposes + * of a profile HMM, this is defined as: + * - for each node: + * - if StateOccupancy() says that M is used + * with probability >= 0.5, this M is "consensus". + * Then, choose maximally likely residue. + * if P>0.5 (protein) or P>0.9 (DNA), make + * it upper case; else make it lower case. + * - if StateOccupancy() says that I + * is used with P >= 0.5, this I is "consensus"; + * use it 1/(1-TII) times (its expectation value). + * Generate an "x" from each I. + * + * The function expects that the model is config'ed + * by Plan7NakedConfig(): that is, for a single global pass + * with no N,C,J involvement. + * + * + * Args: hmm - the model + * ret_seq - RETURN: consensus sequence (pass NULL if unwanted) + * ret_dsq - RETURN: digitized consensus sequence (pass NULL if unwanted) + * ret_L - RETURN: length of generated sequence + * ret_tr - RETURN: generated trace (pass NULL if unwanted) + * + * Returns: void + */ +void +EmitConsensusSequence(struct plan7_s *hmm, char **ret_seq, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr) +{ + struct p7trace_s *tr; /* RETURN: traceback */ + char *dsq, *seq; /* sequence in digitized and undigitized form */ + float *mp, *ip, *dp; /* state occupancies from StateOccupancy() */ + int nmat, ndel, nins; /* number of matches, deletes, inserts used */ + int k; /* counter for nodes */ + int tpos; /* position in trace */ + int i; /* position in seq (equiv pos in dsq is i+1 */ + int x; /* symbol choice (M) or # symbols (I) */ + float mthresh; /* >= this, show symbol as upper case */ + + if (Alphabet_type == hmmAMINO) mthresh = 0.5; + else mthresh = 0.9; + + StateOccupancy(hmm, &mp, &ip, &dp); + + /* First pass: how many states do we need in the trace? + * how long will the sequence be? + */ + nmat = ndel = nins = 0; + for (k = 1; k <= hmm->M; k++) + { + if (mp[k] >= 0.5) nmat++; else ndel++; + if (k < hmm->M && ip[k] >= 0.5) + nins += (int) (1.f / (1.f - hmm->t[k][TII])); + } + + /* Allocations + */ + P7AllocTrace(6 + nmat + ndel + nins, &tr); + dsq = MallocOrDie(sizeof(char) * (nmat+nins+3)); + seq = MallocOrDie(sizeof(char) * (nmat+nins+1)); + + /* Main pass. + * Construct consensus trace, seq, and dsq. + */ + TraceSet(tr, 0, STS, 0, 0); + TraceSet(tr, 1, STN, 0, 0); + TraceSet(tr, 2, STB, 0, 0); + dsq[0] = Alphabet_iupac; /* guard byte */ + tpos = 3; + i = 0; + for (k = 1; k <= hmm->M; k++) + { + if (mp[k] >= 0.5) + { + x = FMax(hmm->mat[k], Alphabet_size); + TraceSet(tr, tpos, STM, k, i+1); + seq[i] = Alphabet[x]; + dsq[i+1] = x; + if (hmm->mat[k][x] < mthresh) + seq[i] = tolower((int) seq[i]); + i++; + tpos++; + } + else + { + TraceSet(tr, tpos, STD, k, 0); + tpos++; + } + + if (k < hmm->M && ip[k] >= 0.5) + { + x = (int) (1.f / (1.f - hmm->t[k][TII])); + while (x--) + { + TraceSet(tr, tpos, STI, k, i+1); + seq[i] = 'x'; + dsq[i+1] = Alphabet_iupac - 1; + i++; + tpos++; + } + } + } + TraceSet(tr, tpos, STE, 0, 0); tpos++; + TraceSet(tr, tpos, STC, 0, 0); tpos++; + TraceSet(tr, tpos, STT, 0, 0); tpos++; + dsq[i+1] = Alphabet_iupac; + + free(mp); + free(ip); + free(dp); + if (ret_seq != NULL) *ret_seq = seq; else free(seq); + if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq); + if (ret_L != NULL) *ret_L = i; + if (ret_tr != NULL) *ret_tr = tr; else P7FreeTrace(tr); +} + + + +/* Function: StateOccupancy() + * Date: SRE, Wed Nov 11 09:46:15 1998 [St. Louis] + * + * Purpose: Calculate the expected state occupancy for + * a given HMM in generated traces. + * + * Note that expected prob of getting into + * any special state in a trace is trivial: + * S,N,B,E,C,T = 1.0 + * J = E->J transition prob + * + * Args: hmm - the model + * ret_mp - RETURN: [1..M] prob's of occupying M + * ret_ip - RETURN: [1..M-1] prob's of occupying I + * ret_dp - RETURN: [1..M] prob's of occupying D + * + * Returns: void + * mp, ip, dp are malloc'ed here. Caller must free(). + */ +void +StateOccupancy(struct plan7_s *hmm, float **ret_mp, float **ret_ip, float **ret_dp) +{ + float *fmp, *fip, *fdp; /* forward probabilities */ + int k; /* counter for nodes */ + + /* Initial allocations + */ + fmp = MallocOrDie (sizeof(float) * (hmm->M+1)); + fip = MallocOrDie (sizeof(float) * (hmm->M)); + fdp = MallocOrDie (sizeof(float) * (hmm->M+1)); + + /* Forward pass. + */ + fdp[1] = hmm->tbd1; + fmp[1] = hmm->begin[1]; + fip[1] = fmp[1] * hmm->t[1][TMI]; + for (k = 2; k <= hmm->M; k++) + { + /* M: from M,D,I at k-1, or B; count t_II as 1.0 */ + fmp[k] = fmp[k-1] * hmm->t[k-1][TMM] + + fip[k-1] + + fdp[k-1] * hmm->t[k-1][TDM] + + hmm->begin[k]; + /* D: from M,D at k-1 */ + fdp[k] = fmp[k-1] * hmm->t[k-1][TMD] + + fdp[k-1] * hmm->t[k-1][TDD]; + /* I: from M at k; don't count II */ + if (k < hmm->M) { + fip[k] = fmp[k] * hmm->t[k][TMI]; + } + + SQD_DASSERT2((fabs(1.0f - fmp[k] - fdp[k]) < 1e-6f)); + fmp[k] /= fmp[k]+fdp[k]; /* prevent propagating fp errors */ + fdp[k] /= fmp[k]+fdp[k]; + } + /* We don't need a backward pass; all backwards P's are 1.0 + * by definition (you can always get out of a state with P=1). + * The only situation where this might not be true is for + * a TII of 1.0, when TIM = 0 -- but in that case, if there's + * a finite chance of getting into that insert state, the model + * generates infinitely long sequences, so we can consider this + * situation "perverse" and disallow it elsewhere in building + * profile HMMs. + */ + + /* Return. + */ + *ret_mp = fmp; + *ret_dp = fdp; + *ret_ip = fip; +} diff --git a/forester/archive/RIO/others/hmmer/src/emulation.c b/forester/archive/RIO/others/hmmer/src/emulation.c new file mode 100644 index 0000000..7de1833 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/emulation.c @@ -0,0 +1,242 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* emulation.c + * SRE, Wed Jan 21 07:50:01 1998 + * + * Interfaces between HMMER and other software packages. + * + * RCS $Id: emulation.c,v 1.1.1.1 2005/03/22 08:34:01 cmzmasek Exp $ + */ + +#include +#include + +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" +#include "version.h" + + +/* Function: WriteProfile() + * Date: SRE, Wed Jan 21 07:58:09 1998 [St. Louis] + * + * Purpose: Given an HMM, write a GCG profile .prf file as + * output. Based on examination of Michael Gribskov's Fortran + * source in GCG 9.1; on reverse engineering + * by examination of GCG 9.1 output from "profilemake" + * and how the .prf file is used by "profilesearch"; + * and on the GCG 9.0 documentation. + * + * See notes 28 Jan 98 for detail; in brief, the conversion goes like: + * + * PROF(i,k) = match score = msc(i,k) + TMM(k-1) + * + * GAP(k) = cost per insertion = TMI(k-1) + TIM(k-1) - TMM(k-1) - TII(k-1) + * LEN(k) = cost per inserted x = TII(k-1) + * + * QGAP(k) = cost per deletion = TDM(k-1) + TMD(unknown) - TMM(k-1) - TDD(k-1) + * QLEN(k) = cost per deleted k = TDD(k-1) + * + * Note that GCG affine gaps are GAP + n * LEN; + * HMMER affine gaps count (n-1) * gap-extend, thus an + * extra TII gets taken away from GAP (and TDD from QGAP), + * since GCG will charge it. + * + * Also note how the TMM transitions, which have no equivalent + * in a profile, get smuggled in OK. + * + * Also note that GCG charges gaps using the profile position + * /after/ the gap, not preceding the gap as HMMER does. + * + * Also note the TMD(unknown) in the QGAP calculation. HMMER + * distinguishes between gap-open and gap-close, but GCG does not, + * so there is a fundamental incompatibility here. Here + * we use an upper (best-scoring, minimum-cost) bound. + * + * And finally note that GCG's implementation forces GAP=QGAP and + * LEN=QLEN. Here, we upper bound again. Compugen's implementation + * allows an "extended profile" format which distinguishes between + * the two. + * + * The upper bound approach to these scores means that a + * score given by an emulated profile is an upper bound: the HMMER + * score (for a single Smith/Waterman style local alignment) + * cannot be better than this. This is intentional, so that + * the Compugen BIC can be used for rapid prefiltering of + * the database. + * + * To get a close approximation of hmmsw scores, call + * profilesearch as + * profilesearch -noave -nonor -gap 10 -len 1 + * On the Compugen BIC, using extended profiles, you want: + * om -model=xsw.model -gapop=10 -gapext=1 -qgapop=10 -qgapext=1 -noave -nonor + * + * Args: fp - open FILE to write to (or stdout, possibly) + * hmm - the HMM to write + * do_xsw - TRUE to write Compugen's experimental extended profile format + * + * Returns: (void) + */ +void +WriteProfile(FILE *fp, struct plan7_s *hmm, int do_xsw) +{ + int k; /* position in model */ + int x; /* symbol index */ + int sc; /* a score to print */ + float nx; /* expected # of symbol x */ + int gap, len, qgap, qlen; /* penalties to charge */ + + P7Logoddsify(hmm, TRUE); + + /* GCG can't deal with long profiles. Their limit is 1000 + * positions. However, Compugen can. Therefore we warn, + * but don't die. + */ + if (hmm->M > 1000 && !do_xsw) + Warn("Profile %s will have more than 1000 positions. GCG won't read it; Compugen will.", + hmm->name); + + /* Header information. + * GCG will look for sequence type and length of model. + * Other than this, nothing is parsed until we get to the + * Cons line that has a ".." on it. + * Lines that begin with "!" are comments. + */ + if (Alphabet_type == hmmAMINO) fprintf(fp, "!!AA_PROFILE 1.0\n"); + else if (Alphabet_type == hmmNUCLEIC) fprintf(fp, "!!NA_PROFILE 1.0\n"); + else Die("No support for profiles with non-biological alphabets"); + + if (Alphabet_type == hmmAMINO) fprintf(fp, "(Peptide) "); + else if (Alphabet_type == hmmNUCLEIC) fprintf(fp, "(Nucleotide) "); + fprintf(fp, "HMMCONVERT v%s Length: %d %s|%s|%s\n", + RELEASE, hmm->M, hmm->name, + hmm->flags & PLAN7_ACC ? hmm->acc : "", + hmm->flags & PLAN7_DESC ? hmm->desc : ""); + + /* Insert some HMMER-specific commentary + */ + if (do_xsw) + { + fprintf(fp, " Profile converted from a profile HMM using HMMER v%s emulation.\n", RELEASE); + fprintf(fp, " Compugen XSW extended profile format.\n"); + fprintf(fp, " Use -model=xsw.model -nonor -noave -gapop=10 -gapext=1 -qgapop=10 -qgapext=1\n"); + fprintf(fp, " with om on the Compugen BIC to get the closest approximation to HMMER bit scores.\n"); + fprintf(fp, " WARNING: There is a loss of information in this conversion.\n"); + fprintf(fp, " Neither the scores nor even the rank order of hits will be precisely\n"); + fprintf(fp, " preserved in a comparison of HMMER hmmsearch to GCG profilesearch.\n"); + fprintf(fp, " The profile score is an approximation of the (single-hit) HMMER score.\n\n"); + } + else + { + fprintf(fp, " Profile converted from a profile HMM using HMMER v%s emulation.\n", RELEASE); + fprintf(fp, " Use -nonor -noave -gap=10 -len=1 with profilesearch and friends\n"); + fprintf(fp, " to get the closest approximation to HMMER bit scores.\n"); + fprintf(fp, " WARNING: There is a loss of information in this conversion.\n"); + fprintf(fp, " Neither the scores nor even the rank order of hits will be precisely\n"); + fprintf(fp, " preserved in a comparison of HMMER hmmsearch to GCG profilesearch.\n"); + fprintf(fp, " The profile score is an approximation of the (single-hit) HMMER score.\n\n"); + } + + + /* Do the CONS line, which gives the valid IUPAC symbols and their order + */ + fprintf(fp, "Cons"); + for (x = 0; x < Alphabet_iupac; x++) + fprintf(fp, " %c ", Alphabet[x]); + if (do_xsw) + fprintf(fp, " Gap Len QGap Qlen ..\n"); + else + fprintf(fp, " Gap Len ..\n"); + + /* Now, the profile; for each position in the HMM, write a line of profile. + */ + for (k = 1; k <= hmm->M; k++) + { + /* GCG adds some indexing as comments */ + if ((k-1)%10 == 0 && k > 10) + fprintf(fp, "! %d\n", k); + + /* find consensus residue by max prob */ + x = FMax(hmm->mat[k], Alphabet_size); + fprintf(fp, " %c ", Alphabet[x]); + /* generate emission score profile; + * Profiles are scaled by a factor of 100 + */ + for (x = 0; x < Alphabet_iupac; x++) + { + sc = hmm->msc[x][k]; + if (k < hmm->M) sc += hmm->tsc[k][TMM]; + sc = sc * 100 / INTSCALE; + fprintf(fp, "%5d ", sc); + } + /* Generate gap open, gap extend penalties; + note we will force profilesearch to weights of 10, 1, + and that GCG profile values are percentages + of these base penalties, 0..100.*/ + /* gap open (insertion)*/ + if (k > 1) + { + gap = -1 * (hmm->tsc[k-1][TMI] + hmm->tsc[k-1][TIM] - hmm->tsc[k-1][TMM] - hmm->tsc[k-1][TII]); + gap = gap * 100 / (10.0 * INTSCALE); + } + else gap = 100; /* doesn't matter because GAP_1 is never used */ + + /* gap extend (insertion)*/ + if (k > 1) + { + len = -1 * hmm->tsc[k-1][TII]; + len = len * 100 / (1.0 * INTSCALE); + } + else len = 100; /* again, doesn't matter because LEN_1 is never used */ + + /* gap open (deletion) */ + if (k > 1) + { + qgap = -1 * (hmm->tsc[k-1][TDM] + hmm->tsc[k-1][TMD] - hmm->tsc[k-1][TMM] - hmm->tsc[k-1][TDD]); + qgap = qgap * 100 / (10.0 * INTSCALE); + } + else qgap = 100; + /* gap extend (deletion) */ + if (k > 1) + { + qlen = -1 * hmm->tsc[k-1][TDD]; + qlen = qlen * 100 / (1.0 * INTSCALE); + } + else qlen = 100; + + + if (do_xsw) + fprintf(fp, "%5d %5d %5d %5d\n", gap, len, qgap, qlen); + else + fprintf(fp, "%5d %5d\n", gap, len); /* assume insertions >= deletions */ + } + + /* The final line of the profile is a count of the observed + * residues in the training sequences. This information is not + * available in an HMM, and I'm not sure that GCG ever uses it. + * Approximate it by calculating a /very/ rough expectation. + */ + fprintf(fp, " * "); + for (x = 0; x < Alphabet_size; x++) + { + nx = 0.0; + for (k = 1; k <= hmm->M; k++) + nx += hmm->mat[k][x]; + nx *= hmm->nseq; + fprintf(fp, "%5d ", (int) nx); + } + for (; x < Alphabet_iupac; x++) + fprintf(fp, "%5d ", 0); + fprintf(fp, "\n"); + return; +} + diff --git a/forester/archive/RIO/others/hmmer/src/funcs.h b/forester/archive/RIO/others/hmmer/src/funcs.h new file mode 100644 index 0000000..81f31fb --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/funcs.h @@ -0,0 +1,350 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* funcs.h + * RCS $Id: funcs.h,v 1.1.1.1 2005/03/22 08:34:07 cmzmasek Exp $ + * + * Declarations of external functions in HMMER. + */ + +#ifndef FUNCSH_INCLUDED +#define FUNCSH_INCLUDED + +#include "config.h" +#include "structs.h" +#include "squid.h" +#include "msa.h" + +/* alphabet.c + * Configuration of global alphabet information + */ +extern void DetermineAlphabet(char **rseqs, int nseq); +extern void SetAlphabet(int type); +extern int SymbolIndex(char sym); +extern char *DigitizeSequence(char *seq, int L); +extern char *DedigitizeSequence(char *dsq, int L); +extern void DigitizeAlignment(MSA *msa, char ***ret_dsqs); +extern void P7CountSymbol(float *counters, char sym, float wt); +extern void DefaultGeneticCode(int *aacode); +extern void DefaultCodonBias(float *codebias); + +/* from core_algorithms.c + * Clean research/demonstration versions of basic algorithms. + */ +extern struct dpmatrix_s *AllocPlan7Matrix(int rows, int M, int ***xmx, + int ***mmx, int ***imx, int ***dmx); +extern struct dpshadow_s *AllocShadowMatrix(int rows, int M, char ***xtb, + char ***mtb, char ***itb, char ***dtb); +extern void FreePlan7Matrix(struct dpmatrix_s *mx); +extern void FreeShadowMatrix(struct dpshadow_s *tb); +extern int P7ViterbiSize(int L, int M); +extern int P7SmallViterbiSize(int L, int M); +extern int P7WeeViterbiSize(int L, int M); +extern float P7Forward(char *dsq, int L, struct plan7_s *hmm, + struct dpmatrix_s **ret_mx); +extern float P7Viterbi(char *dsq, int L, struct plan7_s *hmm, + struct p7trace_s **ret_tr); +extern void P7ViterbiTrace(struct plan7_s *hmm, char *dsq, int L, + struct dpmatrix_s *mx, struct p7trace_s **ret_tr); +extern float P7SmallViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr); +extern float P7ParsingViterbi(char *dsq, int L, struct plan7_s *hmm, + struct p7trace_s **ret_tr); +extern float P7WeeViterbi(char *dsq, int L, struct plan7_s *hmm, + struct p7trace_s **ret_tr); +extern float Plan7ESTViterbi(char *dsq, int L, struct plan7_s *hmm, + struct dpmatrix_s **ret_mx); +extern struct p7trace_s *P7ViterbiAlignAlignment(MSA *msa, struct plan7_s *hmm); +extern struct p7trace_s *ShadowTrace(struct dpshadow_s *tb, struct plan7_s *hmm, int L); +extern void PostprocessSignificantHit(struct tophit_s *ghit, struct tophit_s *dhit, struct p7trace_s *tr, struct plan7_s *hmm, char *dsq, int L, char *seqname, char *seqacc, char *seqdesc, int do_forward, float sc_override, int do_null2, struct threshold_s *thresh, int hmmpfam_mode); + + +/* from debug.c + * Debugging output of various sorts. + */ +extern char *Statetype(char st); +extern char *AlphabetType2String(int type); +extern void P7PrintTrace(FILE *fp, struct p7trace_s *tr, + struct plan7_s *hmm, char *dsq); +extern void P7PrintPrior(FILE *fp, struct p7prior_s *pri); +extern int TraceCompare(struct p7trace_s *t1, struct p7trace_s *t2); +extern int TraceVerify(struct p7trace_s *tr, int M, int N); + +/* + * from display.c + * Ian Holmes' functions for displaying HMMER2 data structures, especially + * for posterior probabilities in alignments. + */ +extern void DisplayPlan7Matrix(char *dsq, int L, struct plan7_s *hmm, + struct dpmatrix_s *mx); +extern void DisplayPlan7Posteriors(int L, struct plan7_s *hmm, + struct dpmatrix_s *forward, struct dpmatrix_s *backward, + struct p7trace_s *viterbi, struct p7trace_s *optacc); +extern void DisplayPlan7PostAlign(int L, struct plan7_s *hmm, + struct dpmatrix_s *forward, struct dpmatrix_s *backward, + struct p7trace_s **alignment, int A); + + +/* from emit.c + * Generation of sequences/traces from an HMM + */ +extern void EmitSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr); +extern void EmitConsensusSequence(struct plan7_s *hmm, char **ret_seq, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr); +extern void StateOccupancy(struct plan7_s *hmm, float **ret_mp, float **ret_ip, float **ret_dp); + + +/* from emulation.c + * Interfaces between HMMER and other software packages + */ +extern void WriteProfile(FILE *fp, struct plan7_s *hmm, int do_xsw); + + +/* from histogram.c + * accumulation of scores + */ +extern struct histogram_s *AllocHistogram(int min, int max, int lumpsize); +extern void FreeHistogram(struct histogram_s *h); +extern void UnfitHistogram(struct histogram_s *h); +extern void AddToHistogram(struct histogram_s *h, float sc); +extern void PrintASCIIHistogram(FILE *fp, struct histogram_s *h); +extern void PrintXMGRHistogram(FILE *fp, struct histogram_s *h); +extern void PrintXMGRDistribution(FILE *fp, struct histogram_s *h); +extern void PrintXMGRRegressionLine(FILE *fp, struct histogram_s *h); +extern void EVDBasicFit(struct histogram_s *h); +extern int ExtremeValueFitHistogram(struct histogram_s *h, int censor, + float high_hint); +extern void ExtremeValueSetHistogram(struct histogram_s *h, float mu, float lambda, + float low, float high, int ndegrees); +extern int GaussianFitHistogram(struct histogram_s *h, float high_hint); +extern void GaussianSetHistogram(struct histogram_s *h, float mean, float sd); +extern double EVDDensity(float x, float mu, float lambda); +extern double EVDDistribution(float x, float mu, float lambda); +extern double ExtremeValueP (float x, float mu, float lambda); +extern double ExtremeValueP2(float x, float mu, float lambda, int N); +extern double ExtremeValueE (float x, float mu, float lambda, int N); +extern float EVDrandom(float mu, float lambda); +extern int EVDMaxLikelyFit(float *x, int *y, int n, + float *ret_mu, float *ret_lambda); +extern int EVDCensoredFit(float *x, int *y, int n, int z, float c, + float *ret_mu, float *ret_lambda); +extern void Lawless416(float *x, int *y, int n, float lambda, + float *ret_f, float *ret_df); +extern void Lawless422(float *x, int *y, int n, int z, float c, + float lambda, float *ret_f, float *ret_df); + +/* from hmmio.c + * Input/output (saving/reading) of models + */ +extern HMMFILE *HMMFileOpen(char *hmmfile, char *env); +extern int HMMFileRead(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +extern void HMMFileClose(HMMFILE *hmmfp); +extern int HMMFileFormat(HMMFILE *hmmfp); +extern void HMMFileRewind(HMMFILE *hmmfp); +extern int HMMFilePositionByName(HMMFILE *hmmfp, char *name); +extern int HMMFilePositionByIndex(HMMFILE *hmmfp, int idx); +extern void WriteAscHMM(FILE *fp, struct plan7_s *hmm); +extern void WriteBinHMM(FILE *fp, struct plan7_s *hmm); + +/* masks.c + * Repetitive sequence masking. + */ +extern int XNU(char *dsq, int len); +extern float TraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq); + +/* mathsupport.c + * Much of this code deals with Dirichlet prior mathematics. + */ +extern int Prob2Score(float p, float null); +extern float Score2Prob(int sc, float null); +extern float Scorify(int sc); +extern double PValue(struct plan7_s *hmm, float sc); +extern float LogSum(float p1, float p2); +extern int ILogsum(int p1, int p2); +extern void LogNorm(float *vec, int n); +extern float Logp_cvec(float *cvec, int n, float *alpha); +extern void SampleDirichlet(float *alpha, int n, float *p); +extern float SampleGamma(float alpha); +extern void SampleCountvector(float *p, int n, int c, float *cvec); +extern float P_PvecGivenDirichlet(float *p, int n, float *alpha); + +/* from misc.c + * Miscellaneous functions with no home + */ +extern char *Getword(FILE *fp, int type); +extern char *Getline(char *s, int n, FILE *fp); +extern int SetAutocuts(struct threshold_s *thresh, struct plan7_s *hmm); + +/* from modelmakers.c + * Model construction algorithms + */ +extern void P7Handmodelmaker(MSA *msa, char **dsq, struct plan7_s **ret_hmm, + struct p7trace_s ***ret_tr); +extern void P7Fastmodelmaker(MSA *msa, char **dsq, + float maxgap, struct plan7_s **ret_hmm, + struct p7trace_s ***ret_tr); +extern void P7Maxmodelmaker(MSA *msa, char **dsq, + float maxgap, struct p7prior_s *prior, + float *null, float null_p1, float mpri, + struct plan7_s **ret_hmm, + struct p7trace_s ***ret_tr); + +/* from plan7.c + * Plan7 HMM structure support + */ +extern struct plan7_s *AllocPlan7(int M); +extern struct plan7_s *AllocPlan7Shell(void); +extern void AllocPlan7Body(struct plan7_s *hmm, int M); +extern void FreePlan7(struct plan7_s *hmm); +extern void ZeroPlan7(struct plan7_s *hmm); +extern void Plan7SetName(struct plan7_s *hmm, char *name); +extern void Plan7SetAccession(struct plan7_s *hmm, char *acc); +extern void Plan7SetDescription(struct plan7_s *hmm, char *desc); +extern void Plan7ComlogAppend(struct plan7_s *hmm, int argc, char **argv); +extern void Plan7SetCtime(struct plan7_s *hmm); +extern void Plan7SetNullModel(struct plan7_s *hmm, float null[MAXABET], float p1); +extern void P7Logoddsify(struct plan7_s *hmm, int viterbi_mode); +extern void Plan7Renormalize(struct plan7_s *hmm); +extern void Plan7RenormalizeExits(struct plan7_s *hmm); +extern void Plan7NakedConfig(struct plan7_s *hmm); +extern void Plan7GlobalConfig(struct plan7_s *hmm); +extern void Plan7LSConfig(struct plan7_s *hmm); +extern void Plan7SWConfig(struct plan7_s *hmm, float pentry, float pexit); +extern void Plan7FSConfig(struct plan7_s *hmm, float pentry, float pexit); +extern void PrintPlan7Stats(FILE *fp, struct plan7_s *hmm, char **dsq, + int nseq, struct p7trace_s **tr); +extern int DegenerateSymbolScore(float *p, float *null, int ambig); +extern void Plan9toPlan7(struct plan9_s *hmm, struct plan7_s **ret_plan7); + +/* + * from plan9.c + * Backwards compatibility for the Plan 9 data structures of HMMER 1.x + */ +extern struct plan9_s *P9AllocHMM(int M); +extern void P9ZeroHMM(struct plan9_s *hmm); +extern int P9FreeHMM(struct plan9_s *hmm); +extern void P9Renormalize(struct plan9_s *hmm); +extern void P9DefaultNullModel(float *null); + +/* + * from postprob.c + * Functions for working with posterior probabilities within alignments + */ +extern float P7OptimalAccuracy(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr); +extern float P7Backward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx); +extern void P7EmitterPosterior(int L, struct plan7_s *hmm, struct dpmatrix_s *forward, + struct dpmatrix_s *backward, struct dpmatrix_s *mx); +extern float P7FillOptimalAccuracy(int L, int M, struct dpmatrix_s *posterior, + struct dpmatrix_s *mx, struct p7trace_s **ret_tr); +extern void P7OptimalAccuracyTrace(int L, int M, struct dpmatrix_s *posterior, + struct dpmatrix_s *mx, struct p7trace_s **ret_tr); +extern char *PostalCode(int L, struct dpmatrix_s *mx, struct p7trace_s *tr); + +/* from prior.c + * Dirichlet priors + */ +extern struct p7prior_s *P7AllocPrior(void); +extern struct p7prior_s *P7LaplacePrior(void); +extern struct p7prior_s *P7DefaultPrior(void); +extern struct p7prior_s *P7ReadPrior(char *prifile); +extern void P7FreePrior(struct p7prior_s *pri); +extern void PAMPrior(char *pamfile, struct p7prior_s *pri, float pamwgt); +extern void P7DefaultNullModel(float *null, float *ret_p1); +extern void P7ReadNullModel(char *rndfile, float *null, float *ret_p1); +extern void P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri); +extern void P7PriorifyTransitionVector(float *t, struct p7prior_s *prior, + float tq[MAXDCHLET]); +extern void P7PriorifyEmissionVector(float *vec, struct p7prior_s *pri, + int num, float eq[MAXDCHLET], + float e[MAXDCHLET][MAXABET], + float *ret_mix); + + +#ifdef HMMER_PVM +/* from pvm.c + * PVM Parallel Virtual Machine implementation + */ +extern void PVMSpawnSlaves(char *slave, int **ret_tid, int *ret_nslaves); +extern void PVMConfirmSlaves(int *slave_tid, int nslaves); +extern void PVMCheckSlaves(int *slave_tid, int nslaves); +extern void PVMKillSlaves(int *slave_tid, int nslaves); +extern int PVMPackString(char *s); +extern char * PVMUnpackString(void); +extern int PVMPackTrace(struct p7trace_s *tr); +extern struct p7trace_s *PVMUnpackTrace(void); +extern int PVMPackHMM(struct plan7_s *hmm); +extern struct plan7_s * PVMUnpackHMM(void); +#endif /*HMMER_PVM*/ + +#ifdef HMMER_THREADS +/* from threads.c + * POSIX threads implementation + */ +extern int ThreadNumber(void); +#endif /*HMMER_THREADS*/ + + +/* from tophits.c + * Support for keeping/sorting top scoring hit/alignment lists + */ +extern struct tophit_s *AllocTophits(int lumpsize); +extern void GrowTophits(struct tophit_s *h); +extern void FreeTophits(struct tophit_s *h); +extern struct fancyali_s *AllocFancyAli(void); +extern void FreeFancyAli(struct fancyali_s *ali); +extern void RegisterHit(struct tophit_s *h, double sortkey, + double pvalue, float score, + double motherp, float mothersc, + char *name, char *acc, char *desc, + int sqfrom, int sqto, int sqlen, + int hmmfrom, int hmmto, int hmmlen, + int domidx, int ndom, + struct fancyali_s *ali); +extern void GetRankedHit(struct tophit_s *h, int rank, + double *r_pvalue, float *r_score, + double *r_motherp, float *r_mothersc, + char **r_name, char **r_acc, char **r_desc, + int *r_sqfrom, int *r_sqto, int *r_sqlen, + int *r_hmmfrom, int *r_hmmto, int *r_hmmlen, + int *r_domidx, int *r_ndom, + struct fancyali_s **r_ali); +extern int TophitsMaxName(struct tophit_s *h); +extern void FullSortTophits(struct tophit_s *h); +extern void TophitsReport(struct tophit_s *h, double E, int nseq); + +/* from trace.c + * Support for traceback (state path) structure + */ +extern void P7AllocTrace(int tlen, struct p7trace_s **ret_tr); +extern void P7ReallocTrace(struct p7trace_s *tr, int tlen); +extern void P7FreeTrace(struct p7trace_s *tr); +extern void TraceSet(struct p7trace_s *tr, int tpos, char type, int idx, int pos); +extern struct p7trace_s **MergeTraceArrays(struct p7trace_s **t1, int n1, struct p7trace_s **t2, int n2); +extern void P7ReverseTrace(struct p7trace_s *tr); +extern void P7TraceCount(struct plan7_s *hmm, char *dsq, float wt, + struct p7trace_s *tr); +extern float P7TraceScore(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr); +extern MSA *P7Traces2Alignment(char **dsq, SQINFO *sqinfo, float *wgt, + int nseq, int M, + struct p7trace_s **tr, int matchonly); +extern int TransitionScoreLookup(struct plan7_s *hmm, char st1, + int k1, char st2, int k2); +extern struct fancyali_s *CreateFancyAli(struct p7trace_s *tr, struct plan7_s *hmm, + char *dsq, char *name); +extern void PrintFancyAli(FILE *fp, struct fancyali_s *ali); +extern void TraceDecompose(struct p7trace_s *otr, struct p7trace_s ***ret_tr, + int *ret_ntr); +extern int TraceDomainNumber(struct p7trace_s *tr); +extern void TraceSimpleBounds(struct p7trace_s *tr, int *ret_i1, int *ret_i2, + int *ret_k1, int *ret_k2); +extern struct p7trace_s *MasterTraceFromMap(int *map, int M, int alen); +extern void ImposeMasterTrace(char **aseq, int nseq, struct p7trace_s *mtr, + struct p7trace_s ***ret_tr); + + +#endif /*FUNCSH_INCLUDED*/ diff --git a/forester/archive/RIO/others/hmmer/src/globals.h b/forester/archive/RIO/others/hmmer/src/globals.h new file mode 100644 index 0000000..5a5ae23 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/globals.h @@ -0,0 +1,24 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* globals.h + * Mon Nov 18 13:05:03 1996 + * + * Global variable definitions. + * This file may only be included in a main() .c file. + */ + +char Alphabet[MAXCODE]; /* ACGT, for instance */ +int Alphabet_type; /* hmmNUCLEIC or hmmAMINO */ +int Alphabet_size; /* uniq alphabet size: 4 or 20 */ +int Alphabet_iupac; /* total size of alphabet + IUPAC degen. */ +char Degenerate[MAXCODE][MAXABET]; +int DegenCount[MAXCODE]; + diff --git a/forester/archive/RIO/others/hmmer/src/histogram.c b/forester/archive/RIO/others/hmmer/src/histogram.c new file mode 100644 index 0000000..88841fc --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/histogram.c @@ -0,0 +1,1369 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* histogram.c + * SRE, Sat Jan 20 16:16:17 1996 + * + * Accumulation, printing, and fitting of score histograms + * from database searches. + * + * RCS $Id: histogram.c,v 1.1.1.1 2005/03/22 08:34:00 cmzmasek Exp $ + ************************************************************ + * Basic API: + * + * struct histogram_s *h; + * + * h = AllocHistogram(min_hint, max_hint, lumpsize); + * + * while (getting scores x) AddToHistogram(h, x); + * + * ExtremeValueFitHistogram(h, high_hint); + * PrintASCIIHistogram(fp, h); + * FreeHistogram(h); + */ + +#include +#include +#include +#include +#include + +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +/* Function: AllocHistogram() + * + * Purpose: Allocate and return a histogram structure. + * min and max are your best guess. They need + * not be absolutely correct; the histogram + * will expand dynamically to accomodate scores + * that exceed these suggested bounds. The amount + * that the histogram grows by is set by "lumpsize". + * + * Args: min: minimum score (integer) + * max: maximum score (integer) + * lumpsize: when reallocating histogram, pad the reallocation + * by this much (saves excessive reallocation) + */ +struct histogram_s * +AllocHistogram(int min, int max, int lumpsize) +{ + struct histogram_s *h; + int newsize; + int i; + + newsize = max - min + 1; + + h = (struct histogram_s *) MallocOrDie(sizeof(struct histogram_s)); + h->min = min; + h->max = max; + h->total = 0; + h->lowscore = INT_MAX; + h->highscore = INT_MIN; + h->lumpsize = lumpsize; + h->histogram = (int *) MallocOrDie (sizeof(int) * newsize); + for (i = 0; i < newsize; i++) h->histogram[i] = 0; + + h->expect = NULL; + h->fit_type = HISTFIT_NONE; + + return h; +} + + +/* Function: FreeHistogram() + * + * Purpose: free a histogram structure. + */ +void +FreeHistogram(struct histogram_s *h) +{ + free(h->histogram); + if (h->expect != NULL) free(h->expect); + free(h); +} + +/* Function: UnfitHistogram() + * + * Purpose: Free only the theoretical fit part of a histogram. + */ +void +UnfitHistogram(struct histogram_s *h) +{ + if (h->expect != NULL) free(h->expect); + h->expect = NULL; + h->fit_type = HISTFIT_NONE; +} + + +/* Function: AddToHistogram() + * + * Purpose: Bump the appropriate counter in a histogram + * structure, given a score. The score is + * rounded off from float precision to the + * next lower integer. + */ +void +AddToHistogram(struct histogram_s *h, float sc) +{ + int score; + int moveby; + int prevsize; + int newsize; + int i; + + /* Adding to a histogram conflicts with existing fit: + * prohibit this. + */ + if (h->fit_type != HISTFIT_NONE) + Die("AddToHistogram(): Can't add to a fitted histogram\n"); + + + /* histogram bins are defined as: score >= bin value, < bin+1 + * -1.9 -> -2 -0.4 -> -1 1.9 -> 1 + * -2.1 -> -3 0.4 -> 0 2.1 -> 2 + */ + score = (int) floor(sc); + + /* Check to see if we must reallocate the histogram. + */ + if (score < h->min) + { + prevsize = h->max - h->min + 1; + moveby = (h->min - score) + h->lumpsize; + newsize = prevsize + moveby; + h->min -= moveby; + + h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize); + memmove(h->histogram+moveby, h->histogram, sizeof(int) * prevsize); + for (i = 0; i < moveby; i++) + h->histogram[i] = 0; + } + else if (score > h->max) + { + prevsize = h->max - h->min + 1; + h->max = h->lumpsize + score; + newsize = h->max - h->min + 1; + + h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize); + for (i = prevsize; i < newsize; i++) + h->histogram[i] = 0; + } + + /* Bump the correct bin. + * The bin number is score - h->min + */ + h->histogram[score - h->min]++; + h->total++; + if (score < h->lowscore) h->lowscore = score; + if (score > h->highscore) h->highscore = score; + + SQD_DPRINTF3(("AddToHistogram(): added %.1f; rounded to %d; in bin %d (%d-%d)\n", + sc, score, score-h->min, h->min, h->max)); + return; +} + + + +/* Function: PrintASCIIHistogram() + * + * Purpose: Print a "prettified" histogram to a file pointer. + * Deliberately a look-and-feel clone of Bill Pearson's + * excellent FASTA output. + * + * Args: fp - open file to print to (stdout works) + * h - histogram to print + */ +void +PrintASCIIHistogram(FILE *fp, struct histogram_s *h) +{ + int units; + int maxbar; + int num; + int i, idx; + char buffer[81]; /* output line buffer */ + int pos; /* position in output line buffer */ + int lowbound, lowcount; /* cutoffs on the low side */ + int highbound, highcount; /* cutoffs on the high side */ + int emptybins = 3; + + /* Find out how we'll scale the histogram. + * We have 59 characters to play with on a + * standard 80-column terminal display: + * leading "%5d %6d %6d|" occupies 20 chars. + * Save the peak position, we'll use it later. + */ + maxbar = 0; + for (i = h->lowscore - h->min; i <= h->highscore - h->min; i++) + if (h->histogram[i] > maxbar) + { + maxbar = h->histogram[i]; /* max height */ + lowbound = i + h->min; /* peak position */ + } + + /* Truncate histogram display on both sides, ad hoc fashion. + * Start from the peak; then move out until we see empty bins, + * and stop. + */ + highbound = lowbound; /* start at peak position */ + for (num = 0; lowbound > h->lowscore; lowbound--) + { + i = lowbound - h->min; + if (h->histogram[i] > 0) { num = 0; continue; } /* reset */ + if (++num == emptybins) { lowbound += emptybins; break; } /* stop */ + } + for (num = 0; highbound < h->highscore; highbound++) + { + i = highbound - h->min; + if (h->histogram[i] > 0) { num = 0; continue; } /* reset */ + if (++num == emptybins) { highbound -= emptybins; break; } /* stop */ + } + /* collect counts outside of bounds */ + for (lowcount = 0, i = h->lowscore - h->min; i <= lowbound - h->min; i++) + lowcount += h->histogram[i]; + for (highcount = 0, i = h->highscore - h->min; i >= highbound - h->min; i--) + highcount += h->histogram[i]; + + /* maxbar might need raised now; then set our units */ + if (lowcount > maxbar) maxbar = lowcount; + if (highcount > maxbar) maxbar = highcount; + units = ((maxbar-1)/ 59) + 1; + + + /* Print the histogram + */ + fprintf(fp, "%5s %6s %6s (one = represents %d sequences)\n", + "score", "obs", "exp", units); + fprintf(fp, "%5s %6s %6s\n", "-----", "---", "---"); + buffer[80] = '\0'; + buffer[79] = '\n'; + for (i = h->lowscore; i <= h->highscore; i++) + { + memset(buffer, ' ', 79 * sizeof(char)); + idx = i - h->min; + + /* Deal with special cases at edges + */ + if (i < lowbound) continue; + else if (i > highbound) continue; + else if (i == lowbound && i != h->lowscore) + { + sprintf(buffer, "<%4d %6d %6s|", i+1, lowcount, "-"); + if (lowcount > 0) { + num = 1+(lowcount-1) / units; + if (num > 60) Die("oops"); + for (pos = 20; num > 0; num--) buffer[pos++] = '='; + } + fputs(buffer, fp); + continue; + } + else if (i == highbound && i != h->highscore) + { + sprintf(buffer, ">%4d %6d %6s|", i, highcount, "-"); + if (highcount > 0) { + num = 1+(highcount-1) / units; + for (pos = 20; num > 0; num--) buffer[pos++] = '='; + } + fputs(buffer, fp); + continue; + } + + /* Deal with most cases + */ + if (h->fit_type != HISTFIT_NONE) + sprintf(buffer, "%5d %6d %6d|", + i, h->histogram[idx], (int) h->expect[idx]); + else + sprintf(buffer, "%5d %6d %6s|", i, h->histogram[idx], "-"); + buffer[20] = ' '; /* sprintf writes a null char */ + + /* Mark the histogram bar for observed hits + */ + if (h->histogram[idx] > 0) { + num = 1 + (h->histogram[idx]-1) / units; + for (pos = 20; num > 0; num--) buffer[pos++] = '='; + } + + /* Mark the theoretically expected value + */ + if (h->fit_type != HISTFIT_NONE && (int) h->expect[idx] > 0) + { + pos = 20 + (int)(h->expect[idx]-1) / units; + if (pos >= 78) pos = 78; /* be careful of buffer bounds */ + buffer[pos] = '*'; + } + + /* Print the line + */ + fputs(buffer, fp); + } + + /* Print details about the statistics + */ + switch (h->fit_type) { + case HISTFIT_NONE: + fprintf(fp, "\n\n%% No statistical fit available\n"); + break; + + case HISTFIT_EVD: + fprintf(fp, "\n\n%% Statistical details of theoretical EVD fit:\n"); + fprintf(fp, " mu = %10.4f\n", h->param[EVD_MU]); + fprintf(fp, " lambda = %10.4f\n", h->param[EVD_LAMBDA]); + fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq); + fprintf(fp, " P(chi-square) = %10.4g\n", h->chip); + break; + + case HISTFIT_GAUSSIAN: + fprintf(fp, "\n\n%% Statistical details of theoretical Gaussian fit:\n"); + fprintf(fp, " mean = %10.4f\n", h->param[GAUSS_MEAN]); + fprintf(fp, " sd = %10.4f\n", h->param[GAUSS_SD]); + fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq); + fprintf(fp, " P(chi-square) = %10.4g\n", h->chip); + break; + } + return; +} + + + +/* Function: PrintXMGRHistogram() + * Date: SRE, Wed Nov 12 11:02:00 1997 [St. Louis] + * + * Purpose: Print an XMGR data file that contains two data sets: + * - xy data for the observed histogram + * - xy data for the theoretical histogram + */ +void +PrintXMGRHistogram(FILE *fp, struct histogram_s *h) +{ + int sc; /* integer score in histogram structure */ + double val; + + /* First data set is the observed histogram + */ + for (sc = h->lowscore; sc <= h->highscore; sc++) + if (h->histogram[sc - h->min] > 0) + fprintf(fp, "%-6d %f\n", sc, + (float) h->histogram[sc - h->min]/ (float) h->total); + fprintf(fp, "&\n"); + + /* Second data set is the theoretical histogram + */ + if (h->fit_type != HISTFIT_NONE) + { + for (sc = h->lowscore; sc <= h->highscore; sc++) + { + val = + (1. - ExtremeValueP((float)sc+1, h->param[EVD_MU], h->param[EVD_LAMBDA]))- + (1. - ExtremeValueP((float)sc, h->param[EVD_MU], h->param[EVD_LAMBDA])); + fprintf(fp, "%-6d %f\n", sc, val); + } + fprintf(fp, "&\n"); + } +} + +/* Function: PrintXMGRDistribution() + * Date: SRE, Wed Nov 12 11:02:09 1997 [St. Louis] + * + * Purpose: Print an XMGR data file that contains two data sets: + * - xy data for the observed distribution P(Slowscore; sc <= h->highscore; sc++) + { + cum += h->histogram[sc - h->min]; + fprintf(fp, "%-6d %f\n", sc + 1, (float) cum / (float) h->total); + } + fprintf(fp, "&\n"); + + /* Second data set is the theoretical histogram + */ + if (h->fit_type != HISTFIT_NONE) + { + for (sc = h->lowscore; sc <= h->highscore; sc++) + { + val = (1. - ExtremeValueP((float) sc, h->param[EVD_MU], + h->param[EVD_LAMBDA])); + fprintf(fp, "%-6d %f\n", sc, val); + } + fprintf(fp, "&\n"); + } +} + +/* Function: PrintXMGRRegressionLine() + * Date: SRE, Wed Nov 12 11:02:19 1997 [St. Louis] + * + * Purpose: Print an XMGR data file that contains two data sets: + * - xy data for log log transform of observed distribution P(Slowscore; sc <= h->highscore; sc++) + { + cum += h->histogram[sc - h->min]; + val = log (-1. * log((double) cum / (double) h->total)); + if (cum < h->total) + fprintf(fp, "%-6d %f\n", sc + 1, val); + } + fprintf(fp, "&\n"); + + /* Second data set is the theoretical histogram + */ + if (h->fit_type != HISTFIT_NONE) + { + for (sc = h->lowscore; sc <= h->highscore; sc++) + { + val = log(-1. * log(1. - ExtremeValueP((float) sc, h->param[EVD_MU], + h->param[EVD_LAMBDA]))); + fprintf(fp, "%-6d %f\n", sc, val); + } + fprintf(fp, "&\n"); + } +} + +/* Function: EVDBasicFit() + * Date: SRE, Wed Nov 12 11:02:27 1997 [St. Louis] + * + * Purpose: Fit a score histogram to the extreme value + * distribution. Set the parameters lambda + * and mu in the histogram structure. Fill in the + * expected values in the histogram. Calculate + * a chi-square test as a measure of goodness of fit. + * + * This is the basic version of ExtremeValueFitHistogram(), + * in a nonrobust form: simple linear regression with no + * outlier pruning. + * + * Methods: Uses a linear regression fitting method [Collins88,Lawless82] + * + * Args: h - histogram to fit + * + * Return: (void) + */ +void +EVDBasicFit(struct histogram_s *h) +{ + float *d; /* distribution P(S < x) */ + float *x; /* x-axis of P(Smin */ + float slope, intercept; /* m,b fit from Linefit() */ + float corr; /* correlation coeff of line fit, not used */ + float lambda, mu; /* slope, intercept converted to EVD params */ + + /* Allocations for x, y axes + * distribution d runs from min..max with indices 0..max-min + * i.e. score - min = index into d, x, histogram, and expect + */ + hsize = h->highscore - h->lowscore + 1; + d = (float *) MallocOrDie(sizeof(float) * hsize); + x = (float *) MallocOrDie(sizeof(float) * hsize); + for (idx = 0; idx < hsize; idx++) + d[idx] = x[idx] = 0.; + + /* Calculate P(S < x) distribution from histogram. + * note off-by-one of sc, because histogram bin contains scores between + * x and x+1. + */ + sum = 0; + for (sc = h->lowscore; sc <= h->highscore; sc++) + { + sum += h->histogram[sc - h->min]; + d[sc - h->lowscore] = (float) sum / (float) h->total; + x[sc - h->lowscore] = (float) (sc + 1); + } + + /* Do a linear regression fit to the log[-log(P(Sx))] = -lambda * x + lambda * mu + * so lambda = -m and mu = b/lambda + */ + /* convert y axis to log[-log(P(Slowscore; sc < h->highscore; sc++) + d[sc - h->lowscore] = log(-1. * log(d[sc - h->lowscore])); + + /* do the linear regression */ + Linefit(x, d, hsize-1, &intercept, &slope, &corr); + /* calc mu, lambda */ + lambda = -1. * slope; + mu = intercept / lambda; + + /* Set the EVD parameters in the histogram; + * pass 2 for additional lost degrees of freedom because we fit mu, lambda. + */ + ExtremeValueSetHistogram(h, mu, lambda, h->lowscore, h->highscore, 2); + + free(x); + free(d); + return; +} + + +/* Function: ExtremeValueFitHistogram() + * Date: SRE, Sat Nov 15 17:16:15 1997 [St. Louis] + * + * Purpose: Fit a score histogram to the extreme value + * distribution. Set the parameters lambda + * and mu in the histogram structure. Calculate + * a chi-square test as a measure of goodness of fit. + * + * Methods: Uses a maximum likelihood method [Lawless82]. + * Lower outliers are removed by censoring the data below the peak. + * Upper outliers are removed iteratively using method + * described by [Mott92]. + * + * Args: h - histogram to fit + * censor - TRUE to censor data left of the peak + * high_hint - score cutoff; above this are `real' hits that aren't fit + * + * Return: 1 if fit is judged to be valid. + * else 0 if fit is invalid (too few seqs.) + */ +int +ExtremeValueFitHistogram(struct histogram_s *h, int censor, float high_hint) +{ + float *x; /* array of EVD samples to fit */ + int *y; /* histogram counts */ + int n; /* number of observed samples */ + int z; /* number of censored samples */ + int hsize; /* size of histogram */ + float lambda, mu; /* new estimates of lambda, mu */ + int sc; /* loop index for score */ + int lowbound; /* lower bound of fitted region*/ + int highbound; /* upper bound of fitted region*/ + int new_highbound; + int iteration; + + /* Determine lower bound on fitted region; + * if we're censoring the data, choose the peak of the histogram. + * if we're not, then we take the whole histogram. + */ + lowbound = h->lowscore; + if (censor) + { + int max = -1; + for (sc = h->lowscore; sc <= h->highscore; sc++) + if (h->histogram[sc - h->min] > max) + { + max = h->histogram[sc - h->min]; + lowbound = sc; + } + } + + /* Determine initial upper bound on fitted region. + */ + highbound = MIN(high_hint, h->highscore); + + /* Now, iteratively converge on our lambda, mu: + */ + for (iteration = 0; iteration < 100; iteration++) + { + /* Construct x, y vectors. + */ + x = NULL; + y = NULL; + hsize = highbound - lowbound + 1; + if (hsize < 5) goto FITFAILED; /* require at least 5 bins or we don't fit */ + + x = MallocOrDie(sizeof(float) * hsize); + y = MallocOrDie(sizeof(int) * hsize); + n = 0; + for (sc = lowbound; sc <= highbound; sc++) + { + x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */ + y[sc-lowbound] = h->histogram[sc - h->min]; + n += h->histogram[sc - h->min]; + } + + if (n < 100) goto FITFAILED; /* require fitting to at least 100 points */ + + /* If we're censoring, estimate z, the number of censored guys + * left of the bound. Our initial estimate is crudely that we're + * missing e^-1 of the total distribution (which would be exact + * if we censored exactly at mu; but we censored at the observed peak). + * Subsequent estimates are more exact based on our current estimate of mu. + */ + if (censor) + { + if (iteration == 0) + z = MIN(h->total-n, (int) (0.58198 * (float) n)); + else + { + double psx; + psx = EVDDistribution((float) lowbound, mu, lambda); + z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx))); + } + } + + /* Do an ML fit + */ + if (censor) { + if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda)) + goto FITFAILED; + } else + if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda)) + goto FITFAILED; + + /* Find the Eval = 1 point as a new highbound; + * the total number of samples estimated to "belong" to the EVD is n+z + */ + new_highbound = (int) + (mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda)); + + free(x); + free(y); + if (new_highbound >= highbound) break; + highbound = new_highbound; + } + + /* Set the histogram parameters; + * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom + * for fitting mu, lambda, but we get 1 back because we're unnormalized + * in this interval, hence we pass 2-1 = 1 as ndegrees. + */ + ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1); + return 1; + +FITFAILED: + UnfitHistogram(h); + if (x != NULL) free(x); + if (y != NULL) free(y); + return 0; +} + + +/* Function: ExtremeValueSetHistogram() + * + * Purpose: Instead of fitting the histogram to an EVD, + * simply set the EVD parameters from an external source. + * + * Args: h - the histogram to set + * mu - mu location parameter + * lambda - lambda scale parameter + * lowbound - low bound of the histogram that was fit + * highbound- high bound of histogram that was fit + * ndegrees - extra degrees of freedom to subtract in X^2 test: + * typically 0 if mu, lambda are parametric, + * else 2 if mu, lambda are estimated from data + */ +void +ExtremeValueSetHistogram(struct histogram_s *h, float mu, float lambda, + float lowbound, float highbound, int ndegrees) +{ + int sc; + int hsize, idx; + int nbins; + float delta; + + UnfitHistogram(h); + h->fit_type = HISTFIT_EVD; + h->param[EVD_LAMBDA] = lambda; + h->param[EVD_MU] = mu; + + hsize = h->max - h->min + 1; + h->expect = (float *) MallocOrDie(sizeof(float) * hsize); + for (idx = 0; idx < hsize; idx++) + h->expect[idx] = 0.; + + /* Calculate the expected values for the histogram. + */ + for (sc = h->min; sc <= h->max; sc++) + h->expect[sc - h->min] = + ExtremeValueE((float)(sc), h->param[EVD_MU], h->param[EVD_LAMBDA], + h->total) - + ExtremeValueE((float)(sc+1), h->param[EVD_MU], h->param[EVD_LAMBDA], + h->total); + + /* Calculate the goodness-of-fit (within whole region) + */ + h->chisq = 0.; + nbins = 0; + for (sc = lowbound; sc <= highbound; sc++) + if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) + { + delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; + h->chisq += delta * delta / h->expect[sc-h->min]; + nbins++; + } + + /* Since we fit the whole histogram, there is at least + * one constraint on chi-square: the normalization to h->total. + */ + if (nbins > 1 + ndegrees) + h->chip = (float) IncompleteGamma((double)(nbins-1-ndegrees)/2., + (double) h->chisq/2.); + else + h->chip = 0.; +} + + + +/* Function: GaussianFitHistogram() + * + * Purpose: Fit a score histogram to a Gaussian distribution. + * Set the parameters mean and sd in the histogram + * structure, as well as a chi-squared test for + * goodness of fit. + * + * Args: h - histogram to fit + * high_hint - score cutoff; above this are `real' hits that aren't fit + * + * Return: 1 if fit is judged to be valid. + * else 0 if fit is invalid (too few seqs.) + */ +int +GaussianFitHistogram(struct histogram_s *h, float high_hint) +{ + float sum; + float sqsum; + float delta; + int sc; + int nbins; + int hsize, idx; + + /* Clear any previous fitting from the histogram. + */ + UnfitHistogram(h); + + /* Determine if we have enough hits to fit the histogram; + * arbitrarily require 1000. + */ + if (h->total < 1000) { h->fit_type = HISTFIT_NONE; return 0; } + + /* Simplest algorithm for mean and sd; + * no outlier detection yet (not even using high_hint) + * + * Magic 0.5 correction is because our histogram is for + * scores between x and x+1; we estimate the expectation + * (roughly) as x + 0.5. + */ + sum = sqsum = 0.; + for (sc = h->lowscore; sc <= h->highscore; sc++) + { + delta = (float) sc + 0.5; + sum += (float) h->histogram[sc-h->min] * delta; + sqsum += (float) h->histogram[sc-h->min] * delta * delta; + } + h->fit_type = HISTFIT_GAUSSIAN; + h->param[GAUSS_MEAN] = sum / (float) h->total; + h->param[GAUSS_SD] = sqrt((sqsum - (sum*sum/(float)h->total)) / + (float)(h->total-1)); + + /* Calculate the expected values for the histogram. + * Note that the magic 0.5 correction appears again. + * Calculating difference between distribution functions for Gaussian + * would be correct but hard. + */ + hsize = h->max - h->min + 1; + h->expect = (float *) MallocOrDie(sizeof(float) * hsize); + for (idx = 0; idx < hsize; idx++) + h->expect[idx] = 0.; + + for (sc = h->min; sc <= h->max; sc++) + { + delta = (float) sc + 0.5 - h->param[GAUSS_MEAN]; + h->expect[sc - h->min] = + (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * + (exp(-1.* delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD])))); + } + + /* Calculate the goodness-of-fit (within region that was fitted) + */ + h->chisq = 0.; + nbins = 0; + for (sc = h->lowscore; sc <= h->highscore; sc++) + if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) + { + delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; + h->chisq += delta * delta / h->expect[sc-h->min]; + nbins++; + } + /* -1 d.f. for normalization; -2 d.f. for two free parameters */ + if (nbins > 3) + h->chip = (float) IncompleteGamma((double)(nbins-3)/2., + (double) h->chisq/2.); + else + h->chip = 0.; + + return 1; +} + + +/* Function: GaussianSetHistogram() + * + * Purpose: Instead of fitting the histogram to a Gaussian, + * simply set the Gaussian parameters from an external source. + */ +void +GaussianSetHistogram(struct histogram_s *h, float mean, float sd) +{ + int sc; + int hsize, idx; + int nbins; + float delta; + + UnfitHistogram(h); + h->fit_type = HISTFIT_GAUSSIAN; + h->param[GAUSS_MEAN] = mean; + h->param[GAUSS_SD] = sd; + + /* Calculate the expected values for the histogram. + */ + hsize = h->max - h->min + 1; + h->expect = (float *) MallocOrDie(sizeof(float) * hsize); + for (idx = 0; idx < hsize; idx++) + h->expect[idx] = 0.; + + /* Note: ideally we'd use the Gaussian distribution function + * to find the histogram occupancy in the window sc..sc+1. + * However, the distribution function is hard to calculate. + * Instead, estimate the histogram by taking the density at sc+0.5. + */ + for (sc = h->min; sc <= h->max; sc++) + { + delta = ((float)sc + 0.5) - h->param[GAUSS_MEAN]; + h->expect[sc - h->min] = + (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * + (exp(-1.*delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD])))); + } + + /* Calculate the goodness-of-fit (within whole region) + */ + h->chisq = 0.; + nbins = 0; + for (sc = h->lowscore; sc <= h->highscore; sc++) + if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) + { + delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; + h->chisq += delta * delta / h->expect[sc-h->min]; + nbins++; + } + /* -1 d.f. for normalization */ + if (nbins > 1) + h->chip = (float) IncompleteGamma((double)(nbins-1)/2., + (double) h->chisq/2.); + else + h->chip = 0.; +} + + + +/* Function: EVDDensity() + * Date: SRE, Sat Nov 15 19:37:52 1997 [St. Louis] + * + * Purpose: Return the extreme value density P(S=x) at + * a given point x, for an EVD controlled by + * parameters mu and lambda. + */ +double +EVDDensity(float x, float mu, float lambda) +{ + return (lambda * exp(-1. * lambda * (x - mu) + - exp(-1. * lambda * (x - mu)))); +} + +/* Function: EVDDistribution() + * Date: SRE, Tue Nov 18 08:02:22 1997 [St. Louis] + * + * Purpose: Returns the extreme value distribution P(S < x) + * evaluated at x, for an EVD controlled by parameters + * mu and lambda. + */ +double +EVDDistribution(float x, float mu, float lambda) +{ + return (exp(-1. * exp(-1. * lambda * (x - mu)))); +} + +/* Function: ExtremeValueP() + * + * Purpose: Calculate P(S>x) according to an extreme + * value distribution, given x and the parameters + * of the distribution (characteristic + * value mu, decay constant lambda). + * + * This function is exquisitely prone to + * floating point exceptions if it isn't coded + * carefully. + * + * Args: x = score + * mu = characteristic value of extreme value distribution + * lambda = decay constant of extreme value distribution + * + * Return: P(S>x) + */ +double +ExtremeValueP(float x, float mu, float lambda) +{ + double y; + /* avoid exceptions near P=1.0 */ + /* typical 32-bit sys: if () < -3.6, return 1.0 */ + if ((lambda * (x - mu)) <= -1. * log(-1. * log(DBL_EPSILON))) return 1.0; + /* avoid underflow fp exceptions near P=0.0*/ + if ((lambda * (x - mu)) >= 2.3 * (double) DBL_MAX_10_EXP) return 0.0; + /* a roundoff issue arises; use 1 - e^-x --> x for small x */ + y = exp(-1. * lambda * (x - mu)); + if (y < 1e-7) return y; + else return (1.0 - exp(-1. * y)); +} + + +/* Function: ExtremeValueP2() + * + * Purpose: Calculate P(S>x) in a database of size N, + * using P(S>x) for a single sequence, according + * to a Poisson distribution. + * + * Args: x = score + * mu = characteristic value of extreme value distribution + * lambda = decay constant of extreme value distribution + * N = number of trials (number of sequences) + * + * Return: P(S>x) for database of size N + */ +double +ExtremeValueP2(float x, float mu, float lambda, int N) +{ + double y; + y = N * ExtremeValueP(x,mu,lambda); + if (y < 1e-7) return y; + else return (1.0 - exp(-1. * y)); +} + +/* Function: ExtremeValueE() + * + * Purpose: Calculate E(S>x) in a database of size N, + * using P(S>x) for a single sequence: simply np. + * + * Args: x = score + * mu = characteristic value of extreme value distribution + * lambda = decay constant of extreme value distribution + * N = number of trials (number of sequences) + * + * Return: E(S>x) for database of size N + */ +double +ExtremeValueE(float x, float mu, float lambda, int N) +{ + return (double)N * ExtremeValueP(x,mu,lambda); +} + + +/* Function: EVDrandom() + * + * Purpose: Randomly sample an x from an EVD. + * Trivially done by the transformation method, since + * the distribution is analytical: + * x = \mu - \frac{\log \left[ -\log P(S= c + * lambda - a lambda to test + * ret_f - RETURN: 4.2.2 evaluated at lambda + * ret_df - RETURN: first derivative of 4.2.2 evaluated at lambda + * + * Return: (void) + */ +void +Lawless422(float *x, int *y, int n, int z, float c, + float lambda, float *ret_f, float *ret_df) +{ + double esum; /* \sum e^(-lambda xi) + z term */ + double xesum; /* \sum xi e^(-lambda xi) + z term */ + double xxesum; /* \sum xi^2 e^(-lambda xi) + z term */ + double xsum; /* \sum xi (no z term) */ + double mult; /* histogram count multiplier */ + double total; /* total samples */ + int i; + + esum = xesum = xsum = xxesum = total = 0.; + for (i = 0; i < n; i++) + { + mult = (y == NULL) ? 1. : (double) y[i]; + xsum += mult * x[i]; + esum += mult * exp(-1. * lambda * x[i]); + xesum += mult * x[i] * exp(-1. * lambda * x[i]); + xxesum += mult * x[i] * x[i] * exp(-1. * lambda * x[i]); + total += mult; + } + + /* Add z terms for censored data + */ + esum += (double) z * exp(-1. * lambda * c); + xesum += (double) z * c * exp(-1. * lambda * c); + xxesum += (double) z * c * c * exp(-1. * lambda * c); + + *ret_f = 1./lambda - xsum / total + xesum / esum; + *ret_df = ((xesum / esum) * (xesum / esum)) + - (xxesum / esum) + - (1. / (lambda * lambda)); + + return; +} + + + +/* Function: EVDMaxLikelyFit() + * Date: SRE, Fri Nov 14 07:56:29 1997 [St. Louis] + * + * Purpose: Given a list or a histogram of EVD-distributed samples, + * find maximum likelihood parameters lambda and + * mu. + * + * Algorithm: Uses approach described in [Lawless82]. Solves + * for lambda using Newton/Raphson iterations; + * then substitutes lambda into Lawless' equation 4.1.5 + * to get mu. + * + * Newton/Raphson algorithm developed from description in + * Numerical Recipes in C [Press88]. + * + * Args: x - list of EVD distributed samples or x-axis of histogram + * c - NULL, or y-axis of histogram + * n - number of samples, or number of histogram bins + * ret_mu : RETURN: ML estimate of mu + * ret_lambda : RETURN: ML estimate of lambda + * + * Return: 1 on success; 0 on any failure + */ +int +EVDMaxLikelyFit(float *x, int *c, int n, float *ret_mu, float *ret_lambda) +{ + float lambda, mu; + float fx; /* f(x) */ + float dfx; /* f'(x) */ + double esum; /* \sum e^(-lambda xi) */ + double mult; + double total; + float tol = 1e-5; + int i; + + /* 1. Find an initial guess at lambda: linear regression here? + */ + lambda = 0.2; + + /* 2. Use Newton/Raphson to solve Lawless 4.1.6 and find ML lambda + */ + for (i = 0; i < 100; i++) + { + Lawless416(x, c, n, lambda, &fx, &dfx); + if (fabs(fx) < tol) break; /* success */ + lambda = lambda - fx / dfx; /* Newton/Raphson is simple */ + if (lambda <= 0.) lambda = 0.001; /* but be a little careful */ + } + + /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed. + * Resort to a bisection search. Worse convergence speed + * but guaranteed to converge (unlike Newton/Raphson). + * We assume (!?) that fx is a monotonically decreasing function of x; + * i.e. fx > 0 if we are left of the root, fx < 0 if we + * are right of the root. + */ + if (i == 100) + { + float left, right, mid; + SQD_DPRINTF2(("EVDMaxLikelyFit(): Newton/Raphson failed; switchover to bisection")); + + /* First we need to bracket the root */ + lambda = right = left = 0.2; + Lawless416(x, c, n, lambda, &fx, &dfx); + if (fx < 0.) + { /* fix right; search left. */ + do { + left -= 0.1; + if (left < 0.) { + SQD_DPRINTF2(("EVDMaxLikelyFit(): failed to bracket root")); + return 0; + } + Lawless416(x, c, n, left, &fx, &dfx); + } while (fx < 0.); + } + else + { /* fix left; search right. */ + do { + right += 0.1; + Lawless416(x, c, n, right, &fx, &dfx); + if (right > 100.) { + SQD_DPRINTF2(("EVDMaxLikelyFit(): failed to bracket root")); + return 0; + } + } while (fx > 0.); + } + /* now we bisection search in left/right interval */ + for (i = 0; i < 100; i++) + { + mid = (left + right) / 2.; + Lawless416(x, c, n, mid, &fx, &dfx); + if (fabs(fx) < tol) break; /* success */ + if (fx > 0.) left = mid; + else right = mid; + } + if (i == 100) { + SQD_DPRINTF2(("EVDMaxLikelyFit(): even the bisection search failed")); + return 0; + } + lambda = mid; + } + + /* 3. Substitute into Lawless 4.1.5 to find mu + */ + esum = 0.; + total = 0.; + for (i = 0; i < n; i++) + { + mult = (c == NULL) ? 1. : (double) c[i]; + esum += mult * exp(-1 * lambda * x[i]); + total += mult; + } + mu = -1. * log(esum / total) / lambda; + + *ret_lambda = lambda; + *ret_mu = mu; + return 1; +} + + +/* Function: EVDCensoredFit() + * Date: SRE, Mon Nov 17 10:01:05 1997 [St. Louis] + * + * Purpose: Given a /left-censored/ list or histogram of EVD-distributed + * samples, as well as the number of censored samples z and the + * censoring value c, + * find maximum likelihood parameters lambda and + * mu. + * + * Algorithm: Uses approach described in [Lawless82]. Solves + * for lambda using Newton/Raphson iterations; + * then substitutes lambda into Lawless' equation 4.2.3 + * to get mu. + * + * Newton/Raphson algorithm developed from description in + * Numerical Recipes in C [Press88]. + * + * Args: x - list of EVD distributed samples or x-axis of histogram + * y - NULL, or y-axis of histogram + * n - number of observed samples,or number of histogram bins + * z - number of censored samples + * c - censoring value (all x_i >= c) + * ret_mu : RETURN: ML estimate of mu + * ret_lambda : RETURN: ML estimate of lambda + * + * Return: (void) + */ +int +EVDCensoredFit(float *x, int *y, int n, int z, float c, + float *ret_mu, float *ret_lambda) +{ + float lambda, mu; + float fx; /* f(x) */ + float dfx; /* f'(x) */ + double esum; /* \sum e^(-lambda xi) */ + double mult; + double total; + float tol = 1e-5; + int i; + + /* 1. Find an initial guess at lambda: linear regression here? + */ + lambda = 0.2; + + /* 2. Use Newton/Raphson to solve Lawless 4.2.2 and find ML lambda + */ + for (i = 0; i < 100; i++) + { + Lawless422(x, y, n, z, c, lambda, &fx, &dfx); + if (fabs(fx) < tol) break; /* success */ + lambda = lambda - fx / dfx; /* Newton/Raphson is simple */ + if (lambda <= 0.) lambda = 0.001; /* but be a little careful */ + } + + /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed. + * Resort to a bisection search. Worse convergence speed + * but guaranteed to converge (unlike Newton/Raphson). + * We assume (!?) that fx is a monotonically decreasing function of x; + * i.e. fx > 0 if we are left of the root, fx < 0 if we + * are right of the root. + */ + if (i == 100) + { + float left, right, mid; + /* First we need to bracket the root */ + SQD_DPRINTF2(("EVDCensoredFit(): Newton/Raphson failed; switched to bisection")); + lambda = right = left = 0.2; + Lawless422(x, y, n, z, c, lambda, &fx, &dfx); + if (fx < 0.) + { /* fix right; search left. */ + do { + left -= 0.03; + if (left < 0.) { + SQD_DPRINTF2(("EVDCensoredFit(): failed to bracket root")); + return 0; + } + Lawless422(x, y, n, z, c, left, &fx, &dfx); + } while (fx < 0.); + } + else + { /* fix left; search right. */ + do { + right += 0.1; + Lawless422(x, y, n, z, c, left, &fx, &dfx); + if (right > 100.) { + SQD_DPRINTF2(("EVDCensoredFit(): failed to bracket root")); + return 0; + } + } while (fx > 0.); + } + /* now we bisection search in left/right interval */ + for (i = 0; i < 100; i++) + { + mid = (left + right) / 2.; + Lawless422(x, y, n, z, c, left, &fx, &dfx); + if (fabs(fx) < tol) break; /* success */ + if (fx > 0.) left = mid; + else right = mid; + } + if (i == 100) { + SQD_DPRINTF2(("EVDCensoredFit(): even the bisection search failed")); + return 0; + } + lambda = mid; + } + + /* 3. Substitute into Lawless 4.2.3 to find mu + */ + esum = total = 0.; + for (i = 0; i < n; i++) + { + mult = (y == NULL) ? 1. : (double) y[i]; + esum += mult * exp(-1. * lambda * x[i]); + total += mult; + } + esum += (double) z * exp(-1. * lambda * c); /* term from censored data */ + mu = -1. * log(esum / total) / lambda; + + *ret_lambda = lambda; + *ret_mu = mu; + return 1; +} + + + diff --git a/forester/archive/RIO/others/hmmer/src/hmmalign.c b/forester/archive/RIO/others/hmmer/src/hmmalign.c new file mode 100644 index 0000000..27d0802 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmalign.c @@ -0,0 +1,325 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmalign.c + * SRE, Thu Dec 18 16:05:29 1997 [St. Louis] + * + * main() for aligning a set of sequences to an HMM. + * RCS $Id: hmmalign.c,v 1.1.1.1 2005/03/22 08:34:00 cmzmasek Exp $ + */ + +#include +#include + +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ +#include "msa.h" /* squid's multiple alignment i/o */ + +static char banner[] = "hmmalign - align sequences to an HMM profile"; + +static char usage[] = "\ +Usage: hmmalign [-options] \n\ +Available options are:\n\ + -h : help; print brief help on version and usage\n\ + -m : only print symbols aligned to match states\n\ + -o : save alignment in file in SELEX format\n\ + -q : quiet - suppress verbose banner\n\ +"; + +static char experts[] = "\ + --informat : sequence file is in format , not FASTA\n\ + --mapali : include alignment in file using map in HMM\n\ + --withali : include alignment to (fixed) alignment in file \n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-m", TRUE, sqdARG_NONE } , + { "-o", TRUE, sqdARG_STRING }, + { "-q", TRUE, sqdARG_NONE }, + { "--informat",FALSE, sqdARG_STRING }, + { "--mapali", FALSE, sqdARG_STRING }, + { "--withali", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +static void include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped, + char ***rseq, char ***dsq, SQINFO **sqinfo, + struct p7trace_s ***tr, int *nseq); + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMMs from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + struct plan7_s *hmm; /* HMM to align to */ + char *seqfile; /* file to read target sequence from */ + int format; /* format of seqfile */ + char **rseq; /* raw, unaligned sequences */ + SQINFO *sqinfo; /* info associated with sequences */ + char **dsq; /* digitized raw sequences */ + int nseq; /* number of sequences */ + float *wgt; /* weights to assign to alignment */ + MSA *msa; /* alignment that's created */ + int i; + struct p7trace_s **tr; /* traces for aligned sequences */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + int be_quiet; /* TRUE to suppress verbose banner */ + int matchonly; /* TRUE to show only match state syms */ + char *outfile; /* optional alignment output file */ + FILE *ofp; /* handle on alignment output file */ + char *withali; /* name of additional alignment file to align */ + char *mapali; /* name of additional alignment file to map */ + + /*********************************************** + * Parse command line + ***********************************************/ + + format = SQFILE_UNKNOWN; /* default: autodetect format */ + matchonly = FALSE; + outfile = NULL; + be_quiet = FALSE; + withali = NULL; + mapali = NULL; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-m") == 0) matchonly= TRUE; + else if (strcmp(optname, "-o") == 0) outfile = optarg; + else if (strcmp(optname, "-q") == 0) be_quiet = TRUE; + else if (strcmp(optname, "--mapali") == 0) mapali = optarg; + else if (strcmp(optname, "--withali") == 0) withali = optarg; + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == SQFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) + { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + if (argc - optind != 2) + Die("Incorrect number of arguments.\n%s\n", usage); + + hmmfile = argv[optind++]; + seqfile = argv[optind++]; + + /*********************************************** + * Open HMM file (might be in HMMERDB or current directory). + * Read a single HMM from it. + * + * Currently hmmalign disallows the J state and + * only allows one domain per sequence. To preserve + * the S/W entry information, the J state is explicitly + * disallowed, rather than calling a Plan7*Config() function. + * this is a workaround in 2.1 for the 2.0.x "yo!" bug. + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) + Die("Failed to open HMM file %s\n%s", hmmfile, usage); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", hmmfile); + HMMFileClose(hmmfp); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + hmm->xt[XTE][MOVE] = 1.; /* only 1 domain/sequence ("global" alignment) */ + hmm->xt[XTE][LOOP] = 0.; + P7Logoddsify(hmm, TRUE); + /* do we have the map we might need? */ + if (mapali != NULL && ! (hmm->flags & PLAN7_MAP)) + Die("HMMER: HMM file %s has no map; you can't use --mapali.", hmmfile); + + /*********************************************** + * Open sequence file in current directory. + * Read all seqs from it. + ***********************************************/ + + if (! ReadMultipleRseqs(seqfile, format, &rseq, &sqinfo, &nseq)) + Die("Failed to read any sequences from file %s", seqfile); + + /*********************************************** + * Show the banner + ***********************************************/ + + if (! be_quiet) + { + Banner(stdout, banner); + printf( "HMM file: %s\n", hmmfile); + printf( "Sequence file: %s\n", seqfile); + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); + } + + /*********************************************** + * Do the work + ***********************************************/ + + /* Allocations and initializations. + */ + dsq = MallocOrDie(sizeof(char *) * nseq); + tr = MallocOrDie(sizeof(struct p7trace_s *) * nseq); + + /* Align each sequence to the model, collect traces + */ + for (i = 0; i < nseq; i++) + { + dsq[i] = DigitizeSequence(rseq[i], sqinfo[i].len); + + if (P7ViterbiSize(sqinfo[i].len, hmm->M) <= RAMLIMIT) + (void) P7Viterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i])); + else + (void) P7SmallViterbi(dsq[i], sqinfo[i].len, hmm, &(tr[i])); + } + + /* Include an aligned alignment, if desired. + */ + if (mapali != NULL) + include_alignment(mapali, hmm, TRUE, &rseq, &dsq, &sqinfo, &tr, &nseq); + if (withali != NULL) + include_alignment(withali, hmm, FALSE, &rseq, &dsq, &sqinfo, &tr, &nseq); + + /* Turn traces into a multiple alignment + */ + wgt = MallocOrDie(sizeof(float) * nseq); + FSet(wgt, nseq, 1.0); + msa = P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, matchonly); + + /*********************************************** + * Output the alignment + ***********************************************/ + + if (outfile != NULL && (ofp = fopen(outfile, "w")) != NULL) + { + WriteStockholm(ofp, msa); + printf("Alignment saved in file %s\n", outfile); + fclose(ofp); + } + else + WriteStockholm(stdout, msa); + + /*********************************************** + * Cleanup and exit + ***********************************************/ + + for (i = 0; i < nseq; i++) + { + P7FreeTrace(tr[i]); + FreeSequence(rseq[i], &(sqinfo[i])); + free(dsq[i]); + } + MSAFree(msa); + FreePlan7(hmm); + free(sqinfo); + free(rseq); + free(dsq); + free(wgt); + free(tr); + + SqdClean(); + return 0; +} + + +/* Function: include_alignment() + * Date: SRE, Sun Jul 5 15:25:13 1998 [St. Louis] + * + * Purpose: Given the name of a multiple alignment file, + * align that alignment to the HMM, and add traces + * to an existing array of traces. If do_mapped + * is TRUE, we use the HMM's map file. If not, + * we use P7ViterbiAlignAlignment(). + * + * Args: seqfile - name of alignment file + * hmm - model to align to + * do_mapped- TRUE if we're to use the HMM's alignment map + * rsq - RETURN: array of rseqs to add to + * dsq - RETURN: array of dsq to add to + * sqinfo - RETURN: array of SQINFO to add to + * tr - RETURN: array of traces to add to + * nseq - RETURN: number of seqs + * + * Returns: new, realloc'ed arrays for rsq, dsq, sqinfo, tr; nseq is + * increased to nseq+ainfo.nseq. + */ +static void +include_alignment(char *seqfile, struct plan7_s *hmm, int do_mapped, + char ***rsq, char ***dsq, SQINFO **sqinfo, + struct p7trace_s ***tr, int *nseq) +{ + int format; /* format of alignment file */ + MSA *msa; /* alignment to align to */ + MSAFILE *afp; + SQINFO *newinfo; /* sqinfo array from msa */ + char **newdsq; + char **newrseq; + int idx; /* counter over aseqs */ + struct p7trace_s *master; /* master trace */ + struct p7trace_s **addtr; /* individual traces for aseq */ + + format = MSAFILE_UNKNOWN; /* invoke Babelfish */ + if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", seqfile); + if ((msa = MSAFileRead(afp)) == NULL) + Die("Failed to read an alignment from %s\n", seqfile); + MSAFileClose(afp); + for (idx = 0; idx < msa->nseq; idx++) + s2upper(msa->aseq[idx]); + newinfo = MSAToSqinfo(msa); + + /* Verify checksums before mapping */ + if (do_mapped && GCGMultchecksum(msa->aseq, msa->nseq) != hmm->checksum) + Die("The checksums for alignment file %s and the HMM alignment map don't match.", + seqfile); + /* Get a master trace */ + if (do_mapped) master = MasterTraceFromMap(hmm->map, hmm->M, msa->alen); + else master = P7ViterbiAlignAlignment(msa, hmm); + + /* convert to individual traces */ + ImposeMasterTrace(msa->aseq, msa->nseq, master, &addtr); + /* add those traces to existing ones */ + *tr = MergeTraceArrays(*tr, *nseq, addtr, msa->nseq); + + /* additional bookkeeping: add to dsq, sqinfo */ + *rsq = ReallocOrDie((*rsq), sizeof(char *) * (*nseq + msa->nseq)); + DealignAseqs(msa->aseq, msa->nseq, &newrseq); + for (idx = *nseq; idx < *nseq + msa->nseq; idx++) + (*rsq)[idx] = newrseq[idx - (*nseq)]; + free(newrseq); + + *dsq = ReallocOrDie((*dsq), sizeof(char *) * (*nseq + msa->nseq)); + DigitizeAlignment(msa, &newdsq); + for (idx = *nseq; idx < *nseq + msa->nseq; idx++) + (*dsq)[idx] = newdsq[idx - (*nseq)]; + free(newdsq); + /* unnecessarily complex, but I can't be bothered... */ + *sqinfo = ReallocOrDie((*sqinfo), sizeof(SQINFO) * (*nseq + msa->nseq)); + for (idx = *nseq; idx < *nseq + msa->nseq; idx++) + SeqinfoCopy(&((*sqinfo)[idx]), &(newinfo[idx - (*nseq)])); + + *nseq = *nseq + msa->nseq; + + /* Cleanup */ + P7FreeTrace(master); + MSAFree(msa); + /* Return */ + return; +} + + + diff --git a/forester/archive/RIO/others/hmmer/src/hmmbuild.c b/forester/archive/RIO/others/hmmer/src/hmmbuild.c new file mode 100644 index 0000000..b9308c7 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmbuild.c @@ -0,0 +1,1129 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmbuild.c + * SRE, Mon Nov 18 12:41:29 1996 + * + * main() for HMM construction from an alignment. + * CVS $Id: hmmbuild.c,v 1.1.1.1 2005/03/22 08:34:06 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ +#include "msa.h" /* squid's multiple alignment i/o */ + +static char banner[] = "hmmbuild - build a hidden Markov model from an alignment"; + +static char usage[] = "\ +Usage: hmmbuild [-options] \n\ + Available options are:\n\ + -h : help; print brief help on version and usage\n\ + -n : name; name this (first) HMM \n\ + -o : re-save annotated alignment to \n\ + -A : append; append this HMM to \n\ + -F : force; allow overwriting of \n\ +\n\ + Alternative search algorithm styles: (default: hmmls domain alignment)\n\ + -f : multi-hit local (hmmfs style)\n\ + -g : global alignment (hmms style, Needleman/Wunsch)\n\ + -s : local alignment (hmmsw style, Smith/Waterman)\n\ +"; + +static char experts[] = "\ + Alternative model construction strategies: (default: MAP)\n\ + --fast : Krogh/Haussler fast heuristic construction (see --gapmax)\n\ + --hand : manual construction (requires annotated alignment)\n\ +\n\ + Expert customization of parameters and priors:\n\ + --null : read null (random sequence) model from \n\ + --pam : heuristic PAM-based prior, using BLAST PAM matrix in \n\ + --prior : read Dirichlet prior parameters from \n\ +\n\ + Alternative sequence weighting strategies: (default: GSC weights)\n\ + --wblosum : Henikoff simple filter weights (see --idlevel)\n\ + --wgsc : Gerstein/Sonnhammer/Chothia tree weights (default)\n\ + --wme : maximum entropy (ME)\n\ + --wpb : Henikoff position-based weights\n\ + --wvoronoi : Sibbald/Argos Voronoi weights\n\ + --wnone : don't do any weighting\n\ + --noeff : don't use effective sequence number; just use nseq\n\ + --pbswitch : set switch from GSC to position-based wgts at > n seqs\n\ +\n\ + Forcing an alphabet: (normally autodetected)\n\ + --amino : override autodetection, assert that seqs are protein\n\ + --nucleic : override autodetection, assert that seqs are DNA/RNA\n\ +\n\ + Other expert options:\n\ + --archpri : set architecture size prior to {0.85} [0..1]\n\ + --binary : save the model in binary format, not ASCII text\n\ + --cfile : save count vectors to \n\ + --gapmax : max fraction of gaps in mat column {0.50} [0..1]\n\ + --idlevel : set frac. id level used by eff. nseq and --wblosum {0.62}\n\ + --informat : input alignment is in format , not Stockholm\n\ + --pamwgt : set weight on PAM-based prior to {20.}[>=0]\n\ + --swentry : set S/W aggregate entry prob. to {0.5}\n\ + --swexit : set S/W aggregate exit prob. to {0.5}\n\ + --verbose : print boring information\n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-f", TRUE, sqdARG_NONE }, + { "-g", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "-n", TRUE, sqdARG_STRING}, + { "-o", TRUE, sqdARG_STRING}, + { "-s", TRUE, sqdARG_NONE }, + { "-A", TRUE, sqdARG_NONE }, + { "-F", TRUE, sqdARG_NONE }, + { "--amino", FALSE, sqdARG_NONE }, + { "--archpri", FALSE, sqdARG_FLOAT }, + { "--binary", FALSE, sqdARG_NONE }, + { "--cfile", FALSE, sqdARG_STRING}, + { "--fast", FALSE, sqdARG_NONE}, + { "--gapmax", FALSE, sqdARG_FLOAT }, + { "--hand", FALSE, sqdARG_NONE}, + { "--idlevel", FALSE, sqdARG_FLOAT }, + { "--informat",FALSE, sqdARG_STRING }, + { "--noeff", FALSE, sqdARG_NONE }, + { "--nucleic", FALSE, sqdARG_NONE }, + { "--null", FALSE, sqdARG_STRING }, + { "--pam", FALSE, sqdARG_STRING }, + { "--pamwgt", FALSE, sqdARG_FLOAT }, + { "--pbswitch",FALSE, sqdARG_INT }, + { "--prior", FALSE, sqdARG_STRING }, + { "--swentry", FALSE, sqdARG_FLOAT }, + { "--swexit", FALSE, sqdARG_FLOAT }, + { "--verbose", FALSE, sqdARG_NONE }, + { "--wgsc", FALSE, sqdARG_NONE }, + { "--wblosum", FALSE, sqdARG_NONE }, + { "--wme", FALSE, sqdARG_NONE }, + { "--wnone", FALSE, sqdARG_NONE }, + { "--wpb", FALSE, sqdARG_NONE }, + { "--wvoronoi",FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +static void print_all_scores(FILE *fp, struct plan7_s *hmm, + char **dsq, MSA *msa, struct p7trace_s **tr); +static void save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm); +static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt, + int nseq, struct p7trace_s **tr, float *pernode, + float *ret_avg); +static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, + float *pernode, float expected); +static void maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa, + float eff_nseq, + struct p7prior_s *prior, struct p7trace_s **tr); + + +int +main(int argc, char **argv) +{ + char *seqfile; /* seqfile to read alignment from */ + int format; /* format of seqfile */ + MSAFILE *afp; /* open alignment file */ + MSA *msa; /* a multiple sequence alignment */ + char **dsq; /* digitized unaligned aseq's */ + struct plan7_s *hmm; /* constructed HMM; written to hmmfile */ + struct p7prior_s *pri; /* Dirichlet priors to use */ + struct p7trace_s **tr; /* fake tracebacks for aseq's */ + char *hmmfile; /* file to write HMM to */ + FILE *hmmfp; /* HMM output file handle */ + char *name; /* name of the HMM */ + int idx; /* counter for sequences */ + float randomseq[MAXABET]; /* null sequence model */ + float p1; /* null sequence model p1 transition */ + int nali; /* count number of alignments/HMMs */ + char fpopts[3]; /* options to open a file with, e.g. "ab" */ + int checksum; /* checksum of the alignment */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + enum p7_construction c_strategy; /* construction strategy choice */ + enum p7_weight { /* weighting strategy */ + WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_PB, WGT_VORONOI, WGT_ME} w_strategy; + enum p7_config { /* algorithm configuration strategy */ + P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy; + float gapmax; /* max frac gaps in mat col for -k */ + int overwrite_protect; /* TRUE to prevent overwriting HMM file */ + int verbose; /* TRUE to show a lot of output */ + char *rndfile; /* random sequence model file to read */ + char *prifile; /* Dirichlet prior file to read */ + char *pamfile; /* PAM matrix file for heuristic prior */ + char *align_ofile; /* name of output alignment file */ + char *cfile; /* output file for count vectors */ + FILE *alignfp; /* open filehandle for alignment resaves */ + FILE *cfp; /* open filehandle for count vector saves*/ + float archpri; /* "architecture" prior on model size */ + float pamwgt; /* weight on PAM for heuristic prior */ + int do_append; /* TRUE to append to hmmfile */ + int do_binary; /* TRUE to write in binary format */ + float blosumlevel; /* BLOSUM frac id filtering level [0.62] */ + float swentry; /* S/W aggregate entry probability */ + float swexit; /* S/W aggregate exit probability */ + int do_eff; /* TRUE to set an effective seq number */ + float eff_nseq; /* effective sequence number */ + int pbswitch; /* nseq >= this, switchover to PB weights*/ + char *setname; /* NULL, or ptr to HMM name to set */ + int gapmax_set; /* TRUE if gapmax was set on commandline */ + + /*********************************************** + * Parse command line + ***********************************************/ + + format = MSAFILE_UNKNOWN; /* autodetect format by default. */ + c_strategy = P7_MAP_CONSTRUCTION; + w_strategy = WGT_GSC; + blosumlevel = 0.62; + cfg_strategy = P7_LS_CONFIG; + gapmax = 0.5; + overwrite_protect = TRUE; + verbose = FALSE; + rndfile = NULL; + prifile = NULL; + pamfile = NULL; + align_ofile = NULL; + alignfp = NULL; + cfile = NULL; + cfp = NULL; + archpri = 0.85; + pamwgt = 20.; + Alphabet_type = hmmNOTSETYET; /* initially unknown */ + name = NULL; + do_append = FALSE; + swentry = 0.5; + swexit = 0.5; + do_eff = TRUE; + do_binary = FALSE; + pbswitch = 1000; + setname = NULL; + gapmax_set = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-f") == 0) cfg_strategy = P7_FS_CONFIG; + else if (strcmp(optname, "-g") == 0) cfg_strategy = P7_BASE_CONFIG; + else if (strcmp(optname, "-n") == 0) setname = optarg; + else if (strcmp(optname, "-o") == 0) align_ofile = optarg; + else if (strcmp(optname, "-s") == 0) cfg_strategy = P7_SW_CONFIG; + else if (strcmp(optname, "-A") == 0) do_append = TRUE; + else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE; + else if (strcmp(optname, "--amino") == 0) SetAlphabet(hmmAMINO); + else if (strcmp(optname, "--archpri") == 0) archpri = atof(optarg); + else if (strcmp(optname, "--binary") == 0) do_binary = TRUE; + else if (strcmp(optname, "--cfile") == 0) cfile = optarg; + else if (strcmp(optname, "--fast") == 0) c_strategy = P7_FAST_CONSTRUCTION; + else if (strcmp(optname, "--gapmax") == 0) { gapmax = atof(optarg); gapmax_set = TRUE; } + else if (strcmp(optname, "--hand") == 0) c_strategy = P7_HAND_CONSTRUCTION; + else if (strcmp(optname, "--idlevel") == 0) blosumlevel = atof(optarg); + else if (strcmp(optname, "--noeff") == 0) do_eff = FALSE; + else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC); + else if (strcmp(optname, "--null") == 0) rndfile = optarg; + else if (strcmp(optname, "--pam") == 0) pamfile = optarg; + else if (strcmp(optname, "--pamwgt") == 0) pamwgt = atof(optarg); + else if (strcmp(optname, "--pbswitch")== 0) pbswitch = atoi(optarg); + else if (strcmp(optname, "--prior") == 0) prifile = optarg; + else if (strcmp(optname, "--swentry") == 0) swentry = atof(optarg); + else if (strcmp(optname, "--swexit") == 0) swexit = atof(optarg); + else if (strcmp(optname, "--verbose") == 0) verbose = TRUE; + else if (strcmp(optname, "--wgsc") == 0) w_strategy = WGT_GSC; + else if (strcmp(optname, "--wblosum") == 0) w_strategy = WGT_BLOSUM; + else if (strcmp(optname, "--wme") == 0) w_strategy = WGT_ME; + else if (strcmp(optname, "--wpb") == 0) w_strategy = WGT_PB; + else if (strcmp(optname, "--wnone") == 0) w_strategy = WGT_NONE; + else if (strcmp(optname, "--wvoronoi")== 0) w_strategy = WGT_VORONOI; + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == MSAFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + if (! IsAlignmentFormat(format)) + Die("%s is an unaligned format, can't read as an alignment", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(EXIT_SUCCESS); + } + } + if (argc - optind != 2) + Die("Incorrect number of arguments.\n%s\n", usage); + + hmmfile = argv[optind++]; + seqfile = argv[optind++]; + + if (gapmax < 0. || gapmax > 1.) + Die("--gapmax must be a value from 0 to 1\n%s\n", usage); + if (archpri < 0. || archpri > 1.) + Die("--archpri must be a value from 0 to 1\n%s\n", usage); + if (overwrite_protect && !do_append && FileExists(hmmfile)) + Die("HMM file %s already exists. Rename or delete it.", hmmfile); + if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile)) + Die("Alignment resave file %s exists. Rename or delete it.", align_ofile); + if (gapmax_set && c_strategy != P7_FAST_CONSTRUCTION) + Die("using --gapmax only makes sense if you use --fast"); + + /*********************************************** + * Preliminaries: open our files for i/o + ***********************************************/ + + /* Open the alignment */ + if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", seqfile); + + /* Open the HMM output file */ + if (do_append) strcpy(fpopts, "a"); + else strcpy(fpopts, "w"); + if (do_binary) strcat(fpopts, "b"); + if ((hmmfp = fopen(hmmfile, fpopts)) == NULL) + Die("Failed to open HMM file %s for %s\n", hmmfile, + do_append ? "appending" : "writing"); + + /* Open the count vector save file */ + cfp = NULL; + if (cfile != NULL) + if ((cfp = fopen(cfile, "w")) == NULL) + Die("Failed to open count vector file %s for writing\n", cfile); + + /* Open the alignment resave file */ + alignfp = NULL; + if (align_ofile != NULL) + if ((alignfp = fopen(align_ofile, "w")) == NULL) + Die("Failed to open alignment resave file %s for writing\n", align_ofile); + + /*********************************************** + * Show the banner + ***********************************************/ + + Banner(stdout, banner); + printf("Alignment file: %s\n", + seqfile); + printf("File format: %s\n", + SeqfileFormat2String(afp->format)); + + printf("Search algorithm configuration: "); + if (cfg_strategy == P7_BASE_CONFIG) puts("Global alignment (hmms)"); + else if (cfg_strategy == P7_SW_CONFIG) { + puts("Local (hmmsw)"); + printf("S/W aggregate entry probability: %.2f\n", swentry); + printf("S/W aggregate exit probability: %.2f\n", swexit); + } + else if (cfg_strategy == P7_LS_CONFIG) puts("Multiple domain (hmmls)"); + else if (cfg_strategy == P7_FS_CONFIG) { + puts("Multiple local (hmmfs)"); + printf("S/W aggregate entry probability: %.2f\n", swentry); + printf("S/W aggregate exit probability: %.2f\n", swexit); + } + + printf("Model construction strategy: "); + if (c_strategy == P7_HAND_CONSTRUCTION) puts("Manual, from #=RF annotation"); + else if (c_strategy==P7_FAST_CONSTRUCTION) printf("Fast/ad hoc (gapmax %.2f)\n", gapmax); + else printf("MAP (gapmax hint: %.2f)\n", gapmax); + + printf("Null model used: %s\n", + (rndfile == NULL) ? "(default)" : rndfile); + + printf("Prior used: %s\n", + (prifile == NULL) ? "(default)" : prifile); + + printf("Sequence weighting method: "); + if (w_strategy == WGT_NONE) puts("none"); + else if (w_strategy == WGT_GSC) puts("G/S/C tree weights"); + else if (w_strategy == WGT_BLOSUM) printf("BLOSUM filter at %.2f id\n", blosumlevel); + else if (w_strategy == WGT_PB) puts("Henikoff position-based"); + else if (w_strategy == WGT_VORONOI)puts("Sibbald/Argos Voronoi"); + else if (w_strategy == WGT_ME) puts("Maximum entropy"); + + printf("New HMM file: %s %s\n", + hmmfile, do_append? "[appending]" : ""); + if (cfile != NULL) + printf("Count vectors saved to: %s\n", cfile); + if (align_ofile != NULL) + printf("Annotated alignment(s) resaved to: %s\n", align_ofile); + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); + + + /*********************************************** + * Get alignment(s), build HMMs one at a time + ***********************************************/ + + nali = 0; + while ((msa = MSAFileRead(afp)) != NULL) + { + /* Print some stuff about what we're about to do. + */ + if (msa->name != NULL) printf("Alignment: %s\n", msa->name); + else printf("Alignment: #%d\n", nali+1); + printf ("Number of sequences: %d\n", msa->nseq); + printf ("Number of columns: %d\n", msa->alen); + puts(""); + fflush(stdout); + + /* Make alignment upper case, because some symbol counting + * things are case-sensitive. + */ + for (idx = 0; idx < msa->nseq; idx++) + s2upper(msa->aseq[idx]); + + /* Set up the alphabet globals: + * either already set by --amino or --nucleic, or + * we guess based on the first alignment we see + */ + if (Alphabet_type == hmmNOTSETYET) + DetermineAlphabet(msa->aseq, msa->nseq); + + /* Do some initialization the first time through. + * This code must be delayed until after we've seen the + * first alignment, because we have to see the alphabet type first + */ + if (nali == 0) + { + /* Set up Dirichlet priors */ + if (prifile == NULL) pri = P7DefaultPrior(); + else pri = P7ReadPrior(prifile); + + if (pamfile != NULL) PAMPrior(pamfile, pri, pamwgt); + + /* Set up the null/random seq model */ + if (rndfile == NULL) P7DefaultNullModel(randomseq, &p1); + else P7ReadNullModel(rndfile, randomseq, &p1); + } + + /* Prepare unaligned digitized sequences for internal use + */ + DigitizeAlignment(msa, &dsq); + + /* In some respects we treat DNA more crudely for now; + * for example, we can't do eff seq #, because it's + * calibrated for protein. + */ + if (Alphabet_type == hmmNUCLEIC) + do_eff = FALSE; + + /* Determine "effective sequence number". + * The BlosumWeights() routine is now an efficient O(N) + * memory clustering algorithm that doesn't blow up on, + * say, Pfam's GP120 alignment (13000+ sequences) + */ + eff_nseq = (float) msa->nseq; + if (do_eff) + { + float *wgt; + printf("%-40s ... ", "Determining effective sequence number"); + fflush(stdout); + /* dummy weights array to feed BlosumWeights*/ + wgt = MallocOrDie(sizeof(float) * msa->nseq); + BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt); + eff_nseq = FSum(wgt, msa->nseq); + + free(wgt); + printf("done. [%.0f]\n", eff_nseq); + } + + + /* Weight the sequences (optional), + */ + if (w_strategy == WGT_GSC || + w_strategy == WGT_BLOSUM || + w_strategy == WGT_VORONOI || + w_strategy == WGT_PB) + { + printf("%-40s ... ", "Weighting sequences heuristically"); + fflush(stdout); + + if (w_strategy != WGT_PB && msa->nseq >= pbswitch) + { + printf("[big alignment! doing PB]... "); + PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + } + else if (w_strategy == WGT_GSC) + GSCWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + else if (w_strategy == WGT_BLOSUM) + BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, msa->wgt); + else if (w_strategy == WGT_PB) + PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + else if (w_strategy == WGT_VORONOI) + VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + + printf("done.\n"); + } + + /* Set the effective sequence number (if do_eff is FALSE, eff_nseq + * was set to nseq). + */ + FNorm(msa->wgt, msa->nseq); + FScale(msa->wgt, msa->nseq, eff_nseq); + + /* Build a model architecture. + * If we're not doing MD or ME, that's all we need to do. + * We get an allocated, counts-based HMM back. + * + * Because the architecture algorithms are allowed to change + * gap characters in the alignment, we have to calculate the + * alignment checksum before we enter the algorithms. + */ + printf("%-40s ... ", "Constructing model architecture"); + fflush(stdout); + checksum = GCGMultchecksum(msa->aseq, msa->nseq); + if (c_strategy == P7_FAST_CONSTRUCTION) + P7Fastmodelmaker(msa, dsq, gapmax, &hmm, &tr); + else if (c_strategy == P7_HAND_CONSTRUCTION) + P7Handmodelmaker(msa, dsq, &hmm, &tr); + else + P7Maxmodelmaker(msa, dsq, gapmax, + pri, randomseq, p1, archpri, &hmm, &tr); + hmm->checksum = checksum; + printf("done.\n"); + + /* Save the count vectors if asked. Used primarily for + * making the data files for training priors. + */ + if (cfile != NULL) + { + printf("%-40s ... ", "Saving count vector file"); + fflush(stdout); + save_countvectors(cfp, + (msa->name != NULL ? msa->name : "-"), + hmm); + printf("done. [%s]\n", cfile); + } + + /* Record the null model in the HMM; + * add prior contributions in pseudocounts and renormalize. + */ + printf("%-40s ... ", "Converting counts to probabilities"); + fflush(stdout); + Plan7SetNullModel(hmm, randomseq, p1); + P7PriorifyHMM(hmm, pri); + printf("done.\n"); + + /* Model configuration, temporary. + * hmmbuild assumes that it's given an alignment of single domains, + * and the alignment may contain fragments. So, for the purpose of + * scoring the sequences (or, optionally, MD/ME weighting), + * configure the model into hmmsw mode. Later we'll + * configure the model according to how the user wants to + * use it. + */ + Plan7SWConfig(hmm, 0.5, 0.5); + + /* Do model-dependent "weighting" strategies. + */ + if (w_strategy == WGT_ME) + { + printf("\n%-40s ...\n", "Maximum entropy weighting, iterative"); + maximum_entropy(hmm, dsq, msa, eff_nseq, pri, tr); + printf("----------------------------------------------\n\n"); + } + + /* Give the model a name. + * We deal with this differently depending on whether + * we're in an alignment database or a single alignment. + * + * If a single alignment, priority is: + * 1. Use -n if set. + * 2. Use msa->name (avail in Stockholm or SELEX formats only) + * 3. If all else fails, use alignment file name without + * filename extension (e.g. "globins.slx" gets named "globins" + * + * If a multiple MSA database (e.g. Stockholm/Pfam), + * only msa->name is applied. -n is not allowed. + * if msa->name is unavailable, or -n was used, + * a fatal error is thrown. + * + * Because we can't tell whether we've got more than one + * alignment 'til we're on the second one, these fatal errors + * only happen after the first HMM has already been built. + * Oh well. + */ + printf("%-40s ... ", "Setting model name, etc."); + fflush(stdout); + if (nali == 0) /* first (only?) HMM in file: */ + { + if (setname != NULL) name = Strdup(setname); + else if (msa->name != NULL) name = Strdup(msa->name); + else name = FileTail(seqfile, TRUE); + } + else + { + if (setname != NULL) + Die("Oops. Wait. You can't use -n with an alignment database."); + else if (msa->name != NULL) name = Strdup(msa->name); + else + Die("Oops. Wait. I need name annotation on each alignment.\n"); + } + Plan7SetName(hmm, name); + free(name); + + /* Transfer other information from the alignment to + * the HMM. This typically only works for SELEX format + * alignments, so these things are conditional/optional. + */ + if (msa->acc != NULL) Plan7SetAccession(hmm, msa->acc); + if (msa->desc != NULL) Plan7SetDescription(hmm, msa->desc); + + if (msa->flags & MSA_SET_GA) + { hmm->flags |= PLAN7_GA; hmm->ga1 = msa->ga1; hmm->ga2 = msa->ga2; } + if (msa->flags & MSA_SET_TC) + { hmm->flags |= PLAN7_TC; hmm->tc1 = msa->tc1; hmm->tc2 = msa->tc2; } + if (msa->flags & MSA_SET_NC) + { hmm->flags |= PLAN7_NC; hmm->nc1 = msa->nc1; hmm->nc2 = msa->nc2; } + + /* Record some other miscellaneous information in the HMM, + * like how/when we built it. + */ + Plan7ComlogAppend(hmm, argc, argv); + Plan7SetCtime(hmm); + hmm->nseq = msa->nseq; + printf("done. [%s]\n", hmm->name); + + /* Print information for the user + */ + printf("\nConstructed a profile HMM (length %d)\n", hmm->M); + PrintPlan7Stats(stdout, hmm, dsq, msa->nseq, tr); + printf("\n"); + + /* Configure the model for chosen algorithm + */ + printf("%-40s ... ", "Finalizing model configuration"); + fflush(stdout); + switch (cfg_strategy) { + case P7_BASE_CONFIG: Plan7GlobalConfig(hmm); break; + case P7_SW_CONFIG: Plan7SWConfig(hmm, swentry, swexit); break; + case P7_LS_CONFIG: Plan7LSConfig(hmm); break; + case P7_FS_CONFIG: Plan7FSConfig(hmm, swentry, swexit); break; + default: Die("bogus configuration choice"); + } + printf("done.\n"); + + /* Save new HMM to disk: open a file for appending or writing. + */ + printf("%-40s ... ", "Saving model to file"); + fflush(stdout); + if (do_binary) WriteBinHMM(hmmfp, hmm); + else WriteAscHMM(hmmfp, hmm); + printf("done.\n"); + + /* the annotated alignment may be resaved */ + if (alignfp != NULL) + { + MSA *new_msa; + SQINFO *sqinfo; + + printf("%-40s ... ", "Saving annotated alignment"); + fflush(stdout); + sqinfo = MSAToSqinfo(msa); + new_msa = P7Traces2Alignment(dsq, sqinfo, msa->wgt, msa->nseq, + hmm->M, tr, FALSE); + + WriteStockholm(alignfp, new_msa); + MSAFree(new_msa); + for (idx = 0; idx < msa->nseq; idx++) + FreeSequence(NULL, &(sqinfo[idx])); + free(sqinfo); + printf("done.\n"); + } + + /* Verbose output; show scores for each sequence + */ + if (verbose) + print_all_scores(stdout, hmm, dsq, msa, tr); + + /* Clean up before moving on to next alignment + */ + for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]); + free(tr); + FreePlan7(hmm); + MSAFree(msa); + Free2DArray((void **) dsq, msa->nseq); + fflush(hmmfp); + if (cfp != NULL) fflush(cfp); + if (alignfp != NULL) fflush(alignfp); + + puts("//\n"); + nali++; + } + + + + /* Clean up and exit + */ + MSAFileClose(afp); + fclose(hmmfp); + if (cfp != NULL) fclose(cfp); + if (alignfp != NULL) fclose(alignfp); + P7FreePrior(pri); + SqdClean(); + return 0; +} + + +/* Function: print_all_scores() + * + * Purpose: For each training sequence, print its score under + * the final model. + * + * Args: fp - where to print the output (usu. stdout) + * hmm - newly constructed HMM, with prob's. + * dsq - digitized unaligned training sequences. + * msa - alignment and associated info + * tr - array of tracebacks + * + * Return: (void) + */ +static void +print_all_scores(FILE *fp, struct plan7_s *hmm, + char **dsq, MSA *msa, struct p7trace_s **tr) +{ + int idx; /* counter for sequences */ + + /* make sure model scores are ready */ + P7Logoddsify(hmm, TRUE); + /* header */ + fputs("**\n", fp); + fputs("Individual training sequence scores:\n", fp); + /* score for each sequence */ + for (idx = 0; idx < msa->nseq; idx++) + { + fprintf(fp, "%7.2f %-12s %s\n", + P7TraceScore(hmm, dsq[idx], tr[idx]), + msa->sqname[idx], + (MSAGetSeqDescription(msa,idx) != NULL) ? + MSAGetSeqDescription(msa,idx) : ""); + P7PrintTrace(fp, tr[idx], hmm, dsq[idx]); + } + fputs("\n", fp); +} + + + +/* Function: save_countvectors() + * + * Purpose: Save emission/transition count vectors to a file. + * Used for gathering the data on which to train a + * prior (e.g. mixture Dirichlet, etc.) + * + * The format of the file is one vector per line: + * M ...: 20 match emission counts in order AC..WY. + * followed by two chars of CS, CA annotation. + * I ...: 20 insert emission counts in order AC..WY. + * followed by two chars of CS, CA annotation. + * T ...: 7 transition counts in order TMM, TMI, TMD, + * TIM, TII, TDM, TDD. (see structs.h) + * followed by four chars of structure + * annotation: CS, CS of M+1; CA, CA of M+1. + * + * Args: cfp - open counts file + * name - name of alignment or HMM to associate with these vectors + * hmm - counts-based HMM + */ +static void +save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm) +{ + int k, x; + /* match emission vectors */ + for (k = 1; k <= hmm->M; k++) + { + fputs("M ", cfp); + for (x = 0; x < Alphabet_size; x++) + fprintf(cfp, "%8.2f ", hmm->mat[k][x]); + + fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k); + if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA) + fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]); + else + fputs("- -", cfp); + fputs("\n", cfp); + } + /* insert emission vectors */ + for (k = 1; k < hmm->M; k++) + { + fputs("I ", cfp); + for (x = 0; x < Alphabet_size; x++) + fprintf(cfp, "%8.2f ", hmm->ins[k][x]); + + fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k); + if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA) + fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]); + else + fputs("- -", cfp); + + fputs("\n", cfp); + } + /* transition vectors */ + for (k = 1; k < hmm->M; k++) + { + fputs("T ", cfp); + + for (x = 0; x < 7; x++) + fprintf(cfp, "%8.2f ", hmm->t[k][x]); + + fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k); + if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA) + fprintf(cfp, "%c %c %c %c", + hmm->cs[k], hmm->cs[k+1], + hmm->ca[k], hmm->ca[k+1]); + else + fputs("- -", cfp); + fputs("\n", cfp); + } +} + + +/* Function: position_average_score() + * Date: Wed Dec 31 09:36:35 1997 [StL] + * + * Purpose: Calculate scores from tracebacks, keeping them + * in a position specific array. The final array + * is normalized position-specifically too, according + * to how many sequences contributed data to this + * position. Used for compensating for sequence + * fragments in ME and MD score optimization. + * Very much ad hoc. + * + * Code related to (derived from) TraceScore(). + * + * Args: hmm - HMM structure, scores valid + * dsq - digitized unaligned sequences + * wgt - weights on the sequences + * nseq - number of sequences + * tr - array of nseq tracebacks that aligns each dsq to hmm + * pernode - RETURN: [0]1..M array of position-specific avg scores + * ret_avg - RETURN: overall average full-length, one-domain score + * + * Return: 1 on success, 0 on failure. + * pernode is malloc'ed [0]1..M by CALLER and filled here. + */ +static void +position_average_score(struct plan7_s *hmm, + char **dsq, + float *wgt, + int nseq, + struct p7trace_s **tr, + float *pernode, + float *ret_avg) +{ + int pos; /* position in seq */ + int sym; + int tpos; /* position in trace/state sequence */ + float *counts; /* counts at each position */ + float avg; /* RETURN: average overall */ + int k; /* counter for model position */ + int idx; /* counter for sequence number */ + + /* Allocations + */ + counts = MallocOrDie ((hmm->M+1) * sizeof(float)); + FSet(pernode, hmm->M+1, 0.); + FSet(counts, hmm->M+1, 0.); + + /* Loop over traces, accumulate weighted scores per position + */ + for (idx = 0; idx < nseq; idx++) + for (tpos = 0; tpos < tr[idx]->tlen; tpos++) + { + pos = tr[idx]->pos[tpos]; + sym = (int) dsq[idx][tr[idx]->pos[tpos]]; + k = tr[idx]->nodeidx[tpos]; + + /* Counts: how many times did we use this model position 1..M? + * (weighted) + */ + if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD) + counts[k] += wgt[idx]; + + /* Emission scores. + */ + if (tr[idx]->statetype[tpos] == STM) + pernode[k] += wgt[idx] * Scorify(hmm->msc[sym][k]); + else if (tr[idx]->statetype[tpos] == STI) + pernode[k] += wgt[idx] * Scorify(hmm->isc[sym][k]); + + /* Transition scores. + */ + if (tr[idx]->statetype[tpos] == STM || + tr[idx]->statetype[tpos] == STD || + tr[idx]->statetype[tpos] == STI) + pernode[k] += wgt[idx] * + Scorify(TransitionScoreLookup(hmm, tr[idx]->statetype[tpos], tr[idx]->nodeidx[tpos], + tr[idx]->statetype[tpos+1],tr[idx]->nodeidx[tpos+1])); + } + + /* Divide accumulated scores by accumulated weighted counts + */ + avg = 0.; + for (k = 1; k <= hmm->M; k++) + { + pernode[k] /= counts[k]; + avg += pernode[k]; + } + + free(counts); + *ret_avg = avg; + return; +} + + +/* Function: frag_trace_score() + * Date: SRE, Wed Dec 31 10:03:47 1997 [StL] + * + * Purpose: Allow MD/ME optimization to be used for alignments + * that include fragments and multihits -- estimate a full-length + * per-domain score. + * + * + * + * Return: "corrected" score. + */ +static float +frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, + float *pernode, float expected) +{ + float sc; /* corrected score */ + float fragexp; /* expected score for a trace like this */ + int tpos; /* position in trace */ + + /* get uncorrected score */ + sc = P7TraceScore(hmm, dsq, tr); + + /* calc expected score for trace like this */ + fragexp = 0.; + for (tpos = 0; tpos < tr->tlen; tpos++) + if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD) + fragexp += pernode[tr->nodeidx[tpos]]; + + /* correct for multihits */ + fragexp /= (float) TraceDomainNumber(tr); + + /* extrapolate to full-length, one-hit score */ + sc = sc * expected / fragexp; + return sc; +} + + +/* Function: maximum_entropy() + * Date: SRE, Fri Jan 2 10:56:00 1998 [StL] + * + * Purpose: Optimizes a model according to maximum entropy weighting. + * See Krogh and Mitchison (1995). + * + * [Actually, we do minimum relative entropy, rather than + * maximum entropy. Same thing, though we refer to "ME" + * weights and models. The optimization is a steepest + * descents minimization of the relative entropy.] + * + * Expects to be called shortly after a Maxmodelmaker() + * or Handmodelmaker(), so that both a new model architecture + * (with MAP parameters) and fake tracebacks are available. + * + * Prints a summary of optimization progress to stdout. + * + * Args: hmm - model. allocated, set with initial MAP parameters. + * dsq - dealigned digitized seqs the model is based on + * ainfo - extra info for aseqs + * nseq - number of aseqs + * eff_nseq- effective sequence number; weights normalize up to this. + * prior - prior distributions for parameterizing model + * tr - array of fake traces for each sequence + * + * Return: (void) + * hmm changed to an ME HMM + * ainfo changed, contains ME weights + */ +static void +maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa, + float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr) +{ + float *wgt; /* current best set of ME weights */ + float *new_wgt; /* new set of ME weights to try */ + float *sc; /* log-odds score of each sequence */ + float *grad; /* gradient */ + float epsilon; /* steepness of descent */ + float relative_entropy; /* current best relative entropy */ + float new_entropy; /* relative entropy at new weights */ + float last_new_entropy; /* last new_entropy we calc'ed */ + float use_epsilon; /* current epsilon value in use */ + int idx; /* counter over sequences */ + int i1, i2; /* counters for iterations */ + + float converge_criterion; + float minw, maxw; /* min, max weight */ + int posw, highw; /* number of positive weights */ + float mins, maxs, avgs; /* min, max, avg score */ + float *pernode; /* expected score per node of HMM */ + float expscore; /* expected score of complete HMM */ + int max_iter; /* bulletproof against infinite loop bugs */ + + epsilon = 0.2; /* works fine */ + max_iter = 666; + + /* Allocations + */ + sc = MallocOrDie (sizeof(float) * msa->nseq); + wgt = MallocOrDie (sizeof(float) * msa->nseq); + new_wgt = MallocOrDie (sizeof(float) * msa->nseq); + grad = MallocOrDie (sizeof(float) * msa->nseq); + pernode = MallocOrDie (sizeof(float) * (hmm->M+1)); + + /* Initialization. Start with all weights == 1.0. + * Find relative entropy and gradient. + */ + Plan7SWConfig(hmm, 0.5, 0.5); + P7Logoddsify(hmm, TRUE); + + FSet(wgt, msa->nseq, 1.0); + position_average_score(hmm, dsq, wgt, msa->nseq, tr, pernode,&expscore); + for (idx = 0; idx < msa->nseq; idx++) + sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore); + relative_entropy = FSum(sc, msa->nseq) / (float) msa->nseq; + for (idx = 0; idx < msa->nseq; idx++) + grad[idx] = relative_entropy - sc[idx]; + + + printf("iter avg-sc min-sc max-sc min-wgt max-wgt +wgt ++wgt rel.ent convergence\n"); + printf("---- ------ ------ ------ ------- ------- ---- ----- ------- -----------\n"); + mins = maxs = avgs = sc[0]; + for (idx = 1; idx < msa->nseq; idx++) + { + if (sc[idx] < mins) mins = sc[idx]; + if (sc[idx] > maxs) maxs = sc[idx]; + avgs += sc[idx]; + } + avgs /= (float) msa->nseq; + printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8s\n", + 0, avgs, mins, maxs, 1.0, 1.0, msa->nseq, 0, relative_entropy, "-"); + + + /* Steepest descents optimization; + * iterate until relative entropy converges. + */ + i1 = 0; + while (++i1 < max_iter) + { + /* Gradient gives us a line of steepest descents. + * (Roughly speaking, anyway. We actually have a constraint + * that weights are nonnegative and normalized, and the + * gradient doesn't take these into account.) + * Look along this line, a distance of epsilon * gradient: + * if new point is better, accept; if new point is worse, + * move back along the line by half the distance and re-evaluate. + */ + use_epsilon = epsilon; + new_entropy = relative_entropy + 1.0; /* just ensure new > old */ + + i2 = 0; + while (new_entropy > relative_entropy && ++i2 < max_iter) + { + last_new_entropy = new_entropy; + + /* find a new point in weight space */ + for (idx = 0; idx < msa->nseq; idx++) + { + new_wgt[idx] = wgt[idx] + use_epsilon * grad[idx]; + if (new_wgt[idx] < 0.) new_wgt[idx] = 0.0; + } + FNorm(new_wgt, msa->nseq); + FScale(new_wgt, msa->nseq, (float) msa->nseq); + + /* Make new HMM using these weights */ + ZeroPlan7(hmm); + for (idx = 0; idx < msa->nseq; idx++) + P7TraceCount(hmm, dsq[idx], new_wgt[idx], tr[idx]); + P7PriorifyHMM(hmm, prior); + + + /* Evaluate new point */ + Plan7SWConfig(hmm, 0.5, 0.5); + P7Logoddsify(hmm, TRUE); + position_average_score(hmm, dsq, new_wgt, msa->nseq, tr, pernode, &expscore); + for (idx = 0; idx < msa->nseq; idx++) + sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore); + new_entropy = FDot(sc, new_wgt, msa->nseq) / (float) msa->nseq; + + use_epsilon /= 2.0; + /* Failsafe: we're not converging. Set epsilon to zero, + * do one more round. + */ + if (use_epsilon < 1e-6) use_epsilon = 0.0; + if (use_epsilon == 0.0) break; + + /* Failsafe: avoid infinite loops. Sometimes the + new entropy converges without ever being better + than the previous point, probably as a result + of minor roundoff error. */ + if (last_new_entropy == new_entropy) break; + } + if (i2 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n"); + + /* Evaluate convergence before accepting the new weights; + * then, accept the new point and evaluate the gradient there. + */ + converge_criterion = fabs((relative_entropy-new_entropy)/relative_entropy); + relative_entropy = new_entropy; + FCopy(wgt, new_wgt, msa->nseq); + for (idx = 0; idx < msa->nseq; idx++) + grad[idx] = relative_entropy - sc[idx]; + + /* Print some statistics about this iteration + */ + mins = maxs = avgs = sc[0]; + minw = maxw = wgt[0]; + posw = (wgt[0] > 0.0) ? 1 : 0; + highw = (wgt[0] > 1.0) ? 1 : 0; + for (idx = 1; idx < msa->nseq; idx++) + { + if (sc[idx] < mins) mins = sc[idx]; + if (sc[idx] > maxs) maxs = sc[idx]; + if (wgt[idx] < minw) minw = wgt[idx]; + if (wgt[idx] > maxw) maxw = wgt[idx]; + if (wgt[idx] > 0.0) posw++; + if (wgt[idx] > 1.0) highw++; + avgs += sc[idx]; + } + avgs /= (float) msa->nseq; + printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8.5f\n", + i1, + avgs, mins, maxs, + minw, maxw, posw, highw, + relative_entropy, converge_criterion); + + if (converge_criterion < 1e-5) break; + } + if (i1 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n"); + + /* Renormalize weights to sum to eff_nseq, and save. + */ + FNorm(wgt, msa->nseq); + FScale(wgt, msa->nseq, (float) eff_nseq); + FCopy(msa->wgt, wgt, msa->nseq); + /* Make final HMM using these adjusted weights */ + ZeroPlan7(hmm); + for (idx = 0; idx < msa->nseq; idx++) + P7TraceCount(hmm, dsq[idx], wgt[idx], tr[idx]); + P7PriorifyHMM(hmm, prior); + + /* Cleanup and return + */ + free(pernode); + free(new_wgt); + free(grad); + free(wgt); + free(sc); + return; +} diff --git a/forester/archive/RIO/others/hmmer/src/hmmcalibrate-pvm.c b/forester/archive/RIO/others/hmmer/src/hmmcalibrate-pvm.c new file mode 100644 index 0000000..52824c6 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmcalibrate-pvm.c @@ -0,0 +1,209 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +#ifdef HMMER_PVM + +/* hmmcalibrate-pvm.c + * SRE, Tue Aug 18 15:19:28 1998 + * Redesigned for better parallelization: SRE, Wed Dec 1 09:48:58 1999 + * + * Design: + * Initialization: + * receive parameters of random sequence synthesis, and an HMM. + * send an OK signal to the master. + * + * Main loop: + * receive work packet: # of seqs to make + * Synthesize and score # seqs + * send results: # raw scores. + * + * Termination: + * master sends a shutdown signal instead of a work packet. + * + * PVM slave for hmmcalibrate. + * RCS $Id: hmmcalibrate-pvm.c,v 1.1.1.1 2005/03/22 08:34:12 cmzmasek Exp $ + */ + +#include +#include +#include +#include + +#include "version.h" +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ +#include "stopwatch.h" /* CPU timing routines */ + +static void leave_pvm(void); + +int +main(void) +{ + int master_tid; /* PVM TID of our master */ + int slaveidx; /* my slave index (0..nslaves-1) */ + struct plan7_s *hmm; /* HMM to calibrate, sent from master */ + char *seq; /* synthetic random sequence */ + char *dsq; /* digitized seq */ + int len; /* length of seq */ + float *sc; /* scores of seqs */ + int seed; /* random number seed */ + int nsample; /* number of seqs to sample */ + int fixedlen; /* if nonzero, fixed length of seq */ + float lenmean; /* Gaussian mean length of seq */ + float lensd; /* Gaussian length std. dev. for seq */ + float randomseq[MAXABET]; /* iid frequencies of residues */ + float p1; + int alphatype; /* alphabet type, hmmAMINO or hmmNUCLEIC */ + int idx; + int code; + Stopwatch_t stopwatch; /* CPU timings */ + + /* Register leave_pvm() cleanup function so any exit() call + * first calls pvm_exit(). + */ + if (atexit(leave_pvm) != 0) { + pvm_exit(); Die("slave couldn't register leave_pvm()"); + } + + /***************************************************************** + * initialization. + * Master broadcasts the problem to us: + * an HMM; + * parameters of the HMM calibration. + * We send back: + * an OK flag, and our RELEASE, for some sanity checking. + ******************************************************************/ + + StopwatchStart(&stopwatch); + + master_tid = pvm_parent(); /* who's our master? */ + + pvm_recv(master_tid, HMMPVM_INIT); + pvm_upkfloat(&lenmean, 1, 1); /* mean length of random seqs */ + pvm_upkfloat(&lensd, 1, 1); /* std. dev. of random seq len */ + pvm_upkint(&fixedlen, 1, 1); /* if non-zero, override lenmean */ + pvm_upkint(&alphatype, 1, 1); /* alphabet type, hmmAMINO or hmmNUCLEIC */ + pvm_upkint(&seed, 1, 1); /* random number seed */ + SetAlphabet(alphatype); /* must set alphabet before reading HMM! */ + hmm = PVMUnpackHMM(); + if (hmm == NULL) Die("oh no, the HMM never arrived"); + + P7DefaultNullModel(randomseq, &p1); + P7Logoddsify(hmm, TRUE); + + /* tell the master we're OK and ready to go (or not) + */ + code = HMMPVM_OK; + pvm_initsend(PvmDataDefault); + pvm_pkint(&code, 1, 1); + PVMPackString(RELEASE); + pvm_send(master_tid, HMMPVM_RESULTS); + + /***************************************************************** + * Main loop. + * Receive: a number of sequences we're supposed to do. + * If we receive a 0, we have no work, so wait for shutdown; + * if we receive a -1, shut down. + *****************************************************************/ + slaveidx = -1; + for (;;) + { + pvm_recv(master_tid, HMMPVM_WORK); + pvm_upkint(&nsample, 1, 1); + pvm_upkint(&idx, 1, 1); + + if (nsample == 0) continue; /* go into stasis */ + if (nsample == -1) break; /* shut down */ + + if (slaveidx == -1) { /* first time: set id, seed sre_random */ + slaveidx = idx; + sre_srandom(seed+idx); /* unique seed in current PVM */ + } + + sc = MallocOrDie(sizeof(float) * nsample); + for (idx = 0; idx < nsample; idx++) + { + /* choose length of random sequence */ + if (fixedlen) len = fixedlen; + else do len = (int) Gaussrandom(lenmean, lensd); while (len < 1); + /* generate it */ + seq = RandomSequence(Alphabet, randomseq, Alphabet_size, len); + dsq = DigitizeSequence(seq, len); + SQD_DPRINTF2(("slave %d seq: %d : %20.20s...\n", slaveidx, len, seq)); + + if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT) + sc[idx] = P7Viterbi(dsq, len, hmm, NULL); + else + sc[idx] = P7SmallViterbi(dsq, len, hmm, NULL); + + free(seq); + free(dsq); + } + + /* Return output to master, some of which is sanity checking. + * 1. our slave index. + * 2. how many seqs we simulated. + * 3. the array of scores we got, so the master can stuff + * them into a histogram. + */ + pvm_initsend(PvmDataDefault); + pvm_pkint(&slaveidx, 1, 1); + pvm_pkint(&nsample, 1, 1); + pvm_pkfloat(sc, nsample,1); + pvm_send(master_tid, HMMPVM_RESULTS); + + /* cleanup + */ + free(sc); + } + + /*********************************************** + * Cleanup, return. + ***********************************************/ + + FreePlan7(hmm); + StopwatchStop(&stopwatch); + + /* tell the master we heard his shutdown signal, and + * give him our CPU times; then exit. + */ + pvm_initsend(PvmDataDefault); + pvm_pkint(&slaveidx, 1, 1); + StopwatchPVMPack(&stopwatch); + pvm_send(master_tid, HMMPVM_RESULTS); + + return 0; /* pvm_exit() is called by atexit() registration. */ +} + +/* Function: leave_pvm() + * + * Purpose: Cleanup function, to deal with crashes. We register + * this function using atexit() so it gets called before + * the slave dies. + */ +void leave_pvm(void) +{ + SQD_DPRINTF1(("slave leaving PVM.\n")); + pvm_exit(); +} + +#else /* if HMMER_PVM not defined: include a dummy */ + +#include +int main(void) +{ + printf("hmmcalibrate-pvm disabled. PVM support was not compiled into HMMER.\n"); + exit(0); +} + +#endif diff --git a/forester/archive/RIO/others/hmmer/src/hmmcalibrate.c b/forester/archive/RIO/others/hmmer/src/hmmcalibrate.c new file mode 100644 index 0000000..b003f8b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmcalibrate.c @@ -0,0 +1,957 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmcalibrate.c + * SRE, Fri Oct 31 09:25:21 1997 [St. Louis] + * + * Score an HMM against random sequence data sets; + * set histogram fitting parameters. + * + * CVS $Id: hmmcalibrate.c,v 1.1.1.1 2005/03/22 08:34:03 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HMMER_THREADS +#include +#endif +#ifdef HMMER_PVM +#include +#endif + +#include "squid.h" /* general sequence analysis library */ +#include "config.h" /* compile-time configuration constants */ +#include "structs.h" /* data structures, macros, #define's */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "version.h" /* release version info */ +#include "stopwatch.h" /* process timings */ + +static char banner[] = "hmmcalibrate -- calibrate HMM search statistics"; + +static char usage[] = "\ +Usage: hmmcalibrate [-options] \n\ +Available options are:\n\ + -h : print short usage and version info, then exit\n\ +"; + +static char experts[] = "\ + --cpu : run threads in parallel (if threaded)\n\ + --fixed : fix random sequence length at \n\ + --histfile : save histogram(s) to file \n\ + --mean : set random seq length mean at [350]\n\ + --num : set number of sampled seqs to [5000]\n\ + --pvm : run on a Parallel Virtual Machine (PVM)\n\ + --sd : set random seq length std. dev to [350]\n\ + --seed : set random seed to [time()]\n\ +"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "--cpu", FALSE, sqdARG_INT }, + { "--fixed", FALSE, sqdARG_INT }, + { "--histfile", FALSE, sqdARG_STRING }, + { "--mean", FALSE, sqdARG_FLOAT }, + { "--num", FALSE, sqdARG_INT }, + { "--pvm", FALSE, sqdARG_NONE }, + { "--sd", FALSE, sqdARG_FLOAT }, + { "--seed", FALSE, sqdARG_INT}, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + + +static void main_loop_serial(struct plan7_s *hmm, int seed, int nsample, + float lenmean, float lensd, int fixedlen, + struct histogram_s **ret_hist, float *ret_max); + +#ifdef HMMER_THREADS +/* A structure of this type is shared by worker threads in the POSIX + * threads parallel version. + */ +struct workpool_s { + /* Static configuration: + */ + struct plan7_s *hmm; /* ptr to single HMM to search with */ + int fixedlen; /* if >0, fix random seq len to this */ + float lenmean; /* mean of Gaussian for random seq len */ + float lensd; /* s.d. of Gaussian for random seq len */ + float *randomseq; /* 0..Alphabet_size-1 i.i.d. probs */ + int nsample; /* number of random seqs to do */ + + /* Shared (mutex-protected) input: + */ + int nseq; /* current number of seqs searched */ + + /* Shared (mutex-protected) output: + */ + struct histogram_s *hist; /* histogram */ + float max_score; /* maximum score seen */ + Stopwatch_t watch; /* Timings accumulated for threads */ + + /* Thread pool information: + */ + pthread_t *thread; /* our pool of threads */ + int num_threads; /* number of threads */ + pthread_mutex_t input_lock; /* a mutex protecting input fields */ + pthread_mutex_t output_lock; /* a mutex protecting output fields */ +}; +static void main_loop_threaded(struct plan7_s *hmm, int seed, int nsample, + float lenmean, float lensd, int fixedlen, + int nthreads, + struct histogram_s **ret_hist, float *ret_max, + Stopwatch_t *twatch); +static struct workpool_s *workpool_start(struct plan7_s *hmm, + float lenmean, float lensd, int fixedlen, + float *randomseq, int nsample, + struct histogram_s *hist, + int num_threads); +static void workpool_stop(struct workpool_s *wpool); +static void workpool_free(struct workpool_s *wpool); +static void *worker_thread(void *ptr); +#endif /* HMMER_THREADS */ + +#ifdef HMMER_PVM +static void main_loop_pvm(struct plan7_s *hmm, int seed, int nsample, + int lumpsize, + float lenmean, float lensd, int fixedlen, + struct histogram_s **ret_hist, float *ret_max, + Stopwatch_t *extrawatch, int *ret_nslaves); +#endif /* HMMER_PVM */ + + +int +main(int argc, char **argv) +{ + char *hmmfile; /* HMM file to open */ + char *tmpfile; /* temporary calibrated HMM file */ + HMMFILE *hmmfp; /* opened hmm file pointer */ + FILE *outfp; /* for writing HMM(s) into tmpfile */ + char *mode; /* write mode, "w" or "wb" */ + struct plan7_s *hmm; /* the hidden Markov model */ + int idx; /* counter over sequences */ + sigset_t blocksigs; /* list of signals to protect from */ + int nhmm; /* number of HMMs calibrated */ + + struct histogram_s *hist; /* a resulting histogram */ + float max; /* maximum score from an HMM */ + char *histfile; /* histogram save file */ + FILE *hfp; /* open file pointer for histfile */ + + Stopwatch_t stopwatch; /* main stopwatch for process */ + Stopwatch_t extrawatch; /* stopwatch for threads/PVM slaves */ + + float *mu; /* array of EVD mu's for HMMs */ + float *lambda; /* array of EVD lambda's for HMMs */ + int mu_lumpsize; /* allocation lumpsize for mu, lambda */ + + int nsample; /* number of random seqs to sample */ + int seed; /* random number seed */ + int fixedlen; /* fixed length, or 0 if unused */ + float lenmean; /* mean of length distribution */ + float lensd; /* std dev of length distribution */ + int do_pvm; /* TRUE to use PVM */ + int pvm_lumpsize; /* # of seqs to do per PVM slave exchange */ + int pvm_nslaves; /* number of slaves used in the PVM */ + + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + int num_threads; /* number of worker threads */ + + + /*********************************************** + * Parse the command line + ***********************************************/ + StopwatchStart(&stopwatch); + StopwatchZero(&extrawatch); + + nsample = 5000; + fixedlen = 0; + lenmean = 325.; + lensd = 200.; + seed = (int) time ((time_t *) NULL); + histfile = NULL; + do_pvm = FALSE; + pvm_lumpsize = 20; /* 20 seqs/PVM exchange: sets granularity */ + mu_lumpsize = 100; +#ifdef HMMER_THREADS + num_threads = ThreadNumber(); /* only matters if we're threaded */ +#else + num_threads = 0; +#endif + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "--cpu") == 0) num_threads = atoi(optarg); + else if (strcmp(optname, "--fixed") == 0) fixedlen = atoi(optarg); + else if (strcmp(optname, "--histfile") == 0) histfile = optarg; + else if (strcmp(optname, "--mean") == 0) lenmean = atof(optarg); + else if (strcmp(optname, "--num") == 0) nsample = atoi(optarg); + else if (strcmp(optname, "--pvm") == 0) do_pvm = TRUE; + else if (strcmp(optname, "--sd") == 0) lensd = atof(optarg); + else if (strcmp(optname, "--seed") == 0) seed = atoi(optarg); + else if (strcmp(optname, "-h") == 0) + { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + + if (argc - optind != 1) Die("Incorrect number of arguments.\n%s\n", usage); + hmmfile = argv[optind++]; + +#ifndef HMMER_PVM + if (do_pvm) Die("PVM support is not compiled into HMMER; --pvm doesn't work."); +#endif +#ifndef HMMER_THREADS + if (num_threads) Die("Posix threads support is not compiled into HMMER; --cpu doesn't have any effect"); +#endif + + /*********************************************** + * Open our i/o file pointers, make sure all is well + ***********************************************/ + + /* HMM file */ + if ((hmmfp = HMMFileOpen(hmmfile, NULL)) == NULL) + Die("failed to open HMM file %s for reading.", hmmfile); + + /* histogram file */ + hfp = NULL; + if (histfile != NULL) { + if ((hfp = fopen(histfile, "w")) == NULL) + Die("Failed to open histogram save file %s for writing\n", histfile); + } + + /* Generate calibrated HMM(s) in a tmp file in the current + * directory. When we're finished, we delete the original + * HMM file and rename() this one. That way, the worst + * effect of a catastrophic failure should be that we + * leave a tmp file lying around, but the original HMM + * file remains uncorrupted. tmpnam() doesn't work portably here, + * because it'll put the file in /tmp and we won't + * necessarily be able to rename() it from there. + */ + tmpfile = MallocOrDie(strlen(hmmfile) + 5); + strcpy(tmpfile, hmmfile); + strcat(tmpfile, ".xxx"); /* could be more inventive here... */ + if (FileExists(tmpfile)) + Die("temporary file %s already exists; please delete it first", tmpfile); + if (hmmfp->is_binary) mode = "wb"; + else mode = "w"; + + /*********************************************** + * Show the banner + ***********************************************/ + + Banner(stdout, banner); + printf("HMM file: %s\n", hmmfile); + if (fixedlen) + printf("Length fixed to: %d\n", fixedlen); + else { + printf("Length distribution mean: %.0f\n", lenmean); + printf("Length distribution s.d.: %.0f\n", lensd); + } + printf("Number of samples: %d\n", nsample); + printf("random seed: %d\n", seed); + printf("histogram(s) saved to: %s\n", + histfile != NULL ? histfile : "[not saved]"); + if (do_pvm) + printf("PVM: ACTIVE\n"); + else if (num_threads > 0) + printf("POSIX threads: %d\n", num_threads); + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); + + /*********************************************** + * Read the HMMs one at a time, and send them off + * in probability form to one of the main loops. + * The main loop functions are responsible for + * synthesizing random sequences and returning + * a score histogram for each HMM. + ***********************************************/ + + nhmm = 0; + mu = MallocOrDie(sizeof(float) * mu_lumpsize); + lambda = MallocOrDie(sizeof(float) * mu_lumpsize); + + while (HMMFileRead(hmmfp, &hmm)) + { + if (hmm == NULL) + Die("HMM file may be corrupt or in incorrect format; parse failed"); + + if (! do_pvm && num_threads == 0) + main_loop_serial(hmm, seed, nsample, lenmean, lensd, fixedlen, + &hist, &max); +#ifdef HMMER_PVM + else if (do_pvm) { + pvm_nslaves = 0; /* solely to silence compiler warnings */ + main_loop_pvm(hmm, seed, nsample, pvm_lumpsize, + lenmean, lensd, fixedlen, + &hist, &max, &extrawatch, &pvm_nslaves); + } +#endif +#ifdef HMMER_THREADS + else if (num_threads > 0) + main_loop_threaded(hmm, seed, nsample, lenmean, lensd, fixedlen, + num_threads, &hist, &max, &extrawatch); +#endif + else + Die("wait. that can't happen. I didn't do anything."); + + + /* Fit an EVD to the observed histogram. + * The TRUE left-censors and fits only the right slope of the histogram. + * The 9999. is an arbitrary high number that means we won't trim + * outliers on the right. + */ + if (! ExtremeValueFitHistogram(hist, TRUE, 9999.)) + Die("fit failed; -n may be set too small?\n"); + + mu[nhmm] = hist->param[EVD_MU]; + lambda[nhmm] = hist->param[EVD_LAMBDA]; + nhmm++; + if (nhmm % 100 == 0) { + mu = ReallocOrDie(mu, sizeof(float) * (nhmm+mu_lumpsize)); + lambda = ReallocOrDie(lambda, sizeof(float) * (nhmm+mu_lumpsize)); + } + + /* Output + */ + printf("HMM : %s\n", hmm->name); + printf("mu : %12f\n", hist->param[EVD_MU]); + printf("lambda : %12f\n", hist->param[EVD_LAMBDA]); + printf("max : %12f\n", max); + printf("//\n"); + + if (hfp != NULL) + { + fprintf(hfp, "HMM: %s\n", hmm->name); + PrintASCIIHistogram(hfp, hist); + fprintf(hfp, "//\n"); + } + + FreeHistogram(hist); + } + SQD_DPRINTF1(("Main body believes it has calibrations for %d HMMs\n", nhmm)); + + /***************************************************************** + * Rewind the HMM file for a second pass. + * Write a temporary HMM file with new mu, lambda values in it + *****************************************************************/ + + HMMFileRewind(hmmfp); + if (FileExists(tmpfile)) + Die("Ouch. Temporary file %s appeared during the run.", tmpfile); + if ((outfp = fopen(tmpfile, mode)) == NULL) + Die("Ouch. Temporary file %s couldn't be opened for writing.", tmpfile); + + for (idx = 0; idx < nhmm; idx++) + { + /* Sanity checks + */ + if (!HMMFileRead(hmmfp, &hmm)) + Die("Ran out of HMMs too early in pass 2"); + if (hmm == NULL) + Die("HMM file %s was corrupted? Parse failed in pass 2", hmmfile); + + /* Put results in HMM + */ + hmm->mu = mu[idx]; + hmm->lambda = lambda[idx]; + hmm->flags |= PLAN7_STATS; + Plan7ComlogAppend(hmm, argc, argv); + + /* Save HMM to tmpfile + */ + if (hmmfp->is_binary) WriteBinHMM(outfp, hmm); + else WriteAscHMM(outfp, hmm); + + FreePlan7(hmm); + } + + /***************************************************************** + * Now, carefully remove original file and replace it + * with the tmpfile. Note the protection from signals; + * we wouldn't want a user to ctrl-C just as we've deleted + * their HMM file but before the new one is moved. + *****************************************************************/ + + HMMFileClose(hmmfp); + if (fclose(outfp) != 0) PANIC; + + if (sigemptyset(&blocksigs) != 0) PANIC; + if (sigaddset(&blocksigs, SIGINT) != 0) PANIC; + if (sigprocmask(SIG_BLOCK, &blocksigs, NULL) != 0) PANIC; + if (remove(hmmfile) != 0) PANIC; + if (rename(tmpfile, hmmfile) != 0) PANIC; + if (sigprocmask(SIG_UNBLOCK, &blocksigs, NULL) != 0) PANIC; + + /*********************************************** + * Exit + ***********************************************/ + + StopwatchStop(&stopwatch); + if (do_pvm > 0) { + printf("PVM processors used: %d\n", pvm_nslaves); + StopwatchInclude(&stopwatch, &extrawatch); + } +#ifdef PTHREAD_TIMES_HACK + else if (num_threads > 0) StopwatchInclude(&stopwatch, &extrawatch); +#endif + + /* StopwatchDisplay(stdout, "CPU Time: ", &stopwatch); */ + + free(mu); + free(lambda); + free(tmpfile); + if (hfp != NULL) fclose(hfp); + SqdClean(); + return 0; +} + +/* Function: main_loop_serial() + * Date: SRE, Tue Aug 18 16:18:28 1998 [St. Louis] + * + * Purpose: Given an HMM and parameters for synthesizing random + * sequences; return a histogram of scores. + * (Serial version) + * + * Args: hmm - an HMM to calibrate. + * seed - random number seed + * nsample - number of seqs to synthesize + * lenmean - mean length of random sequence + * lensd - std dev of random seq length + * fixedlen - if nonzero, override lenmean, always this len + * ret_hist - RETURN: the score histogram + * ret_max - RETURN: highest score seen in simulation + * + * Returns: (void) + * hist is alloc'ed here, and must be free'd by caller. + */ +static void +main_loop_serial(struct plan7_s *hmm, int seed, int nsample, + float lenmean, float lensd, int fixedlen, + struct histogram_s **ret_hist, float *ret_max) +{ + struct histogram_s *hist; + float randomseq[MAXABET]; + float p1; + float max; + char *seq; + char *dsq; + float score; + int sqlen; + int idx; + + /* Initialize. + * We assume we've already set the alphabet (safe, because + * HMM input sets the alphabet). + */ + sre_srandom(seed); + P7Logoddsify(hmm, TRUE); + P7DefaultNullModel(randomseq, &p1); + hist = AllocHistogram(-200, 200, 100); + max = -FLT_MAX; + + for (idx = 0; idx < nsample; idx++) + { + /* choose length of random sequence */ + if (fixedlen) sqlen = fixedlen; + else do sqlen = (int) Gaussrandom(lenmean, lensd); while (sqlen < 1); + /* generate it */ + seq = RandomSequence(Alphabet, randomseq, Alphabet_size, sqlen); + dsq = DigitizeSequence(seq, sqlen); + + if (P7ViterbiSize(sqlen, hmm->M) <= RAMLIMIT) + score = P7Viterbi(dsq, sqlen, hmm, NULL); + else + score = P7SmallViterbi(dsq, sqlen, hmm, NULL); + + AddToHistogram(hist, score); + if (score > max) max = score; + + free(dsq); + free(seq); + } + + *ret_hist = hist; + *ret_max = max; + return; +} + + +#ifdef HMMER_THREADS +/* Function: main_loop_threaded() + * Date: SRE, Wed Dec 1 12:43:09 1999 [St. Louis] + * + * Purpose: Given an HMM and parameters for synthesizing random + * sequences; return a histogram of scores. + * (Threaded version.) + * + * Args: hmm - an HMM to calibrate. + * seed - random number seed + * nsample - number of seqs to synthesize + * lenmean - mean length of random sequence + * lensd - std dev of random seq length + * fixedlen - if nonzero, override lenmean, always this len + * nthreads - number of threads to start + * ret_hist - RETURN: the score histogram + * ret_max - RETURN: highest score seen in simulation + * twatch - RETURN: accumulation of thread times + * + * Returns: (void) + * hist is alloc'ed here, and must be free'd by caller. + */ +static void +main_loop_threaded(struct plan7_s *hmm, int seed, int nsample, + float lenmean, float lensd, int fixedlen, + int nthreads, + struct histogram_s **ret_hist, float *ret_max, + Stopwatch_t *twatch) +{ + struct histogram_s *hist; + float randomseq[MAXABET]; + float p1; + struct workpool_s *wpool; /* pool of worker threads */ + + /* Initialize. + * We assume we've already set the alphabet (safe, because + * HMM input sets the alphabet). + */ + sre_srandom(seed); + P7Logoddsify(hmm, TRUE); + P7DefaultNullModel(randomseq, &p1); + hist = AllocHistogram(-200, 200, 100); + + wpool = workpool_start(hmm, lenmean, lensd, fixedlen, randomseq, nsample, + hist, nthreads); + workpool_stop(wpool); + + *ret_hist = hist; + *ret_max = wpool->max_score; + StopwatchInclude(twatch, &(wpool->watch)); + + workpool_free(wpool); + return; +} + +/***************************************************************** + * POSIX threads implementation. + * API: + * workpool_start() (makes a workpool_s structure. Starts calculations.) + * workpool_stop() (waits for threads to finish.) + * [process histogram] + * workpool_free() (destroys the structure) + * + * Threads: + * worker_thread() (the actual parallelized worker thread). + *****************************************************************/ + +/* Function: workpool_start() + * Date: SRE, Thu Jul 16 11:09:05 1998 [St. Louis] + * + * Purpose: Initialize a workpool_s structure, and return it. + * + * Args: hmm - the HMM to calibrate + * fixedlen - 0, or a fixed length for seqs (bypass of Gaussian) + * lenmean - mean sequence length + * lensd - std. dev. for sequence length + * randomseq- i.i.d. frequencies for residues, 0..Alphabet_size-1 + * nsample - how many seqs to calibrate on + * hist - histogram structure for storing results + * num_threads - how many processors to run on + * + * Returns: ptr to struct workpool_s. + * Caller must wait for threads to finish with workpool_stop(), + * then free the structure with workpool_free(). + */ +static struct workpool_s * +workpool_start(struct plan7_s *hmm, float lenmean, float lensd, int fixedlen, + float *randomseq, int nsample, struct histogram_s *hist, + int num_threads) +{ + struct workpool_s *wpool; + pthread_attr_t attr; + int i; + int rtn; + + wpool = MallocOrDie(sizeof(struct workpool_s)); + wpool->thread = MallocOrDie(num_threads * sizeof(pthread_t)); + wpool->hmm = hmm; + wpool->fixedlen = fixedlen; + wpool->lenmean = lenmean; + wpool->lensd = lensd; + wpool->randomseq = randomseq; + wpool->nsample = nsample; + + wpool->nseq = 0; + wpool->hist = hist; + wpool->max_score = -FLT_MAX; + wpool->num_threads= num_threads; + + StopwatchZero(&(wpool->watch)); + + if ((rtn = pthread_mutex_init(&(wpool->input_lock), NULL)) != 0) + Die("pthread_mutex_init FAILED; %s\n", strerror(rtn)); + if ((rtn = pthread_mutex_init(&(wpool->output_lock), NULL)) != 0) + Die("pthread_mutex_init FAILED; %s\n", strerror(rtn)); + + /* Create slave threads. + * Note the crazy machinations we have to go through to achieve concurrency. + * You'd think that POSIX threads were portable... ha. + * On IRIX 6.5, system scope threads are only available to root, or if + * /etc/capability has been configured specially, so to avoid strange + * permissions errors we can't set PTHREAD_SCOPE_SYSTEM for IRIX. + * On IRIX pre-6.5, we can't get good concurrency, period. As of 6.5, + * SGI provides the nonportable pthread_setconcurrency() call. + * On FreeBSD (3.0 snapshots), the pthread_attr_setscope() call isn't + * even provided, apparently on grounds of "if it doesn't do anything, + * why provide it?" Hello? POSIX compliance, perhaps? + * On Sun Solaris, we need to set system scope to achieve concurrency. + * Linux and DEC Digital UNIX seem to work fine in either process scope + * or system scope, without a pthread_setconcurrency call. + */ + pthread_attr_init(&attr); +#ifndef __sgi +#ifdef HAVE_PTHREAD_ATTR_SETSCOPE + pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); +#endif +#endif +#ifdef HAVE_PTHREAD_SETCONCURRENCY + pthread_setconcurrency(num_threads+1); +#endif + for (i = 0; i < num_threads; i++) + if ((rtn = pthread_create(&(wpool->thread[i]), &attr, + worker_thread , (void *) wpool)) != 0) + Die("Failed to create thread %d; return code %d\n", i, rtn); + + pthread_attr_destroy(&attr); + + return wpool; +} + +/* Function: workpool_stop() + * Date: SRE, Thu Jul 16 11:20:16 1998 [St. Louis] + * + * Purpose: Waits for threads in a workpool to finish. + * + * Args: wpool -- ptr to the workpool structure + * + * Returns: (void) + */ +static void +workpool_stop(struct workpool_s *wpool) +{ + int i; + /* wait for threads to stop */ + for (i = 0; i < wpool->num_threads; i++) + if (pthread_join(wpool->thread[i],NULL) != 0) + Die("pthread_join failed"); + return; +} + +/* Function: workpool_free() + * Date: SRE, Thu Jul 16 11:26:27 1998 [St. Louis] + * + * Purpose: Free a workpool_s structure, after the threads + * have finished. + * + * Args: wpool -- ptr to the workpool. + * + * Returns: (void) + */ +static void +workpool_free(struct workpool_s *wpool) +{ + free(wpool->thread); + free(wpool); + return; +} + +/* Function: worker_thread() + * Date: SRE, Thu Jul 16 10:41:02 1998 [St. Louis] + * + * Purpose: The procedure executed by the worker threads. + * + * Args: ptr - (void *) that is recast to a pointer to + * the workpool. + * + * Returns: (void *) + */ +void * +worker_thread(void *ptr) +{ + struct plan7_s *hmm; + struct workpool_s *wpool; + char *seq; + char *dsq; + int len; + float sc; + int rtn; + Stopwatch_t thread_watch; + + StopwatchStart(&thread_watch); + wpool = (struct workpool_s *) ptr; + hmm = wpool->hmm; + for (;;) + { + /* 1. Synthesize a random sequence. + * The input sequence number is a shared resource, + * and sre_random() isn't thread-safe, so protect + * the whole section with mutex. + */ + /* acquire a lock */ + if ((rtn = pthread_mutex_lock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_lock failure: %s\n", strerror(rtn)); + /* generate a sequence */ + wpool->nseq++; + if (wpool->nseq > wpool->nsample) + { /* we're done; release input lock, break loop */ + if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + break; + } + if (wpool->fixedlen) len = wpool->fixedlen; + else do len = (int) Gaussrandom(wpool->lenmean, wpool->lensd); while (len < 1); + seq = RandomSequence(Alphabet, wpool->randomseq, Alphabet_size, len); + + /* release the lock */ + if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + + /* 2. Score the sequence against the model. + */ + dsq = DigitizeSequence(seq, len); + + if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT) + sc = P7Viterbi(dsq, len, hmm, NULL); + else + sc = P7SmallViterbi(dsq, len, hmm, NULL); + free(dsq); + free(seq); + + /* 3. Save the output; hist and max_score are shared, + * so protect this section with the output mutex. + */ + /* acquire lock on the output queue */ + if ((rtn = pthread_mutex_lock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_lock failure: %s\n", strerror(rtn)); + /* save output */ + AddToHistogram(wpool->hist, sc); + if (sc > wpool->max_score) wpool->max_score = sc; + /* release our lock */ + if ((rtn = pthread_mutex_unlock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + } + + StopwatchStop(&thread_watch); + /* acquire lock on the output queue */ + if ((rtn = pthread_mutex_lock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_lock failure: %s\n", strerror(rtn)); + /* accumulate cpu time into main stopwatch */ + StopwatchInclude(&(wpool->watch), &thread_watch); + /* release our lock */ + if ((rtn = pthread_mutex_unlock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + + pthread_exit(NULL); + return NULL; /* solely to silence compiler warnings */ +} +#endif /* HMMER_THREADS */ + + + +#ifdef HMMER_PVM +/* Function: main_loop_pvm() + * Date: SRE, Wed Aug 19 13:59:54 1998 [St. Louis] + * + * Purpose: Given an HMM and parameters for synthesizing random + * sequences; return a histogram of scores. + * (PVM version) + * + * Args: hmm - an HMM to calibrate. + * seed - random number seed + * nsample - number of seqs to synthesize + * lumpsize- # of seqs per slave exchange; controls granularity + * lenmean - mean length of random sequence + * lensd - std dev of random seq length + * fixedlen- if nonzero, override lenmean, always this len + * hist - RETURN: the score histogram + * ret_max - RETURN: highest score seen in simulation + * extrawatch - RETURN: total CPU time spend in slaves. + * ret_nslaves- RETURN: number of PVM slaves run. + * + * Returns: (void) + * hist is alloc'ed here, and must be free'd by caller. + */ +static void +main_loop_pvm(struct plan7_s *hmm, int seed, int nsample, int lumpsize, + float lenmean, float lensd, int fixedlen, + struct histogram_s **ret_hist, float *ret_max, + Stopwatch_t *extrawatch, int *ret_nslaves) +{ + struct histogram_s *hist; + int master_tid; + int *slave_tid; + int nslaves; + int nsent; /* # of seqs we've asked for so far */ + int ndone; /* # of seqs we've got results for so far */ + int packet; /* # of seqs to have a slave do */ + float max; + int slaveidx; /* id of a slave */ + float *sc; /* scores returned by a slave */ + Stopwatch_t slavewatch; + int i; + + StopwatchZero(extrawatch); + hist = AllocHistogram(-200, 200, 100); + max = -FLT_MAX; + + /* Initialize PVM + */ + if ((master_tid = pvm_mytid()) < 0) + Die("pvmd not responding -- do you have PVM running?"); +#if DEBUGLEVEL >= 1 + pvm_catchout(stderr); /* catch output for debugging */ +#endif + PVMSpawnSlaves("hmmcalibrate-pvm", &slave_tid, &nslaves); + + /* Initialize the slaves + */ + pvm_initsend(PvmDataDefault); + pvm_pkfloat(&lenmean, 1, 1); + pvm_pkfloat(&lensd, 1, 1); + pvm_pkint( &fixedlen, 1, 1); + pvm_pkint( &Alphabet_type, 1, 1); + pvm_pkint( &seed, 1, 1); + if (! PVMPackHMM(hmm)) Die("Failed to pack the HMM"); + pvm_mcast(slave_tid, nslaves, HMMPVM_INIT); + SQD_DPRINTF1(("Initialized %d slaves\n", nslaves)); + + /* Confirm slaves' OK status. + */ + PVMConfirmSlaves(slave_tid, nslaves); + SQD_DPRINTF1(("Slaves confirm that they're ok...\n")); + + /* Load the slaves + */ + nsent = ndone = 0; + for (slaveidx = 0; slaveidx < nslaves; slaveidx++) + { + packet = (nsample - nsent > lumpsize ? lumpsize : nsample - nsent); + + pvm_initsend(PvmDataDefault); + pvm_pkint(&packet, 1, 1); + pvm_pkint(&slaveidx, 1, 1); + pvm_send(slave_tid[slaveidx], HMMPVM_WORK); + nsent += packet; + } + SQD_DPRINTF1(("Loaded %d slaves\n", nslaves)); + + /* Receive/send loop + */ + sc = MallocOrDie(sizeof(float) * lumpsize); + while (nsent < nsample) + { + /* integrity check of slaves */ + PVMCheckSlaves(slave_tid, nslaves); + + /* receive results */ + SQD_DPRINTF2(("Waiting for results...\n")); + pvm_recv(-1, HMMPVM_RESULTS); + pvm_upkint(&slaveidx, 1, 1); + pvm_upkint(&packet, 1, 1); + pvm_upkfloat(sc, packet, 1); + SQD_DPRINTF2(("Got results.\n")); + ndone += packet; + + /* store results */ + for (i = 0; i < packet; i++) { + AddToHistogram(hist, sc[i]); + if (sc[i] > max) max = sc[i]; + } + /* send new work */ + packet = (nsample - nsent > lumpsize ? lumpsize : nsample - nsent); + + pvm_initsend(PvmDataDefault); + pvm_pkint(&packet, 1, 1); + pvm_pkint(&slaveidx, 1, 1); + pvm_send(slave_tid[slaveidx], HMMPVM_WORK); + SQD_DPRINTF2(("Told slave %d to do %d more seqs.\n", slaveidx, packet)); + nsent += packet; + } + + /* Wait for the last output to come in. + */ + while (ndone < nsample) + { + /* integrity check of slaves */ + PVMCheckSlaves(slave_tid, nslaves); + + /* receive results */ + SQD_DPRINTF1(("Waiting for final results...\n")); + pvm_recv(-1, HMMPVM_RESULTS); + pvm_upkint(&slaveidx, 1, 1); + pvm_upkint(&packet, 1, 1); + pvm_upkfloat(sc, packet, 1); + SQD_DPRINTF2(("Got some final results.\n")); + ndone += packet; + /* store results */ + for (i = 0; i < packet; i++) { + AddToHistogram(hist, sc[i]); + if (sc[i] > max) max = sc[i]; + } + } + + /* Shut down the slaves: send -1,-1,-1. + */ + pvm_initsend(PvmDataDefault); + packet = -1; + pvm_pkint(&packet, 1, 1); + pvm_pkint(&packet, 1, 1); + pvm_pkint(&packet, 1, 1); + pvm_mcast(slave_tid, nslaves, HMMPVM_WORK); + + /* Collect stopwatch results; quit the VM; return. + */ + for (i = 0; i < nslaves; i++) + { + pvm_recv(-1, HMMPVM_RESULTS); + pvm_upkint(&slaveidx, 1, 1); + StopwatchPVMUnpack(&slavewatch); + + SQD_DPRINTF1(("Slave %d finished; says it used %.2f cpu, %.2f sys\n", + slaveidx, slavewatch.user, slavewatch.sys)); + + StopwatchInclude(extrawatch, &slavewatch); + } + + free(slave_tid); + free(sc); + pvm_exit(); + *ret_hist = hist; + *ret_max = max; + *ret_nslaves = nslaves; + return; +} +#endif /* HMMER_PVM */ + + + diff --git a/forester/archive/RIO/others/hmmer/src/hmmconvert.c b/forester/archive/RIO/others/hmmer/src/hmmconvert.c new file mode 100644 index 0000000..6a9ea06 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmconvert.c @@ -0,0 +1,209 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmconvert.c + * SRE, Thu Oct 30 08:56:22 1997; St. Louis + * + * main() for converting between HMM file formats, and + * for converting HMMs to other software formats like GCG profiles. + * + * CVS $Id: hmmconvert.c,v 1.1.1.1 2005/03/22 08:33:58 cmzmasek Exp $ + */ + +#include +#include + +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ + +static char banner[] = "hmmconvert - convert between profile HMM file formats"; + +static char usage[] = "\ +Usage: hmmconvert [-options] \n\ + Available options are:\n\ + -h : help; print brief help on version and usage\n\ +\n\ + -a : convert to HMMER ASCII file (the default)\n\ + -b : convert to HMMER binary file\n\ + -p : convert to GCG Profile .prf format\n\ + -P : convert to Compugen extended .eprf profile format\n\ +\n\ + -A : append mode; append to \n\ + -F : force mode; allow overwriting of existing files\n\ +"; + +static char experts[] = "\ +\n"; + + +static struct opt_s OPTIONS[] = { + { "-a", TRUE, sqdARG_NONE }, + { "-b", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "-p", TRUE, sqdARG_NONE }, + { "-A", TRUE, sqdARG_NONE }, + { "-F", TRUE, sqdARG_NONE }, + { "-P", TRUE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *infile; /* name of input HMM file */ + char *outfile; /* name of output HMM file */ + HMMFILE *infp; /* input HMM file ptr */ + FILE *outfp; /* output HMM file ptr */ + char *mode; /* mode to open file in */ + struct plan7_s *hmm; /* a profile HMM structure */ + int nhmm; /* number of HMMs converted */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + int do_append; /* TRUE to append to existing outfile */ + int do_force; /* TRUE to allow overwriting */ + enum hmmfmt_e { P7ASCII, P7BINARY, GCGPROFILE, BICPROFILE } + outfmt; /* output format */ + + /*********************************************** + * Parse command line + ***********************************************/ + + outfmt = P7ASCII; + do_append = FALSE; + do_force = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-a") == 0) { outfmt = P7ASCII; } + else if (strcmp(optname, "-b") == 0) { outfmt = P7BINARY; } + else if (strcmp(optname, "-p") == 0) { outfmt = GCGPROFILE; } + else if (strcmp(optname, "-A") == 0) { do_append = TRUE; } + else if (strcmp(optname, "-F") == 0) { do_force = TRUE; } + else if (strcmp(optname, "-P") == 0) { outfmt = BICPROFILE; } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 2) + Die("Incorrect number of arguments.\n%s\n", usage); + + infile = argv[optind++]; + outfile = argv[optind++]; + + /*********************************************** + * Open input HMM database (might be in HMMERDB or current directory) + ***********************************************/ + + if ((infp = HMMFileOpen(infile, "HMMERDB")) == NULL) + Die("Failed to open HMM database %s\n%s", infile, usage); + + /*********************************************** + * Open output HMM file + ***********************************************/ + + if (do_append) + { /* If we're appending to a file, it needs to be Plan7 format */ + HMMFILE *test; + + if (FileExists(outfile)) { + test = HMMFileOpen(outfile, NULL); + if (test == NULL) + Die("%s not an HMM file; I refuse to append to it; using stdout instead", + outfile); + + /* bug #14 fix. 12/24/00, xref STL3 p.133. */ + if (test->is_binary && outfmt != P7BINARY) + Die("File %s is in Plan 7 binary format; must append the same fmt.", outfile); + else if (! test->is_binary && outfmt != P7ASCII) + Die("File %s is in Plan 7 ASCII format; must append the same fmt.", outfile); + + HMMFileClose(test); + } + switch (outfmt) { + case P7ASCII: mode = "a"; break; + case P7BINARY: mode = "ab"; break; + case GCGPROFILE: Die("You cannot append GCG profiles"); + case BICPROFILE: Die("You cannot append Compugen extended profiles"); + default: Die("unexpected format"); + } + } + else + { /* else, we're writing a new file */ + if (! do_force && FileExists(outfile)) + Die("Output HMM file %s already exists. Please rename or delete it.", outfile); + switch (outfmt) { + case P7ASCII: mode = "w"; break; + case P7BINARY: mode = "wb"; break; + case GCGPROFILE: mode = "w"; break; + case BICPROFILE: mode = "w"; break; + default: Die("unexpected format"); + } + } + if ((outfp = fopen(outfile, mode)) == NULL) + Die("Failed to open output file %s for writing", outfile); + + /*********************************************** + * Show the banner + ***********************************************/ + + Banner(stdout, banner); + printf( "Input HMM file: %s\n", infile); + printf( "Output HMM file: %s\n", outfile); + printf( "Converting to: "); + switch (outfmt) { + case P7ASCII: puts("HMMER Plan7 ASCII"); break; + case P7BINARY: puts("HMMER Plan7 binary"); break; + case GCGPROFILE: puts("GCG Profile .prf"); break; + case BICPROFILE: puts("Compugen .eprf profile"); break; + default: Die("unexpected fault"); + } + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); + + /*********************************************** + * Do the conversion + ***********************************************/ + + nhmm = 0; + while (HMMFileRead(infp, &hmm)) { + if (hmm == NULL) + Die("HMM file %s may be corrupt or in incorrect format; parse failed", infile); + + switch(outfmt) { + case P7ASCII: WriteAscHMM(outfp, hmm); break; + case P7BINARY: WriteBinHMM(outfp, hmm); break; + case GCGPROFILE: WriteProfile(outfp, hmm, FALSE); break; + case BICPROFILE: WriteProfile(outfp, hmm, TRUE); break; + default: Die("unexpected format"); + } + + printf(" - converted %s\n", hmm->name); + FreePlan7(hmm); + nhmm++; + } + printf("\n%d HMM(s) converted and written to %s\n", nhmm, outfile); + + /*********************************************** + * Clean-up and exit. + ***********************************************/ + + HMMFileClose(infp); + fclose(outfp); + SqdClean(); + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/src/hmmemit.c b/forester/archive/RIO/others/hmmer/src/hmmemit.c new file mode 100644 index 0000000..857b61c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmemit.c @@ -0,0 +1,267 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmemit.c + * SRE, Sun Mar 8 14:11:24 1998 [St. Louis] + * + * main() for generating sequences from an HMM + * CVS $Id: hmmemit.c,v 1.1.1.1 2005/03/22 08:34:09 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ +#include "msa.h" /* squid's multiple sequence i/o */ + +static char banner[] = "hmmemit - generate sequences from a profile HMM"; + +static char usage[] = "\ +Usage: hmmemit [-options] \n\ +Available options are:\n\ + -a : write generated sequences as an alignment, not FASTA\n\ + -c : generate a single \"consensus\" sequence\n\ + -h : help; print brief help on version and usage\n\ + -n : emit sequences (default 10)\n\ + -o : save sequences in file \n\ + -q : quiet - suppress verbose banner\n\ +"; + +static char experts[] = "\ + --seed : set random number seed to \n\ +"; + +static struct opt_s OPTIONS[] = { + { "-a", TRUE, sqdARG_NONE }, + { "-c", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "-n", TRUE, sqdARG_INT}, + { "-o", TRUE, sqdARG_STRING}, + { "-q", TRUE, sqdARG_NONE}, + { "--seed", FALSE, sqdARG_INT}, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMMs from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + struct plan7_s *hmm; /* HMM to generate from */ + FILE *fp; /* output file handle */ + int L; /* length of a sequence */ + int i; /* counter over sequences */ + int nhmm; /* counter over HMMs */ + + char *ofile; /* output sequence file */ + int nseq; /* number of seqs to sample */ + int seed; /* random number generator seed */ + int be_quiet; /* TRUE to silence header/footer */ + int do_alignment;/* TRUE to output in aligned format */ + int do_consensus;/* TRUE to do a single consensus seq */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + + nseq = 10; + seed = time ((time_t *) NULL); + be_quiet = FALSE; + do_alignment = FALSE; + do_consensus = FALSE; + ofile = NULL; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-a") == 0) do_alignment = TRUE; + else if (strcmp(optname, "-c") == 0) do_consensus = TRUE; + else if (strcmp(optname, "-n") == 0) nseq = atoi(optarg); + else if (strcmp(optname, "-o") == 0) ofile = optarg; + else if (strcmp(optname, "-q") == 0) be_quiet = TRUE; + else if (strcmp(optname, "--seed") == 0) seed = atoi(optarg); + else if (strcmp(optname, "-h") == 0) + { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 1) + Die("Incorrect number of arguments.\n%s\n", usage); + + hmmfile = argv[optind++]; + + sre_srandom(seed); + + if (do_alignment && do_consensus) + Die("Sorry, -a and -c are incompatible.\nUsage:\n%s", usage); + if (nseq != 10 && do_consensus) + Warn("-c (consensus) overrides -n (# of sampled seqs)"); + + /*********************************************** + * Open HMM file (might be in HMMERDB or current directory). + * Open output file, if needed. + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) + Die("Failed to open HMM file %s\n%s", hmmfile, usage); + + if (ofile == NULL) fp = stdout; + else { + if ((fp = fopen(ofile, "w")) == NULL) + Die("Failed to open output file %s for writing", ofile); + } + + /*********************************************** + * Show the options banner + ***********************************************/ + + if (! be_quiet) + { + Banner(stdout, banner); + printf("HMM file: %s\n", hmmfile); + if (! do_consensus) { + printf("Number of seqs: %d\n", nseq); + printf("Random seed: %d\n", seed); + } else { + printf("Generating consensus sequence.\n"); + } + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); + } + + /*********************************************** + * For every HMM in the file, do some emission. + ***********************************************/ + + nhmm = 0; + while (HMMFileRead(hmmfp, &hmm)) { + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + + /* Configure the HMM to shut off N,J,C emission: so we + * do a simple single pass through the model. + */ + Plan7NakedConfig(hmm); + Plan7Renormalize(hmm); + + /*********************************************** + * Do the work. + * If we're generating an alignment, we have to collect + * all our traces, then output. If we're generating unaligned + * sequences, we can emit one at a time. + ***********************************************/ + + if (do_consensus) + { + char *seq; + SQINFO sqinfo; /* info about sequence (name/desc) */ + + EmitConsensusSequence(hmm, &seq, NULL, &L, NULL); + strcpy(sqinfo.name, hmm->name); + strcpy(sqinfo.desc, "profile HMM generated consensus sequence [hmmemit]"); + + sqinfo.len = L; + sqinfo.flags = SQINFO_NAME | SQINFO_DESC | SQINFO_LEN; + + WriteSeq(fp, SQFILE_FASTA, seq, &sqinfo); + free(seq); + } + else if (do_alignment) + { + struct p7trace_s **tr; /* traces for aligned sequences */ + char **dsq; /* digitized sequences */ + SQINFO *sqinfo; /* info about sequences (name/desc) */ + MSA *msa; /* alignment */ + float *wgt; + + dsq = MallocOrDie(sizeof(char *) * nseq); + tr = MallocOrDie(sizeof(struct p7trace_s *) * nseq); + sqinfo = MallocOrDie(sizeof(SQINFO) * nseq); + wgt = MallocOrDie(sizeof(float) * nseq); + FSet(wgt, nseq, 1.0); + + for (i = 0; i < nseq; i++) + { + EmitSequence(hmm, &(dsq[i]), &L, &(tr[i])); + sprintf(sqinfo[i].name, "seq%d", i+1); + sqinfo[i].len = L; + sqinfo[i].flags = SQINFO_NAME | SQINFO_LEN; + } + + msa = P7Traces2Alignment(dsq, sqinfo, wgt, nseq, hmm->M, tr, FALSE); + msa->name = sre_strdup(hmm->name, -1); + msa->desc = sre_strdup("Synthetic sequence alignment generated by hmmemit", -1); + + /* Output the alignment */ + WriteStockholm(fp, msa); + + /* Free memory + */ + for (i = 0; i < nseq; i++) + { + P7FreeTrace(tr[i]); + free(dsq[i]); + } + MSAFree(msa); + free(sqinfo); + free(dsq); + free(wgt); + free(tr); + } + else /* unaligned sequence output */ + { + struct p7trace_s *tr; /* generated trace */ + char *dsq; /* digitized sequence */ + char *seq; /* alphabetic sequence */ + SQINFO sqinfo; /* info about sequence (name/len) */ + + for (i = 0; i < nseq; i++) + { + EmitSequence(hmm, &dsq, &L, &tr); + sprintf(sqinfo.name, "%s-%d", hmm->name, i+1); + sqinfo.len = L; + sqinfo.flags = SQINFO_NAME | SQINFO_LEN; + + seq = DedigitizeSequence(dsq, L); + + WriteSeq(fp, SQFILE_FASTA, seq, &sqinfo); + + P7FreeTrace(tr); + free(dsq); + free(seq); + } + } + nhmm++; + FreePlan7(hmm); + } + + /* We're done; clean up and exit. + */ + if (nhmm == 0) + Die("Failed to read any HMMs from %s\n", hmmfile); + if (ofile != NULL) { + fclose(fp); + if (!be_quiet) printf("Output saved in file %s\n", ofile); + } + HMMFileClose(hmmfp); + SqdClean(); + return 0; +} + diff --git a/forester/archive/RIO/others/hmmer/src/hmmfetch.c b/forester/archive/RIO/others/hmmer/src/hmmfetch.c new file mode 100644 index 0000000..3f06f7c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmfetch.c @@ -0,0 +1,130 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmfetch.c + * SRE, Wed Aug 5 14:26:51 1998 [St. Louis] + * + * Recover a specific HMM file from an HMM database, using + * an SSI index (created with hmmindex). + * + * CVS $Id: hmmfetch.c,v 1.1.1.1 2005/03/22 08:34:14 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" +#include "version.h" + +#include "globals.h" + +static char banner[] = "hmmfetch -- retrieve specific HMM from an HMM database"; + +static char usage[] = "\ +Usage: hmmfetch [-options] \n\ +Available options are:\n\ + -h : print short usage and version info, then exit\n\ + -n : interpret instead as an HMM number\n\ +"; + +static char experts[] = "\ +"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-n", TRUE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + + +int +main(int argc, char **argv) +{ + char *hmmfile; /* HMM file to open */ + char *key; /* HMM name to retrieve */ + HMMFILE *hmmfp; /* opened hmm file pointer */ + struct plan7_s *hmm; /* a hidden Markov model */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + int by_number; /* fetch by number, not name */ + int nhmm; /* hmm number */ + + /*********************************************** + * Parse the command line + ***********************************************/ + + by_number = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-n") == 0) by_number = TRUE; + else if (strcmp(optname, "-h") == 0) + { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + + if (argc - optind != 2) Die("Incorrect number of arguments.\n%s\n", usage); + hmmfile = argv[optind++]; + key = argv[optind++]; + + /*********************************************** + * Open HMM file, make sure SSI index exists + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) + Die("failed to open HMM file %s for reading.", hmmfile); + if (hmmfp->ssi == NULL) + Die("There is no SSI index for %s; you need to use hmmindex on it.", hmmfile); + + /*********************************************** + * find key in hmmfile; get HMM; show as ASCII + ***********************************************/ + + if (by_number) { + if (! IsInt(key)) Die("%s does not appear to be a number.", key); + nhmm = atoi(key); + if (! HMMFilePositionByIndex(hmmfp, nhmm)) + Die("failed to position %s to HMM #%d", hmmfile, nhmm); + } else { + if (! HMMFilePositionByName(hmmfp, key)) + Die("No such hmm %s in HMM file %s\n", key, hmmfile); + } + + if (! HMMFileRead(hmmfp, &hmm)) + Die("Unexpected end of HMM file"); + if (hmm == NULL) + Die("HMM file %s may be corrupt or in incorrect format; parse failed", hmmfile); + + WriteAscHMM(stdout, hmm); + + FreePlan7(hmm); + HMMFileClose(hmmfp); + + /*********************************************** + * Exit + ***********************************************/ + + SqdClean(); + return 0; +} + + diff --git a/forester/archive/RIO/others/hmmer/src/hmmindex.c b/forester/archive/RIO/others/hmmer/src/hmmindex.c new file mode 100644 index 0000000..8323687 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmindex.c @@ -0,0 +1,166 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmindex.c + * SRE, Wed Aug 5 11:05:03 1998 [St. Louis] + * + * Create an SSI index file for an HMM database. + * + * CVS $Id: hmmindex.c,v 1.1.1.1 2005/03/22 08:34:03 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" +#include "version.h" +#include "globals.h" +#include "ssi.h" + +static char banner[] = "hmmindex -- create SSI index for an HMM database"; + +static char usage[] = "\ +Usage: hmmindex [-options] \n\ +Available options are:\n\ + -h : print short usage and version info, then exit\n\ +"; + +static char experts[] = "\ +"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *hmmfile; /* HMM file to open */ + SSIINDEX *ssi; /* SSI index in memory */ + char *ssifile; /* name of SSI index on disk */ + HMMFILE *hmmfp; /* opened hmm file pointer */ + struct plan7_s *hmm; /* a hidden Markov model */ + int idx, nhmm; /* counter over HMMs */ + int npri, nsec; /* # of names, accessions */ + int fh; /* file handle */ + int status; /* return status from SSI call */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse the command line + ***********************************************/ + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) + { + if (strcmp(optname, "-h") == 0) + { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + + if (argc - optind != 1) Die("Incorrect number of arguments.\n%s\n", usage); + hmmfile = argv[optind++]; + + /*********************************************** + * Open our input HMM file, make sure all is well with the output SSI filename + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, NULL)) == NULL) + Die("failed to open HMM file %s for reading.", hmmfile); + if (hmmfp->ssi != NULL) + Die("SSI index already exists for %s.\nPlease delete it first.", hmmfile); + + ssifile = MallocOrDie(strlen(hmmfile) + 5); + sprintf(ssifile, "%s%s", hmmfile, ".ssi"); + if (FileExists(ssifile)) /* shouldn't happen */ + Die("An SSI file %s already exists; please delete it first", ssifile); + + if ((ssi = SSICreateIndex(hmmfp->mode)) == NULL) + Die("Failed to initialize the SSI index structure"); + if (SSIAddFileToIndex(ssi, hmmfile, hmmfp->is_binary, &fh) != 0) + Die("SSIAddFileToIndex() failed"); + + /*********************************************** + * Show the banner + ***********************************************/ + + Banner(stdout, banner); + printf("HMM file: %s\n", hmmfile); + if (hmmfp->mode == SSI_OFFSET_I64) + printf("Index file mode: 64-bit (large HMM file)\n"); + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n"); + + /*********************************************** + * Get offsets and names for every model; store in keylist + ***********************************************/ + + printf("Determining offsets for %s, please be patient...\n", hmmfile); + + nhmm = npri = nsec = 0; + while (HMMFileRead(hmmfp, &hmm)) + { + if (hmm == NULL) + Die("HMM file %s may be corrupt or in incorrect format; parse failed", hmmfile); + + /* record name of HMM as the primary retrieval key */ + status = SSIAddPrimaryKeyToIndex(ssi, hmm->name, fh, &(hmmfp->offset), NULL, 0); + if (status != 0) Die("SSIAddPrimaryKeyToIndex() failed"); + npri++; + + /* record accession of HMM as a secondary retrieval key */ + if (hmm->flags & PLAN7_ACC) { + status = SSIAddSecondaryKeyToIndex(ssi, hmm->acc, hmm->name); + if (status != 0) Die("SSIAddSecondaryKeyToIndex() failed"); + nsec++; + } + + nhmm++; + FreePlan7(hmm); + } + HMMFileClose(hmmfp); + + /*********************************************** + * Output the SSI file + ***********************************************/ + + status = SSIWriteIndex(ssifile, ssi); + if (status != 0) Die("SSIWriteIndex() failed"); + + printf("Complete.\n"); + printf("HMM file: %s\n", hmmfile); + printf("SSI index: %s\n", ssifile); + printf("# of HMMS: %d\n", nhmm); + printf("HMM names: %d\n", npri); + printf("HMM accessions: %d\n", nsec); + + + /*********************************************** + * Exit + ***********************************************/ + + free(ssifile); + SSIFreeIndex(ssi); + SqdClean(); + return 0; +} + + diff --git a/forester/archive/RIO/others/hmmer/src/hmmio.c b/forester/archive/RIO/others/hmmer/src/hmmio.c new file mode 100644 index 0000000..f2d6da5 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmio.c @@ -0,0 +1,1744 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmio.c + * + * Input/output of HMMs. + * + * As of HMMER 2.0, HMMs are saved by default in a tabular ASCII format + * as log-odds or log probabilities scaled to an integer. A binary save + * file format is also available which is faster to access (a + * consideration which might be important for HMM library applications). + * HMMs can be concatenated into HMM libraries. + * + * A comment on loss of accuracy. Storing a number as a scaled log + * probability guarantees us an error of about 0.035% or + * less in the retrieved probability. We are relatively invulnerable + * to the truncation errors which HMMER 1.8 was vulnerable to. + * + * Magic numbers (both for the ASCII and binary save formats) are used + * to label save files with a major version number. This simplifies the task of + * backwards compatibility as new versions of the program are created. + * Reverse but not forward compatibility is guaranteed. I.e. HMMER 2.0 + * can read `1.7' save files, but not vice versa. Note that the major + * version number in the save files is NOT the version of the software + * that generated it; rather, the number of the last major version in which + * save format changed. + * + ****************************************************************** + * + * The HMM input API: + * + * HMMFILE *hmmfp; + * char *hmmfile; + * struct plan7_s *hmm; + * char env[] = "HMMERDB"; (a la BLASTDB) + * + * hmmfp = HMMFileOpen(hmmfile, env) NULL on failure + * while (HMMFileRead(hmmfp, &hmm)) 0 if no more HMMs + * if (hmm == NULL) Die(); NULL on file parse failure + * whatever; + * FreeHMM(hmm); + * } + * HMMFileClose(hmmfp); + * + ***************************************************************** + * + * The HMM output API: + * + * FILE *ofp; + * struct plan7_s *hmm; + * + * WriteAscHMM(ofp, hmm); to write/append an HMM to open file + * or WriteBinHMM(ofp, hmm); to write/append binary format HMM to open file + * + ***************************************************************** + * + * V1.0: original implementation + * V1.1: regularizers removed from model structure + * V1.7: ref and cs annotation lines added from alignment, one + * char per match state 1..M + * V1.9: null model and name added to HMM structure. ASCII format changed + * to compact tabular one. + * V2.0: Plan7. Essentially complete rewrite. + */ + +#include +#include +#include +#include +#include +#include /* to get SEEK_CUR definition on silly Suns */ + +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" +#include "version.h" +#include "ssi.h" + +/* Magic numbers identifying binary formats. + * Do not change the old magics! Necessary for backwards compatibility. + */ +static unsigned int v10magic = 0xe8ededb1; /* v1.0 binary: "hmm1" + 0x80808080 */ +static unsigned int v10swap = 0xb1edede8; /* byteswapped v1.0 */ +static unsigned int v11magic = 0xe8ededb2; /* v1.1 binary: "hmm2" + 0x80808080 */ +static unsigned int v11swap = 0xb2edede8; /* byteswapped v1.1 */ +static unsigned int v17magic = 0xe8ededb3; /* v1.7 binary: "hmm3" + 0x80808080 */ +static unsigned int v17swap = 0xb3edede8; /* byteswapped v1.7 */ +static unsigned int v19magic = 0xe8ededb4; /* V1.9 binary: "hmm4" + 0x80808080 */ +static unsigned int v19swap = 0xb4edede8; /* V1.9 binary, byteswapped */ +static unsigned int v20magic = 0xe8ededb5; /* V2.0 binary: "hmm5" + 0x80808080 */ +static unsigned int v20swap = 0xb5edede8; /* V2.0 binary, byteswapped */ + +/* Old HMMER 1.x file formats. + */ +#define HMMER1_0B 1 /* binary HMMER 1.0 */ +#define HMMER1_0F 2 /* flat ascii HMMER 1.0 */ +#define HMMER1_1B 3 /* binary HMMER 1.1 */ +#define HMMER1_1F 4 /* flat ascii HMMER 1.1 */ +#define HMMER1_7B 5 /* binary HMMER 1.7 */ +#define HMMER1_7F 6 /* flat ascii HMMER 1.7 */ +#define HMMER1_9B 7 /* HMMER 1.9 binary */ +#define HMMER1_9F 8 /* HMMER 1.9 flat ascii */ + +static int read_asc20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_bin20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_asc19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_bin19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_asc17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_bin17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_asc11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_bin11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_asc10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); +static int read_bin10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm); + +static void byteswap(char *swap, int nbytes); +static char *prob2ascii(float p, float null); +static float ascii2prob(char *s, float null); +static void write_bin_string(FILE *fp, char *s); +static int read_bin_string(FILE *fp, int doswap, char **ret_s); +static void multiline(FILE *fp, char *pfx, char *s); + +static struct plan9_s *read_plan9_binhmm(FILE *fp, int version, int swapped); +static struct plan9_s *read_plan9_aschmm(FILE *fp, int version); + +/***************************************************************** + * HMM input API functions: + * HMMFileOpen() + * HMMFileRead() + * HMMFileClose() + * HMMFileRewind() + *****************************************************************/ + +/* Function: HMMFileOpen() + * + * Purpose: Open an HMM file for reading. The file may be either + * an index for a library of HMMs, or an HMM. + * + * Args: hmmfile - name of file + * env - NULL, or environment variable for HMM database. + * + * Return: Valid HMMFILE *, or NULL on failure. + */ +HMMFILE * +HMMFileOpen(char *hmmfile, char *env) +{ + HMMFILE *hmmfp; + unsigned int magic; + char buf[512]; + char *ssifile; + char *dir; /* dir name in which HMM file was found */ + int status; + + hmmfp = (HMMFILE *) MallocOrDie (sizeof(HMMFILE)); + hmmfp->f = NULL; + hmmfp->parser = NULL; + hmmfp->is_binary = FALSE; + hmmfp->byteswap = FALSE; + hmmfp->is_seekable= TRUE; /* always; right now, an HMM must always be in a file. */ + + /* Open the file. Look in current directory. + * If that doesn't work, check environment var for + * a second possible directory (usually the location + * of a system-wide HMM library). + * Using dir name if necessary, construct correct SSI file name. + */ + hmmfp->f = NULL; + hmmfp->ssi = NULL; + if ((hmmfp->f = fopen(hmmfile, "r")) != NULL) + { + ssifile = MallocOrDie(sizeof(char) * (strlen(hmmfile) + 5)); + sprintf(ssifile, "%s.ssi", hmmfile); + + if ((hmmfp->mode = SSIRecommendMode(hmmfile)) == -1) + Die("SSIRecommendMode() failed"); + } + else if ((hmmfp->f = EnvFileOpen(hmmfile, env, &dir)) != NULL) + { + char *full; + full = FileConcat(dir, hmmfile); + + ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(hmmfile) + 5)); + sprintf(ssifile, "%s.ssi", full); + + if ((hmmfp->mode = SSIRecommendMode(full)) == -1) + Die("SSIRecommendMode() failed"); + + free(full); + free(dir); + } + else return NULL; + + /* Open the SSI index file. If it doesn't exist, or it's corrupt, or + * some error happens, hmmfp->ssi stays NULL. + */ + SQD_DPRINTF1(("Opening ssifile %s...\n", ssifile)); + SSIOpen(ssifile, &(hmmfp->ssi)); + free(ssifile); + + /* Initialize the disk offset stuff. + */ + status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset)); + if (status != 0) Die("SSIGetFilePosition() failed"); + + /* Check for binary or byteswapped binary format + * by peeking at first 4 bytes. + */ + if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) { + HMMFileClose(hmmfp); + return NULL; + } + rewind(hmmfp->f); + + if (magic == v20magic) { + hmmfp->parser = read_bin20hmm; + hmmfp->is_binary = TRUE; + return hmmfp; + } + else if (magic == v20swap) { + SQD_DPRINTF1(("Opened a HMMER 2.0 binary file [byteswapped]\n")); + hmmfp->parser = read_bin20hmm; + hmmfp->is_binary = TRUE; + hmmfp->byteswap = TRUE; + return hmmfp; + } + else if (magic == v19magic) { + hmmfp->parser = read_bin19hmm; + hmmfp->is_binary = TRUE; + return hmmfp; + } + else if (magic == v19swap) { + hmmfp->parser = read_bin19hmm; + hmmfp->is_binary = TRUE; + hmmfp->byteswap = TRUE; + return hmmfp; + } + else if (magic == v17magic) { + hmmfp->parser = read_bin17hmm; + hmmfp->is_binary = TRUE; + return hmmfp; + } + else if (magic == v17swap) { + hmmfp->parser = read_bin17hmm; + hmmfp->is_binary = TRUE; + hmmfp->byteswap = TRUE; + return hmmfp; + } + else if (magic == v11magic) { + hmmfp->parser = read_bin11hmm; + hmmfp->is_binary = TRUE; + return hmmfp; + } + else if (magic == v11swap) { + hmmfp->parser = read_bin11hmm; + hmmfp->is_binary = TRUE; + hmmfp->byteswap = TRUE; + return hmmfp; + } + else if (magic == v10magic) { + hmmfp->parser = read_bin10hmm; + hmmfp->is_binary = TRUE; + return hmmfp; + } + else if (magic == v10swap) { + hmmfp->parser = read_bin10hmm; + hmmfp->is_binary = TRUE; + hmmfp->byteswap = TRUE; + return hmmfp; + } + /* else we fall thru; it may be an ASCII file. */ + + /* If magic looks binary but we don't recognize it, choke and die. + */ + if (magic & 0x80000000) { + Warn("\ +%s appears to be a binary but format is not recognized\n\ +It may be from a HMMER version more recent than yours,\n\ +or may be a different kind of binary altogether.\n", hmmfile); + HMMFileClose(hmmfp); + return NULL; + } + + /* Check for ASCII format by peeking at first word. + */ + if (fgets(buf, 512, hmmfp->f) == NULL) { + HMMFileClose(hmmfp); + return NULL; + } + rewind(hmmfp->f); + + if (strncmp("HMMER2.0", buf, 8) == 0) { + hmmfp->parser = read_asc20hmm; + return hmmfp; + } else if (strncmp("HMMER v1.9", buf, 10) == 0) { + hmmfp->parser = read_asc19hmm; + return hmmfp; + } else if (strncmp("# HMM v1.7", buf, 10) == 0) { + hmmfp->parser = read_asc17hmm; + return hmmfp; + } else if (strncmp("# HMM v1.1", buf, 10) == 0) { + hmmfp->parser = read_asc11hmm; + return hmmfp; + } else if (strncmp("# HMM v1.0", buf, 10) == 0) { + hmmfp->parser = read_asc10hmm; + return hmmfp; + } + + /* If we haven't recognized it yet, it's bogus. + */ + HMMFileClose(hmmfp); + return NULL; +} +int +HMMFileRead(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + int status; + /* Set the disk position marker. */ + if (hmmfp->is_seekable) { + status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset)); + if (status != 0) Die("SSIGetFilePosition() failed"); + } + /* Parse the HMM and return it. */ + return (*hmmfp->parser)(hmmfp, ret_hmm); +} +void +HMMFileClose(HMMFILE *hmmfp) +{ + if (hmmfp->f != NULL) fclose(hmmfp->f); + if (hmmfp->ssi != NULL) SSIClose(hmmfp->ssi); + free(hmmfp); +} +void +HMMFileRewind(HMMFILE *hmmfp) +{ + rewind(hmmfp->f); +} +int +HMMFilePositionByName(HMMFILE *hmmfp, char *name) +{ + SSIOFFSET offset; /* offset in hmmfile, from SSI */ + int fh; /* ignored. */ + + if (hmmfp->ssi == NULL) return 0; + if (SSIGetOffsetByName(hmmfp->ssi, name, &fh, &offset) != 0) return 0; + if (SSISetFilePosition(hmmfp->f, &offset) != 0) return 0; + return 1; +} +int +HMMFilePositionByIndex(HMMFILE *hmmfp, int idx) +{ /* idx runs from 0..nhmm-1 */ + int fh; /* file handle is ignored; only one HMM file */ + SSIOFFSET offset; /* file position of HMM */ + + if (hmmfp->ssi == NULL) return 0; + if (SSIGetOffsetByNumber(hmmfp->ssi, idx, &fh, &offset) != 0) return 0; + if (SSISetFilePosition(hmmfp->f, &offset) != 0) return 0; + return 1; +} + +/***************************************************************** + * HMM output API: + * WriteAscHMM() + * WriteBinHMM() + * + *****************************************************************/ + +/* Function: WriteAscHMM() + * + * Purpose: Save an HMM in flat text ASCII format. + * + * Args: fp - open file for writing + * hmm - HMM to save + */ +void +WriteAscHMM(FILE *fp, struct plan7_s *hmm) +{ + int k; /* counter for nodes */ + int x; /* counter for symbols */ + int ts; /* counter for state transitions */ + + fprintf(fp, "HMMER2.0 [%s]\n", RELEASE); /* magic header */ + + /* write header information + */ + fprintf(fp, "NAME %s\n", hmm->name); + if (hmm->flags & PLAN7_ACC) + fprintf(fp, "ACC %s\n", hmm->acc); + if (hmm->flags & PLAN7_DESC) + fprintf(fp, "DESC %s\n", hmm->desc); + fprintf(fp, "LENG %d\n", hmm->M); + fprintf(fp, "ALPH %s\n", + (Alphabet_type == hmmAMINO) ? "Amino":"Nucleic"); + fprintf(fp, "RF %s\n", (hmm->flags & PLAN7_RF) ? "yes" : "no"); + fprintf(fp, "CS %s\n", (hmm->flags & PLAN7_CS) ? "yes" : "no"); + fprintf(fp, "MAP %s\n", (hmm->flags & PLAN7_MAP) ? "yes" : "no"); + multiline(fp, "COM ", hmm->comlog); + fprintf(fp, "NSEQ %d\n", hmm->nseq); + fprintf(fp, "DATE %s\n", hmm->ctime); + fprintf(fp, "CKSUM %d\n", hmm->checksum); + if (hmm->flags & PLAN7_GA) + fprintf(fp, "GA %.1f %.1f\n", hmm->ga1, hmm->ga2); + if (hmm->flags & PLAN7_TC) + fprintf(fp, "TC %.1f %.1f\n", hmm->tc1, hmm->tc2); + if (hmm->flags & PLAN7_NC) + fprintf(fp, "NC %.1f %.1f\n", hmm->nc1, hmm->nc2); + + /* Specials + */ + fputs("XT ", fp); + for (k = 0; k < 4; k++) + for (x = 0; x < 2; x++) + fprintf(fp, "%6s ", prob2ascii(hmm->xt[k][x], 1.0)); + fputs("\n", fp); + + /* Save the null model first, so HMM readers can decode + * log odds scores on the fly. Save as log odds probabilities + * relative to 1/Alphabet_size (flat distribution) + */ + fprintf(fp, "NULT "); + fprintf(fp, "%6s ", prob2ascii(hmm->p1, 1.0)); /* p1 */ + fprintf(fp, "%6s\n", prob2ascii(1.0-hmm->p1, 1.0)); /* p2 */ + fputs("NULE ", fp); + for (x = 0; x < Alphabet_size; x++) + fprintf(fp, "%6s ", prob2ascii(hmm->null[x], 1/(float)(Alphabet_size))); + fputs("\n", fp); + + /* EVD statistics + */ + if (hmm->flags & PLAN7_STATS) + fprintf(fp, "EVD %10f %10f\n", hmm->mu, hmm->lambda); + + /* Print header + */ + fprintf(fp, "HMM "); + for (x = 0; x < Alphabet_size; x++) fprintf(fp, " %c ", Alphabet[x]); + fprintf(fp, "\n"); + fprintf(fp, " %6s %6s %6s %6s %6s %6s %6s %6s %6s\n", + "m->m", "m->i", "m->d", "i->m", "i->i", "d->m", "d->d", "b->m", "m->e"); + + /* Print HMM parameters (main section of the save file) + */ + fprintf(fp, " %6s %6s ", prob2ascii(1-hmm->tbd1, 1.0), "*"); + fprintf(fp, "%6s\n", prob2ascii(hmm->tbd1, 1.0)); + for (k = 1; k <= hmm->M; k++) + { + /* Line 1: k, match emissions, map */ + fprintf(fp, " %5d ", k); + for (x = 0; x < Alphabet_size; x++) + fprintf(fp, "%6s ", prob2ascii(hmm->mat[k][x], hmm->null[x])); + if (hmm->flags & PLAN7_MAP) fprintf(fp, "%5d", hmm->map[k]); + fputs("\n", fp); + /* Line 2: RF and insert emissions */ + fprintf(fp, " %5c ", hmm->flags & PLAN7_RF ? hmm->rf[k] : '-'); + for (x = 0; x < Alphabet_size; x++) + fprintf(fp, "%6s ", (k < hmm->M) ? prob2ascii(hmm->ins[k][x], hmm->null[x]) : "*"); + fputs("\n", fp); + /* Line 3: CS and transition probs */ + fprintf(fp, " %5c ", hmm->flags & PLAN7_CS ? hmm->cs[k] : '-'); + for (ts = 0; ts < 7; ts++) + fprintf(fp, "%6s ", (k < hmm->M) ? prob2ascii(hmm->t[k][ts], 1.0) : "*"); + fprintf(fp, "%6s ", prob2ascii(hmm->begin[k], 1.0)); + fprintf(fp, "%6s ", prob2ascii(hmm->end[k], 1.0)); + + fputs("\n", fp); + } + fputs("//\n", fp); +} + +/* Function: WriteBinHMM() + * + * Purpose: Write an HMM in binary format. + */ +void +WriteBinHMM(FILE *fp, struct plan7_s *hmm) +{ + int k; + + /* ye olde magic number */ + fwrite((char *) &(v20magic), sizeof(unsigned int), 1, fp); + + /* header section + */ + fwrite((char *) &(hmm->flags), sizeof(int), 1, fp); + write_bin_string(fp, hmm->name); + if (hmm->flags & PLAN7_ACC) write_bin_string(fp, hmm->acc); + if (hmm->flags & PLAN7_DESC) write_bin_string(fp, hmm->desc); + fwrite((char *) &(hmm->M), sizeof(int), 1, fp); + fwrite((char *) &(Alphabet_type), sizeof(int), 1, fp); + if (hmm->flags & PLAN7_RF) fwrite((char *) hmm->rf, sizeof(char), hmm->M+1, fp); + if (hmm->flags & PLAN7_CS) fwrite((char *) hmm->cs, sizeof(char), hmm->M+1, fp); + if (hmm->flags & PLAN7_MAP) fwrite((char *) hmm->map, sizeof(int), hmm->M+1, fp); + write_bin_string(fp, hmm->comlog); + fwrite((char *) &(hmm->nseq), sizeof(int), 1, fp); + write_bin_string(fp, hmm->ctime); + fwrite((char *) &(hmm->checksum), sizeof(int), 1, fp); + if (hmm->flags & PLAN7_GA) { + fwrite((char *) &(hmm->ga1), sizeof(float), 1, fp); + fwrite((char *) &(hmm->ga2), sizeof(float), 1, fp); + } + if (hmm->flags & PLAN7_TC) { + fwrite((char *) &(hmm->tc1), sizeof(float), 1, fp); + fwrite((char *) &(hmm->tc2), sizeof(float), 1, fp); + } + if (hmm->flags & PLAN7_NC) { + fwrite((char *) &(hmm->nc1), sizeof(float), 1, fp); + fwrite((char *) &(hmm->nc2), sizeof(float), 1, fp); + } + + /* Specials */ + for (k = 0; k < 4; k++) + fwrite((char *) hmm->xt[k], sizeof(float), 2, fp); + + /* Null model */ + fwrite((char *)&(hmm->p1), sizeof(float), 1, fp); + fwrite((char *) hmm->null, sizeof(float), Alphabet_size, fp); + + /* EVD stats */ + if (hmm->flags & PLAN7_STATS) { + fwrite((char *) &(hmm->mu), sizeof(float), 1, fp); + fwrite((char *) &(hmm->lambda), sizeof(float), 1, fp); + } + + /* entry/exit probabilities + */ + fwrite((char *)&(hmm->tbd1),sizeof(float), 1, fp); + fwrite((char *) hmm->begin, sizeof(float), hmm->M+1, fp); + fwrite((char *) hmm->end, sizeof(float), hmm->M+1, fp); + + /* main model + */ + for (k = 1; k <= hmm->M; k++) + fwrite((char *) hmm->mat[k], sizeof(float), Alphabet_size, fp); + for (k = 1; k < hmm->M; k++) + fwrite((char *) hmm->ins[k], sizeof(float), Alphabet_size, fp); + for (k = 1; k < hmm->M; k++) + fwrite((char *) hmm->t[k], sizeof(float), 7, fp); +} + + +/***************************************************************** + * + * Internal: HMM file parsers for various releases of HMMER. + * + * read_{asc,bin}xxhmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) + * + * Upon return, *ret_hmm is an allocated Plan7 HMM. + * Return 0 if no more HMMs in the file (normal). + * Return 1 and *ret_hmm = something if we got an HMM (normal) + * Return 1 if an error occurs (meaning "I tried to + * read something...") and *ret_hmm == NULL (meaning + * "...but it wasn't an HMM"). I know, this is a funny + * way to handle errors. + * + *****************************************************************/ + +static int +read_asc20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + struct plan7_s *hmm; + char buffer[512]; + char *s; + int M; + float p; + int k, x; + int atype; /* alphabet type, hmmAMINO or hmmNUCLEIC */ + + hmm = NULL; + if (feof(hmmfp->f) || fgets(buffer, 512, hmmfp->f) == NULL) return 0; + if (strncmp(buffer, "HMMER2.0", 8) != 0) goto FAILURE; + + /* Get the header information: tag/value pairs in any order, + * ignore unknown tags, stop when "HMM" is reached (signaling + * start of main model) + */ + hmm = AllocPlan7Shell(); + M = -1; + while (fgets(buffer, 512, hmmfp->f) != NULL) { + if (strncmp(buffer, "NAME ", 5) == 0) Plan7SetName(hmm, buffer+6); + else if (strncmp(buffer, "ACC ", 5) == 0) Plan7SetAccession(hmm, buffer+6); + else if (strncmp(buffer, "DESC ", 5) == 0) Plan7SetDescription(hmm, buffer+6); + else if (strncmp(buffer, "LENG ", 5) == 0) M = atoi(buffer+6); + else if (strncmp(buffer, "NSEQ ", 5) == 0) hmm->nseq = atoi(buffer+6); + else if (strncmp(buffer, "ALPH ", 5) == 0) + { /* Alphabet type */ + s2upper(buffer+6); + if (strncmp(buffer+6, "AMINO", 5) == 0) atype = hmmAMINO; + else if (strncmp(buffer+6, "NUCLEIC", 7) == 0) atype = hmmNUCLEIC; + else goto FAILURE; + + if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype); + else if (atype != Alphabet_type) + Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype)); + } + else if (strncmp(buffer, "RF ", 5) == 0) + { /* Reference annotation present? */ + if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_RF; + } + else if (strncmp(buffer, "CS ", 5) == 0) + { /* Consensus annotation present? */ + if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_CS; + } + else if (strncmp(buffer, "MAP ", 5) == 0) + { /* Map annotation present? */ + if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_MAP; + } + else if (strncmp(buffer, "COM ", 5) == 0) + { /* Command line log */ + StringChop(buffer+6); + if (hmm->comlog == NULL) + hmm->comlog = Strdup(buffer+6); + else + { + hmm->comlog = ReallocOrDie(hmm->comlog, sizeof(char *) * + (strlen(hmm->comlog) + 1 + strlen(buffer+6))); + strcat(hmm->comlog, "\n"); + strcat(hmm->comlog, buffer+6); + } + } + else if (strncmp(buffer, "DATE ", 5) == 0) + { /* Date file created */ + StringChop(buffer+6); + hmm->ctime= Strdup(buffer+6); + } + else if (strncmp(buffer, "GA ", 5) == 0) + { + if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE; + hmm->ga1 = atof(s); + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->ga2 = atof(s); + hmm->flags |= PLAN7_GA; + } + else if (strncmp(buffer, "TC ", 5) == 0) + { + if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE; + hmm->tc1 = atof(s); + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->tc2 = atof(s); + hmm->flags |= PLAN7_TC; + } + else if (strncmp(buffer, "NC ", 5) == 0) + { + if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE; + hmm->nc1 = atof(s); + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->nc2 = atof(s); + hmm->flags |= PLAN7_NC; + } + else if (strncmp(buffer, "XT ", 5) == 0) + { /* Special transition section */ + if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE; + for (k = 0; k < 4; k++) + for (x = 0; x < 2; x++) + { + if (s == NULL) goto FAILURE; + hmm->xt[k][x] = ascii2prob(s, 1.0); + s = strtok(NULL, " \t\n"); + } + } + else if (strncmp(buffer, "NULT ", 5) == 0) + { /* Null model transitions */ + if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE; + hmm->p1 = ascii2prob(s, 1.); + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->p1 = hmm->p1 / (hmm->p1 + ascii2prob(s, 1.0)); + } + else if (strncmp(buffer, "NULE ", 5) == 0) + { /* Null model emissions */ + if (Alphabet_type == hmmNOTSETYET) + Die("ALPH must precede NULE in HMM save files"); + s = strtok(buffer+6, " \t\n"); + for (x = 0; x < Alphabet_size; x++) { + if (s == NULL) goto FAILURE; + hmm->null[x] = ascii2prob(s, 1./(float)Alphabet_size); + s = strtok(NULL, " \t\n"); + } + } + else if (strncmp(buffer, "EVD ", 5) == 0) + { /* EVD parameters */ + hmm->flags |= PLAN7_STATS; + if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE; + hmm->mu = atof(s); + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->lambda = atof(s); + } + else if (strncmp(buffer, "CKSUM", 5) == 0) hmm->checksum = atoi(buffer+6); + else if (strncmp(buffer, "HMM ", 5) == 0) break; + } + + /* partial check for mandatory fields */ + if (feof(hmmfp->f)) goto FAILURE; + if (M < 1) goto FAILURE; + if (hmm->name == NULL) goto FAILURE; + if (Alphabet_type == hmmNOTSETYET) goto FAILURE; + + /* Main model section. Read as integer log odds, convert + * to probabilities + */ + AllocPlan7Body(hmm, M); + /* skip an annotation line */ + if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE; + /* parse tbd1 line */ + if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE; + if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE; + p = ascii2prob(s, 1.0); + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->tbd1 = ascii2prob(s, 1.0); + hmm->tbd1 = hmm->tbd1 / (p + hmm->tbd1); + + /* main model */ + for (k = 1; k <= hmm->M; k++) { + /* Line 1: k, match emissions, map */ + if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE; + if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE; + if (atoi(s) != k) goto FAILURE; + for (x = 0; x < Alphabet_size; x++) { + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->mat[k][x] = ascii2prob(s, hmm->null[x]); + } + if (hmm->flags & PLAN7_MAP) { + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->map[k] = atoi(s); + } + /* Line 2: RF and insert emissions */ + if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE; + if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE; + if (hmm->flags & PLAN7_RF) hmm->rf[k] = *s; + if (k < hmm->M) { + for (x = 0; x < Alphabet_size; x++) { + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->ins[k][x] = ascii2prob(s, hmm->null[x]); + } + } + /* Line 3: CS and transitions */ + if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE; + if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE; + if (hmm->flags & PLAN7_CS) hmm->cs[k] = *s; + for (x = 0; x < 7; x++) { + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][x] = ascii2prob(s, 1.0); + } + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->begin[k] = ascii2prob(s, 1.0); + if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE; + hmm->end[k] = ascii2prob(s, 1.0); + + } /* end loop over main model */ + + /* Advance to record separator + */ + while (fgets(buffer, 512, hmmfp->f) != NULL) + if (strncmp(buffer, "//", 2) == 0) break; + + Plan7Renormalize(hmm); /* Paracel reported bug 6/11/99 */ + + /* Set flags and return + */ + hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */ + hmm->flags &= ~PLAN7_HASBITS; /* scores are not valid */ + + *ret_hmm = hmm; + return 1; + +FAILURE: + if (hmm != NULL) FreePlan7(hmm); + *ret_hmm = NULL; + return 1; +} + + +static int +read_bin20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + struct plan7_s *hmm; + int k,x; + int type; + unsigned int magic; + + hmm = NULL; + + /* Header section + */ + if (feof(hmmfp->f)) return 0; + if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0; + + if (hmmfp->byteswap) byteswap((char *)&magic, sizeof(unsigned int)); + if (magic != v20magic) goto FAILURE; + /* allocate HMM shell for header info */ + hmm = AllocPlan7Shell(); + /* flags */ + if (! fread((char *) &(hmm->flags), sizeof(int), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) byteswap((char *)&(hmm->flags), sizeof(int)); + /* name */ + if (! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->name))) goto FAILURE; + + /* optional accession */ + if ((hmm->flags & PLAN7_ACC) && + ! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->acc))) goto FAILURE; + /* optional description */ + if ((hmm->flags & PLAN7_DESC) && + ! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->desc))) goto FAILURE; + /* length of model */ + if (! fread((char *) &hmm->M, sizeof(int), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) byteswap((char *)&(hmm->M), sizeof(int)); + /* alphabet type */ + if (! fread((char *) &type, sizeof(int), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) byteswap((char *)&type, sizeof(int)); + if (Alphabet_type == hmmNOTSETYET) SetAlphabet(type); + else if (type != Alphabet_type) + Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(type)); + + /* now allocate for rest of model */ + AllocPlan7Body(hmm, hmm->M); + + /* optional #=RF alignment annotation */ + if ((hmm->flags & PLAN7_RF) && + !fread((char *) hmm->rf, sizeof(char), hmm->M+1, hmmfp->f)) goto FAILURE; + hmm->rf[hmm->M+1] = '\0'; + /* optional #=CS alignment annotation */ + if ((hmm->flags & PLAN7_CS) && + !fread((char *) hmm->cs, sizeof(char), hmm->M+1, hmmfp->f)) goto FAILURE; + hmm->cs[hmm->M+1] = '\0'; + /* optional alignment map annotation */ + if ((hmm->flags & PLAN7_MAP) && + !fread((char *) hmm->map, sizeof(int), hmm->M+1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) + for (k = 1; k <= hmm->M; k++) + byteswap((char*)&(hmm->map[k]), sizeof(int)); + /* command line log */ + if (!read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->comlog))) goto FAILURE; + /* nseq */ + if (!fread((char *) &(hmm->nseq),sizeof(int), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) byteswap((char *)&(hmm->nseq), sizeof(int)); + /* creation time */ + if (!read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->ctime))) goto FAILURE; + /* checksum */ + if (!fread((char *) &(hmm->checksum),sizeof(int), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) byteswap((char *)&(hmm->checksum), sizeof(int)); + + /* Pfam gathering thresholds */ + if (hmm->flags & PLAN7_GA) { + if (! fread((char *) &(hmm->ga1), sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (! fread((char *) &(hmm->ga2), sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) { + byteswap((char *) &(hmm->ga1), sizeof(float)); + byteswap((char *) &(hmm->ga2), sizeof(float)); + } + } + /* Pfam trusted cutoffs */ + if (hmm->flags & PLAN7_TC) { + if (! fread((char *) &(hmm->tc1), sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (! fread((char *) &(hmm->tc2), sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) { + byteswap((char *) &(hmm->tc1), sizeof(float)); + byteswap((char *) &(hmm->tc2), sizeof(float)); + } + } + /* Pfam noise cutoffs */ + if (hmm->flags & PLAN7_NC) { + if (! fread((char *) &(hmm->nc1), sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (! fread((char *) &(hmm->nc2), sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) { + byteswap((char *) &(hmm->nc1), sizeof(float)); + byteswap((char *) &(hmm->nc2), sizeof(float)); + } + } + + /* specials */ + for (k = 0; k < 4; k++) + { + if (! fread((char *) hmm->xt[k], sizeof(float), 2, hmmfp->f)) goto FAILURE; + if (hmmfp->byteswap) { + for (x = 0; x < 2; x++) + byteswap((char *)&(hmm->xt[k][x]), sizeof(float)); + } + } + + /* null model */ + if (!fread((char *) &(hmm->p1),sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (!fread((char *)hmm->null,sizeof(float),Alphabet_size,hmmfp->f))goto FAILURE; + + /* EVD stats */ + if (hmm->flags & PLAN7_STATS) { + if (! fread((char *) &(hmm->mu), sizeof(float), 1, hmmfp->f))goto FAILURE; + if (! fread((char *) &(hmm->lambda), sizeof(float), 1, hmmfp->f))goto FAILURE; + + if (hmmfp->byteswap) { + byteswap((char *)&(hmm->mu), sizeof(float)); + byteswap((char *)&(hmm->lambda), sizeof(float)); + } + } + + /* entry/exit probabilities + */ + if (! fread((char *)&(hmm->tbd1), sizeof(float), 1, hmmfp->f)) goto FAILURE; + if (! fread((char *) hmm->begin, sizeof(float), hmm->M+1, hmmfp->f)) goto FAILURE; + if (! fread((char *) hmm->end, sizeof(float), hmm->M+1, hmmfp->f)) goto FAILURE; + + /* main model */ + for (k = 1; k <= hmm->M; k++) + if (! fread((char *) hmm->mat[k], sizeof(float), Alphabet_size, hmmfp->f)) goto FAILURE; + for (k = 1; k < hmm->M; k++) + if (! fread((char *) hmm->ins[k], sizeof(float), Alphabet_size, hmmfp->f)) goto FAILURE; + for (k = 1; k < hmm->M; k++) + if (! fread((char *) hmm->t[k], sizeof(float), 7, hmmfp->f)) goto FAILURE; + + /* byteswapping + */ + if (hmmfp->byteswap) { + for (x = 0; x < Alphabet_size; x++) + byteswap((char *) &(hmm->null[x]), sizeof(float)); + byteswap((char *)&(hmm->p1), sizeof(float)); + byteswap((char *)&(hmm->tbd1), sizeof(float)); + + for (k = 1; k <= hmm->M; k++) + { + for (x = 0; x < Alphabet_size; x++) + byteswap((char *)&(hmm->mat[k][x]), sizeof(float)); + if (k < hmm->M) + for (x = 0; x < Alphabet_size; x++) + byteswap((char *)&(hmm->ins[k][x]), sizeof(float)); + byteswap((char *)&(hmm->begin[k]), sizeof(float)); + byteswap((char *)&(hmm->end[k]), sizeof(float)); + if (k < hmm->M) + for (x = 0; x < 7; x++) + byteswap((char *)&(hmm->t[k][x]), sizeof(float)); + } + } + + + /* set flags and return + */ + hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */ + hmm->flags &= ~PLAN7_HASBITS; /* scores are not yet valid */ + *ret_hmm = hmm; + return 1; + +FAILURE: + if (hmm != NULL) FreePlan7(hmm); + *ret_hmm = NULL; + return 1; +} + + + + + +/* Function: read_asc19hmm() + * Date: Tue Apr 7 17:11:29 1998 [StL] + * + * Purpose: Read ASCII-format tabular (1.9 and later) save files. + * + * HMMER 1.9 was only used internally at WashU, as far as + * I know, so this code shouldn't be terribly important + * to anyone. + */ +static int +read_asc19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + struct plan7_s *hmm; + FILE *fp; + char buffer[512]; + char *s; + int M; /* length of model */ + int k; /* state number */ + int x; /* symbol number */ + int atype; /* Alphabet type */ + + hmm = NULL; + fp = hmmfp->f; + if (feof(fp) || fgets(buffer, 512, fp) == NULL) return 0; + if (strncmp(buffer, "HMMER v1.9", 10) != 0) goto FAILURE; + + hmm = AllocPlan7Shell(); + /* read M from first line */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; M = atoi(s); /* model length */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* ignore alphabet size */ + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; Plan7SetName(hmm, s); /* name */ + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; /* alphabet type */ + s2upper(s); + if (strcmp(s, "AMINO") == 0) atype = hmmAMINO; + else if (strcmp(s, "NUCLEIC") == 0) atype = hmmNUCLEIC; + else goto FAILURE; + + if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype); + else if (atype != Alphabet_type) + Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype)); + + /* read alphabet, make sure it's Plan7-compatible... */ + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + if (strncmp(s, Alphabet, Alphabet_size) != 0) goto FAILURE; + + /* whether we have ref, cs info */ + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + if (strcmp(s, "yes") == 0) hmm->flags |= PLAN7_RF; + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + if (strcmp(s, "yes") == 0) hmm->flags |= PLAN7_CS; + + /* null model. 1.9 has emissions only. invent transitions. */ + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + if (strcmp(s, "null") != 0) goto FAILURE; + for (x = 0; x < Alphabet_size; x++) { + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + hmm->null[x] = ascii2prob(s, 1.0); + } + hmm->p1 = (Alphabet_type == hmmAMINO)? 350./351. : 1000./1001.; + + /* Done with header; check some stuff before proceeding + */ + if (feof(hmmfp->f)) goto FAILURE; + if (M < 1) goto FAILURE; + if (hmm->name == NULL) goto FAILURE; + if (Alphabet_type == hmmNOTSETYET) goto FAILURE; + + /* Allocate the model. Set up the probabilities that Plan9 + * doesn't set. + */ + AllocPlan7Body(hmm, M); + ZeroPlan7(hmm); + Plan7LSConfig(hmm); + + /* The zero row has: 4 or 20 unused scores for nonexistent M0 state + * then: B->M, tbd1, a B->I that Plan7 doesn't have; + * three unused D-> transitions; then three I0 transitions that Plan7 doesn't have; + * then two unused rf, cs annotations. + */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* position index ignored */ + for (x = 0; x < Alphabet_size; x++) + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* emissions ignored */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + hmm->begin[1] = ascii2prob(s, 1.0); + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + hmm->tbd1 = ascii2prob(s, 1.0); + /* renormalize */ + hmm->begin[1] = hmm->begin[1] / (hmm->begin[1] + hmm->tbd1); + hmm->tbd1 = hmm->tbd1 / (hmm->begin[1] + hmm->tbd1); + /* skip rest of line, seven integer fields, two char fields */ + for (x = 0; x < 7; x++) + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + + /* main model: table of emissions, transitions, annotation */ + for (k = 1; k <= hmm->M; k++) + { + /* position index ignored */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + /* match emissions */ + for (x = 0; x < Alphabet_size; x++) { + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + hmm->mat[k][x] = ascii2prob(s, hmm->null[x]); + } + /* nine transitions; two are ignored */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][TMM] = ascii2prob(s, 1.0); + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][TMD] = (k == hmm->M) ? 0.0 : ascii2prob(s, 1.0); + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][TMI] = ascii2prob(s, 1.0); + + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][TDM] = ascii2prob(s, 1.0); + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][TDD] = (k == hmm->M) ? 0.0 : ascii2prob(s, 1.0); + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;/* TDI ignored. */ + + /* no insert state at k == M, be careful */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][TIM] = ascii2prob(s, 1.0); + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* TID ignored. */ + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k < hmm->M) hmm->t[k][TII] = ascii2prob(s, 1.0); + + /* annotations */ + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + if (hmm->flags & PLAN7_RF) hmm->rf[k] = *s; + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + if (hmm->flags & PLAN7_CS) hmm->cs[k] = *s; + } + /* table of insert emissions; + * Plan7 has no insert state at 0 or M */ + for (k = 0; k <= hmm->M; k++) + { + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* position index ignored */ + for (x = 0; x < Alphabet_size; x++) { + if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; + if (k > 0 && k < hmm->M) + hmm->ins[k][x] = ascii2prob(s, hmm->null[x]); + } + } + + /* Set flags and return + */ + hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */ + hmm->flags &= ~PLAN7_HASBITS; /* scores are not valid */ + Plan7Renormalize(hmm); + hmm->comlog = Strdup("[converted from an old Plan9 HMM]"); + Plan7SetCtime(hmm); + *ret_hmm = hmm; + return 1; + +FAILURE: + if (hmm != NULL) FreePlan7(hmm); + *ret_hmm = NULL; + return 1; +} + +static int +read_bin19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + unsigned int magic; + struct plan7_s *hmm; /* plan7 HMM */ + struct plan9_s *p9hmm; /* old style 1.x HMM */ + + /* Read the magic number; if we don't see it, then we + * must be out of data in the file. + */ + if (feof(hmmfp->f)) return 0; + if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0; + + p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_9B, hmmfp->byteswap); + if (p9hmm == NULL) { *ret_hmm = NULL; return 1; } + + Plan9toPlan7(p9hmm, &hmm); + + hmm->comlog = Strdup("[converted from an old Plan9 HMM]"); + Plan7SetCtime(hmm); + + P9FreeHMM(p9hmm); + *ret_hmm = hmm; + return 1; +} +static int +read_asc17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + struct plan7_s *hmm; /* plan7 HMM */ + struct plan9_s *p9hmm; /* old style 1.x HMM */ + char buffer[512]; + + /* Read the magic header; if we don't see it, then + * we must be out of data in the file. + */ + if (feof(hmmfp->f) || fgets(buffer, 512, hmmfp->f) == NULL) return 0; + + p9hmm = read_plan9_aschmm(hmmfp->f, HMMER1_7F); + if (p9hmm == NULL) { *ret_hmm = NULL; return 1; } + + Plan9toPlan7(p9hmm, &hmm); + + hmm->comlog = Strdup("[converted from an old Plan9 HMM]"); + Plan7SetCtime(hmm); + + P9FreeHMM(p9hmm); + Plan7Renormalize(hmm); + *ret_hmm = hmm; + return 1; +} + +static int +read_bin17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + unsigned int magic; + struct plan7_s *hmm; /* plan7 HMM */ + struct plan9_s *p9hmm; /* old style 1.x HMM */ + + /* Read the magic number; if we don't see it, then we + * must be out of data in the file. + */ + if (feof(hmmfp->f)) return 0; + if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0; + + p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_7B, hmmfp->byteswap); + if (p9hmm == NULL) { *ret_hmm = NULL; return 1; } + + Plan9toPlan7(p9hmm, &hmm); + + hmm->comlog = Strdup("[converted from an old Plan9 HMM]"); + Plan7SetCtime(hmm); + + P9FreeHMM(p9hmm); + *ret_hmm = hmm; + return 1; +} + +static int +read_asc11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + Die("1.1 ASCII HMMs unsupported"); + return 1; +} +static int +read_bin11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + unsigned int magic; + struct plan7_s *hmm; /* plan7 HMM */ + struct plan9_s *p9hmm; /* old style 1.x HMM */ + + /* Read the magic number; if we don't see it, then we + * must be out of data in the file. + */ + if (feof(hmmfp->f)) return 0; + if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0; + + p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_1B, hmmfp->byteswap); + if (p9hmm == NULL) { *ret_hmm = NULL; return 1; } + + Plan9toPlan7(p9hmm, &hmm); + + hmm->comlog = Strdup("[converted from an old Plan9 HMM]"); + Plan7SetCtime(hmm); + + P9FreeHMM(p9hmm); + *ret_hmm = hmm; + return 1; +} + +static int +read_asc10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + Die("1.0 ASCII HMMs unsupported"); + return 1; +} + +static int +read_bin10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm) +{ + unsigned int magic; + struct plan7_s *hmm; /* plan7 HMM */ + struct plan9_s *p9hmm; /* old style 1.x HMM */ + + /* Read the magic number; if we don't see it, then we + * must be out of data in the file. + */ + if (feof(hmmfp->f)) return 0; + if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0; + + p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_0B, hmmfp->byteswap); + if (p9hmm == NULL) { *ret_hmm = NULL; return 1; } + + Plan9toPlan7(p9hmm, &hmm); + + hmm->comlog = Strdup("[converted from an old Plan9 HMM]"); + Plan7SetCtime(hmm); + + P9FreeHMM(p9hmm); + *ret_hmm = hmm; + return 1; +} + +/***************************************************************** + * Some miscellaneous utility functions + *****************************************************************/ + +/* Function: prob2ascii() + * + * Purpose: Format a probability for output to an ASCII save + * file. Returns a ptr to a static internal buffer. + * + */ +static char * +prob2ascii(float p, float null) +{ + static char buffer[8]; + + if (p == 0.0) return "*"; + sprintf(buffer, "%6d", Prob2Score(p, null)); + return buffer; +} + + +/* Function: ascii2prob() + * + * Purpose: Convert a saved string back to a probability. + */ +static float +ascii2prob(char *s, float null) +{ + return (*s == '*') ? 0. : Score2Prob(atoi(s), null); +} + +/* Function: byteswap() + * + * Purpose: Swap between big-endian and little-endian. + * For example: + * int foo = 0x12345678; + * byteswap((char *) &foo, sizeof(int)); + * printf("%x\n", foo) + * gives 78563412. + * + * I don't fully understand byte-swapping issues. + * However, I have tested this on chars through floats, + * on various machines: + * SGI IRIX 4.0.5, SunOS 4.1.3, DEC Alpha OSF/1, Alliant + * + * Note: this is only a partial solution to the problem of + * binary file portability. 32 bit integers are assumed by HMMER, + * for instance. This should be true for all UNIX, VAX, and WinNT + * platforms, I believe. + * + * Date: Sun Feb 12 10:26:22 1995 + */ +static void +byteswap(char *swap, int nbytes) +{ + int x; + char byte; + + for (x = 0; x < nbytes / 2; x++) + { + byte = swap[nbytes - x - 1]; + swap[nbytes - x - 1] = swap[x]; + swap[x] = byte; + } +} + +/* Function: write_bin_string() + * Date: SRE, Wed Oct 29 13:49:27 1997 [TWA 721 over Canada] + * + * Purpose: Write a string in binary save format: an integer + * for the string length (including \0), followed by + * the string. + */ +static void +write_bin_string(FILE *fp, char *s) +{ + int len; + if (s != NULL) + { + len = strlen(s) + 1; + fwrite((char *) &len, sizeof(int), 1, fp); + fwrite((char *) s, sizeof(char), len, fp); + } + else + { + len = 0; + fwrite((char *) &len, sizeof(int), 1, fp); + } +} + +/* Function: read_bin_string() + * Date: SRE, Wed Oct 29 14:03:23 1997 [TWA 721] + * + * Purpose: Read in a string from a binary file, where + * the first integer is the length (including '\0'). + * + * Args: fp - FILE to read from + * doswap - TRUE to byteswap + * ret_s - string to read into + * + * Return: 0 on failure. ret_s is malloc'ed here. + */ +static int +read_bin_string(FILE *fp, int doswap, char **ret_s) +{ + char *s; + int len; + + if (! fread((char *) &len, sizeof(int), 1, fp)) return 0; + if (doswap) byteswap((char *)&len, sizeof(int)); + s = MallocOrDie (sizeof(char) * (len)); + if (! fread((char *) s, sizeof(char), len, fp)) + { + free(s); + return 0; + } + + *ret_s = s; + return 1; +} + +/* Function: multiline() + * Date: Mon Jan 5 14:57:50 1998 [StL] + * + * Purpose: Given a record (like the comlog) that contains + * multiple lines, print it as multiple lines with + * a given prefix. e.g.: + * + * given: "COM ", "foo\nbar\nbaz" + * print: COM foo + * COM bar + * COM baz + * + * + * Used to print the command log to ASCII save files. + * + * Args: fp: FILE to print to + * pfx: prefix for each line + * s: line to break up and print; tolerates a NULL + * + * Return: (void) + */ +static void +multiline(FILE *fp, char *pfx, char *s) +{ + char *buf; + char *sptr; + + if (s == NULL) return; + buf = Strdup(s); + sptr = strtok(buf, "\n"); + while (sptr != NULL) + { + fprintf(fp, "%s%s\n", pfx, sptr); + sptr = strtok(NULL, "\n"); + } + free(buf); +} + + +/***************************************************************** + * HMMER 1.x save file reading functions, modified from the + * corpse of 1.9m. + *****************************************************************/ + + +/* Function: read_plan9_binhmm() + * + * Read old (Plan9) binary HMM save files from HMMER 1.9 and earlier. + * V1.0 saved regularizer and sympvec info, which V1.1 ignores. + * V1.7 and later may include optional ref, cs annotation lines. + * V1.9 added name, null model. + * + * Returns pointer to the HMM on success; NULL + * on failure. Sets global alphabet information based on + * whether it reads 4 or 20 as alphabet size (don't rely + * on ancient HMMER macro definitions). + */ +static struct plan9_s * +read_plan9_binhmm(FILE *fp, int version, int swapped) +{ + struct plan9_s *hmm; + int M; /* length of model */ + int k; /* state number */ + int x; /* symbol or transition number */ + int len; /* length of variable length string */ + int asize; /* alphabet size */ + int atype; /* alphabet type (read but ignored) */ + char abet[20]; /* alphabet (read but ignored) */ + + /* read M and alphabet size */ + if (! fread((char *) &(M), sizeof(int), 1, fp)) return NULL; + if (! fread((char *) &asize, sizeof(int), 1, fp)) return NULL; + if (swapped) { + byteswap((char *) &M, sizeof(int)); + byteswap((char *) &asize, sizeof(int)); + } + + /* Set global alphabet information + */ + if (asize == 4) atype = hmmNUCLEIC; + else if (asize == 20) atype = hmmAMINO; + else Die("A nonbiological alphabet size of %d; so I can't convert plan9 to plan7", asize); + if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype); + else if (atype != Alphabet_type) + Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype)); + + /* now, create space for hmm */ + if ((hmm = P9AllocHMM(M)) == NULL) + Die("malloc failed for reading hmm in\n"); + + /* version 1.9+ files have a name */ + if (version == HMMER1_9B) { + if (! fread((char *) &len, sizeof(int), 1, fp)) return NULL; + if (swapped) byteswap((char *) &len, sizeof(int)); + hmm->name = (char *) ReallocOrDie (hmm->name, sizeof(char) * (len+1)); + if (! fread((char *) hmm->name, sizeof(char), len, fp)) return NULL; + hmm->name[len] = '\0'; + } + + /* read alphabet_type and alphabet, but ignore: we've already set them */ + if (! fread((char *) &atype, sizeof(int), 1, fp)) return NULL; + if (! fread((char *) abet, sizeof(char), Alphabet_size, fp)) return NULL; + + /* skip the random symbol frequencies in V1.0 */ + if (version == HMMER1_0B) + fseek(fp, (long) (sizeof(float) * Alphabet_size), SEEK_CUR); + + /* Get optional info in V1.7 and later + */ + if (version == HMMER1_7B || version == HMMER1_9B) + { + if (! fread((char *) &(hmm->flags), sizeof(int), 1, fp)) return NULL; + if (swapped) byteswap((char *) &hmm->flags, sizeof(int)); + if ((hmm->flags & HMM_REF) && + ! fread((char *) hmm->ref, sizeof(char), hmm->M+1, fp)) return NULL; + hmm->ref[hmm->M+1] = '\0'; + if ((hmm->flags & HMM_CS) && + ! fread((char *) hmm->cs, sizeof(char), hmm->M+1, fp)) return NULL; + hmm->cs[hmm->M+1] = '\0'; + } + + /* Get the null model in V1.9 and later + */ + if (version == HMMER1_9B) + { + if (! fread((char *) hmm->null, sizeof(float), Alphabet_size, fp)) return NULL; + if (swapped) + for (x = 0; x < Alphabet_size; x++) + byteswap((char *) &(hmm->null[x]), sizeof(float)); + } + else P9DefaultNullModel(hmm->null); + + /* everything else is states */ + for (k = 0; k <= hmm->M; k++) + { + /* get match state info */ + if (! fread((char *) &(hmm->mat[k].t[MATCH]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) &(hmm->mat[k].t[DELETE]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) &(hmm->mat[k].t[INSERT]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) hmm->mat[k].p, sizeof(float), Alphabet_size, fp)) return NULL +; + if (swapped) { + byteswap((char *) &(hmm->mat[k].t[MATCH]), sizeof(float)); + byteswap((char *) &(hmm->mat[k].t[DELETE]), sizeof(float)); + byteswap((char *) &(hmm->mat[k].t[INSERT]), sizeof(float)); + for (x = 0; x < Alphabet_size; x++) + byteswap((char *) &(hmm->mat[k].p[x]), sizeof(float)); + } + + /* skip the regularizer info in V1.0 */ + if (version == HMMER1_0B) + fseek(fp, (long)(sizeof(float) * (3 + Alphabet_size)), SEEK_CUR); + + /* get delete state info */ + if (! fread((char *) &(hmm->del[k].t[MATCH]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) &(hmm->del[k].t[DELETE]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) &(hmm->del[k].t[INSERT]), sizeof(float), 1, fp)) return NULL; + if (swapped) { + byteswap((char *) &(hmm->del[k].t[MATCH]), sizeof(float)); + byteswap((char *) &(hmm->del[k].t[DELETE]), sizeof(float)); + byteswap((char *) &(hmm->del[k].t[INSERT]), sizeof(float)); + } + + /* skip the regularizer info in V1.0 */ + if (version == HMMER1_0B) + fseek(fp, (long)(sizeof(float) * 3), SEEK_CUR); + + /* get insert state info */ + if (! fread((char *) &(hmm->ins[k].t[MATCH]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) &(hmm->ins[k].t[DELETE]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) &(hmm->ins[k].t[INSERT]), sizeof(float), 1, fp)) return NULL; + if (! fread((char *) hmm->ins[k].p, sizeof(float), Alphabet_size, fp)) return NULL +; + if (swapped) { + byteswap((char *) &(hmm->ins[k].t[MATCH]), sizeof(float)); + byteswap((char *) &(hmm->ins[k].t[DELETE]), sizeof(float)); + byteswap((char *) &(hmm->ins[k].t[INSERT]), sizeof(float)); + for (x = 0; x < Alphabet_size; x++) + byteswap((char *) &(hmm->ins[k].p[x]), sizeof(float)); + } + + /* skip the regularizer info in V1.0 */ + if (version == HMMER1_0B) + fseek(fp, (long)(sizeof(float) * (3 + Alphabet_size)), SEEK_CUR); + } + P9Renormalize(hmm); + return hmm; +} + + +/* Function: read_plan9_aschmm() + * + * Purpose: Read ASCII-format save files from 1.8.4 and earlier. + * V1.0 contained sympvec and regularizers; these are ignored + * in V1.1 and later + * V1.7 and later contain ref and cs annotation. + * + * Args: fp - open save file, header has been read already + * version - HMMER1_7F, for instance + * + * Returns ptr to the (allocated) new HMM on success, + * or NULL on failure. + */ +static struct plan9_s * +read_plan9_aschmm(FILE *fp, int version) +{ + struct plan9_s *hmm; + int M; /* length of model */ + char buffer[512]; + char *statetype; + char *s; + int k; /* state number */ + int i; /* symbol number */ + int asize; /* Alphabet size */ + int atype; /* Alphabet type */ + + /* read M from first line */ + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + if (!isdigit((int) (*s))) return NULL; + M = atoi(s); + /* read alphabet_length */ + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + if (!isdigit((int) (*s))) return NULL; + asize = atoi(s); + + /* Set global alphabet information + */ + if (asize == 4) atype = hmmNUCLEIC; + else if (asize == 20) atype = hmmAMINO; + else Die("A nonbiological alphabet size of %d; so I can't convert plan9 to plan7", asize); + if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype); + else if (atype != Alphabet_type) + Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype)); + + /* now, create space for hmm */ + if ((hmm = P9AllocHMM(M)) == NULL) + Die("malloc failed for reading hmm in\n"); + + /* read alphabet_type but ignore */ + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + if (!isdigit((int) (*s))) return NULL; + /* read alphabet but ignore */ + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + + /* skip the random symbol frequencies in V1.0 files. now unused */ + if (version == HMMER1_0F) + for (i = 0; i < Alphabet_size; i++) + if (fgets(buffer, 512, fp) == NULL) return NULL; + + /* V1.7 has lines for whether we have valid ref, cs info + */ + if (version == HMMER1_7F) + { + if (fgets(buffer, 512, fp) == NULL) return NULL; + if (strncmp(buffer, "yes", 3) == 0) hmm->flags |= HMM_REF; + if (fgets(buffer, 512, fp) == NULL) return NULL; + if (strncmp(buffer, "yes", 3) == 0) hmm->flags |= HMM_CS; + } + + /* everything else is states */ + while (fgets(buffer, 512, fp) != NULL) + { + /* get state type and index info */ + if ((statetype = strtok(buffer, " \t\n")) == NULL) return NULL; + if ((s = strtok((char *) NULL, " \t\n")) == NULL) return NULL; + if (!isdigit((int) (*s))) return NULL; + k = atoi(s); + if (k < 0 || k > hmm->M+1) return NULL; + + if (strcmp(statetype, "###MATCH_STATE") == 0) + { + /* V1.7: get ref, cs info: */ + /* ###MATCH_STATE 16 (x) (H) */ + if (version == HMMER1_7F) + { + s = strtok(NULL, "\n"); + while (*s != '(' && *s != '\0') s++; + if (*s != '(') return NULL; + hmm->ref[k] = *(s+1); + while (*s != '(' && *s != '\0') s++; + if (*s != '(') return NULL; + hmm->cs[k] = *(s+1); + } + + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->mat[k].t[MATCH] = (float) atof(s); + + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->mat[k].t[DELETE] = (float) atof(s); + + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->mat[k].t[INSERT] = (float) atof(s); + + for (i = 0; i < Alphabet_size; i++) + { + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->mat[k].p[i] = (float) atof(s); + } + + /* Skip all regularizer info for V1.0 */ + if (version == HMMER1_0F) + for (i = 0; i < Alphabet_size + 3; i++) + if (fgets(buffer, 512, fp) == NULL) return NULL; + + } + else if (strcmp(statetype, "###INSERT_STATE") == 0) + { + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->ins[k].t[MATCH] = (float) atof(s); + + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->ins[k].t[DELETE] = (float) atof(s); + + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->ins[k].t[INSERT] = (float) atof(s); + + for (i = 0; i < Alphabet_size; i++) + { + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->ins[k].p[i] = (float) atof(s); + } + + /* Skip all regularizer info in V1.0 files */ + if (version == HMMER1_0F) + for (i = 0; i < Alphabet_size + 3; i++) + if (fgets(buffer, 512, fp) == NULL) return NULL; + + } + else if (strcmp(statetype, "###DELETE_STATE") == 0) + { + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->del[k].t[MATCH] = (float) atof(s); + + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->del[k].t[DELETE] = (float) atof(s); + + if (fgets(buffer, 512, fp) == NULL) return NULL; + if ((s = strtok(buffer, " \t\n")) == NULL) return NULL; + hmm->del[k].t[INSERT] = (float) atof(s); + + /* Skip all regularizer info in V1.0 files*/ + if (version == HMMER1_0F) + for (i = 0; i < 3; i++) + if (fgets(buffer, 512, fp) == NULL) return NULL; + } + else + return NULL; + } + + P9DefaultNullModel(hmm->null); + P9Renormalize(hmm); + return hmm; +} diff --git a/forester/archive/RIO/others/hmmer/src/hmmpfam-pvm.c b/forester/archive/RIO/others/hmmer/src/hmmpfam-pvm.c new file mode 100644 index 0000000..ea75d20 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmpfam-pvm.c @@ -0,0 +1,229 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +#ifdef HMMER_PVM + +/* hmmslave-pvm.c + * SRE, Sun Jul 12 17:15:36 1998 + * + * PVM slave for hmmpfam-pvm and hmmsearch-pvm. + * RCS $Id: hmmpfam-pvm.c,v 1.1.1.1 2005/03/22 08:34:15 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "version.h" +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ + +static void leave_pvm(void); + +int +main(void) +{ + struct p7trace_s *tr; /* traceback of an alignment */ + int master_tid; /* PVM TID of our master */ + char *hmmfile; /* file to read HMM(s) from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + struct plan7_s *hmm; + char *seq; + char *dsq; + int len; + int nhmm; /* number of HMM to work on */ + float sc; + int my_idx = -1; /* my index, 0..nslaves-1 */ + double pvalue; /* Z*pvalue = Evalue */ + double evalue; /* upper bound on evalue */ + struct threshold_s thresh; /* threshold settings */ + int send_trace; /* TRUE if score is significant */ + int do_xnu; /* TRUE to do XNU filter on seq */ + int do_forward; /* TRUE to use Forward() scores not Viterbi */ + int do_null2; /* TRUE to correct scores w/ ad hoc null2 */ + int alphatype; /* alphabet type, hmmAMINO or hmmNUCLEIC */ + int code; /* return code after initialization */ + + + SQD_DPRINTF1(("a slave reporting for duty!\n")); + + /* Register leave_pvm() cleanup function so any exit() call + * first calls pvm_exit(). + */ + if (atexit(leave_pvm) != 0) { pvm_exit(); Die("slave couldn't register leave_pvm()"); } + + /***************************************************************** + * initialization. + * Master broadcasts to us: + * 1) len of HMM file name (int) + * 2) name of HMM file (string) + * 3) length of sequence string (int) + * 4) sequence (string) + * 5) globT threshold + * 6) globE threshold + * 7) Z + * 8) autocut setting + * 9) do_xnu flag + * 10) do_forward flag + * 11) do_null2 flag + * 12) alphabet type + * We receive the broadcast and open the files. + ******************************************************************/ + + master_tid = pvm_parent(); /* who's our master? */ + SQD_DPRINTF1(("I know my master is %d\n", master_tid)); + + pvm_recv(master_tid, HMMPVM_INIT); + pvm_upkint(&len, 1, 1); + hmmfile = MallocOrDie(sizeof(char *) * (len+1)); + pvm_upkstr(hmmfile); + pvm_upkint(&len, 1, 1); + seq = MallocOrDie(sizeof(char *) * (len+1)); + pvm_upkstr(seq); + pvm_upkfloat(&(thresh.globT), 1, 1); + pvm_upkdouble(&(thresh.globE), 1, 1); + pvm_upkint(&(thresh.Z), 1, 1); + pvm_upkint((int *) &(thresh.autocut), 1, 1); + pvm_upkint(&do_xnu, 1, 1); + pvm_upkint(&do_forward, 1, 1); + pvm_upkint(&do_null2, 1, 1); + pvm_upkint(&alphatype, 1, 1); + SQD_DPRINTF1(("My master has told me how to initialize, and I am happy.\n")); + + SetAlphabet(alphatype); + /* Open HMM file (maybe in HMMERDB) */ + code = HMMPVM_OK; + if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) + code = HMMPVM_NO_HMMFILE; + else if (hmmfp->ssi == NULL) + code = HMMPVM_NO_INDEX; + + /* report our status. + */ + pvm_initsend(PvmDataDefault); + pvm_pkint(&code, 1, 1); + PVMPackString(RELEASE); /* proofing against bug#1 */ + pvm_send(master_tid, HMMPVM_RESULTS); + SQD_DPRINTF1(("I have told my master my initialization status and I await his command.\n")); + + dsq = DigitizeSequence(seq, len); + if (do_xnu) XNU(dsq, len); + + /***************************************************************** + * Main loop. + * Receive an integer 0..nhmm-1 for which HMM to search against. + * If we receive a -1, we shut down. + *****************************************************************/ + + for (;;) + { + pvm_recv(master_tid, HMMPVM_WORK); + pvm_upkint(&nhmm, 1, 1); + if (my_idx < 0) my_idx = nhmm; /* first time thru, remember what index we are. */ + + if (nhmm == -1) { /* shutdown signal */ + SQD_DPRINTF1(("I've been told to shut down.")); + break; + } + + /* move to our assigned HMM in the HMM file, and read it + */ + SQD_DPRINTF1(("The master says to do HMM #%d - I hear and obey\n", nhmm)); + if (! HMMFilePositionByIndex(hmmfp, nhmm)) Die("didn't position the HMM file"); + if (! HMMFileRead(hmmfp, &hmm)) Die("unexpected end of HMM file"); + if (hmm == NULL) Die("unexpected failure to parse HMM file"); + P7Logoddsify(hmm, TRUE); + + /* set Pfam specific score thresholds if needed */ + if (! SetAutocuts(&thresh, hmm)) + Die("HMM %s doesn't have the score cutoffs you wanted", hmm->name); + + /* Score sequence, do alignment (Viterbi), recover trace + */ + if (P7ViterbiSize(len, hmm->M) <= RAMLIMIT) + { + SQD_DPRINTF1(("P7Viterbi(): Estimated size %d Mb\n", P7ViterbiSize(len, hmm->M))); + sc = P7Viterbi(dsq, len, hmm, &tr); + } + else + { + SQD_DPRINTF1(("P7SmallViterbi() called; %d Mb > %d\n", P7ViterbiSize(len, hmm->M), RAMLIMIT)); + sc = P7SmallViterbi(dsq, len, hmm, &tr); + } + + /* The Forward score override. + * See comments in hmmpfam.c in serial version. + */ + if (do_forward) { + sc = P7Forward(dsq, len, hmm, NULL); + if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq); + } + + pvalue = PValue(hmm, sc); + evalue = thresh.Z ? (double) thresh.Z * pvalue : (double) nhmm * pvalue; + send_trace = (sc >= thresh.globT && evalue <= thresh.globE) ? 1 : 0; + + /* return output + */ + pvm_initsend(PvmDataDefault); + pvm_pkint(&my_idx, 1, 1); /* tell master who we are */ + pvm_pkstr(hmm->name); /* double check that we did the right thing */ + pvm_pkfloat(&sc, 1, 1); + pvm_pkdouble(&pvalue, 1, 1); + pvm_pkint(&send_trace, 1, 1); /* flag for whether a trace structure is coming */ + if (send_trace) PVMPackTrace(tr); + pvm_send(master_tid, HMMPVM_RESULTS); + + /* cleanup + */ + FreePlan7(hmm); + P7FreeTrace(tr); + } + + /*********************************************** + * Cleanup, return. + ***********************************************/ + + HMMFileClose(hmmfp); + free(seq); + free(dsq); + free(hmmfile); + return 0; +} + + +/* Function: leave_pvm() + * + * Purpose: Cleanup function, to deal with crashes. We register + * this function using atexit() so it gets called before + * the slave dies. + */ +static void leave_pvm(void) +{ + SQD_DPRINTF1(("slave leaving PVM.\n")); + pvm_exit(); +} + + + +#else /* if HMMER_PVM not defined: include a dummy */ + +#include +int main(void) +{ + printf("hmmpfam-slave is disabled. PVM support was not compiled into HMMER.\n"); + exit(0); +} + +#endif + diff --git a/forester/archive/RIO/others/hmmer/src/hmmpfam.c b/forester/archive/RIO/others/hmmer/src/hmmpfam.c new file mode 100644 index 0000000..4d49f71 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmpfam.c @@ -0,0 +1,1094 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmpfam.c + * SRE, Mon Aug 25 17:03:14 1997 [Denver] + * + * Search a single sequence against an HMM database. + * Conditionally includes PVM parallelization when HMMER_PVM is defined + * at compile time; hmmpfam --pvm runs the PVM version. + * + * CVS $Id: hmmpfam.c,v 1.1.1.1 2005/03/22 08:34:13 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include +#ifdef HMMER_THREADS +#include +#endif +#ifdef HMMER_PVM +#include +#endif + +#include "squid.h" /* general sequence analysis library */ +#include "config.h" /* compile-time configuration constants */ +#include "structs.h" /* data structures, macros, #define's */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "version.h" /* version info */ + +static char banner[] = "hmmpfam - search one or more sequences against HMM database"; + +static char usage[] = "\ +Usage: hmmpfam [-options] \n\ + Available options are:\n\ + -h : help; print brief help on version and usage\n\ + -n : nucleic acid models/sequence (default protein)\n\ + -A : sets alignment output limit to best domain alignments\n\ + -E : sets E value cutoff (globE) to ; default 10\n\ + -T : sets T bit threshold (globT) to ; no threshold by default\n\ + -Z : sets Z (# models) for E-value calculation\n\ +"; + +static char experts[] = "\ + --acc : use HMM accession numbers instead of names in output\n\ + --compat : make best effort to use last version's output style\n\ + --cpu : run threads in parallel (if threaded)\n\ + --cut_ga : use Pfam GA gathering threshold cutoffs\n\ + --cut_nc : use Pfam NC noise threshold cutoffs\n\ + --cut_tc : use Pfam TC trusted threshold cutoffs\n\ + --domE : sets domain Eval cutoff (2nd threshold) to \n\ + --domT : sets domain T bit thresh (2nd threshold) to \n\ + --forward : use the full Forward() algorithm instead of Viterbi\n\ + --informat : sequence file is in format , not FASTA\n\ + --null2 : turn OFF the post hoc second null model\n\ + --pvm : run on a PVM (Parallel Virtual Machine) cluster\n\ + --xnu : turn ON XNU filtering of query protein sequence\n\ +\n"; + + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-n", TRUE, sqdARG_NONE }, + { "-A", TRUE, sqdARG_INT }, + { "-E", TRUE, sqdARG_FLOAT}, + { "-T", TRUE, sqdARG_FLOAT}, + { "-Z", TRUE, sqdARG_INT }, + { "--acc", FALSE, sqdARG_NONE }, + { "--compat", FALSE, sqdARG_NONE }, + { "--cpu", FALSE, sqdARG_INT }, + { "--cut_ga", FALSE, sqdARG_NONE }, + { "--cut_nc", FALSE, sqdARG_NONE }, + { "--cut_tc", FALSE, sqdARG_NONE }, + { "--domE", FALSE, sqdARG_FLOAT}, + { "--domT", FALSE, sqdARG_FLOAT}, + { "--forward", FALSE, sqdARG_NONE }, + { "--informat",FALSE, sqdARG_STRING}, + { "--null2", FALSE, sqdARG_NONE }, + { "--pvm", FALSE, sqdARG_NONE }, + { "--xnu", FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + + + +#ifdef HMMER_THREADS +/* POSIX threads version: + * the threads share a workpool_s structure amongst themselves, + * for obtaining locks on input HMM file and output histogram and + * tophits structures. + */ +struct workpool_s { + /* Shared configuration resources that don't change: + */ + char *hmmfile; /* name of HMM file */ + char *dsq; /* digitized query sequence */ + char *seqname; /* sequence name */ + int L; /* length of dsq */ + int do_forward; /* TRUE to score using Forward */ + int do_null2; /* TRUE to apply null2 correction */ + struct threshold_s *thresh; /* score/evalue cutoff information */ + + /* Shared (mutex-protected) input resources: + */ + HMMFILE *hmmfp; /* ptr to open HMM file */ + int nhmm; /* number of HMMs searched so far */ + pthread_mutex_t input_lock; /* mutex for locking input */ + + /* Shared (mutex-protected) output resources: + */ + struct tophit_s *ghit; /* per-sequence top hits */ + struct tophit_s *dhit; /* per-domain top hits */ + pthread_mutex_t output_lock; /* mutex for locking output */ + + /* Thread pool information + */ + pthread_t *thread; /* our pool of threads */ + int num_threads; /* number of threads */ +}; + +static struct workpool_s *workpool_start(char *hmmfile, HMMFILE *hmmfp, + char *dsq, char *seqname, int L, + int do_forward, int do_null2, + struct threshold_s *thresh, + struct tophit_s *ghit, struct tophit_s *dhit, + int num_threads); +static void workpool_stop(struct workpool_s *wpool); +static void workpool_free(struct workpool_s *wpool); +static void *worker_thread(void *ptr); +#endif /* HMMER_THREADS */ + + +#ifdef HMMER_PVM +static void main_loop_pvm(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, + struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2, + struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm); +#endif +static void main_loop_serial(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, + struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2, + int num_threads, + struct tophit_s *ghit, struct tophit_s *dhit, int *nhmm); + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMMs from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + char *seqfile; /* file to read target sequence from */ + SQFILE *sqfp; /* opened seqfile for reading */ + int format; /* format of seqfile */ + char *seq; /* target sequence */ + SQINFO sqinfo; /* optional info for seq */ + struct fancyali_s *ali; /* an alignment for display */ + struct tophit_s *ghit; /* list of top hits and alignments for seq */ + struct tophit_s *dhit; /* list of top hits/alignments for domains */ + + float sc; /* log-odds score in bits */ + double pvalue; /* pvalue of an HMM score */ + double evalue; /* evalue of an HMM score */ + double motherp; /* pvalue of a whole seq HMM score */ + float mothersc; /* score of a whole seq parent of domain */ + int sqfrom, sqto; /* coordinates in sequence */ + int hmmfrom, hmmto; /* coordinate in HMM */ + char *name, *acc, *desc; /* hit HMM name, accession, description */ + int hmmlen; /* length of HMM hit */ + int nhmm; /* number of HMMs searched */ + int domidx; /* number of this domain */ + int ndom; /* total # of domains in this seq */ + int namewidth; /* max width of printed HMM name */ + int descwidth; /* max width of printed description */ + + int Alimit; /* A parameter limiting output alignments */ + struct threshold_s thresh; /* contains all threshold (cutoff) info */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + int do_forward; /* TRUE to use Forward() not Viterbi() */ + int do_nucleic; /* TRUE to do DNA/RNA instead of protein */ + int do_null2; /* TRUE to adjust scores with null model #2 */ + int do_pvm; /* TRUE to run on PVM */ + int do_xnu; /* TRUE to do XNU filtering */ + int be_backwards; /* TRUE to be backwards-compatible in output*/ + int show_acc; /* TRUE to sub HMM accessions for names */ + int i; + int nreported; + + int num_threads; /* number of worker threads */ + + /*********************************************** + * Parse command line + ***********************************************/ + + format = SQFILE_UNKNOWN; /* default: autodetect format w/ Babelfish */ + do_forward = FALSE; + do_nucleic = FALSE; + do_null2 = TRUE; + do_pvm = FALSE; + do_xnu = FALSE; + be_backwards= FALSE; + show_acc = FALSE; + + Alimit = INT_MAX; /* no limit on alignment output */ + thresh.globE = 10.0; /* use a reasonable Eval threshold; */ + thresh.globT = -FLT_MAX; /* but no bit threshold, */ + thresh.domT = -FLT_MAX; /* no domain bit threshold, */ + thresh.domE = FLT_MAX; /* and no domain Eval threshold. */ + thresh.autocut = CUT_NONE; /* and no Pfam cutoffs used. */ + thresh.Z = 0; /* Z not preset, so determined by # of HMMs */ + +#ifdef HMMER_THREADS + num_threads = ThreadNumber(); /* only matters if we're threaded */ +#else + num_threads = 0; +#endif + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-n") == 0) do_nucleic = TRUE; + else if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg); + else if (strcmp(optname, "-E") == 0) thresh.globE = atof(optarg); + else if (strcmp(optname, "-T") == 0) thresh.globT = atof(optarg); + else if (strcmp(optname, "-Z") == 0) thresh.Z = atoi(optarg); + else if (strcmp(optname, "--acc") == 0) show_acc = TRUE; + else if (strcmp(optname, "--compat") == 0) be_backwards = TRUE; + else if (strcmp(optname, "--cpu") == 0) num_threads = atoi(optarg); + else if (strcmp(optname, "--cut_ga") == 0) thresh.autocut = CUT_GA; + else if (strcmp(optname, "--cut_nc") == 0) thresh.autocut = CUT_NC; + else if (strcmp(optname, "--cut_tc") == 0) thresh.autocut = CUT_TC; + else if (strcmp(optname, "--domE") == 0) thresh.domE = atof(optarg); + else if (strcmp(optname, "--domT") == 0) thresh.domT = atof(optarg); + else if (strcmp(optname, "--forward") == 0) do_forward = TRUE; + else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE; + else if (strcmp(optname, "--pvm") == 0) do_pvm = TRUE; + else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE; + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == SQFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 2) + Die("Incorrect number of arguments.\n%s\n", usage); + + hmmfile = argv[optind++]; + seqfile = argv[optind++]; + +#ifndef HMMER_PVM + if (do_pvm) Die("PVM support is not compiled into HMMER; --pvm doesn't work."); +#endif +#ifndef HMMER_THREADS + if (num_threads) Die("Posix threads support is not compiled into HMMER; --cpu doesn't have any effect"); +#endif + + /*********************************************** + * Open sequence database (must be in curr directory); + * get target sequence. + ***********************************************/ + + if (do_nucleic) SetAlphabet(hmmNUCLEIC); + else SetAlphabet(hmmAMINO); + + if (do_nucleic && do_xnu) + Die("You can't use -n and --xnu together: I can't xnu DNA data."); + + if ((sqfp = SeqfileOpen(seqfile, format, NULL)) == NULL) + Die("Failed to open sequence file %s\n%s\n", seqfile, usage); + + /*********************************************** + * Open HMM database (might be in HMMERDB or current directory) + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) + Die("Failed to open HMM database %s\n%s", hmmfile, usage); + + /*********************************************** + * Show the banner + ***********************************************/ + + Banner(stdout, banner); + printf( "HMM file: %s\n", hmmfile); + printf( "Sequence file: %s\n", seqfile); + if (do_pvm) + printf( "PVM: ACTIVE\n"); + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n"); + + /*********************************************** + * Search each HMM against each sequence + ***********************************************/ + + while (ReadSeq(sqfp, format, &seq, &sqinfo)) + { + ghit = AllocTophits(20); /* keeps full seq scores */ + dhit = AllocTophits(20); /* keeps domain scores */ + + /* 1. Search sequence against all HMMs. + * Significant scores+alignments accumulate in ghit, dhit. + */ + if (!do_pvm) + main_loop_serial(hmmfile, hmmfp, seq, &sqinfo, + &thresh, do_xnu, do_forward, do_null2, num_threads, + ghit, dhit, &nhmm); +#ifdef HMMER_PVM + else if (do_pvm) + { + SQD_DPRINTF1(("Entering PVM main loop\n")); + main_loop_pvm(hmmfile, hmmfp, seq, &sqinfo, + &thresh, do_xnu, do_forward, do_null2, + ghit, dhit, &nhmm); + } +#endif + else Die("wait. that can't happen. I didn't do anything."); + + /* set Z for good now that we're done */ + if (!thresh.Z) thresh.Z = nhmm; + + /* 2. (Done searching all HMMs for this query seq; start output) + * Report the overall sequence hits, sorted by significance. + */ + if (be_backwards) + { + printf("Query: %s %s\n", sqinfo.name, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); + } + else + { + printf("\nQuery sequence: %s\n", sqinfo.name); + printf("Accession: %s\n", sqinfo.flags &SQINFO_ACC ? sqinfo.acc : "[none]"); + printf("Description: %s\n", sqinfo.flags &SQINFO_DESC? sqinfo.desc : "[none]"); + } + /* We'll now sort the global hit list by evalue... + * (not score! that was bug #12. in hmmpfam, score and evalue are not + * monotonic.) + */ + FullSortTophits(ghit); + namewidth = MAX(8, TophitsMaxName(ghit)); /* must print whole name, no truncation */ + descwidth = MAX(52-namewidth, 11); /* may truncate desc, but avoid neg len! */ + + printf("\nScores for sequence family classification (score includes all domains):\n"); + printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Model", descwidth, "Description", "Score", "E-value", " N "); + printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---"); + for (i = 0, nreported = 0; i < ghit->num; i++) + { + char *safedesc; + GetRankedHit(ghit, i, + &pvalue, &sc, NULL, NULL, + &name, &acc, &desc, + NULL, NULL, NULL, /* seq positions */ + NULL, NULL, NULL, /* HMM positions */ + NULL, &ndom, /* domain info */ + NULL); /* alignment info*/ + + evalue = pvalue * (double) thresh.Z; + + /* safedesc is a workaround for an apparent Linux printf() + * bug with the *.*s format. dbmalloc crashes with a memchr() ptr out of bounds + * flaw if the malloc'ed space for desc is short. The workaround + * is to make sure the ptr for *.* has a big malloc space. + */ + if (desc != NULL && strlen(desc) < 80) + { + safedesc = MallocOrDie(sizeof(char) * 80); + strcpy(safedesc, desc); + } + else safedesc = Strdup(desc); + + /* sneaky trick warning: + * if we're using dynamic Pfam score cutoffs (GA, TC, NC), + * then the list of hits is already correct and does not + * need any score cutoffs. Unset the thresholds. They'll + * be reset in the main_loop if we still have sequences + * to process. + */ + if (thresh.autocut != CUT_NONE) { + thresh.globE = thresh.domE = FLT_MAX; + thresh.globT = thresh.domT = -FLT_MAX; + } + + if (evalue <= thresh.globE && sc >= thresh.globT) + { + printf("%-*s %-*.*s %7.1f %10.2g %3d\n", + namewidth, + (show_acc && acc != NULL) ? acc : name, + descwidth, descwidth, safedesc != NULL ? safedesc : "", + sc, evalue, ndom); + nreported++; + } + free(safedesc); + } + if (nreported == 0) printf("\t[no hits above thresholds]\n"); + + /* 3. Report domain hits (sorted on sqto coordinate) + */ + FullSortTophits(dhit); + namewidth = MAX(8, TophitsMaxName(dhit)); /* must print whole name, no truncation */ + + printf("\nParsed for domains:\n"); + printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", + namewidth, "Model", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value"); + printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", + namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------"); + + for (i = 0, nreported = 0; i < dhit->num; i++) + { + GetRankedHit(dhit, i, + &pvalue, &sc, &motherp, &mothersc, + &name, &acc, NULL, + &sqfrom, &sqto, NULL, + &hmmfrom, &hmmto, &hmmlen, + &domidx, &ndom, + NULL); + evalue = pvalue * (double) thresh.Z; + + /* Does the "mother" (complete) sequence satisfy global thresholds? */ + if (motherp * (double)thresh. Z > thresh.globE || mothersc < thresh.globT) + continue; + else if (evalue <= thresh.domE && sc >= thresh.domT) { + printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n", + namewidth, + (show_acc && acc != NULL) ? acc : name, + domidx, ndom, + sqfrom, sqto, + sqfrom == 1 ? '[' : '.', sqto == sqinfo.len ? ']' : '.', + hmmfrom, hmmto, + hmmfrom == 1 ? '[':'.', hmmto == hmmlen ? ']' : '.', + sc, evalue); + nreported++; + } + } + if (nreported == 0) printf("\t[no hits above thresholds]\n"); + + + /* 3. Alignment output, also by domain. + * dhits is already sorted and namewidth is set, from above code. + * Number of displayed alignments is limited by Alimit parameter; + * also by domE (evalue threshold), domT (score theshold). + */ + if (Alimit != 0) + { + printf("\nAlignments of top-scoring domains:\n"); + for (i = 0, nreported = 0; i < dhit->num; i++) + { + if (nreported == Alimit) break; /* limit to Alimit output alignments */ + GetRankedHit(dhit, i, + &pvalue, &sc, &motherp, &mothersc, + &name, &acc, NULL, + &sqfrom, &sqto, NULL, /* seq position info */ + &hmmfrom, &hmmto, &hmmlen, /* HMM position info */ + &domidx, &ndom, /* domain info */ + &ali); /* alignment info */ + evalue = pvalue * (double) thresh.Z; + + if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) + continue; + else if (evalue <= thresh.domE && sc >= thresh.domT) + { + printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n", + (show_acc && acc != NULL) ? acc : name, + domidx, ndom, sqfrom, sqto, sc, evalue); + PrintFancyAli(stdout, ali); + nreported++; + } + } + if (nreported == 0) printf("\t[no hits above thresholds]\n"); + if (nreported == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit); + } + + + printf("//\n"); + FreeSequence(seq, &sqinfo); + FreeTophits(ghit); + FreeTophits(dhit); + + HMMFileRewind(hmmfp); + } + + /*********************************************** + * Clean-up and exit. + ***********************************************/ + SeqfileClose(sqfp); + HMMFileClose(hmmfp); + SqdClean(); + + return 0; +} + + +/* Function: main_loop_serial() + * Date: SRE, Fri Aug 7 13:46:48 1998 [St. Louis] + * + * Purpose: Search a sequence against an HMM database; + * main loop for the serial (non-PVM, non-threads) + * version. + * + * On return, ghit and dhit contain info for all hits + * that satisfy the set thresholds. If an evalue + * cutoff is used at all, the lists will be overestimated -- + * because the evalue will be underestimated until + * we know the final Z. (Thus the main program must recheck + * thresholds before printing any results.) If only + * score cutoffs are used, then the lists are correct, + * and may be printed exactly as they come (after + * appropriate sorting, anyway). This is especially + * important for dynamic thresholding using Pfam + * score cutoffs -- the main caller cannot afford to + * rescan the HMM file just to get the GA/TC/NC cutoffs + * back out for each HMM, and neither do I want to + * burn the space to store them as I make a pass thru + * Pfam. + * + * Args: hmmfile - name of HMM file + * hmmfp - open HMM file (and at start of file) + * dsq - digitized sequence + * sqinfo - ptr to SQINFO optional info for dsq + * thresh - score/evalue threshold information + * do_xnu - TRUE to apply XNU filter to sequence + * do_forward - TRUE to use Forward() scores + * do_null2 - TRUE to adjust scores w/ ad hoc null2 model + * num_threads- number of threads, if threaded + * ghit - global hits list + * dhit - domain hits list + * ret_nhmm - number of HMMs searched. + * + * Returns: (void) + */ +static void +main_loop_serial(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, + struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2, + int num_threads, + struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm) +{ + char *dsq; /* digitized sequence */ + int nhmm; /* number of HMMs searched */ +#ifdef HMMER_THREADS + struct workpool_s *wpool; /* pool of worker threads */ +#endif + struct plan7_s *hmm; /* current HMM to search with */ + struct p7trace_s *tr; /* traceback of alignment */ + float sc; /* an alignment score */ + double pvalue; /* pvalue of an HMM score */ + double evalue; /* evalue of an HMM score */ + + /* Prepare sequence. + */ + dsq = DigitizeSequence(seq, sqinfo->len); + if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo->len); + +#ifdef HMMER_THREADS + if (num_threads > 0) { + wpool = workpool_start(hmmfile, hmmfp, dsq, sqinfo->name, sqinfo->len, + do_forward, do_null2, thresh, + ghit, dhit, num_threads); + workpool_stop(wpool); + nhmm = wpool->nhmm; + workpool_free(wpool); + + free(dsq); + *ret_nhmm = nhmm; + return; + } +#endif + /* unthreaded code: */ + nhmm = 0; + while (HMMFileRead(hmmfp, &hmm)) { + if (hmm == NULL) + Die("HMM file %s may be corrupt or in incorrect format; parse failed", hmmfile); + P7Logoddsify(hmm, !(do_forward)); + + if (! SetAutocuts(thresh, hmm)) + Die("HMM %s did not contain the GA, TC, or NC cutoffs you needed", + hmm->name); + + /* Score sequence, do alignment (Viterbi), recover trace + */ + if (P7ViterbiSize(sqinfo->len, hmm->M) <= RAMLIMIT) + sc = P7Viterbi(dsq, sqinfo->len, hmm, &tr); + else + sc = P7SmallViterbi(dsq, sqinfo->len, hmm, &tr); + + /* Implement do_forward; we'll override the whole_sc with a P7Forward() + * calculation. + * HMMER is so trace- (alignment-) dependent that this gets a bit hacky. + * Some important implications: + * 1) if --do_forward is selected, the domain (Viterbi) scores do not + * necessarily add up to the whole sequence (Forward) score. + * 2) The implementation of null2 for a Forward score is undefined, + * since the null2 correction is trace-dependent. As a total hack, + * we use a null2 correction derived from the whole trace + * (which was the behavior of HMMER 2.1.1 and earlier, anyway). + * This could put the sum of domain scores and whole seq score even + * further in disagreement. + * + * Note that you can't move the Forward calculation into + * PostprocessSignificantHit(). The Forward score will exceed the + * Viterbi score, so you can't apply thresholds until you + * know the Forward score. Also, since PostprocessSignificantHit() + * is wrapped by a mutex in the threaded implementation, + * you'd destroy all useful parallelism if PostprocessSignificantHit() + * did anything compute intensive. + */ + if (do_forward) { + sc = P7Forward(dsq, sqinfo->len, hmm, NULL); + if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq); + } + + /* Store scores/pvalue for each HMM aligned to this sequence, overall + */ + pvalue = PValue(hmm, sc); + evalue = thresh->Z ? (double) thresh->Z * pvalue : (double) nhmm * pvalue; + if (sc >= thresh->globT && evalue <= thresh->globE) { + PostprocessSignificantHit(ghit, dhit, + tr, hmm, dsq, sqinfo->len, + sqinfo->name, NULL, NULL, /* won't need acc or desc even if we have 'em */ + do_forward, sc, + do_null2, + thresh, + TRUE); /* TRUE -> hmmpfam mode */ + } + P7FreeTrace(tr); + FreePlan7(hmm); + nhmm++; + } + + free(dsq); + *ret_nhmm = nhmm; + return; +} + + +#ifdef HMMER_PVM +/***************************************************************** + * PVM specific functions + ****************************************************************/ + +/* Function: main_loop_pvm() + * Date: SRE, Fri Aug 7 13:58:34 1998 [St. Louis] + * + * Purpose: Search a sequence against an HMM database; + * main loop for the PVM version. + * + * Args: hmmfile - name of HMM file + * hmmfp - open HMM file (and at start of file) + * seq - sequence to search against + * sqinfo - ptr to SQINFO optional info for dsq + * thresh - score/evalue threshold settings + * do_xnu - TRUE to apply XNU filter to sequence + * do_forward - TRUE to use Forward() scores + * do_null2 - TRUE to adjust scores w/ ad hoc null2 model + * ghit - global hits list + * dhit - domain hits list + * nhmm - number of HMMs searched. + * + * Returns: (void) + */ +static void +main_loop_pvm(char *hmmfile, HMMFILE *hmmfp, char *seq, SQINFO *sqinfo, + struct threshold_s *thresh, int do_xnu, int do_forward, int do_null2, + struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nhmm) +{ + struct plan7_s *hmm; /* HMM that was searched with */ + struct p7trace_s *tr; /* a traceback structure */ + char *dsq; /* digitized sequence */ + float sc; /* score of an HMM match */ + int master_tid; /* master's ID */ + int *slave_tid; /* array of slave IDs */ + int *hmmlist; /* array of hmm indexes being worked on by slaves */ + int nslaves; /* number of slaves in virtual machine */ + int nhmm; /* number of HMMs searched */ + int slaveidx; /* index of a slave wanting work */ + int slave, msg; + int sent_trace; /* TRUE if slave sent us a trace */ + char slavename[32]; /* name of HMM that slave actually did */ + double pvalue; /* pvalue of HMM score */ + int arglen; + + /* Sanity checks. + */ + if (hmmfp->ssi == NULL) + Die("HMM file %s needs an SSI index to use PVM. See: hmmindex.", hmmfile); + + /* Prepare sequence. + */ + dsq = DigitizeSequence(seq, sqinfo->len); + if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo->len); + + /* Initialize PVM + */ + master_tid = pvm_mytid(); +#if DEBUGLEVEL >= 1 + pvm_catchout(stderr); /* catch output for debugging */ +#endif + SQD_DPRINTF1(("Spawning slaves...\n")); + PVMSpawnSlaves("hmmpfam-pvm", &slave_tid, &nslaves); + hmmlist = MallocOrDie(sizeof(int) * nslaves); + SQD_DPRINTF1(("Spawned a total of %d slaves...\n", nslaves)); + + /* Initialize the slaves + */ + SQD_DPRINTF1(("Broadcasting to %d slaves...\n", nslaves)); + pvm_initsend(PvmDataDefault); + arglen = strlen(hmmfile); + pvm_pkint(&arglen, 1, 1); + pvm_pkstr(hmmfile); + pvm_pkint(&(sqinfo->len), 1, 1); + pvm_pkstr(seq); + pvm_pkfloat(&(thresh->globT), 1, 1); + pvm_pkdouble(&(thresh->globE), 1, 1); + pvm_pkint(&(thresh->Z), 1, 1); + pvm_pkint((int *)&(thresh->autocut), 1, 1); + pvm_pkint(&do_xnu, 1, 1); + pvm_pkint(&do_forward, 1, 1); + pvm_pkint(&do_null2, 1, 1); + pvm_pkint(&Alphabet_type, 1, 1); + pvm_mcast(slave_tid, nslaves, HMMPVM_INIT); + SQD_DPRINTF1(("Slaves should be ready...\n")); + /* get their OK codes. */ + PVMConfirmSlaves(slave_tid, nslaves); + SQD_DPRINTF1(("Slaves confirm that they're ok...\n")); + + /* Load the slaves. + * For efficiency reasons, we don't want the master to + * load HMMs from disk until she absolutely needs them. + */ + for (nhmm = 0; nhmm < nslaves && nhmm < hmmfp->ssi->nprimary; nhmm++) { + pvm_initsend(PvmDataDefault); + pvm_pkint(&nhmm, 1, 1); /* side effect: also tells him what number he is. */ + pvm_send(slave_tid[nhmm], HMMPVM_WORK); + hmmlist[nhmm] = nhmm; + } + SQD_DPRINTF1(("%d slaves are loaded\n", nhmm)); + + + /* Receive/send loop + */ + for (; nhmm < hmmfp->ssi->nprimary; nhmm++) + { + /* check slaves before blocking */ + PVMCheckSlaves(slave_tid, nslaves); + /* receive output */ + SQD_DPRINTF1(("Waiting for a slave to give me output...\n")); + pvm_recv(-1, HMMPVM_RESULTS); + pvm_upkint(&slaveidx, 1, 1); /* # of slave who's sending us stuff */ + pvm_upkstr(slavename); /* name of HMM that slave did */ + pvm_upkfloat(&sc, 1, 1); /* score */ + pvm_upkdouble(&pvalue, 1, 1); /* P-value */ + pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */ + tr = (sent_trace) ? PVMUnpackTrace() : NULL; + SQD_DPRINTF1(("Slave %d finished %s for me...\n", slaveidx, slavename)); + + /* send new work */ + pvm_initsend(PvmDataDefault); + pvm_pkint(&nhmm, 1, 1); + pvm_send(slave_tid[slaveidx], HMMPVM_WORK); + SQD_DPRINTF1(("Assigned %d -> slave %d\n", nhmm, slaveidx)); + + /* process output */ + /* 1b. Store scores/pvalue for each HMM aligned to this sequence, overall + */ + SQD_DPRINTF1(("%15s : %2d : %f\n", slavename, slaveidx, sc)); + if (sent_trace) + { + /* now load the HMM, because the hit is significant */ + HMMFilePositionByIndex(hmmfp, hmmlist[slaveidx]); + if (!HMMFileRead(hmmfp, &hmm)) + { pvm_exit(); Die("Unexpected failure to read HMM file %s", hmmfile); } + if (hmm == NULL) + { pvm_exit(); Die("HMM file %s may be corrupt; parse failed", hmmfile); } + P7Logoddsify(hmm, TRUE); + if (! SetAutocuts(thresh, hmm)) + Die("HMM %s did not contain your GA, NC, or TC cutoffs", hmm->name); + + PostprocessSignificantHit(ghit, dhit, + tr, hmm, dsq, sqinfo->len, + sqinfo->name, + sqinfo->flags & SQINFO_ACC ? sqinfo->acc : NULL, + sqinfo->flags & SQINFO_DESC ? sqinfo->desc : NULL, + do_forward, sc, + do_null2, + thresh, + TRUE); /* TRUE -> hmmpfam mode */ + + FreePlan7(hmm); + P7FreeTrace(tr); + } + hmmlist[slaveidx] = nhmm; + } + + /* Collect the output. all n slaves are still working, so wait for them. + */ + for (slave = 0; slave < nslaves && slave < nhmm; slave++) + { + /* don't check slaves (they're exiting normally); + window of vulnerability here to slave crashes */ + /* receive output */ + pvm_recv(-1, HMMPVM_RESULTS); + pvm_upkint(&slaveidx, 1, 1); /* slave who's sending us stuff */ + pvm_upkstr(slavename); + pvm_upkfloat(&sc, 1, 1); /* one score */ + pvm_upkdouble(&pvalue, 1, 1); /* P-value */ + pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */ + tr = (sent_trace) ? PVMUnpackTrace() : NULL; + + /* process output */ + SQD_DPRINTF1(("%15s : %2d : %f\n", slavename, slaveidx, sc)); + if (sent_trace) + { + /* now load the HMM, because the hit is significant */ + HMMFilePositionByIndex(hmmfp, hmmlist[slaveidx]); + if (!HMMFileRead(hmmfp, &hmm)) + { pvm_exit(); Die("Unexpected failure to read HMM file %s", hmmfile);} + if (hmm == NULL) + { pvm_exit(); Die("HMM file %s may be corrupt; parse failed", hmmfile); } + P7Logoddsify(hmm, TRUE); + if (! SetAutocuts(thresh, hmm)) + Die("HMM %s did not contain your GA, NC, or TC cutoffs", hmm->name); + + PostprocessSignificantHit(ghit, dhit, + tr, hmm, dsq, sqinfo->len, + sqinfo->name, NULL, NULL, /* won't need acc or desc even if we have 'em */ + do_forward, sc, + do_null2, + thresh, + TRUE); /* TRUE -> hmmpfam mode */ + + FreePlan7(hmm); + P7FreeTrace(tr); + } + /* send cleanup/shutdown flag */ + pvm_initsend(PvmDataDefault); + msg = -1; + pvm_pkint(&msg, 1, 1); + pvm_send(slave_tid[slaveidx], HMMPVM_WORK); + } + + /* Cleanup; quit the VM; and return + */ + free(slave_tid); + free(hmmlist); + free(dsq); + pvm_exit(); + *ret_nhmm = nhmm; + return; +} + +#endif /*HMMER_PVM*/ + + +#ifdef HMMER_THREADS +/***************************************************************** + * POSIX threads implementation. + * + * API: + * workpool_start() (makes a workpool_s structure. Starts calculations.) + * workpool_stop() (waits for threads to finish.) + * workpool_free() (destroys the structure) + * + * Threads: + * worker_thread() (the actual parallelized worker thread). + *****************************************************************/ + +/* Function: workpool_start() + * Date: SRE, Mon Sep 28 11:10:58 1998 [St. Louis] + * + * Purpose: Initialize a workpool_s structure, and return it. + * + * Args: hmmfile - name of HMM file + * hmmfp - open HMM file, at start + * dsq - ptr to sequence to search + * seqname - ptr to name of dsq + * L - length of dsq + * do_forward - TRUE to score using Forward + * do_null2 - TRUE to apply null2 ad hoc correction + * threshold - evalue/score threshold settings + * ghit - per-seq hit list + * dhit - per-domain hit list + * num_threads- number of worker threads to run. + * + * Returns: ptr to struct workpool_s. + * Caller must wait for threads to finish with workpool_stop(), + * then free the structure with workpool_free(). + */ +static struct workpool_s * +workpool_start(char *hmmfile, HMMFILE *hmmfp, char *dsq, char *seqname, int L, + int do_forward, int do_null2, struct threshold_s *thresh, + struct tophit_s *ghit, struct tophit_s *dhit, + int num_threads) +{ + struct workpool_s *wpool; + pthread_attr_t attr; + int i; + int rtn; + + wpool = MallocOrDie(sizeof(struct workpool_s)); + wpool->thread = MallocOrDie(num_threads * sizeof(pthread_t)); + wpool->hmmfile = hmmfile; + wpool->dsq = dsq; + wpool->L = L; + wpool->seqname = seqname; + wpool->do_forward = do_forward; + wpool->do_null2 = do_null2; + wpool->thresh = thresh; + + wpool->hmmfp = hmmfp; + wpool->nhmm = 0; + if ((rtn = pthread_mutex_init(&(wpool->input_lock), NULL)) != 0) + Die("pthread_mutex_init FAILED; %s\n", strerror(rtn)); + + wpool->ghit = ghit; + wpool->dhit = dhit; + if ((rtn = pthread_mutex_init(&(wpool->output_lock), NULL)) != 0) + Die("pthread_mutex_init FAILED; %s\n", strerror(rtn)); + + wpool->num_threads= num_threads; + + /* Create slave threads. See comments in hmmcalibrate.c at + * this step regarding concurrency and system scope. + */ + pthread_attr_init(&attr); +#ifndef __sgi +#ifdef HAVE_PTHREAD_ATTR_SETSCOPE + pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); +#endif +#endif +#ifdef HAVE_PTHREAD_SETCONCURRENCY + pthread_setconcurrency(num_threads+1); +#endif + for (i = 0; i < num_threads; i++) + if ((rtn = pthread_create(&(wpool->thread[i]), &attr, + worker_thread , (void *) wpool)) != 0) + Die("Failed to create thread %d; return code %d\n", i, rtn); + + pthread_attr_destroy(&attr); + return wpool; +} + +/* Function: workpool_stop() + * Date: SRE, Thu Jul 16 11:20:16 1998 [St. Louis] + * + * Purpose: Waits for threads in a workpool to finish. + * + * Args: wpool -- ptr to the workpool structure + * + * Returns: (void) + */ +static void +workpool_stop(struct workpool_s *wpool) +{ + int i; + /* wait for threads to stop */ + for (i = 0; i < wpool->num_threads; i++) + if (pthread_join(wpool->thread[i],NULL) != 0) + Die("pthread_join failed"); + return; +} + +/* Function: workpool_free() + * Date: SRE, Thu Jul 16 11:26:27 1998 [St. Louis] + * + * Purpose: Free a workpool_s structure, after the threads + * have finished. + * + * Args: wpool -- ptr to the workpool. + * + * Returns: (void) + */ +static void +workpool_free(struct workpool_s *wpool) +{ + free(wpool->thread); + free(wpool); + return; +} + + +/* Function: worker_thread() + * Date: SRE, Mon Sep 28 10:48:29 1998 [St. Louis] + * + * Purpose: The procedure executed by the worker threads. + * + * Args: ptr - (void *) that is recast to a pointer to + * the workpool. + * + * Returns: (void *) + */ +void * +worker_thread(void *ptr) +{ + struct workpool_s *wpool; /* our working threads structure */ + struct plan7_s *hmm; /* an HMM to search with */ + struct p7trace_s *tr; /* traceback from an alignment */ + float sc; /* score of an alignment */ + int rtn; /* a return code from pthreads lib */ + double pvalue; /* P-value of score */ + double evalue; /* E-value of a score */ + struct threshold_s thresh; /* a local copy of thresholds */ + + wpool = (struct workpool_s *) ptr; + /* Because we might dynamically change the thresholds using + * Pfam GA/NC/TC cutoffs, we make a local copy of the threshold + * structure in this thread. + */ + thresh.globT = wpool->thresh->globT; + thresh.globE = wpool->thresh->globE; + thresh.domT = wpool->thresh->domT; + thresh.domE = wpool->thresh->domE; + thresh.autocut = wpool->thresh->autocut; + thresh.Z = wpool->thresh->Z; + for (;;) { + + /* 1. acquire lock on HMM input, and get + * the next HMM to work on. + */ + /* acquire a lock */ + if ((rtn = pthread_mutex_lock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_lock failure: %s\n", strerror(rtn)); + wpool->nhmm++; + + if (! HMMFileRead(wpool->hmmfp, &hmm)) + { /* we're done. release lock, exit thread */ + if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + pthread_exit(NULL); + } + SQD_DPRINTF1(("a thread is working on %s\n", hmm->name)); + /* release the lock */ + if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + + if (hmm == NULL) + Die("HMM file %s may be corrupt or in incorrect format; parse failed", wpool->hmmfile); + P7Logoddsify(hmm, !(wpool->do_forward)); + + if (!SetAutocuts(&thresh, hmm)) + Die("HMM %s did not have the right GA, NC, or TC cutoffs", hmm->name); + + /* 2. We have an HMM in score form. + * Score the sequence. + */ + if (P7ViterbiSize(wpool->L, hmm->M) <= RAMLIMIT) + sc = P7Viterbi(wpool->dsq, wpool->L, hmm, &tr); + else + sc = P7SmallViterbi(wpool->dsq, wpool->L, hmm, &tr); + + /* The Forward score override (see comments in serial vers) + */ + if (wpool->do_forward) { + sc = P7Forward(wpool->dsq, wpool->L, hmm, NULL); + if (wpool->do_null2) sc -= TraceScoreCorrection(hmm, tr, wpool->dsq); + } + + /* 3. Save the output in tophits structures, after acquiring a lock + */ + if ((rtn = pthread_mutex_lock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_lock failure: %s\n", strerror(rtn)); + SQD_DPRINTF1(("model %s scores %f\n", hmm->name, sc)); + + pvalue = PValue(hmm, sc); + evalue = thresh.Z ? (double) thresh.Z * pvalue : (double) wpool->nhmm * pvalue; + if (sc >= thresh.globT && evalue <= thresh.globE) + { + PostprocessSignificantHit(wpool->ghit, wpool->dhit, + tr, hmm, wpool->dsq, wpool->L, + wpool->seqname, + NULL, NULL, /* won't need seq's acc or desc */ + wpool->do_forward, sc, + wpool->do_null2, + &thresh, + TRUE); /* TRUE -> hmmpfam mode */ + } + if ((rtn = pthread_mutex_unlock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + + P7FreeTrace(tr); + FreePlan7(hmm); + + } /* end 'infinite' loop over HMMs in this thread */ +} + +#endif /* HMMER_THREADS */ diff --git a/forester/archive/RIO/others/hmmer/src/hmmpostal.c b/forester/archive/RIO/others/hmmer/src/hmmpostal.c new file mode 100644 index 0000000..3e56af5 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmpostal.c @@ -0,0 +1,1108 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* Derived from code developed by Ian Holmes (Sanger Centre and UC Berkeley) + * Copyright (C) 1998 Ian Holmes + * Distributed under the GNU General Public License + */ + +#include +#include +#include + +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ + +static char banner[] = "hmmbuild - build a hidden Markov model from an alignment"; + +static char usage[] = "\ +Usage: hmmbuildpost [-options] \n\ + Available options are:\n\ + -h : help; print brief help on version and usage\n\ + -n : name; name this HMM \n\ + -r : read HMM from instead of building\n\ + -m : save HMM to \n\ + -o : re-save annotated alignment to \n\ + -A : append; append this HMM to \n\ + -F : force; allow overwriting of \n\ +\n\ + Alternative search algorithm styles: (default: hmmls domain alignment)\n\ + -f : multi-hit local (hmmfs style)\n\ + -g : global alignment (hmms style, Needleman/Wunsch)\n\ + -s : local alignment (hmmsw style, Smith/Waterman)\n\ +"; + +static char experts[] = "\ + Optional re-alignment of sequences to model:\n\ + --viterbi : standard max-likelihood (Viterbi) algorithm\n\ + --optacc : optimal accuracy algorithm\n\ +\n\ + Alternative model construction strategies: (default: MAP)\n\ + --fast : Krogh/Haussler fast heuristic construction (see --gapmax)\n\ + --hand : manual construction (requires SELEX file, #=RF annotation)\n\ +\n\ + Expert customization of parameters and priors:\n\ + --null : read null (random sequence) model from \n\ + --pam : heuristic PAM-based prior, using BLAST PAM matrix in \n\ + --prior : read Dirichlet prior parameters from \n\ +\n\ + Alternative sequence weighting strategies: (default: GSC weights)\n\ + --wblosum : Henikoff simple filter weights (see --idlevel)\n\ + --wgsc : Gerstein/Sonnhammer/Chothia tree weights (default)\n\ + --wme : maximum entropy (ME)\n\ + --wvoronoi : Sibbald/Argos Voronoi weights\n\ + --wnone : don't do any weighting\n\ + --noeff : don't use effective sequence number; just use nseq\n\ +\n\ + Forcing an alphabet: (normally autodetected)\n\ + --amino : override autodetection, assert that seqs are protein\n\ + --nucleic : override autodetection, assert that seqs are DNA/RNA\n\ +\n\ + Other expert options:\n\ + --archpri : set architecture size prior to {0.85} [0..1]\n\ + --binary : save the model in binary format, not ASCII text\n\ + --cfile : save count vectors to \n\ + --gapmax : max fraction of gaps in mat column {0.50} [0..1]\n\ + --idlevel : set frac. id level used by eff. nseq and --wblosum {0.62}\n\ + --informat : input alignment is in format , not Stockholm\n\ + --pamwgt : set weight on PAM-based prior to {20.}[>=0]\n\ + --swentry : set S/W aggregate entry prob. to {0.5}\n\ + --swexit : set S/W aggregate exit prob. to {0.5}\n\ + --verbose : print a lot of boring information\n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-f", TRUE, sqdARG_NONE }, + { "-g", TRUE, sqdARG_NONE }, + { "-h", TRUE, sqdARG_NONE }, + { "-n", TRUE, sqdARG_STRING}, + { "-r", TRUE, sqdARG_STRING}, + { "-m", TRUE, sqdARG_STRING}, + { "-o", TRUE, sqdARG_STRING}, + { "-s", TRUE, sqdARG_NONE }, + { "-A", TRUE, sqdARG_NONE }, + { "-F", TRUE, sqdARG_NONE }, + { "--amino", FALSE, sqdARG_NONE }, + { "--archpri", FALSE, sqdARG_FLOAT }, + { "--binary", FALSE, sqdARG_NONE }, + { "--cfile", FALSE, sqdARG_STRING}, + { "--fast", FALSE, sqdARG_NONE}, + { "--gapmax", FALSE, sqdARG_FLOAT }, + { "--hand", FALSE, sqdARG_NONE}, + { "--idlevel", FALSE, sqdARG_FLOAT }, + { "--informat",FALSE, sqdARG_STRING }, + { "--noeff", FALSE, sqdARG_NONE }, + { "--nucleic", FALSE, sqdARG_NONE }, + { "--null", FALSE, sqdARG_STRING }, + { "--optacc", FALSE, sqdARG_NONE }, + { "--pam", FALSE, sqdARG_STRING }, + { "--pamwgt", FALSE, sqdARG_FLOAT }, + { "--prior", FALSE, sqdARG_STRING }, + { "--swentry", FALSE, sqdARG_FLOAT }, + { "--swexit", FALSE, sqdARG_FLOAT }, + { "--verbose", FALSE, sqdARG_NONE }, + { "--viterbi", FALSE, sqdARG_NONE }, + { "--wgsc", FALSE, sqdARG_NONE }, + { "--wblosum", FALSE, sqdARG_NONE }, + { "--wme", FALSE, sqdARG_NONE }, + { "--wnone", FALSE, sqdARG_NONE }, + { "--wvoronoi",FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +static void save_model(struct plan7_s *hmm, char *hmmfile, int do_append, int do_binary); +static void print_all_scores(FILE *fp, struct plan7_s *hmm, + AINFO *ainfo, char **dsq, int nseq, + struct p7trace_s **tr); +static void save_countvectors(char *cfile, struct plan7_s *hmm); +static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt, + int nseq, struct p7trace_s **tr, float *pernode, + float *ret_avg); +static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, + float *pernode, float expected); +static void maximum_entropy(struct plan7_s *hmm, char **dsq, AINFO *ainfo, + int nseq, float eff_nseq, + struct p7prior_s *prior, struct p7trace_s **tr); + +extern void Postcode(int L, struct dpmatrix_s *mx, struct p7trace_s *tr); + +int +main(int argc, char **argv) +{ + char *seqfile; /* seqfile to read alignment from */ + int format; /* format of seqfile */ + MSAFILE *afp; /* open alignment file */ + MSA *msa; /* a multiple sequence alignment */ + char **dsq; /* digitized unaligned aseq's */ + struct plan7_s *hmm; /* constructed HMM; written to hmmfile */ + struct p7prior_s *pri; /* Dirichlet priors to use */ + struct p7trace_s **tr; /* fake tracebacks for aseq's */ + char *readfile; /* file to read HMM from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + char *hmmfile; /* file to write HMM to */ + FILE *fp; /* OUTPUT file handle (misc.) */ + char *name; /* name of the HMM */ + int idx; /* counter for sequences */ + float randomseq[MAXABET]; /* null sequence model */ + float p1; /* null sequence model p1 transition */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + enum p7_construction c_strategy; /* construction strategy choice */ + enum p7_weight { /* weighting strategy */ + WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_VORONOI, WGT_ME} w_strategy; + enum p7_config { /* algorithm configuration strategy */ + P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy; + float gapmax; /* max frac gaps in mat col for -k */ + int overwrite_protect; /* TRUE to prevent overwriting HMM file */ + enum realignment_strategy { /* re-alignment strategy */ + REALIGN_NONE, REALIGN_VITERBI, REALIGN_OPTACC } r_strategy; + int verbose; /* TRUE to show a lot of output */ + char *align_ofile; /* name of output alignment file */ + char *rndfile; /* random sequence model file to read */ + char *prifile; /* Dirichlet prior file to read */ + char *pamfile; /* PAM matrix file for heuristic prior */ + char *cfile; /* output file for count vectors */ + float archpri; /* "architecture" prior on model size */ + float pamwgt; /* weight on PAM for heuristic prior */ + int do_append; /* TRUE to append to hmmfile */ + int do_binary; /* TRUE to write in binary format */ + float blosumlevel; /* BLOSUM frac id filtering level [0.62] */ + float swentry; /* S/W aggregate entry probability */ + float swexit; /* S/W aggregate exit probability */ + int do_eff; /* TRUE to set an effective seq number */ + float eff_nseq; /* effective sequence number */ + int checksum; + int len; + + struct dpmatrix_s *forward_mx; /* Forward matrix */ + struct dpmatrix_s *backward_mx; /* Backward matrix */ + struct dpmatrix_s *posterior_mx; /* Posterior matrix */ + struct dpmatrix_s *optacc_mx; /* Optimal accuracy matrix */ + + /*********************************************** + * Parse command line + ***********************************************/ + + format = MSAFILE_UNKNOWN; + c_strategy = P7_MAP_CONSTRUCTION; + w_strategy = WGT_GSC; + blosumlevel = 0.62; + cfg_strategy = P7_LS_CONFIG; + gapmax = 0.5; + overwrite_protect = TRUE; + r_strategy = REALIGN_NONE; + verbose = FALSE; + readfile = NULL; + hmmfile = NULL; + align_ofile = NULL; + rndfile = NULL; + prifile = NULL; + pamfile = NULL; + cfile = NULL; + archpri = 0.85; + pamwgt = 20.; + Alphabet_type = hmmNOTSETYET; /* initially unknown */ + name = NULL; + do_append = FALSE; + swentry = 0.5; + swexit = 0.5; + do_eff = TRUE; + do_binary = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-f") == 0) cfg_strategy = P7_FS_CONFIG; + else if (strcmp(optname, "-g") == 0) cfg_strategy = P7_BASE_CONFIG; + else if (strcmp(optname, "-n") == 0) name = Strdup(optarg); + else if (strcmp(optname, "-r") == 0) readfile = optarg; + else if (strcmp(optname, "-m") == 0) hmmfile = optarg; + else if (strcmp(optname, "-o") == 0) align_ofile = optarg; + else if (strcmp(optname, "-r") == 0) rndfile = optarg; + else if (strcmp(optname, "-s") == 0) cfg_strategy = P7_SW_CONFIG; + else if (strcmp(optname, "-A") == 0) do_append = TRUE; + else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE; + else if (strcmp(optname, "--amino") == 0) SetAlphabet(hmmAMINO); + else if (strcmp(optname, "--archpri") == 0) archpri = atof(optarg); + else if (strcmp(optname, "--binary") == 0) do_binary = TRUE; + else if (strcmp(optname, "--cfile") == 0) cfile = optarg; + else if (strcmp(optname, "--fast") == 0) c_strategy = P7_FAST_CONSTRUCTION; + else if (strcmp(optname, "--hand") == 0) c_strategy = P7_HAND_CONSTRUCTION; + else if (strcmp(optname, "--gapmax") == 0) gapmax = atof(optarg); + else if (strcmp(optname, "--idlevel") == 0) blosumlevel = atof(optarg); + else if (strcmp(optname, "--noeff") == 0) do_eff = FALSE; + else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC); + else if (strcmp(optname, "--optacc") == 0) r_strategy = REALIGN_OPTACC; + else if (strcmp(optname, "--pam") == 0) pamfile = optarg; + else if (strcmp(optname, "--pamwgt") == 0) pamwgt = atof(optarg); + else if (strcmp(optname, "--prior") == 0) prifile = optarg; + else if (strcmp(optname, "--swentry") == 0) swentry = atof(optarg); + else if (strcmp(optname, "--swexit") == 0) swexit = atof(optarg); + else if (strcmp(optname, "--verbose") == 0) verbose = TRUE; + else if (strcmp(optname, "--viterbi") == 0) r_strategy = REALIGN_VITERBI; + else if (strcmp(optname, "--wgsc") == 0) w_strategy = WGT_GSC; + else if (strcmp(optname, "--wblosum") == 0) w_strategy = WGT_BLOSUM; + else if (strcmp(optname, "--wme") == 0) w_strategy = WGT_ME; + else if (strcmp(optname, "--wnone") == 0) w_strategy = WGT_NONE; + else if (strcmp(optname, "--wvoronoi")== 0) w_strategy = WGT_VORONOI; + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == MSAFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + if (! IsAlignmentFormat(format)) + Die("%s is an unaligned format, can't read as an alignment", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 1) + Die("Incorrect number of arguments.\n%s\n", usage); + + seqfile = argv[optind++]; + + if (readfile != NULL && r_strategy == REALIGN_NONE) + r_strategy = REALIGN_VITERBI; + + if (gapmax < 0. || gapmax > 1.) + Die("--gapmax must be a value from 0 to 1\n%s\n", usage); + if (archpri < 0. || archpri > 1.) + Die("--archpri must be a value from 0 to 1\n%s\n", usage); + if (overwrite_protect && hmmfile && !do_append && FileExists(hmmfile)) + Die("HMM file %s already exists. Rename or delete it.", hmmfile); + if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile)) + Die("Alignment resave file %s exists. Rename or delete it.", align_ofile); + + /*********************************************** + * Get sequence data + ***********************************************/ + + /* Open the alignment */ + if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", seqfile); + + /* read the alignment from file */ + if ((msa = MSAFileRead(afp)) == NULL) + Die("Failed to read aligned sequence file %s", seqfile); + for (idx = 0; idx < msa->nseq; idx++) + s2upper(msa->aseq[idx]); + MSAFileClose(afp); + /* Set up the alphabet globals */ + if (Alphabet_type == hmmNOTSETYET) + DetermineAlphabet(msa->aseq, msa->nseq); + + /* Set up Dirichlet priors */ + if (prifile == NULL) pri = P7DefaultPrior(); + else pri = P7ReadPrior(prifile); + + if (pamfile != NULL) PAMPrior(pamfile, pri, pamwgt); + + /* Set up the null/random seq model */ + if (rndfile == NULL) P7DefaultNullModel(randomseq, &p1); + else P7ReadNullModel(rndfile, randomseq, &p1); + + /* Prepare sequences for internal use */ + DigitizeAlignment(msa, &dsq); + + /* In some respects we treat DNA more crudely... */ + if (Alphabet_type == hmmNUCLEIC) + { + do_eff = FALSE; /* don't do effective seq #; it's calibrated for protein */ + } + + /*********************************************** + * Either read in an HMM or build from alignment, + * depending on user specifications. + ***********************************************/ + + if (readfile != NULL) { + + /*********************************************** + * Open HMM file (might be in HMMERDB or current directory). + * Read a single HMM from it. + ***********************************************/ + + if ((hmmfp = HMMFileOpen(readfile, "HMMERDB")) == NULL) + Die("Failed to open HMM file %s\n%s", readfile, usage); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", readfile); + HMMFileClose(hmmfp); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", readfile); + + tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * msa->nseq); + for (idx = 0; idx < msa->nseq; idx++) + tr[idx] = 0; + + } else { + + /*********************************************** + * Build an HMM + ***********************************************/ + + /* Determine the effective sequence number to use (optional) + */ + eff_nseq = (float) msa->nseq; + if (do_eff) + { + float *wgt; + printf("%-40s ... ", "Determining effective sequence number"); + fflush(stdout); + /* dummy weights array to feed BlosumWeights*/ + wgt = MallocOrDie(sizeof(float) * msa->nseq); + BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt); + eff_nseq = FSum(wgt, msa->nseq); + + free(wgt); + printf("done. [%.0f]\n", eff_nseq); + } + + + /* Weight the sequences (optional), + */ + /* Weight the sequences (optional), + */ + if (w_strategy == WGT_GSC || + w_strategy == WGT_BLOSUM || + w_strategy == WGT_VORONOI) + { + printf("%-40s ... ", "Weighting sequences heuristically"); + fflush(stdout); + + if (w_strategy == WGT_GSC) + GSCWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + else if (w_strategy == WGT_BLOSUM) + BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, msa->wgt); + else if (w_strategy == WGT_VORONOI) + VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt); + + printf("done.\n"); + } + + /* Set the effective sequence number (if do_eff is FALSE, eff_nseq + * was set to nseq). + */ + FNorm(msa->wgt, msa->nseq); + FScale(msa->wgt, msa->nseq, eff_nseq); + + + /* Build a model architecture. + * If we're not doing MD or ME, that's all we need to do. + * We get an allocated, counts-based HMM back. + */ + printf("%-40s ... ", "Constructing model architecture"); + fflush(stdout); + checksum = GCGMultchecksum(msa->aseq, msa->nseq); + if (c_strategy == P7_FAST_CONSTRUCTION) + P7Fastmodelmaker(msa, dsq, gapmax, &hmm, &tr); + else if (c_strategy == P7_HAND_CONSTRUCTION) + P7Handmodelmaker(msa, dsq, &hmm, &tr); + else + P7Maxmodelmaker(msa, dsq, gapmax, + pri, randomseq, p1, archpri, &hmm, &tr); + hmm->checksum = checksum; + printf("done.\n"); + + /* Save the count vectors if asked. Used primarily for + * making the data files for training priors. + */ + if (cfile != NULL) + { + save_countvectors(cfile, hmm); + } + + /* Record the null model in the HMM; + * add prior contributions in pseudocounts and renormalize. + */ + Plan7SetNullModel(hmm, randomseq, p1); + P7PriorifyHMM(hmm, pri); + + + /* Model configuration, temporary. + * hmmbuild assumes that it's given an alignment of single domains, + * and the alignment may contain fragments. So, for the purpose of + * scoring the sequences (or, optionally, MD/ME weighting), + * configure the model into hmmsw mode. Later we'll + * configure the model according to how the user wants to + * use it. + */ + Plan7SWConfig(hmm, 0.5, 0.5); + + /* Do model-dependent "weighting" strategies. + */ + /* + if (w_strategy == WGT_ME) + { + maximum_entropy(hmm, dsq, &ainfo, ainfo.nseq, eff_nseq, pri, tr); + } + */ + + /* Give the model a name; by default, the name of the alignment file + * without any filename extension (i.e. "globins.slx" becomes "globins" + */ + if (name == NULL) name = FileTail(seqfile, TRUE); + Plan7SetName(hmm, name); + Plan7ComlogAppend(hmm, argc, argv); + Plan7SetCtime(hmm); + hmm->nseq = msa->nseq; + free(name); + + /* Configure the model for chosen algorithm + */ + switch (cfg_strategy) { + case P7_BASE_CONFIG: Plan7GlobalConfig(hmm); break; + case P7_SW_CONFIG: Plan7SWConfig(hmm, swentry, swexit); break; + case P7_LS_CONFIG: Plan7LSConfig(hmm); break; + case P7_FS_CONFIG: Plan7FSConfig(hmm, swentry, swexit); break; + default: Die("bogus configuration choice"); + } + + } + + /* Optionally save new HMM to disk: open a file for appending or writing. + */ + P7Logoddsify(hmm, TRUE); + if (hmmfile) + save_model(hmm, hmmfile, do_append, do_binary); + + /* Display posterior probabilities for each sequence, + re-aligning them to the model if user requested that + */ + + for (idx = 0; idx < msa->nseq; idx++) { + printf ("#\n# Sequence %d: %s\n#\n", idx + 1, msa->sqname[idx]); + + len = DealignedLength(msa->aseq[idx]); + if (P7ViterbiSize(len, hmm->M) * 2 > RAMLIMIT) + Die("insufficient memory"); + + (void) P7Forward (dsq[idx], len, hmm, &forward_mx); + (void) P7Backward (dsq[idx],len, hmm, &backward_mx); + + if (r_strategy == REALIGN_VITERBI) { + + if (tr[idx]) P7FreeTrace (tr[idx]); + + if (P7ViterbiSize(len, hmm->M) * 3 <= RAMLIMIT) + (void) P7Viterbi(dsq[idx], len, hmm, &(tr[idx])); + else + (void) P7SmallViterbi(dsq[idx], len, hmm, &(tr[idx])); + + } else if (r_strategy == REALIGN_OPTACC) { + + if (tr[idx]) P7FreeTrace (tr[idx]); + + if (P7ViterbiSize(len, hmm->M) * 4 > RAMLIMIT) + Die("insufficient memory"); + + posterior_mx = AllocPlan7Matrix (len + 1, hmm->M, 0, 0, 0, 0); + P7EmitterPosterior (len, hmm, forward_mx, backward_mx, + posterior_mx); + + optacc_mx = AllocPlan7Matrix (len + 1, hmm->M, 0, 0, 0, 0); + (void) P7FillOptimalAccuracy (len, hmm->M, posterior_mx, + optacc_mx, &(tr[idx])); + + FreePlan7Matrix (posterior_mx); + FreePlan7Matrix (optacc_mx); + + } + + posterior_mx = AllocPlan7Matrix (len + 1, hmm->M, 0, 0, 0, 0); + P7EmitterPosterior (len, hmm, forward_mx, backward_mx, + posterior_mx); + + Postcode(len, posterior_mx, tr[idx]); + /* DisplayPlan7Matrix(dsq[idx], len, hmm, posterior_mx); */ + + + /* DisplayPlan7PostAlign (len, hmm, + forward_mx, backward_mx, + &(tr[idx]), 1); + */ + + FreePlan7Matrix (backward_mx); + FreePlan7Matrix (forward_mx); + + } + + /* the annotated alignment may be resaved */ + if (align_ofile != NULL) { + MSA *new_msa; + SQINFO *sqinfo; + + sqinfo = MSAToSqinfo(msa); + new_msa = P7Traces2Alignment(dsq, sqinfo, msa->wgt, msa->nseq, + hmm->M, tr, FALSE); + if ((fp = fopen(align_ofile, "w")) == NULL) { + Warn("Failed to open alignment resave file %s; using stdout instead", + align_ofile); + fp = stdout; + } + WriteStockholm(fp, new_msa); + MSAFree(new_msa); + for (idx = 0; idx < msa->nseq; idx++) + FreeSequence(NULL, &(sqinfo[idx])); + free(sqinfo); + if (fp != stdout) fclose(fp); + } + + /* Verbose output; show scores for each sequence + */ + /* + if (verbose) + print_all_scores(stdout, hmm, dsq, msq, tr); + */ + + /* Clean up and exit + */ + for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]); + free(tr); + FreePlan7(hmm); + P7FreePrior(pri); + Free2DArray((void **) dsq, msa->nseq); + MSAFree(msa); + SqdClean(); + + return 0; +} + +/* Function: save_model() + * + * Purpose: Save the new model to a file. + * + * Args: hmm - model to save + * hmmfile - file to save to (if NULL, use stdout) + * do_append - TRUE to append to file + * do_binary - TRUE to write a binary file + * + * Return: (void) + */ +static void +save_model(struct plan7_s *hmm, char *hmmfile, int do_append, int do_binary) +{ + FILE *fp; + + if (hmmfile == NULL) + fp = stdout; + else if (do_append) + { + /* check that it looks like an HMM file */ +#ifdef REMOVED /* This code induces an unresolved Linux/SGI NFS bug! */ + if (FileExists(hmmfile)) + { + HMMFILE *hmmfp; + hmmfp = HMMFileOpen(hmmfile, NULL); + if (hmmfp == NULL) { + Warn("%s not an HMM file; can't append to it; using stdout instead", + hmmfile); + fp = stdout; + puts(""); /* do a newline before stdout HMM starts */ + } else { + HMMFileClose(hmmfp); + } + } +#endif + + if ((fp = fopen(hmmfile, "a")) == NULL) { + Warn("hey, where'd your HMM file go? Using stdout instead."); + fp = stdout; + puts(""); /* do a newline before stdout HMM starts */ + } + } + else + { + if ((fp = fopen(hmmfile, "w")) == NULL) { + Warn("Failed to open HMM save file %s; using stdout instead", hmmfile); + fp = stdout; + puts(""); /* do a newline before stdout HMM starts */ + } + } + + if (do_binary) WriteBinHMM(fp, hmm); + else WriteAscHMM(fp, hmm); + + if (fp != stdout) fclose(fp); + return; +} + + + + + +/* Function: print_all_scores() + * + * Purpose: For each training sequence, print its score under + * the final model. + * + * Args: fp - where to print the output (usu. stdout) + * hmm - newly constructed HMM, with prob's. + * ainfo- info with aseq + * dsq - digitized unaligned training sequences. + * nseq - number of training sequences + * tr - array of tracebacks + * + * Return: (void) + */ +static void +print_all_scores(FILE *fp, struct plan7_s *hmm, + AINFO *ainfo, char **dsq, int nseq, struct p7trace_s **tr) +{ + int idx; /* counter for sequences */ + + /* make sure model scores are ready */ + P7Logoddsify(hmm, TRUE); + /* header */ + fputs("**\n", fp); + fputs("Individual training sequence scores:\n", fp); + /* score for each sequence */ + for (idx = 0; idx < nseq; idx++) + { + fprintf(fp, "%7.2f %-12s %s\n", + P7TraceScore(hmm, dsq[idx], tr[idx]), + ainfo->sqinfo[idx].name, + (ainfo->sqinfo[idx].flags & SQINFO_DESC) ? + ainfo->sqinfo[idx].desc : ""); + P7PrintTrace(fp, tr[idx], hmm, dsq[idx]); + } + fputs("\n", fp); +} + + + +/* Function: save_countvectors() + * + * Purpose: Save emission/transition count vectors to a file. + * Used for gathering the data on which to train a + * prior (e.g. mixture Dirichlet, etc.) + * + * The format of the file is one vector per line: + * M ...: 20 match emission counts in order AC..WY. + * I ...: 20 insert emission counts in order AC..WY. + * T ...: 7 transition counts in order TMM, TMI, TMD, + * TIM, TII, TDM, TDD. (see structs.h) + * + * Args: cfile - counts file to make + * hmm - counts-based HMM + */ +static void +save_countvectors(char *cfile, struct plan7_s *hmm) +{ + FILE *fp; + int k, x; + + if ((fp = fopen(cfile, "w")) == NULL) + Die("failed to open count vector file %s for writing", cfile); + + /* match emission vectors */ + for (k = 1; k <= hmm->M; k++) + { + fputs("M ", fp); + for (x = 0; x < Alphabet_size; x++) + fprintf(fp, "%.2f ", hmm->mat[k][x]); + fputs("\n", fp); + } + /* insert emission vectors */ + for (k = 1; k < hmm->M; k++) + { + fputs("I ", fp); + for (x = 0; x < Alphabet_size; x++) + fprintf(fp, "%.2f ", hmm->ins[k][x]); + fputs("\n", fp); + } + /* transition vectors */ + for (k = 1; k < hmm->M; k++) + { + fputs("T ", fp); + for (x = 0; x < 7; x++) + fprintf(fp, "%.2f ", hmm->t[k][x]); + fputs("\n", fp); + } + + fclose(fp); +} + + +/* Function: position_average_score() + * Date: Wed Dec 31 09:36:35 1997 [StL] + * + * Purpose: Calculate scores from tracebacks, keeping them + * in a position specific array. The final array + * is normalized position-specifically too, according + * to how many sequences contributed data to this + * position. Used for compensating for sequence + * fragments in ME and MD score optimization. + * Very much ad hoc. + * + * Code related to (derived from) TraceScore(). + * + * Args: hmm - HMM structure, scores valid + * dsq - digitized unaligned sequences + * wgt - weights on the sequences + * nseq - number of sequences + * tr - array of nseq tracebacks that aligns each dsq to hmm + * pernode - RETURN: [0]1..M array of position-specific avg scores + * ret_avg - RETURN: overall average full-length, one-domain score + * + * Return: 1 on success, 0 on failure. + * pernode is malloc'ed [0]1..M by CALLER and filled here. + */ +static void +position_average_score(struct plan7_s *hmm, + char **dsq, + float *wgt, + int nseq, + struct p7trace_s **tr, + float *pernode, + float *ret_avg) +{ + int pos; /* position in seq */ + int sym; + int tpos; /* position in trace/state sequence */ + float *counts; /* counts at each position */ + float avg; /* RETURN: average overall */ + int k; /* counter for model position */ + int idx; /* counter for sequence number */ + + /* Allocations + */ + counts = MallocOrDie ((hmm->M+1) * sizeof(float)); + FSet(pernode, hmm->M+1, 0.); + FSet(counts, hmm->M+1, 0.); + + /* Loop over traces, accumulate weighted scores per position + */ + for (idx = 0; idx < nseq; idx++) + for (tpos = 0; tpos < tr[idx]->tlen; tpos++) + { + pos = tr[idx]->pos[tpos]; + sym = (int) dsq[idx][tr[idx]->pos[tpos]]; + k = tr[idx]->nodeidx[tpos]; + + /* Counts: how many times did we use this model position 1..M? + * (weighted) + */ + if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD) + counts[k] += wgt[idx]; + + /* Emission scores. + */ + if (tr[idx]->statetype[tpos] == STM) + pernode[k] += wgt[idx] * Scorify(hmm->msc[sym][k]); + else if (tr[idx]->statetype[tpos] == STI) + pernode[k] += wgt[idx] * Scorify(hmm->isc[sym][k]); + + /* Transition scores. + */ + if (tr[idx]->statetype[tpos] == STM || + tr[idx]->statetype[tpos] == STD || + tr[idx]->statetype[tpos] == STI) + pernode[k] += wgt[idx] * + Scorify(TransitionScoreLookup(hmm, tr[idx]->statetype[tpos], tr[idx]->nodeidx[tpos], + tr[idx]->statetype[tpos+1],tr[idx]->nodeidx[tpos+1])); + } + + /* Divide accumulated scores by accumulated weighted counts + */ + avg = 0.; + for (k = 1; k <= hmm->M; k++) + { + pernode[k] /= counts[k]; + avg += pernode[k]; + } + + free(counts); + *ret_avg = avg; + return; +} + + +/* Function: frag_trace_score() + * Date: SRE, Wed Dec 31 10:03:47 1997 [StL] + * + * Purpose: Allow MD/ME optimization to be used for alignments + * that include fragments and multihits -- estimate a full-length + * per-domain score. + * + * + * + * Return: "corrected" score. + */ +static float +frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr, + float *pernode, float expected) +{ + float sc; /* corrected score */ + float fragexp; /* expected score for a trace like this */ + int tpos; /* position in trace */ + + /* get uncorrected score */ + sc = P7TraceScore(hmm, dsq, tr); + + /* calc expected score for trace like this */ + fragexp = 0.; + for (tpos = 0; tpos < tr->tlen; tpos++) + if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD) + fragexp += pernode[tr->nodeidx[tpos]]; + + /* correct for multihits */ + fragexp /= (float) TraceDomainNumber(tr); + + /* extrapolate to full-length, one-hit score */ + sc = sc * expected / fragexp; + return sc; +} + + +/* Function: maximum_entropy() + * Date: SRE, Fri Jan 2 10:56:00 1998 [StL] + * + * Purpose: Optimizes a model according to maximum entropy weighting. + * See Krogh and Mitchison (1995). + * + * [Actually, we do minimum relative entropy, rather than + * maximum entropy. Same thing, though we refer to "ME" + * weights and models. The optimization is a steepest + * descents minimization of the relative entropy.] + * + * Expects to be called shortly after a Maxmodelmaker() + * or Handmodelmaker(), so that both a new model architecture + * (with MAP parameters) and fake tracebacks are available. + * + * Prints a summary of optimization progress to stdout. + * + * Args: hmm - model. allocated, set with initial MAP parameters. + * dsq - dealigned digitized seqs the model is based on + * ainfo - extra info for aseqs + * nseq - number of aseqs + * eff_nseq- effective sequence number; weights normalize up to this. + * prior - prior distributions for parameterizing model + * tr - array of fake traces for each sequence + * + * Return: (void) + * hmm changed to an ME HMM + * ainfo changed, contains ME weights + */ +static void +maximum_entropy(struct plan7_s *hmm, char **dsq, AINFO *ainfo, int nseq, + float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr) +{ + float *wgt; /* current best set of ME weights */ + float *new_wgt; /* new set of ME weights to try */ + float *sc; /* log-odds score of each sequence */ + float *grad; /* gradient */ + float epsilon; /* steepness of descent */ + float relative_entropy; /* current best relative entropy */ + float new_entropy; /* relative entropy at new weights */ + float last_new_entropy; /* last new_entropy we calc'ed */ + float use_epsilon; /* current epsilon value in use */ + int idx; /* counter over sequences */ + int i1, i2; /* counters for iterations */ + + float converge_criterion; + float minw, maxw; /* min, max weight */ + int posw, highw; /* number of positive weights */ + float mins, maxs, avgs; /* min, max, avg score */ + float *pernode; /* expected score per node of HMM */ + float expscore; /* expected score of complete HMM */ + int max_iter; /* bulletproof against infinite loop bugs */ + + epsilon = 0.2; /* works fine */ + max_iter = 666; + + /* Allocations + */ + sc = MallocOrDie (sizeof(float) * nseq); + wgt = MallocOrDie (sizeof(float) * nseq); + new_wgt = MallocOrDie (sizeof(float) * nseq); + grad = MallocOrDie (sizeof(float) * nseq); + pernode = MallocOrDie (sizeof(float) * (hmm->M+1)); + + /* Initialization. Start with all weights == 1.0. + * Find relative entropy and gradient. + */ + Plan7SWConfig(hmm, 0.5, 0.5); + P7Logoddsify(hmm, TRUE); + + FSet(wgt, nseq, 1.0); + position_average_score(hmm, dsq, wgt, nseq, tr, pernode, &expscore); + for (idx = 0; idx < nseq; idx++) + sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore); + relative_entropy = FSum(sc, nseq) / (float) nseq; + for (idx = 0; idx < nseq; idx++) + grad[idx] = relative_entropy - sc[idx]; + + + /* + * printf statements commented out: + * + * printf("iter avg-sc min-sc max-sc min-wgt max-wgt +wgt ++wgt rel.ent convergence\n"); + * printf("---- ------ ------ ------ ------- ------- ---- ----- ------- -----------\n"); + * + */ + mins = maxs = avgs = sc[0]; + for (idx = 1; idx < nseq; idx++) + { + if (sc[idx] < mins) mins = sc[idx]; + if (sc[idx] > maxs) maxs = sc[idx]; + avgs += sc[idx]; + } + avgs /= nseq; + + /* + * printf statement commented out: + * + * printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8s\n", + * 0, avgs, mins, maxs, 1.0, 1.0, nseq, 0, relative_entropy, "-"); + * + */ + + + /* Steepest descents optimization; + * iterate until relative entropy converges. + */ + i1 = 0; + while (++i1 < max_iter) + { + /* Gradient gives us a line of steepest descents. + * (Roughly speaking, anyway. We actually have a constraint + * that weights are nonnegative and normalized, and the + * gradient doesn't take these into account.) + * Look along this line, a distance of epsilon * gradient: + * if new point is better, accept; if new point is worse, + * move back along the line by half the distance and re-evaluate. + */ + use_epsilon = epsilon; + new_entropy = relative_entropy + 1.0; /* just ensure new > old */ + + i2 = 0; + while (new_entropy > relative_entropy && ++i2 < max_iter) + { + last_new_entropy = new_entropy; + + /* find a new point in weight space */ + for (idx = 0; idx < nseq; idx++) + { + new_wgt[idx] = wgt[idx] + use_epsilon * grad[idx]; + if (new_wgt[idx] < 0.) new_wgt[idx] = 0.0; + } + FNorm(new_wgt, nseq); + FScale(new_wgt, nseq, (float) nseq); + + /* Make new HMM using these weights */ + ZeroPlan7(hmm); + for (idx = 0; idx < nseq; idx++) + P7TraceCount(hmm, dsq[idx], new_wgt[idx], tr[idx]); + P7PriorifyHMM(hmm, prior); + + + /* Evaluate new point */ + Plan7SWConfig(hmm, 0.5, 0.5); + P7Logoddsify(hmm, TRUE); + position_average_score(hmm, dsq, new_wgt, nseq, tr, pernode, &expscore); + for (idx = 0; idx < nseq; idx++) + sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore); + new_entropy = FDot(sc, new_wgt, nseq) / nseq; + + use_epsilon /= 2.0; + /* Failsafe: we're not converging. Set epsilon to zero, + * do one more round. + */ + if (use_epsilon < 1e-6) use_epsilon = 0.0; + if (use_epsilon == 0.0) break; + + /* Failsafe: avoid infinite loops. Sometimes the + new entropy converges without ever being better + than the previous point, probably as a result + of minor roundoff error. */ + if (last_new_entropy == new_entropy) break; + } + /* + * printf statement commented out: + * + * if (i2 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n"); + * + */ + + /* Evaluate convergence before accepting the new weights; + * then, accept the new point and evaluate the gradient there. + */ + converge_criterion = fabs((relative_entropy-new_entropy)/relative_entropy); + relative_entropy = new_entropy; + FCopy(wgt, new_wgt, nseq); + for (idx = 0; idx < nseq; idx++) + grad[idx] = relative_entropy - sc[idx]; + + /* Print some statistics about this iteration + */ + mins = maxs = avgs = sc[0]; + minw = maxw = wgt[0]; + posw = (wgt[0] > 0.0) ? 1 : 0; + highw = (wgt[0] > 1.0) ? 1 : 0; + for (idx = 1; idx < nseq; idx++) + { + if (sc[idx] < mins) mins = sc[idx]; + if (sc[idx] > maxs) maxs = sc[idx]; + if (wgt[idx] < minw) minw = wgt[idx]; + if (wgt[idx] > maxw) maxw = wgt[idx]; + if (wgt[idx] > 0.0) posw++; + if (wgt[idx] > 1.0) highw++; + avgs += sc[idx]; + } + avgs /= nseq; + + + /* + * printf statement commented out: + * + * printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8.5f\n", + * i1, + * avgs, mins, maxs, + * minw, maxw, posw, highw, + * relative_entropy, converge_criterion); + * + */ + + if (converge_criterion < 1e-5) break; + } + /* + * printf statement commented out: + * + * if (i1 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n"); + * + */ + + /* Renormalize weights to sum to eff_nseq, and save. + */ + FNorm(wgt, nseq); + FScale(wgt, nseq, (float) eff_nseq); + FCopy(ainfo->wgt, wgt, nseq); + /* Make final HMM using these adjusted weights */ + ZeroPlan7(hmm); + for (idx = 0; idx < nseq; idx++) + P7TraceCount(hmm, dsq[idx], wgt[idx], tr[idx]); + P7PriorifyHMM(hmm, prior); + + /* Cleanup and return + */ + free(pernode); + free(new_wgt); + free(grad); + free(wgt); + free(sc); + return; +} diff --git a/forester/archive/RIO/others/hmmer/src/hmmsearch-pvm.c b/forester/archive/RIO/others/hmmer/src/hmmsearch-pvm.c new file mode 100644 index 0000000..7acde9b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmsearch-pvm.c @@ -0,0 +1,180 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +#ifdef HMMER_PVM + +/* hmmsearch-pvm.c + * SRE, Wed Sep 23 09:30:53 1998 + * + * PVM slave for hmmsearch. + * RCS $Id: hmmsearch-pvm.c,v 1.1.1.1 2005/03/22 08:34:12 cmzmasek Exp $ + */ + +#include +#include +#include +#include + +#include "version.h" +#include "structs.h" /* data structures, macros, #define's */ +#include "config.h" /* compile-time configuration constants */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "squid.h" /* general sequence analysis library */ + +static void leave_pvm(void); + +int +main(void) +{ + struct plan7_s *hmm; /* HMM to search with */ + struct p7trace_s *tr; /* trace structure for a Viterbi alignment */ + int master_tid; /* PVM TID of our master */ + int alphatype; /* alphabet type */ + int code; /* status code for whether we're ok */ + int my_idx; /* my slave index: 0..nslaves-1, master assigns */ + int L; /* length of sequence */ + char *dsq; /* digitized sequence 1..L */ + float sc; /* log odds score for seq + HMM */ + double pvalue; /* P-value of sc */ + double evalue; /* bounded E-value of sc (we don't know nseq yet) */ + int do_forward; /* TRUE to score using Forward() */ + int do_null2; /* TRUE to use null2 ad hoc correction */ + float globT; /* T parameter: keep only hits > globT bits */ + double globE; /* E parameter: keep hits < globE E-value */ + int Z; /* nseq to base E value calculation on */ + int nseq; /* actual nseq so far (master keeps updating this) */ + int send_trace; /* TRUE if sc looks significant and we return tr */ + + /* Register leave_pvm() cleanup function so any exit() call + * first calls pvm_exit(). + */ + if (atexit(leave_pvm) != 0) { pvm_exit(); Die("slave couldn't register leave_pvm()"); } + + /***************************************************************** + * Initialization. + * Master broadcasts the problem to us: + * globT, globE, Z, do_forward, do_null2, alphabet type, HMM, + ******************************************************************/ + + master_tid = pvm_parent(); /* who's our master? */ + my_idx = -1; + + /* wait for a HMMPVM_INIT message, and unpack it; + * get options, set alphabet type, get HMM. + */ + pvm_recv(master_tid, HMMPVM_INIT); + pvm_upkfloat(&globT, 1, 1); + pvm_upkdouble(&globE, 1, 1); + pvm_upkint(&Z, 1, 1); + pvm_upkint(&do_forward, 1, 1); + pvm_upkint(&do_null2, 1, 1); + pvm_upkint(&alphatype, 1, 1); + SetAlphabet(alphatype); + hmm = PVMUnpackHMM(); + + P7Logoddsify(hmm, TRUE); + + /* tell the master we're OK and ready to go (or not) + */ + code = HMMPVM_OK; + if (hmm == NULL) code = HMMPVM_BAD_INIT; + pvm_initsend(PvmDataDefault); + pvm_pkint(&code, 1, 1); + PVMPackString(RELEASE); + pvm_send(master_tid, HMMPVM_RESULTS); + + /***************************************************************** + * Main loop. + * Receive a digitized sequence to search against. + *****************************************************************/ + + for (;;) + { + SQD_DPRINTF1(("Slave about to do a blocking receive, waiting for input.\n")); + pvm_recv(master_tid, HMMPVM_WORK); + pvm_upkint(&nseq, 1, 1); + if (nseq == -1) break; /* shutdown signal */ + if (my_idx == -1) my_idx = nseq; + pvm_upkint(&L, 1, 1); + SQD_DPRINTF1(("Slave received nseq=%d L=%d my_idx=%d\n", nseq, L, my_idx)); + dsq = MallocOrDie(sizeof(char) * (L + 2)); + pvm_upkbyte(dsq, L+2, 1); + SQD_DPRINTF1(("Slave unpacked a seq of %d bytes; beginning processing\n", L+2)); + + /* Score sequence, do alignment (Viterbi), recover trace + */ + if (P7ViterbiSize(L, hmm->M) <= RAMLIMIT) + { + SQD_DPRINTF1(("Slave doing Viterbi after estimating %d MB\n", (P7ViterbiSize(L, hmm->M)))); + sc = P7Viterbi(dsq, L, hmm, &tr); + } + else + { + SQD_DPRINTF1(("Slave going small after estimating %d MB\n", (P7ViterbiSize(L, hmm->M)))); + sc = P7SmallViterbi(dsq, L, hmm, &tr); + } + + if (do_forward) sc = P7Forward(dsq, L, hmm, NULL); + if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq); + + pvalue = PValue(hmm, sc); + evalue = Z ? (double) Z * pvalue : (double) nseq * pvalue; + send_trace = (sc >= globT && evalue <= globE) ? 1 : 0; + + /* return output + */ + SQD_DPRINTF1(("Slave has a result (sc = %.1f); sending back to master\n", sc)); + pvm_initsend(PvmDataDefault); + pvm_pkint (&my_idx, 1, 1); + pvm_pkfloat (&sc, 1, 1); + pvm_pkdouble(&pvalue, 1, 1); + pvm_pkint(&send_trace, 1, 1); /* flag for whether a trace structure is coming */ + if (send_trace) PVMPackTrace(tr); + pvm_send(master_tid, HMMPVM_RESULTS); + + /* cleanup + */ + free(dsq); + P7FreeTrace(tr); + } + + /*********************************************** + * Cleanup, return. + ***********************************************/ + + SQD_DPRINTF1(("Slave is done; performing a normal exit.\n")); + FreePlan7(hmm); + exit(0); /* pvm_exit() gets called by atexit() registration. */ +} + +/* Function: leave_pvm() + * + * Purpose: Cleanup function, to deal with crashes. We register + * this function using atexit() so it gets called before + * the slave dies. + */ +void leave_pvm(void) +{ + SQD_DPRINTF1(("slave leaving PVM.\n")); + pvm_exit(); +} + + +#else /* if HMMER_PVM not defined: include a dummy */ + +#include +int main(void) +{ + printf("hmmsearch-pvm is disabled. PVM support was not compiled into HMMER.\n"); + exit(0); +} + +#endif diff --git a/forester/archive/RIO/others/hmmer/src/hmmsearch.c b/forester/archive/RIO/others/hmmer/src/hmmsearch.c new file mode 100644 index 0000000..1c5ba7a --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/hmmsearch.c @@ -0,0 +1,1101 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* hmmsearch.c + * SRE, Tue Jan 7 17:19:20 1997 [St. Louis] + * + * Search a sequence database with a profile HMM. + * Conditionally includes PVM parallelization when HMMER_PVM is defined + * at compile time; hmmsearch --pvm runs the PVM version. + * + * CVS $Id: hmmsearch.c,v 1.1.1.1 2005/03/22 08:34:05 cmzmasek Exp $ + */ + +#include +#include +#include +#include +#include +#ifdef HMMER_THREADS +#include +#endif +#ifdef HMMER_PVM +#include +#endif + +#include "squid.h" /* general sequence analysis library */ +#include "config.h" /* compile-time configuration constants */ +#include "structs.h" /* data structures, macros, #define's */ +#include "funcs.h" /* function declarations */ +#include "globals.h" /* alphabet global variables */ +#include "version.h" /* version info */ + +static char banner[] = "hmmsearch - search a sequence database with a profile HMM"; + +static char usage[] = "\ +Usage: hmmsearch [-options] \n\ + Available options are:\n\ + -h : help; print brief help on version and usage\n\ + -A : sets alignment output limit to best domain alignments\n\ + -E : sets E value cutoff (globE) to <= x\n\ + -T : sets T bit threshold (globT) to >= x\n\ + -Z : sets Z (# seqs) for E-value calculation\n\ +"; + +static char experts[] = "\ + --compat : make best effort to use last version's output style\n\ + --cpu : run threads in parallel (if threaded)\n\ + --cut_ga : use Pfam GA gathering threshold cutoffs\n\ + --cut_nc : use Pfam NC noise threshold cutoffs\n\ + --cut_tc : use Pfam TC trusted threshold cutoffs\n\ + --domE : sets domain Eval cutoff (2nd threshold) to <= x\n\ + --domT : sets domain T bit thresh (2nd threshold) to >= x\n\ + --forward : use the full Forward() algorithm instead of Viterbi\n\ + --informat : sequence file is in format , not FASTA\n\ + --null2 : turn OFF the post hoc second null model\n\ + --pvm : run on a Parallel Virtual Machine (PVM)\n\ + --xnu : turn ON XNU filtering of target protein sequences\n\ +"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-A", TRUE, sqdARG_INT }, + { "-E", TRUE, sqdARG_FLOAT}, + { "-T", TRUE, sqdARG_FLOAT}, + { "-Z", TRUE, sqdARG_INT }, + { "--compat", FALSE, sqdARG_NONE }, + { "--cpu", FALSE, sqdARG_INT }, + { "--cut_ga", FALSE, sqdARG_NONE }, + { "--cut_nc", FALSE, sqdARG_NONE }, + { "--cut_tc", FALSE, sqdARG_NONE }, + { "--domE", FALSE, sqdARG_FLOAT}, + { "--domT", FALSE, sqdARG_FLOAT}, + { "--forward", FALSE, sqdARG_NONE }, + { "--informat",FALSE, sqdARG_STRING}, + { "--null2", FALSE, sqdARG_NONE }, + { "--pvm", FALSE, sqdARG_NONE }, + { "--xnu", FALSE, sqdARG_NONE }, + +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + + +#ifdef HMMER_THREADS +/* POSIX threads version: + * the threads share a workpool_s structure amongst themselves, + * for obtaining locks on input HMM file and output histogram and + * tophits structures. + */ +struct workpool_s { + /* Shared configuration resources which don't change: + */ + struct plan7_s *hmm; /* HMM to search with */ + int do_xnu; /* TRUE to apply XNU filter */ + int do_forward; /* TRUE to score using Forward */ + int do_null2; /* TRUE to apply null2 ad hoc correction */ + struct threshold_s *thresh; /* score/evalue threshold info */ + + /* Shared (mutex-protected) input resources: + */ + SQFILE *sqfp; /* ptr to open sequence file */ + int nseq; /* number of seqs searched so far */ + pthread_mutex_t input_lock; /* mutex for locking input */ + + /* Shared (mutex-protected) output resources: + */ + struct tophit_s *ghit; /* per-sequence top hits */ + struct tophit_s *dhit; /* per-domain top hits */ + struct histogram_s *hist; /* histogram of scores */ + pthread_mutex_t output_lock; /* mutex for locking output */ + + /* Thread pool information + */ + pthread_t *thread; /* our pool of threads */ + int num_threads; /* number of threads */ +}; +static struct workpool_s *workpool_start(struct plan7_s *hmm, SQFILE *sqfp, + int do_xnu, int do_forward, int do_null2, + struct threshold_s *thresh, + struct tophit_s *ghit, struct tophit_s *dhit, + struct histogram_s *hist, int num_threads); +static void workpool_stop(struct workpool_s *wpool); +static void workpool_free(struct workpool_s *wpool); +static void *worker_thread(void *ptr); +#endif /* HMMER_THREADS */ + +static void main_loop_serial(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, + int do_null2, int do_xnu, int num_threads, + struct histogram_s *histogram, struct tophit_s *ghit, + struct tophit_s *dhit, int *ret_nseq); +#ifdef HMMER_PVM +static void main_loop_pvm(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, + int do_null2, int do_xnu, struct histogram_s *histogram, + struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq); +#endif + + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMM(s) from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + char *seqfile; /* file to read target sequence(s) from */ + SQFILE *sqfp; /* opened seqfile for reading */ + int format; /* format of seqfile */ + int i; + struct plan7_s *hmm; /* HMM to search with */ + struct histogram_s *histogram;/* histogram of all scores */ + struct fancyali_s *ali; /* displayed alignment info */ + struct tophit_s *ghit; /* list of top hits for whole sequences */ + struct tophit_s *dhit; /* list of top hits for domains */ + + float sc; /* score of an HMM search */ + double pvalue; /* pvalue of an HMM score */ + double evalue; /* evalue of an HMM score */ + double motherp; /* pvalue of a whole seq HMM score */ + float mothersc; /* score of a whole seq parent of domain */ + int sqfrom, sqto; /* coordinates in sequence */ + int hmmfrom, hmmto; /* coordinate in HMM */ + char *name, *acc, *desc; /* hit sequence name and description */ + int sqlen; /* length of seq that was hit */ + int nseq; /* number of sequences searched */ + int Z; /* # of seqs for purposes of E-val calc */ + int domidx; /* number of this domain */ + int ndom; /* total # of domains in this seq */ + int namewidth; /* max width of sequence name */ + int descwidth; /* max width of description */ + int nreported; /* # of hits reported in a list */ + + int Alimit; /* A parameter limiting output alignments */ + struct threshold_s thresh; /* contains all threshold (cutoff) info */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + int do_null2; /* TRUE to adjust scores with null model #2 */ + int do_forward; /* TRUE to use Forward() not Viterbi() */ + int do_xnu; /* TRUE to filter sequences thru XNU */ + int do_pvm; /* TRUE to run on Parallel Virtual Machine */ + int be_backwards; /* TRUE to be backwards-compatible in output*/ + int num_threads; /* number of worker threads */ + + /*********************************************** + * Parse command line + ***********************************************/ + + format = SQFILE_UNKNOWN; /* default: autodetect seq file format w/ Babelfish */ + do_forward = FALSE; + do_null2 = TRUE; + do_xnu = FALSE; + do_pvm = FALSE; + Z = 0; + be_backwards= FALSE; + + Alimit = INT_MAX; /* no limit on alignment output */ + thresh.globE = 10.0; /* use a reasonable Eval threshold; */ + thresh.globT = -FLT_MAX; /* but no bit threshold, */ + thresh.domT = -FLT_MAX; /* no domain bit threshold, */ + thresh.domE = FLT_MAX; /* and no domain Eval threshold. */ + thresh.autocut = CUT_NONE; /* and no Pfam cutoffs used */ + thresh.Z = 0; /* Z not preset; use actual # of seqs */ + +#ifdef HMMER_THREADS + num_threads = ThreadNumber(); /* only matters if we're threaded */ +#else + num_threads = 0; +#endif + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-A") == 0) Alimit = atoi(optarg); + else if (strcmp(optname, "-E") == 0) thresh.globE = atof(optarg); + else if (strcmp(optname, "-T") == 0) thresh.globT = atof(optarg); + else if (strcmp(optname, "-Z") == 0) thresh.Z = atoi(optarg); + else if (strcmp(optname, "--compat") == 0) be_backwards = TRUE; + else if (strcmp(optname, "--cpu") == 0) num_threads = atoi(optarg); + else if (strcmp(optname, "--cut_ga") == 0) thresh.autocut = CUT_GA; + else if (strcmp(optname, "--cut_nc") == 0) thresh.autocut = CUT_NC; + else if (strcmp(optname, "--cut_tc") == 0) thresh.autocut = CUT_TC; + else if (strcmp(optname, "--domE") == 0) thresh.domE = atof(optarg); + else if (strcmp(optname, "--domT") == 0) thresh.domT = atof(optarg); + else if (strcmp(optname, "--forward") == 0) do_forward = TRUE; + else if (strcmp(optname, "--null2") == 0) do_null2 = FALSE; + else if (strcmp(optname, "--pvm") == 0) do_pvm = TRUE; + else if (strcmp(optname, "--xnu") == 0) do_xnu = TRUE; + else if (strcmp(optname, "--informat") == 0) { + format = String2SeqfileFormat(optarg); + if (format == SQFILE_UNKNOWN) + Die("unrecognized sequence file format \"%s\"", optarg); + } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 2) + Die("Incorrect number of arguments.\n%s\n", usage); + + hmmfile = argv[optind++]; + seqfile = argv[optind++]; + +#ifndef HMMER_PVM + if (do_pvm) Die("PVM support is not compiled into your HMMER software; --pvm doesn't work."); +#endif +#ifndef HMMER_THREADS + if (num_threads) Die("Posix threads support is not compiled into HMMER; --cpu doesn't have any effect"); +#endif + + + /*********************************************** + * Open sequence database (might be in BLASTDB or current directory) + ***********************************************/ + + if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL) + Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); + + /*********************************************** + * Open HMM file (might be in HMMERDB or current directory). + * Read a single HMM from it. (Config HMM, if necessary). + * Alphabet globals are set by reading the HMM. + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, "HMMERDB")) == NULL) + Die("Failed to open HMM file %s\n%s", hmmfile, usage); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", hmmfile); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + P7Logoddsify(hmm, !do_forward); + + if (do_xnu && Alphabet_type == hmmNUCLEIC) + Die("The HMM is a DNA model, and you can't use the --xnu filter on DNA data"); + + /***************************************************************** + * Set up optional Pfam score thresholds. + * Can do this before starting any searches, since we'll only use 1 HMM. + *****************************************************************/ + + if (! SetAutocuts(&thresh, hmm)) + Die("HMM %s did not contain the GA, TC, or NC cutoffs you needed", + hmm->name); + + /*********************************************** + * Show the banner + ***********************************************/ + + Banner(stdout, banner); + printf( "HMM file: %s [%s]\n", hmmfile, hmm->name); + printf( "Sequence database: %s\n", seqfile); + if (do_pvm) + printf( "PVM: ACTIVE\n"); + printf( "per-sequence score cutoff: "); + if (thresh.globT == -FLT_MAX) printf("[none]\n"); + else { + printf(">= %.1f", thresh.globT); + if (thresh.autocut == CUT_GA) printf(" [GA1]\n"); + else if (thresh.autocut == CUT_NC) printf(" [NC1]\n"); + else if (thresh.autocut == CUT_TC) printf(" [TC1]\n"); + else printf("\n"); + } + printf( "per-domain score cutoff: "); + if (thresh.domT == -FLT_MAX) printf("[none]\n"); + else { + printf(">= %.1f", thresh.domT); + if (thresh.autocut == CUT_GA) printf(" [GA2]\n"); + else if (thresh.autocut == CUT_NC) printf(" [NC2]\n"); + else if (thresh.autocut == CUT_TC) printf(" [TC2]\n"); + else printf("\n"); + } + printf( "per-sequence Eval cutoff: "); + if (thresh.globE == FLT_MAX) printf("[none]\n"); + else printf("<= %-10.2g\n", thresh.globE); + + printf( "per-domain Eval cutoff: "); + if (thresh.domE == FLT_MAX) printf("[none]\n"); + else printf("<= %10.2g\n", thresh.domE); + printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n"); + + /*********************************************** + * Search HMM against each sequence + ***********************************************/ + + /* set up structures for storing output */ + histogram = AllocHistogram(-200, 200, 100); /* keeps full histogram */ + ghit = AllocTophits(200); /* per-seq hits: 200=lumpsize */ + dhit = AllocTophits(200); /* domain hits: 200=lumpsize */ + + if (! do_pvm) + main_loop_serial(hmm, sqfp, &thresh, do_forward, do_null2, do_xnu, num_threads, + histogram, ghit, dhit, &nseq); +#ifdef HMMER_PVM + else + main_loop_pvm(hmm, sqfp, &thresh, do_forward, do_null2, do_xnu, + histogram, ghit, dhit, &nseq); +#endif + + /*********************************************** + * Process hit lists, produce text output + ***********************************************/ + + /* Set the theoretical EVD curve in our histogram using + * calibration in the HMM, if available. + */ + if (hmm->flags & PLAN7_STATS) + ExtremeValueSetHistogram(histogram, hmm->mu, hmm->lambda, + histogram->lowscore, histogram->highscore, 0); + if (!thresh.Z) thresh.Z = nseq; /* set Z for good now that we're done. */ + + /* Format and report our output + */ + /* 1. Report overall sequence hits (sorted on E-value) */ + if (be_backwards) + { + printf("\nQuery HMM: %s|%s|%s\n", + hmm->name, + hmm->flags & PLAN7_ACC ? hmm->acc : "", + hmm->flags & PLAN7_DESC ? hmm->desc : ""); + } + else + { + printf("\nQuery HMM: %s\n", hmm->name); + printf("Accession: %s\n", hmm->flags & PLAN7_ACC ? hmm->acc : "[none]"); + printf("Description: %s\n", hmm->flags & PLAN7_DESC ? hmm->desc : "[none]"); + } + + if (hmm->flags & PLAN7_STATS) + printf(" [HMM has been calibrated; E-values are empirical estimates]\n"); + else + printf(" [No calibration for HMM; E-values are upper bounds]\n"); + + FullSortTophits(ghit); + namewidth = MAX(8, TophitsMaxName(ghit)); /* cannot truncate name. */ + descwidth = MAX(52-namewidth, 11);/* may truncate desc, but need strlen("Description") */ + + printf("\nScores for complete sequences (score includes all domains):\n"); + printf("%-*s %-*s %7s %10s %3s\n", namewidth, "Sequence", descwidth, "Description", "Score", "E-value", " N "); + printf("%-*s %-*s %7s %10s %3s\n", namewidth, "--------", descwidth, "-----------", "-----", "-------", "---"); + for (i = 0, nreported = 0; i < ghit->num; i++) + { + char *safedesc; + GetRankedHit(ghit, i, + &pvalue, &sc, NULL, NULL, + &name, NULL, &desc, + NULL, NULL, NULL, /* sequence positions */ + NULL, NULL, NULL, /* HMM positions */ + NULL, &ndom, /* domain info */ + NULL); /* alignment info */ + evalue = pvalue * (double) thresh.Z; + + /* safedesc is a workaround for an apparent Linux printf() + * bug with the *.*s format. dbmalloc crashes with a memchr() ptr out of bounds + * flaw if the malloc'ed space for desc is short. The workaround + * is to make sure the ptr for *.* has a big malloc space. + */ + if (desc != NULL && strlen(desc) < 80) + { + safedesc = MallocOrDie(sizeof(char) * 80); + strcpy(safedesc, desc); + } + else safedesc = Strdup(desc); + + if (evalue <= thresh.globE && sc >= thresh.globT) { + printf("%-*s %-*.*s %7.1f %10.2g %3d\n", + namewidth, name, + descwidth, descwidth, safedesc != NULL ? safedesc : "", + sc, evalue, ndom); + nreported++; + } + free(safedesc); + } + if (nreported == 0) printf("\t[no hits above thresholds]\n"); + + + /* 2. Report domain hits (also sorted on E-value) */ + FullSortTophits(dhit); + namewidth = MAX(8, TophitsMaxName(dhit)); + + printf("\nParsed for domains:\n"); + printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", + namewidth, "Sequence", "Domain ", "seq-f", "seq-t", "hmm-f", "hmm-t", "score", "E-value"); + printf("%-*s %7s %5s %5s %5s %5s %7s %8s\n", + namewidth, "--------", "-------", "-----", "-----", "-----", "-----", "-----", "-------"); + + for (i = 0, nreported = 0; i < dhit->num; i++) + { + GetRankedHit(dhit, i, + &pvalue, &sc, &motherp, &mothersc, + &name, NULL, NULL, + &sqfrom, &sqto, &sqlen, /* seq position info */ + &hmmfrom, &hmmto, NULL, /* HMM position info */ + &domidx, &ndom, /* domain info */ + NULL); /* alignment info */ + evalue = pvalue * (double) thresh.Z; + + if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) + continue; + else if (evalue <= thresh.domE && sc >= thresh.domT) { + printf("%-*s %3d/%-3d %5d %5d %c%c %5d %5d %c%c %7.1f %8.2g\n", + namewidth, name, + domidx, ndom, + sqfrom, sqto, + sqfrom == 1 ? '[' : '.', sqto == sqlen ? ']' : '.', + hmmfrom, hmmto, + hmmfrom == 1 ? '[':'.', hmmto == hmm->M ? ']' : '.', + sc, evalue); + nreported++; + } + } + if (nreported == 0) printf("\t[no hits above thresholds]\n"); + + + /* 3. Alignment output, also by domain. + * dhits is already sorted and namewidth is set, from above code. + * Number of displayed alignments is limited by Alimit parameter; + * also by domE (evalue threshold), domT (score theshold). + */ + if (Alimit != 0) + { + printf("\nAlignments of top-scoring domains:\n"); + for (i = 0, nreported = 0; i < dhit->num; i++) + { + if (nreported == Alimit) break; /* limit to Alimit output alignments */ + GetRankedHit(dhit, i, + &pvalue, &sc, &motherp, &mothersc, + &name, NULL, NULL, + &sqfrom, &sqto, &sqlen, /* seq position info */ + &hmmfrom, &hmmto, NULL, /* HMM position info */ + &domidx, &ndom, /* domain info */ + &ali); /* alignment info */ + evalue = pvalue * (double) thresh.Z; + + if (motherp * (double) thresh.Z > thresh.globE || mothersc < thresh.globT) + continue; + else if (evalue <= thresh.domE && sc >= thresh.domT) + { + printf("%s: domain %d of %d, from %d to %d: score %.1f, E = %.2g\n", + name, domidx, ndom, sqfrom, sqto, sc, evalue); + PrintFancyAli(stdout, ali); + nreported++; + } + } + if (nreported == 0) printf("\t[no hits above thresholds]\n"); + if (nreported == Alimit) printf("\t[output cut off at A = %d top alignments]\n", Alimit); + } + + /* 4. Histogram output */ + printf("\nHistogram of all scores:\n"); + PrintASCIIHistogram(stdout, histogram); + + /* 5. Tophits summaries, while developing... + */ + printf("\nTotal sequences searched: %d\n", nseq); + printf("\nWhole sequence top hits:\n"); + TophitsReport(ghit, thresh.globE, nseq); + printf("\nDomain top hits:\n"); + TophitsReport(dhit, thresh.domE, nseq); + + /*********************************************** + * Clean-up and exit. + ***********************************************/ + + FreeHistogram(histogram); + HMMFileClose(hmmfp); + SeqfileClose(sqfp); + FreeTophits(ghit); + FreeTophits(dhit); + FreePlan7(hmm); + SqdClean(); + + return 0; +} + + +/* Function: main_loop_serial() + * Date: SRE, Wed Sep 23 10:20:49 1998 [St. Louis] + * + * Purpose: Search an HMM against a sequence database. + * main loop for the serial (non-PVM, non-threads) + * version. + * + * In: HMM and open sqfile, plus options + * Out: histogram, global hits list, domain hits list, nseq. + * + * Args: hmm - the HMM to search with. + * sqfp - open SQFILE for sequence database + * thresh - score/evalue threshold info + * do_forward - TRUE to score using Forward() + * do_null2 - TRUE to use ad hoc null2 score correction + * do_xnu - TRUE to apply XNU mask + * num_threads- number of worker threads to start, or 0 + * histogram - RETURN: score histogram + * ghit - RETURN: ranked global scores + * dhit - RETURN: ranked domain scores + * ret_nseq - RETURN: actual number of seqs searched + * + * Returns: (void) + */ +static void +main_loop_serial(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, + int do_null2, int do_xnu, int num_threads, + struct histogram_s *histogram, + struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq) +{ +#ifdef HMMER_THREADS + struct workpool_s *wpool; /* pool of worker threads */ +#else + struct p7trace_s *tr; /* traceback */ + char *seq; /* target sequence */ + char *dsq; /* digitized target sequence */ + SQINFO sqinfo; /* optional info for seq */ + float sc; /* score of an HMM search */ + double pvalue; /* pvalue of an HMM score */ + double evalue; /* evalue of an HMM score */ +#endif + int nseq; /* number of sequences searched */ + +#ifdef HMMER_THREADS + wpool = workpool_start(hmm, sqfp, do_xnu, do_forward, do_null2, thresh, + ghit, dhit, histogram, num_threads); + workpool_stop(wpool); + nseq = wpool->nseq; + workpool_free(wpool); + +#else /* unthreaded code: */ + nseq = 0; + while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) + { + /* Silently skip length 0 seqs. + * What, you think this doesn't occur? Welcome to genomics, + * young grasshopper. + */ + if (sqinfo.len == 0) continue; + + nseq++; + dsq = DigitizeSequence(seq, sqinfo.len); + + if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo.len); + + /* 1. Recover a trace by Viterbi. + */ + if (P7ViterbiSize(sqinfo.len, hmm->M) <= RAMLIMIT) + sc = P7Viterbi(dsq, sqinfo.len, hmm, &tr); + else + sc = P7SmallViterbi(dsq, sqinfo.len, hmm, &tr); + + /* 2. If we're using Forward scores, calculate the + * whole sequence score; this overrides anything + * PostprocessSignificantHit() is going to do to the per-seq score. + */ + if (do_forward) { + sc = P7Forward(dsq, sqinfo.len, hmm, NULL); + if (do_null2) sc -= TraceScoreCorrection(hmm, tr, dsq); + } + +#if DEBUGLEVEL >= 2 + P7PrintTrace(stdout, tr, hmm, dsq); +#endif + + /* 2. Store score/pvalue for global alignment; will sort on score, + * which in hmmsearch is monotonic with E-value. + * Keep all domains in a significant sequence hit. + * We can only make a lower bound estimate of E-value since + * we don't know the final value of nseq yet, so the list + * of hits we keep in memory is >= the list we actually + * output. + */ + pvalue = PValue(hmm, sc); + evalue = thresh->Z ? (double) thresh->Z * pvalue : (double) nseq * pvalue; + if (sc >= thresh->globT && evalue <= thresh->globE) + { + PostprocessSignificantHit(ghit, dhit, + tr, hmm, dsq, sqinfo.len, + sqinfo.name, + sqinfo.flags & SQINFO_ACC ? sqinfo.acc : NULL, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL, + do_forward, sc, + do_null2, + thresh, + FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */ + } + AddToHistogram(histogram, sc); + FreeSequence(seq, &sqinfo); + P7FreeTrace(tr); + free(dsq); + } +#endif + + *ret_nseq = nseq; + return; +} + + + +#ifdef HMMER_PVM +/***************************************************************** + * PVM specific functions + ****************************************************************/ + +/* Function: main_loop_pvm() + * Date: SRE, Wed Sep 23 10:36:44 1998 [St. Louis] + * + * Purpose: Search an HMM against a sequence database. + * main loop for the PVM version. + * + * In: HMM and open sqfile, plus options + * Out: histogram, global hits list, domain hits list, nseq. + * + * Args: hmm - the HMM to search with. scoring form. + * sqfp - open SQFILE for sequence database + * thresh - score/evalue threshold information + * do_forward - TRUE to score using Forward() + * do_null2 - TRUE to use ad hoc null2 score correction + * do_xnu - TRUE to apply XNU mask + * histogram - RETURN: score histogram + * ghit - RETURN: ranked global scores + * dhit - RETURN: ranked domain scores + * ret_nseq - RETURN: actual number of seqs searched + * + * Returns: (void) + */ +static void +main_loop_pvm(struct plan7_s *hmm, SQFILE *sqfp, struct threshold_s *thresh, int do_forward, + int do_null2, int do_xnu, struct histogram_s *histogram, + struct tophit_s *ghit, struct tophit_s *dhit, int *ret_nseq) +{ + char *seq; /* target sequence */ + char *dsq; /* digitized target seq */ + SQINFO sqinfo; /* optional info about target seq */ + int master_tid; /* master's (my) PVM TID */ + int *slave_tid; /* array of slave TID's */ + int nslaves; /* number of slaves */ + int code; /* status code rec'd from a slave */ + int nseq; /* number of sequences searched */ + int sent_trace; /* TRUE if slave gave us a trace */ + char **dsqlist; /* remember what seqs slaves are doing */ + char **namelist; /* remember what seq names slaves are doing */ + char **acclist ; /* remember what seq accessions slaves are doing */ + char **desclist; /* remember what seq desc's slaves are doing */ + int *lenlist; /* remember lengths of seqs slaves are doing */ + int slaveidx; /* counter for slaves */ + float sc; /* score of an alignment */ + double pvalue; /* P-value of a score of an alignment */ + struct p7trace_s *tr; /* Viterbi traceback of an alignment */ + int i; /* generic counter */ + + /* Initialize PVM. + */ + SQD_DPRINTF1(("Requesting master TID...\n")); + master_tid = pvm_mytid(); +#if DEBUGLEVEL >= 1 + pvm_catchout(stderr); /* catch output for debugging */ +#endif + SQD_DPRINTF1(("Spawning slaves...\n")); + PVMSpawnSlaves("hmmsearch-pvm", &slave_tid, &nslaves); + SQD_DPRINTF1(("Spawned a total of %d slaves...\n", nslaves)); + + /* Initialize the slaves by broadcast. + */ + SQD_DPRINTF1(("Broadcasting to %d slaves...\n", nslaves)); + pvm_initsend(PvmDataDefault); + pvm_pkfloat(&(thresh->globT), 1, 1); + pvm_pkdouble(&(thresh->globE), 1, 1); + pvm_pkint(&(thresh->Z), 1, 1); + pvm_pkint(&do_forward, 1, 1); + pvm_pkint(&do_null2, 1, 1); + pvm_pkint(&Alphabet_type, 1, 1); + PVMPackHMM(hmm); + pvm_mcast(slave_tid, nslaves, HMMPVM_INIT); + SQD_DPRINTF1(("Slaves should be ready...\n")); + + /* Confirm slaves' OK status. + */ + PVMConfirmSlaves(slave_tid, nslaves); + SQD_DPRINTF1(("Slaves confirm that they're ok...\n")); + + /* Alloc arrays for remembering what seq each + * slave was working on. + */ + namelist = MallocOrDie(sizeof(char *) * nslaves); + acclist = MallocOrDie(sizeof(char *) * nslaves); + desclist = MallocOrDie(sizeof(char *) * nslaves); + dsqlist = MallocOrDie(sizeof(char *) * nslaves); + lenlist = MallocOrDie(sizeof(int) * nslaves); + + /* Load the slaves. + * Give them all a sequence number and a digitized sequence + * to work on. + * A side effect of the seq number is that we assign each slave + * a number from 0..nslaves-1. + */ + for (nseq = 0; nseq < nslaves; nseq++) + { + if (! ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) break; + if (sqinfo.len == 0) { nseq--; continue; } + + dsq = DigitizeSequence(seq, sqinfo.len); + if (do_xnu && Alphabet_type == hmmAMINO) XNU(dsq, sqinfo.len); + + pvm_initsend(PvmDataDefault); + pvm_pkint(&nseq, 1, 1); + pvm_pkint(&(sqinfo.len), 1, 1); + pvm_pkbyte(dsq, sqinfo.len+2, 1); + pvm_send(slave_tid[nseq], HMMPVM_WORK); + SQD_DPRINTF1(("sent a dsq : %d bytes\n", sqinfo.len+2)); + + namelist[nseq] = Strdup(sqinfo.name); + acclist[nseq] = (sqinfo.flags & SQINFO_ACC) ? Strdup(sqinfo.acc) : NULL; + desclist[nseq] = (sqinfo.flags & SQINFO_DESC) ? Strdup(sqinfo.desc) : NULL; + lenlist[nseq] = sqinfo.len; + dsqlist[nseq] = dsq; + + FreeSequence(seq, &sqinfo); + } + SQD_DPRINTF1(("%d slaves are loaded\n", nseq)); + + /* main receive/send loop + */ + while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) + { + if (sqinfo.len == 0) { continue; } + nseq++; + /* check slaves before blocking */ + PVMCheckSlaves(slave_tid, nslaves); + + /* receive output */ + SQD_DPRINTF1(("Waiting for a slave to give me output...\n")); + pvm_recv(-1, HMMPVM_RESULTS); + pvm_upkint(&slaveidx, 1, 1); /* # of slave who's sending us stuff */ + pvm_upkfloat(&sc, 1, 1); /* score */ + pvm_upkdouble(&pvalue, 1, 1); /* P-value */ + pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */ + tr = (sent_trace) ? PVMUnpackTrace() : NULL; + SQD_DPRINTF1(("Slave %d finished %s for me...\n", slaveidx, namelist[slaveidx])); + + /* send new work */ + dsq = DigitizeSequence(seq, sqinfo.len); + if (do_xnu) XNU(dsq, sqinfo.len); + + pvm_initsend(PvmDataDefault); + pvm_pkint(&nseq, 1, 1); + pvm_pkint(&(sqinfo.len), 1, 1); + pvm_pkbyte(dsq, sqinfo.len+2, 1); + pvm_send(slave_tid[slaveidx], HMMPVM_WORK); + + /* process output */ + if (sent_trace) + { + PostprocessSignificantHit(ghit, dhit, + tr, hmm, dsqlist[slaveidx], lenlist[slaveidx], + namelist[slaveidx], acclist[slaveidx], desclist[slaveidx], + do_forward, sc, + do_null2, + thresh, + FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */ + P7FreeTrace(tr); + } + AddToHistogram(histogram, sc); + + /* record seq info for seq we just sent */ + free(namelist[slaveidx]); + if (acclist[slaveidx] != NULL) free(acclist[slaveidx]); + if (desclist[slaveidx] != NULL) free(desclist[slaveidx]); + free(dsqlist[slaveidx]); + + dsqlist[slaveidx] = dsq; + namelist[slaveidx] = Strdup(sqinfo.name); + acclist[slaveidx] = (sqinfo.flags & SQINFO_ACC) ? Strdup(sqinfo.acc) : NULL; + desclist[slaveidx] = (sqinfo.flags & SQINFO_DESC) ? Strdup(sqinfo.desc) : NULL; + lenlist[slaveidx] = sqinfo.len; + + FreeSequence(seq, &sqinfo); + } + SQD_DPRINTF1(("End of receive/send loop\n")); + + /* Collect the output. All n slaves are still working. + */ + for (i = 0; i < nslaves && i < nseq; i++) + { + /* don't check slaves (they're exiting normally); + window of vulnerability here to slave crashes */ + /* receive output */ + pvm_recv(-1, HMMPVM_RESULTS); + pvm_upkint(&slaveidx, 1, 1); /* # of slave who's sending us stuff */ + pvm_upkfloat(&sc, 1, 1); /* score */ + pvm_upkdouble(&pvalue, 1, 1); /* P-value */ + pvm_upkint(&sent_trace, 1, 1); /* TRUE if trace is coming */ + tr = (sent_trace) ? PVMUnpackTrace() : NULL; + SQD_DPRINTF1(("Slave %d finished %s for me...\n", slaveidx, namelist[slaveidx])); + + /* process output */ + if (sent_trace) + { + PostprocessSignificantHit(ghit, dhit, + tr, hmm, dsqlist[slaveidx], lenlist[slaveidx], + namelist[slaveidx], acclist[slaveidx], desclist[slaveidx], + do_forward, sc, + do_null2, + thresh, + FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */ + P7FreeTrace(tr); + } + AddToHistogram(histogram, sc); + + /* free seq info */ + free(namelist[slaveidx]); + if (acclist[slaveidx] != NULL) free(acclist[slaveidx]); + if (desclist[slaveidx] != NULL) free(desclist[slaveidx]); + free(dsqlist[slaveidx]); + + /* send cleanup/shutdown flag to slave */ + pvm_initsend(PvmDataDefault); + code = -1; + pvm_pkint(&code, 1, 1); + pvm_send(slave_tid[slaveidx], HMMPVM_WORK); + } + + + /* Cleanup; quit the VM; and return + */ + free(slave_tid); + free(dsqlist); + free(namelist); + free(acclist); + free(desclist); + free(lenlist); + pvm_exit(); + *ret_nseq = nseq; + return; +} +#endif /* HMMER_PVM */ + +#ifdef HMMER_THREADS +/***************************************************************** + * POSIX threads implementation. + * + * API: + * workpool_start() (makes a workpool_s structure. Starts calculations.) + * workpool_stop() (waits for threads to finish.) + * workpool_free() (destroys the structure) + * + * Threads: + * worker_thread() (the actual parallelized worker thread). + *****************************************************************/ + +/* Function: workpool_start() + * Date: SRE, Mon Oct 5 16:44:53 1998 + * + * Purpose: Initialize a workpool_s structure, and return it. + * + * Args: sqfp - open sequence file, at start + * do_xnu - TRUE to apply XNU filter + * do_forward - TRUE to score using Forward + * do_null2 - TRUE to apply null2 ad hoc correction + * thresh - score/evalue threshold info + * ghit - per-seq hit list + * dhit - per-domain hit list + * hist - histogram (alloced but empty) + * num_threads- number of worker threads to run. + * + * Returns: ptr to struct workpool_s. + * Caller must wait for threads to finish with workpool_stop(), + * then free the structure with workpool_free(). + */ +static struct workpool_s * +workpool_start(struct plan7_s *hmm, SQFILE *sqfp, int do_xnu, + int do_forward, int do_null2, struct threshold_s *thresh, + struct tophit_s *ghit, struct tophit_s *dhit, + struct histogram_s *hist, int num_threads) +{ + struct workpool_s *wpool; + pthread_attr_t attr; + int i; + int rtn; + + wpool = MallocOrDie(sizeof(struct workpool_s)); + wpool->thread = MallocOrDie(num_threads * sizeof(pthread_t)); + wpool->hmm = hmm; + + wpool->do_xnu = do_xnu; + wpool->do_forward = do_forward; + wpool->do_null2 = do_null2; + wpool->thresh = thresh; + + wpool->sqfp = sqfp; + wpool->nseq = 0; + if ((rtn = pthread_mutex_init(&(wpool->input_lock), NULL)) != 0) + Die("pthread_mutex_init FAILED; %s\n", strerror(rtn)); + + wpool->ghit = ghit; + wpool->dhit = dhit; + wpool->hist = hist; + if ((rtn = pthread_mutex_init(&(wpool->output_lock), NULL)) != 0) + Die("pthread_mutex_init FAILED; %s\n", strerror(rtn)); + + wpool->num_threads= num_threads; + + /* Create slave threads. See comments in hmmcalibrate.c at this + * step, regarding concurrency, system scope, and portability + * amongst various UNIX implementations of pthreads. + */ + pthread_attr_init(&attr); +#ifndef __sgi +#ifdef HAVE_PTHREAD_ATTR_SETSCOPE + pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); +#endif +#endif +#ifdef HAVE_PTHREAD_SETCONCURRENCY + pthread_setconcurrency(num_threads+1); +#endif + /* pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); */ + for (i = 0; i < num_threads; i++) + if ((rtn = pthread_create(&(wpool->thread[i]), &attr, + worker_thread , (void *) wpool)) != 0) + Die("Failed to create thread %d; return code %d\n", i, rtn); + + pthread_attr_destroy(&attr); + return wpool; +} +/* Function: workpool_stop() + * Date: SRE, Thu Jul 16 11:20:16 1998 [St. Louis] + * + * Purpose: Waits for threads in a workpool to finish. + * + * Args: wpool -- ptr to the workpool structure + * + * Returns: (void) + */ +static void +workpool_stop(struct workpool_s *wpool) +{ + int i; + /* wait for threads to stop */ + for (i = 0; i < wpool->num_threads; i++) + if (pthread_join(wpool->thread[i],NULL) != 0) + Die("pthread_join failed"); + return; +} + +/* Function: workpool_free() + * Date: SRE, Thu Jul 16 11:26:27 1998 [St. Louis] + * + * Purpose: Free a workpool_s structure, after the threads + * have finished. + * + * Args: wpool -- ptr to the workpool. + * + * Returns: (void) + */ +static void +workpool_free(struct workpool_s *wpool) +{ + free(wpool->thread); + free(wpool); + return; +} + + +/* Function: worker_thread() + * Date: SRE, Mon Sep 28 10:48:29 1998 [St. Louis] + * + * Purpose: The procedure executed by the worker threads. + * + * Args: ptr - (void *) that is recast to a pointer to + * the workpool. + * + * Returns: (void *) + */ +void * +worker_thread(void *ptr) +{ + struct workpool_s *wpool; /* our working threads structure */ + char *seq; /* target sequence */ + SQINFO sqinfo; /* information assoc w/ seq */ + char *dsq; /* digitized sequence */ + struct p7trace_s *tr; /* traceback from an alignment */ + float sc; /* score of an alignment */ + int rtn; /* a return code from pthreads lib */ + double pvalue; /* P-value of score */ + double evalue; /* E-value of score */ + + wpool = (struct workpool_s *) ptr; + for (;;) { + + /* 1. acquire lock on sequence input, and get + * the next seq to work on. + */ + /* acquire a lock */ + if ((rtn = pthread_mutex_lock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_lock failure: %s\n", strerror(rtn)); + if (! ReadSeq(wpool->sqfp, wpool->sqfp->format, &seq, &sqinfo)) + { /* we're done. release lock, exit thread */ + if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + pthread_exit(NULL); + } + SQD_DPRINTF1(("a thread is working on %s\n", sqinfo.name)); + wpool->nseq++; + /* release the lock */ + if ((rtn = pthread_mutex_unlock(&(wpool->input_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + + if (sqinfo.len == 0) continue; /* silent skip of len=0 seqs (wormpep!?!) */ + + dsq = DigitizeSequence(seq, sqinfo.len); + if (wpool->do_xnu) XNU(dsq, sqinfo.len); + + /* 1. Recover a trace by Viterbi. + */ + if (P7ViterbiSize(sqinfo.len, wpool->hmm->M) <= RAMLIMIT) + sc = P7Viterbi(dsq, sqinfo.len, wpool->hmm, &tr); + else + sc = P7SmallViterbi(dsq, sqinfo.len, wpool->hmm, &tr); + + /* 2. If we're using Forward scores, do another DP + * to get it; else, we already have a Viterbi score + * in sc. + */ + if (wpool->do_forward) sc = P7Forward(dsq, sqinfo.len, wpool->hmm, NULL); + if (wpool->do_null2) sc -= TraceScoreCorrection(wpool->hmm, tr, dsq); + + /* 3. Save the output in tophits and histogram structures, after acquiring a lock + */ + if ((rtn = pthread_mutex_lock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_lock failure: %s\n", strerror(rtn)); + SQD_DPRINTF1(("seq %s scores %f\n", sqinfo.name, sc)); + + pvalue = PValue(wpool->hmm, sc); + evalue = wpool->thresh->Z ? (double) wpool->thresh->Z * pvalue : (double) wpool->nseq * pvalue; + + if (sc >= wpool->thresh->globT && evalue <= wpool->thresh->globE) + { + PostprocessSignificantHit(wpool->ghit, wpool->dhit, + tr, wpool->hmm, dsq, sqinfo.len, + sqinfo.name, + sqinfo.flags & SQINFO_ACC ? sqinfo.acc : NULL, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : NULL, + wpool->do_forward, sc, + wpool->do_null2, + wpool->thresh, + FALSE); /* FALSE-> not hmmpfam mode, hmmsearch mode */ + } + AddToHistogram(wpool->hist, sc); + if ((rtn = pthread_mutex_unlock(&(wpool->output_lock))) != 0) + Die("pthread_mutex_unlock failure: %s\n", strerror(rtn)); + + P7FreeTrace(tr); + FreeSequence(seq, &sqinfo); + free(dsq); + } /* end 'infinite' loop over seqs in this thread */ +} + +#endif /* HMMER_THREADS */ + diff --git a/forester/archive/RIO/others/hmmer/src/masks.c b/forester/archive/RIO/others/hmmer/src/masks.c new file mode 100644 index 0000000..68eb09f --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/masks.c @@ -0,0 +1,367 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* masks.c + * SRE, Tue Nov 18 10:12:28 1997 + * + * Sequence masking routines; corrections for biased composition + * target sequences. + * + * The Claverie/States XNU code is not used by default because I + * consider X'ing out sequence to be too black/white and too + * aggressive, but it's available as an option. + * + * The Wooton/Federhen SEG code was studied, but deemed too + * nonportable to include; it would've suffered the same drawback + * as XNU. + * + * The TraceScoreCorrection() code is the default. + * + * RCS $Id: masks.c,v 1.1.1.1 2005/03/22 08:34:02 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +/* The PAM120 score matrix, in HMMER's AMINO_ALPHABET alphabetic order + */ +static int xpam120[23][23] = { + { 3, -3, 0, 0, -4, 1, -3, -1, -2, -3, -2, -1, 1, -1, -3, 1, 1, 0, -7, -4, 1, 0, 0 }, + {-3, 9, -7, -7, -6, -4, -4, -3, -7, -7, -6, -5, -4, -7, -4, 0, -3, -3, -8, -1, -4, -6, 0 }, + { 0, -7, 5, 3, -7, 0, 0, -3, -1, -5, -4, 2, -3, 1, -3, 0, -1, -3, -8, -5, 5, 3, 0 }, + { 0, -7, 3, 5, -7, -1, -1, -3, -1, -4, -3, 1, -2, 2, -3, -1, -2, -3, -8, -5, 3, 5, 0 }, + {-4, -6, -7, -7, 8, -5, -3, 0, -7, 0, -1, -4, -5, -6, -5, -3, -4, -3, -1, 4, -4, -5, 0 }, + { 1, -4, 0, -1, -5, 5, -4, -4, -3, -5, -4, 0, -2, -3, -4, 1, -1, -2, -8, -6, 1, -1, 0 }, + {-3, -4, 0, -1, -3, -4, 7, -4, -2, -3, -4, 2, -1, 3, 1, -2, -3, -3, -3, -1, 2, 2, 0 }, + {-1, -3, -3, -3, 0, -4, -4, 6, -3, 1, 1, -2, -3, -3, -2, -2, 0, 3, -6, -2, -2, -2, 0 }, + {-2, -7, -1, -1, -7, -3, -2, -3, 5, -4, 0, 1, -2, 0, 2, -1, -1, -4, -5, -5, 1, 0, 0 }, + {-3, -7, -5, -4, 0, -5, -3, 1, -4, 5, 3, -4, -3, -2, -4, -4, -3, 1, -3, -2, -3, -2, 0 }, + {-2, -6, -4, -3, -1, -4, -4, 1, 0, 3, 8, -3, -3, -1, -1, -2, -1, 1, -6, -4, -3, -1, 0 }, + {-1, -5, 2, 1, -4, 0, 2, -2, 1, -4, -3, 4, -2, 0, -1, 1, 0, -3, -4, -2, 4, 1, 0 }, + { 1, -4, -3, -2, -5, -2, -1, -3, -2, -3, -3, -2, 6, 0, -1, 1, -1, -2, -7, -6, -1, 0, 0 }, + {-1, -7, 1, 2, -6, -3, 3, -3, 0, -2, -1, 0, 0, 6, 1, -2, -2, -3, -6, -5, 1, 5, 0 }, + {-3, -4, -3, -3, -5, -4, 1, -2, 2, -4, -1, -1, -1, 1, 6, -1, -2, -3, 1, -5, -1, 0, 0 }, + { 1, 0, 0, -1, -3, 1, -2, -2, -1, -4, -2, 1, 1, -2, -1, 3, 2, -2, -2, -3, 1, 0, 0 }, + { 1, -3, -1, -2, -4, -1, -3, 0, -1, -3, -1, 0, -1, -2, -2, 2, 4, 0, -6, -3, 1, -1, 0 }, + { 0, -3, -3, -3, -3, -2, -3, 3, -4, 1, 1, -3, -2, -3, -3, -2, 0, 5, -8, -3, -2, -2, 0 }, + {-7, -8, -8, -8, -1, -8, -3, -6, -5, -3, -6, -4, -7, -6, 1, -2, -6, -8, 12, -2, -5, -6, 0 }, + {-4, -1, -5, -5, 4, -6, -1, -2, -5, -2, -4, -2, -6, -5, -5, -3, -3, -3, -2, 8, -2, -4, 0 }, + { 1, -4, 5, 3, -4, 1, 2, -2, 1, -3, -3, 4, -1, 1, -1, 1, 1, -2, -5, -2, 6, 4, 0 }, + { 0, -6, 3, 5, -5, -1, 2, -2, 0, -2, -1, 1, 0, 5, 0, 0, -1, -2, -6, -4, 4, 6, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, +}; + + +/* Function: XNU() + * Date: 18 Nov 1997 [StL] + * + * Purpose: x-out of repetitive sequence. XNU tends to be + * good at x'ing out short period tandem repeats. + * + * Note: Apply /only/ to protein sequence. + * + * Args: dsq: 1..len digitized sequence + * len: length of dsq + * + * Return: number of characters x'ed out. + */ +int +XNU(char *dsq, int len) +{ + int i,k,off,sum,beg,end,top; + int topcut,fallcut; + double s0; + int noff = 4; /* maximum search offset */ + int mcut = 1; + double pcut = 0.01; + int *hit; + double lambda = 0.346574; + double K = 0.2; + double H = 0.664; + int xnum = 0; + + if (len == 0) return 0; + + hit = MallocOrDie(sizeof(int) * (len+1)); + for (i=1; i<=len; i++) hit[i]=0; + + /* + ** Determine the score cutoff so that pcut will be the fraction + ** of random sequence eliminated assuming lambda, K, and H are + ** characteristic of the database as a whole + */ + s0 = - log( pcut*H / (noff*K) ) / lambda; + if (s0>0) topcut = floor(s0 + log(s0)/lambda + 0.5); + else topcut = 0; + fallcut = (int)log(K/0.001)/lambda; + + for (off=mcut; off<=noff; off++) { + sum=top=0; + beg=off; + end=0; + + for (i=off+1; i<=len; i++) { + sum += xpam120[(int) dsq[i]][(int) dsq[i-off]]; + if (sum>top) { + top=sum; + end=i; + } + if (top>=topcut && top-sum>fallcut) { + for (k=beg; k<=end; k++) + hit[k] = hit[k-off] = 1; + sum=top=0; + beg=end=i+1; + } else if (top-sum>fallcut) { + sum=top=0; + beg=end=i+1; + } + if (sum<0) { + beg=end=i+1; + sum=top=0; + } + } + if (top>=topcut) { + for (k=beg; k<=end; k++) + hit[k] = hit[k-off] = 1; + } + } + + /* Now mask off detected repeats + */ + for (i=1; i<=len; i++) + if (hit[i]) { xnum++; dsq[i] = Alphabet_iupac-1;} /* e.g. 'X' */ + + free(hit); + return xnum; +} + + +/* Function: TraceScoreCorrection() + * Date: Sun Dec 21 12:05:47 1997 [StL] + * + * Purpose: Calculate a correction (in integer log_2 odds) to be + * applied to a sequence, using a second null model, + * based on a traceback. M/I emissions are corrected; + * C/N/J are not -- as if the nonmatching part and + * matching part were each generated by the best null model. + * The null model is constructed /post hoc/ as the + * average over all the M,I distributions used by the trace. + * + * Return: the log_2-odds score correction. + */ +float +TraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq) +{ + float p[MAXABET]; /* null model distribution */ + int sc[MAXCODE]; /* null model scores */ + int x; + int tpos; + int score; + + /* Set up model: average over the emission distributions of + * all M, I states that appear in the trace. Ad hoc? Sure, you betcha. + */ + FSet(p, Alphabet_size, 0.0); + for (tpos = 0; tpos < tr->tlen; tpos++) + if (tr->statetype[tpos] == STM) + FAdd(p, hmm->mat[tr->nodeidx[tpos]], Alphabet_size); + else if (tr->statetype[tpos] == STI) + FAdd(p, hmm->ins[tr->nodeidx[tpos]], Alphabet_size); + FNorm(p, Alphabet_size); + + for (x = 0; x < Alphabet_size; x++) + sc[x] = Prob2Score(p[x], hmm->null[x]); + /* could avoid this chunk if we knew + we didn't need any degenerate char scores */ + for (x = Alphabet_size; x < Alphabet_iupac; x++) + sc[x] = DegenerateSymbolScore(p, hmm->null, x); + + + /* Score all the M,I state emissions that appear in the trace. + */ + score = 0; + for (tpos = 0; tpos < tr->tlen; tpos++) + if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI) + score += sc[(int) dsq[tr->pos[tpos]]]; + + /* Apply an ad hoc 8 bit fudge factor penalty; + * interpreted as a prior, saying that the second null model is + * 1/2^8 (1/256) as likely as the standard null model + */ + score -= 8 * INTSCALE; + + /* Return the correction to the bit score. + */ + return Scorify(ILogsum(0, score)); +} + + +/* THE FOLLOWING CODE IS IN DEVELOPMENT. + * it is commented out of the current release deliberately. + * If you activate it, I'm not responsible for the consequences. + */ +#if MICHAEL_JORDAN_BUYS_THE_PACERS +/* Function: NewTraceScoreCorrection() + * Date: Wed Feb 17 14:32:45 1999 [StL] + * + * Purpose: Calculate a correction (in integer log_2 odds) to be + * applied to a sequence, using a second null model, + * based on sequence endpoints. M/I emissions are corrected; + * C/N/J are not -- as if the nonmatching part and + * matching part were each generated by the best null model. + * Each null model is constructed /post hoc/ from the + * sequence composition of each matching domain (e.g. + * a null2 model is constructed for each domain in a + * multihit trace). + * + * Constraints on the construction of this function include: + * 1) Paracel hardware can't deal with trace-dependent + * null2 models. Original implementation of + * TraceScoreCorrection() was dependent on traceback + * and could not be reproduced on GeneMatcher. + * GeneMatcher may be able to deal w/ sequence endpoint + * dependent rescoring, though. + * Although this function looks like it's trace- + * dependent (because it's being passed a p7trace_s + * structure), it's really not; only the sequence + * endpoints are being used. + * + * 2) It is desirable that for multihit traces, + * per-domain scores sum to the per-sequence score. + * Otherwise people see this as a "bug" (cf. + * bug #2, David Kerk, NRC). HMMER calculates the + * per-domain scores by going through a separate + * TraceScore() call for each one and separately + * correcting them with TraceScoreCorrection(), + * so we have to do each domain in a full trace + * by a similar mechanism -- even if this means that + * we're adopting a very dubiously post hoc + * null model. + * + * Return: the log_2-odds score correction. + */ +float +NewTraceScoreCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq) +{ + float ct[MAXABET]; /* counts of observed residues */ + float p[MAXABET]; /* null2 model distribution (also counts) */ + float sc[MAXCODE]; /* null2 model scores (as floats not int) */ + + int x; + int tpos; + int score; /* tmp score for real HMM, integer logodds */ + float hmmscore; /* score for real HMM for this domain */ + float null2score; /* score for null2 model for this domain */ + + + float totscore; /* overall score for trace */ + float maxscore; /* best score so far for single domain */ + int in_domain; /* flag for whether we're counting this domain */ + int sym; /* digitized symbol in dsq */ + int ndom; /* number of domains counted towards score */ + + int nsym; /* number of symbols in this alignment */ + + totscore = 0.; + maxscore = -FLT_MAX; + in_domain = FALSE; + ndom = 0; + for (tpos = 0; tpos < tr->tlen; tpos++) + { + /* detect start of domain; start at N or J */ + if (tpos < tr->tlen-1 && tr->statetype[tpos+1] == STB) + { + FCopy(ct, hmm->null, Alphabet_size); /* simple Dirichlet prior */ + score = 0; + null2score = 0.; + nsym = 0; + in_domain = TRUE; + } + /* Count stuff in domain starting with N->B or J->B transition */ + if (in_domain) { + sym = (int) dsq[tr->pos[tpos]]; + + /* count emitted symbols in domain */ + if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STI) + { + P7CountSymbol(ct, sym, 1.0); + nsym++; + } + + /* score emitted symbols in domain towards HMM */ + if (tr->statetype[tpos] == STM) + score += hmm->msc[sym][tr->nodeidx[tpos]]; + else if (tr->statetype[tpos] == STI) + score += hmm->isc[sym][tr->nodeidx[tpos]]; + /* score transitions in domain towards HMM */ + score += TransitionScoreLookup(hmm, + tr->statetype[tpos], tr->nodeidx[tpos], + tr->statetype[tpos+1], tr->nodeidx[tpos+1]); + } + + + if (tr->statetype[tpos] == STE) /* done w/ a domain; calc its score */ + { + /* convert counts to null2 prob distribution */ + FCopy(p, ct, Alphabet_size); + FNorm(p, Alphabet_size); + /* Convert probs to log-odds_e scores */ + /* p can't be zero, because of prior */ + for (x = 0; x < Alphabet_size; x++) + sc[x] = log(p[x] / hmm->null[x]); + /* null2 score = counts \dot scores */ + null2score = FDot(ct, sc, Alphabet_size); + + printf("NSYM = %d NULL2 = %.1f\n", nsym, null2score); + + /* Apply an ad hoc 12 bit fudge factor penalty, per domain. + * Interpreted probabilistically, saying that there's about + * a 1/256 probability to transition into the second null model. + */ + null2score -= 12.; + + /* Now correct score1 using the null2 score. + * If it's still > 0, add it to accumulated score. + */ + hmmscore = Scorify(score); + hmmscore -= 1.44269504 * LogSum(0, null2score); + if (hmmscore > 0.) { totscore += hmmscore; ndom++; } + if (hmmscore > maxscore) maxscore = hmmscore; + + in_domain = FALSE; + } + } + + /* Single domain special case. + */ + if (ndom == 0) totscore = maxscore; + + /* Return the correction to the bit score + */ + return (P7TraceScore(hmm, dsq, tr) - totscore); +} +#endif /*0*/ + + +float +SantaCruzCorrection(struct plan7_s *hmm, struct p7trace_s *tr, char *dsq) +{ + return 0.0; /* UNFINISHED CODE */ +} diff --git a/forester/archive/RIO/others/hmmer/src/mathsupport.c b/forester/archive/RIO/others/hmmer/src/mathsupport.c new file mode 100644 index 0000000..5938463 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/mathsupport.c @@ -0,0 +1,362 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + + +/* mathsupport.c + * SRE, Mon Nov 11 15:07:33 1996 + * + * Miscellaneous mathematical functions. + * General functions are in the SQUID library sre_math.c. + * These functions are too HMM-specific to warrant being in the + * SQUID library. + * + */ + + +#include +#include +#ifdef HMMER_THREADS +#include +#endif +#include "funcs.h" +#include "config.h" +#include "structs.h" +#include "squid.h" + +/* Function: Prob2Score() + * + * Purpose: Convert a probability to a scaled integer log_2 odds score. + * Round to nearest integer (i.e. note use of +0.5 and floor()) + * Return the score. + */ +int +Prob2Score(float p, float null) +{ + if (p == 0.0) return -INFTY; + else return (int) floor(0.5 + INTSCALE * sreLOG2(p/null)); +} + +/* Function: Score2Prob() + * + * Purpose: Convert an integer log_2 odds score back to a probability; + * needs the null model probability, if any, to do the conversion. + */ +float +Score2Prob(int sc, float null) +{ + if (sc == -INFTY) return 0.; + else return (null * sreEXP2((float) sc / INTSCALE)); +} + + +/* Function: Scorify() + * + * Purpose: Convert a scaled integer log-odds score to a floating + * point score for output. (could be a macro but who cares.) + */ +float +Scorify(int sc) +{ + return ((float) sc / INTSCALE); +} + + +/* Function: PValue() + * Date: SRE, Mon Oct 27 12:21:02 1997 [Sanger Centre, UK] + * + * Purpose: Convert an HMM score to a P-value. + * We know P(S>x) is bounded by 1 / (1 + exp_2^x) for a bit score of x. + * We can also use EVD parameters for a tighter bound if we have + * them available. + * + * Args: hmm - model structure, contains EVD parameters + * sc - score in bits + * + * Returns: P value for score significance. + */ +double +PValue(struct plan7_s *hmm, float sc) +{ + double pval; + double pval2; + /* the bound from Bayes */ + if (sc >= sreLOG2(DBL_MAX)) pval = 0.0; + else pval = 1. / (1.+sreEXP2(sc)); + + /* try for a better estimate from EVD fit */ + if (hmm != NULL && (hmm->flags & PLAN7_STATS)) + { + pval2 = ExtremeValueP(sc, hmm->mu, hmm->lambda); + if (pval2 < pval) pval = pval2; + } + return pval; +} + +/* Function: LogSum() + * + * Purpose: Returns the log of the sum of two log probabilities. + * log(exp(p1)+exp(p2)) = p1 + log(1 + exp(p2-p1)) for p1 > p2 + * Note that this is in natural log space, not log_2. + */ +float +LogSum(float p1, float p2) +{ + if (p1 > p2) + return (p1-p2 > 50.) ? p1 + log(1. + exp(p2-p1)) : p1; + else + return (p2-p1 > 50.) ? p2 + log(1. + exp(p1-p2)) : p2; +} + + +/* Function: ILogsum() + * + * Purpose: Return the scaled integer log probability of + * the sum of two probabilities p1 and p2, where + * p1 and p2 are also given as scaled log probabilities. + * + * log(exp(p1)+exp(p2)) = p1 + log(1 + exp(p2-p1)) for p1 > p2 + * + * For speed, builds a lookup table the first time it's called. + * LOGSUM_TBL is set to 20000 by default, in config.h. + * + * Because of the one-time initialization, we have to + * be careful in a multithreaded implementation... hence + * the use of pthread_once(), which forces us to put + * the initialization routine and the lookup table outside + * ILogsum(). (Thanks to Henry Gabb at Intel for pointing + * out this problem.) + * + * Args: p1,p2 -- scaled integer log_2 probabilities to be summed + * in probability space. + * + * Return: scaled integer log_2 probability of the sum. + */ +static int ilogsum_lookup[LOGSUM_TBL]; +static void +init_ilogsum(void) +{ + int i; + for (i = 0; i < LOGSUM_TBL; i++) + ilogsum_lookup[i] = (int) (INTSCALE * 1.44269504 * + (log(1.+exp(0.69314718 * (float) -i/INTSCALE)))); +} +int +ILogsum(int p1, int p2) +{ + int diff; +#ifdef HMMER_THREADS + static pthread_once_t firsttime = PTHREAD_ONCE_INIT; + pthread_once(&firsttime, init_ilogsum); +#else + static int firsttime = 1; + if (firsttime) { init_ilogsum(); firsttime = 0; } +#endif + + diff = p1-p2; + if (diff >= LOGSUM_TBL) return p1; + else if (diff <= -LOGSUM_TBL) return p2; + else if (diff > 0) return p1 + ilogsum_lookup[diff]; + else return p2 + ilogsum_lookup[-diff]; +} + +/* Function: LogNorm() + * + * Purpose: Normalize a vector of log likelihoods, changing it + * to a probability vector. Be careful of overflowing exp(). + * Implementation adapted from Graeme Mitchison. + * + * Args: vec - vector destined to become log probabilities + * n - length of vec + */ +void +LogNorm(float *vec, int n) +{ + int x; + float max = -1.0e30; + float denom = 0.; + + for (x = 0; x < n; x++) + if (vec[x] > max) max = vec[x]; + for (x = 0; x < n; x++) + if (vec[x] > max - 50.) + denom += exp(vec[x] - max); + for (x = 0; x < n; x++) + if (vec[x] > max - 50.) + vec[x] = exp(vec[x] - max) / denom; + else + vec[x] = 0.0; +} + + +/* Function: Logp_cvec() + * + * Purpose: Calculates ln P(cvec|dirichlet), the log probability of a + * count vector given a Dirichlet distribution. Adapted + * from an implementation by Graeme Mitchison. + * + * Args: cvec - count vector + * n - length of cvec + * alpha - Dirichlet alpha terms + * + * Return: log P(cvec|dirichlet) + */ +float +Logp_cvec(float *cvec, int n, float *alpha) +{ + float lnp; /* log likelihood of P(cvec | Dirichlet) */ + float sum1, sum2, sum3; + int x; + + sum1 = sum2 = sum3 = lnp = 0.0; + for (x = 0; x < n; x++) + { + sum1 += cvec[x] + alpha[x]; + sum2 += alpha[x]; + sum3 += cvec[x]; + lnp += Gammln(alpha[x] + cvec[x]); + lnp -= Gammln(cvec[x] + 1.); + lnp -= Gammln(alpha[x]); + } + lnp -= Gammln(sum1); + lnp += Gammln(sum2); + lnp += Gammln(sum3 + 1.); + return lnp; +} + +/* Function: SampleDirichlet() + * + * Purpose: Given a Dirichlet distribution defined by + * a vector of n alpha terms, sample of probability + * distribution of dimension n. + * + * This code was derived from source provided + * by Betty Lazareva, from Gary Churchill's group. + * + * Args: alpha - vector of Dirichlet alphas components + * n - number of components + * ret_p - RETURN: sampled probability vector. + * + * Return: (void) + * ret_p, an n-dimensional array alloced by the caller, + * is filled. + */ +void +SampleDirichlet(float *alpha, int n, float *p) +{ + int x; + + for (x = 0; x < n; x++) + p[x] = SampleGamma(alpha[x]); + FNorm(p, n); +} + + +/* Function: SampleGamma() + * + * Purpose: Return a random deviate distributed as Gamma(alpha, 1.0). + * Uses two different accept/reject algorithms, one + * for 0= 1.0) + { + /*CONSTCOND*/ while (1) + { + lambda = sqrt(2.0*alpha -1.0); + U = sre_random(); + V = U/(1-U); + X = alpha * pow(V, 1/lambda); + W = .25*exp(-X+alpha)*pow(V,1.0+alpha/lambda)*pow(1.0+1.0/V, 2.0); + if (sre_random() <= W) + return X; + } + } + else if (alpha > 0.0) + { + /*CONSTCOND*/ while (1) + { + U = sre_random(); + V = U*(1+ alpha/exp(1.0)); + if (V > 1.0) + { + X = -log( (1-V+alpha/exp(1.0))/alpha); + if (sre_random() <= pow(X, alpha-1.0)) + return X; + } + else + { + X = pow(V,1.0/alpha); + if (sre_random() <= exp(-X)) + return X; + } + } + } + Die("Invalid argument alpha < 0.0 to SampleGamma()"); + /*NOTREACHED*/ + return 0.0; +} + +/* Function: SampleCountvector() + * + * Purpose: Given a probability vector p of dimensionality + * n, sample c counts and store them in cvec. + * cvec is n-dimensional and is alloced by the caller. + */ +void +SampleCountvector(float *p, int n, int c, float *cvec) +{ + int i; + + FSet(cvec, n, 0.0); + for (i = 0; i < c; i++) + cvec[FChoose(p,n)] += 1.0; +} + + + +/* Function: P_PvecGivenDirichlet() + * + * Purpose: Calculate the log probability of a probability + * vector given a single Dirichlet component, alpha. + * Follows Sjolander (1996) appendix, lemma 2. + * + * Return: log P(p | alpha) + */ +float +P_PvecGivenDirichlet(float *p, int n, float *alpha) +{ + float sum; /* for Gammln(|alpha|) in Z */ + float logp; /* RETURN: log P(p|alpha) */ + int x; + + sum = logp = 0.0; + for (x = 0; x < n; x++) + if (p[x] > 0.0) /* any param that is == 0.0 doesn't exist */ + { + logp += (alpha[x]-1.0) * log(p[x]); + logp -= Gammln(alpha[x]); + sum += alpha[x]; + } + logp += Gammln(sum); + return logp; +} + + diff --git a/forester/archive/RIO/others/hmmer/src/misc.c b/forester/archive/RIO/others/hmmer/src/misc.c new file mode 100644 index 0000000..9a7cf26 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/misc.c @@ -0,0 +1,140 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* misc.c + * SRE, Thu Jul 15 18:49:19 1993 + * + * Functions that I don't know quite where to put yet. + */ + +#include +#include +#include +#include +#include +#include + +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "version.h" + +/* Function: Getword() + * + * Purpose: little function used by ReadPrior() and ReadHMM() to parse + * next valid field out of an open file, ignoring + * comments. '#' marks the beginning of a comment. + * + * Arg: fp - open file for reading + * type - sqdARG_INT, sqdARG_FLOAT, or sqdARG_STRING from squid.h + */ +char * +Getword(FILE *fp, int type) +{ + static char buffer[512]; + static char *sptr = NULL; + + if (sptr != NULL) sptr = strtok(NULL, " \t\n"); + + while (sptr == NULL) + { + if ((sptr = fgets(buffer, 512, fp)) == NULL) return NULL; + if ((sptr = strchr(buffer, '#')) != NULL) *sptr = '\0'; + sptr = strtok(buffer, " \t\n"); + } + + switch (type) { + case sqdARG_STRING: + if (strlen(sptr) == 0) { + Warn("Parse failed: expected string, got nothing"); + sptr = NULL; + } + break; + case sqdARG_INT: + if (!IsInt(sptr)) { + Warn("Parse failed: expected integer, got %s", sptr); + sptr = NULL; + } + break; + case sqdARG_FLOAT: + if (!IsReal(sptr)) { + Warn("Parse failed: expected real value, got %s", sptr); + sptr = NULL; + } + break; + } + + return sptr; +} + + +/* Function: Getline() + * + * Purpose: Get the next non-blank, non-comment line from an open file. + * A comment line has '#' as the first non-whitespace character. + * Returns NULL if no line is found. + * Syntax is the same as fgets(). + * + * Args: s - allocated storage for line + * n - number of characters allocated for s + * fp - open FILE * + * + * Return: Either s, or NULL if no new line is found. + */ +char * +Getline(char *s, int n, FILE *fp) +{ + char *first; + + do { + if (fgets(s, n, fp) == NULL) return NULL; + first = s; while (isspace((int) (*first))) first++; + } while (*first == '#' || *first == '\0'); + return s; +} + + +/* Function: SetAutocuts() + * Date: SRE, Thu Jun 8 08:19:46 2000 [TW721 over Ireland] + * + * Purpose: Set score thresholds using the GA, TC, or NC information + * in an HMM. + * + * Args: thresh - score threshold structure. autocut must be set + * properly (CUT_GA, CUT_NC, or CUT_TC). + * hmm - HMM containing appropriate score cutoff info + * + * Returns: 1 on success. + * 0 if HMM does not have the score cutoffs available -- caller + * will have to decide on a fallback plan. + * Has no effect (and returns success) if autocut is + * CUT_NONE. + */ +int +SetAutocuts(struct threshold_s *thresh, struct plan7_s *hmm) +{ + if (thresh->autocut == CUT_GA) { + if (! (hmm->flags & PLAN7_GA)) return 0; + thresh->globT = hmm->ga1; + thresh->domT = hmm->ga2; + thresh->globE = thresh->domE = FLT_MAX; + } else if (thresh->autocut == CUT_NC) { + if (! (hmm->flags & PLAN7_NC)) return 0; + thresh->globT = hmm->nc1; + thresh->domT = hmm->nc2; + thresh->globE = thresh->domE = FLT_MAX; + } else if (thresh->autocut == CUT_TC) { + if (! (hmm->flags & PLAN7_TC)) return 0; + thresh->globT = hmm->tc1; + thresh->domT = hmm->tc2; + thresh->globE = thresh->domE = FLT_MAX; + } + return 1; +} diff --git a/forester/archive/RIO/others/hmmer/src/modelmakers.c b/forester/archive/RIO/others/hmmer/src/modelmakers.c new file mode 100644 index 0000000..8e5eab8 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/modelmakers.c @@ -0,0 +1,940 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* modelmakers.c + * SRE, Fri Nov 15 10:00:04 1996 + * + * Construction of models from multiple alignments. Three versions: + * Handmodelmaker() -- use #=RF annotation to indicate match columns + * Fastmodelmaker() -- Krogh/Haussler heuristic + * Maxmodelmaker() -- MAP model construction algorithm (Eddy, + * unpublished) + * + * The meat of the model construction code is in matassign2hmm(). + * The three model construction strategies simply label which columns + * are supposed to be match states, and then hand this info to + * matassign2hmm(). + * + * Two wrinkles to watch for: + * 1) The alignment is assumed to contain sequence fragments. Look in + * fake_tracebacks() for how internal entry/exit points are handled. + * 2) Plan7 disallows DI and ID transitions, but an alignment may + * imply these. Look in trace_doctor() for how DI and ID transitions + * are removed. + */ + +#include +#include +#include +#include +#include +#include + +#include "structs.h" +#include "config.h" +#include "funcs.h" +#include "squid.h" +#include "msa.h" + +/* flags used for matassign[] arrays -- + * assignment of aligned columns to match/insert states + */ +#define ASSIGN_MATCH (1<<0) +#define FIRST_MATCH (1<<1) +#define LAST_MATCH (1<<2) +#define ASSIGN_INSERT (1<<3) +#define EXTERNAL_INSERT_N (1<<4) +#define EXTERNAL_INSERT_C (1<<5) + +static int build_cij(char **aseqs, int nseq, int *insopt, int i, int j, + float *wgt, float *cij); +static int estimate_model_length(MSA *msa); +static void matassign2hmm(MSA *msa, char **dsq, + int *matassign, struct plan7_s **ret_hmm, + struct p7trace_s ***ret_tr); +static void fake_tracebacks(char **aseq, int nseq, int alen, int *matassign, + struct p7trace_s ***ret_tr); +static void trace_doctor(struct p7trace_s *tr, int M, int *ret_ndi, + int *ret_nid); +static void annotate_model(struct plan7_s *hmm, int *matassign, MSA *msa); +static void print_matassign(int *matassign, int alen); + + + +/* Function: P7Handmodelmaker() + * + * Purpose: Manual model construction: + * Construct an HMM from an alignment, where the #=RF line + * of a HMMER alignment file is given to indicate + * the columns assigned to matches vs. inserts. + * + * NOTE: Handmodelmaker() will slightly revise the alignment + * if necessary, if the assignment of columns implies + * DI and ID transitions. + * + * Returns both the HMM in counts form (ready for applying + * Dirichlet priors as the next step), and fake tracebacks + * for each aligned sequence. + * + * Args: msa - multiple sequence alignment + * dsq - digitized unaligned aseq's + * ret_hmm - RETURN: counts-form HMM + * ret_tr - RETURN: array of tracebacks for aseq's + * + * Return: (void) + * ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr), + * FreeHMM(hmm). + */ +void +P7Handmodelmaker(MSA *msa, char **dsq, + struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr) +{ + int *matassign; /* MAT state assignments if 1; 1..alen */ + int apos; /* counter for aligned columns */ + + /* Make sure we have all the info about the alignment that we need */ + if (msa->rf == NULL) + Die("Alignment must have RF annotation to hand-build an HMM"); + + /* Allocation */ + matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1)); + + /* Determine match assignment from optional annotation + */ + matassign[0] = 0; + for (apos = 0; apos < msa->alen; apos++) + { + matassign[apos+1] = 0; + if (!isgap(msa->rf[apos])) + matassign[apos+1] |= ASSIGN_MATCH; + else + matassign[apos+1] |= ASSIGN_INSERT; + } + + /* Hand matassign off for remainder of model construction + */ + /* print_matassign(matassign, msa->alen); */ + matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr); + + free(matassign); + return; +} + + +/* Function: P7Fastmodelmaker() + * + * Purpose: Heuristic model construction: + * Construct an HMM from an alignment using the original + * Krogh/Haussler heuristic; any column with more + * symbols in it than a given fraction is assigned to + * match. + * + * NOTE: Fastmodelmaker() will slightly revise the + * alignment if the assignment of columns implies + * DI and ID transitions. + * + * Returns the HMM in counts form (ready for applying Dirichlet + * priors as the next step). Also returns fake traceback + * for each training sequence. + * + * Args: msa - multiple sequence alignment + * dsq - digitized unaligned aseq's + * maxgap - if more gaps than this, column becomes insert. + * ret_hmm - RETURN: counts-form HMM + * ret_tr - RETURN: array of tracebacks for aseq's + * + * Return: (void) + * ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr), + * FreeHMM(hmm). + */ +void +P7Fastmodelmaker(MSA *msa, char **dsq, float maxgap, + struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr) +{ + int *matassign; /* MAT state assignments if 1; 1..alen */ + int idx; /* counter over sequences */ + int apos; /* counter for aligned columns */ + int ngap; /* number of gaps in a column */ + + /* Allocations: matassign is 1..alen array of bit flags + */ + matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1)); + + /* Determine match assignment by counting symbols in columns + */ + matassign[0] = 0; + for (apos = 0; apos < msa->alen; apos++) { + matassign[apos+1] = 0; + + ngap = 0; + for (idx = 0; idx < msa->nseq; idx++) + if (isgap(msa->aseq[idx][apos])) + ngap++; + + if ((float) ngap / (float) msa->nseq > maxgap) + matassign[apos+1] |= ASSIGN_INSERT; + else + matassign[apos+1] |= ASSIGN_MATCH; + } + + /* Once we have matassign calculated, all modelmakers behave + * the same; matassign2hmm() does this stuff (traceback construction, + * trace counting) and sets up ret_hmm and ret_tr. + */ + matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr); + + free(matassign); + return; +} + + +/* Function: P7Maxmodelmaker() + * + * Purpose: The Unholy Beast of HMM model construction algorithms -- + * maximum a posteriori construction. A tour de force and + * probably overkill. MAP construction for Krogh + * HMM-profiles is fairly straightforward, but MAP construction of + * Plan 7 HMM-profiles is, er, intricate. + * + * Given a multiple alignment, construct an optimal (MAP) model + * architecture. Return a counts-based HMM. + * + * Args: msa - multiple sequence alignment + * dsq - digitized, unaligned seqs + * maxgap - above this, trailing columns are assigned to C + * prior - priors on parameters to use for model construction + * null - random sequence model emissions + * null_p1 - random sequence model p1 transition + * mpri - prior on architecture: probability of new match node + * ret_hmm - RETURN: new hmm (counts form) + * ret_tr - RETURN: array of tracebacks for aseq's + * + * Return: (void) + * ret_hmm and ret_tr (if !NULL) must be free'd by the caller. + */ +void +P7Maxmodelmaker(MSA *msa, char **dsq, float maxgap, + struct p7prior_s *prior, + float *null, float null_p1, float mpri, + struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr) +{ + int idx; /* counter for seqs */ + int i, j; /* positions in alignment */ + int x; /* counter for syms or transitions */ + float **matc; /* count vectors: [1..alen][0..19] */ + float cij[8], tij[8]; /* count and score transit vectors */ + float matp[MAXABET]; /* match emission vector */ + float insp[MAXABET]; /* insert score vector */ + float insc[MAXABET]; /* insert count vector */ + float *sc; /* DP scores [0,1..alen,alen+1] */ + int *tbck; /* traceback ptrs for sc */ + int *matassign; /* match assignments [1..alen] */ + int *insopt; /* number of inserted chars [0..nseq-1] */ + int first, last; /* positions of first and last cols [1..alen] */ + float bm1, bm2; /* estimates for start,internal b->m t's */ + int est_M; /* estimate for the size of the model */ + float t_me; /* estimate for an internal M->E transition */ + float new, bestsc; /* new score, best score so far */ + int code; /* optimization: return code from build_cij() */ + int ngap; /* gap count in a column */ + float wgtsum; /* sum of weights; do not assume it is nseq */ + + /* Allocations + */ + matc = (float **) MallocOrDie (sizeof(float *) * (msa->alen+1)); + sc = (float *) MallocOrDie (sizeof(float) * (msa->alen+2)); + tbck = (int *) MallocOrDie (sizeof(int) * (msa->alen+2)); + matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1)); + insopt = (int *) MallocOrDie (sizeof(int) * msa->nseq); + for (i = 0; i < msa->alen; i++) { + matc[i+1] = (float *) MallocOrDie (Alphabet_size * sizeof(float)); + FSet(matc[i+1], Alphabet_size, 0.); + } + + /* Precalculations + */ + for (i = 0; i < msa->alen; i++) + for (idx = 0; idx < msa->nseq; idx++) + if (!isgap(msa->aseq[idx][i])) + P7CountSymbol(matc[i+1], SymbolIndex(msa->aseq[idx][i]), msa->wgt[idx]); + mpri = sreLOG2(mpri); + + FCopy(insp, prior->i[0], Alphabet_size); + FNorm(insp, Alphabet_size); + wgtsum = FSum(msa->wgt, msa->nseq); + for (x = 0; x < Alphabet_size; x++) + insp[x] = sreLOG2(insp[x] / null[x]); + + /* Estimate the relevant special transitions. + */ + est_M = estimate_model_length(msa); + t_me = 0.5 / (float) (est_M-1); + bm1 = 0.5; + bm2 = 0.5 / (float) (est_M-1); + bm1 = sreLOG2(bm1 / null_p1); + bm2 = sreLOG2(bm2 / null_p1); + + /* Estimate the position of the last match-assigned column + * by counting gap frequencies. + */ + maxgap = 0.5; + for (last = msa->alen; last >= 1; last--) { + ngap = 0; + for (idx = 0; idx < msa->nseq; idx++) + if (isgap(msa->aseq[idx][last-1])) ngap++; + if ((float) ngap / (float) msa->nseq <= maxgap) + break; + } + + /* Initialization + */ + sc[last] = 0.; + tbck[last] = 0; + + /* Set ME gaps to '_' + */ + for (idx = 0; idx < msa->nseq; idx++) + for (i = last; i > 0 && isgap(msa->aseq[idx][i-1]); i--) + msa->aseq[idx][i-1] = '_'; + + /* Main recursion moves from right to left. + */ + for (i = last-1; i > 0; i--) { + /* Calculate match emission scores for i */ + FCopy(matp, matc[i], Alphabet_size); + P7PriorifyEmissionVector(matp, prior, prior->mnum, prior->mq, prior->m, NULL); + for (x = 0; x < Alphabet_size; x++) + matp[x] = sreLOG2(matp[x] / null[x]); + + /* Initialize insert counters to zero */ + FSet(insc, Alphabet_size, 0.); + for (idx = 0; idx < msa->nseq; idx++) insopt[idx] = 0; + + sc[i] = -FLT_MAX; + for (j = i+1; j <= last; j++) { + /* build transition matrix for column pair i,j */ + code = build_cij(msa->aseq, msa->nseq, insopt, i, j, msa->wgt, cij); + if (code == -1) break; /* no j to our right can work for us */ + if (code == 1) { + FCopy(tij, cij, 7); + P7PriorifyTransitionVector(tij, prior, prior->tq); + FNorm(tij, 3); + tij[TMM] = sreLOG2(tij[TMM] / null_p1); + tij[TMI] = sreLOG2(tij[TMI] / null_p1); + tij[TMD] = sreLOG2(tij[TMD]); + tij[TIM] = sreLOG2(tij[TIM] / null_p1); + tij[TII] = sreLOG2(tij[TII] / null_p1); + tij[TDM] = sreLOG2(tij[TDM] / null_p1); + tij[TDD] = sreLOG2(tij[TDD]); + /* calculate the score of using this j. */ + new = sc[j] + FDot(tij, cij, 7) + FDot(insp, insc, Alphabet_size); + + SQD_DPRINTF2(("%3d %3d new=%6.2f scj=%6.2f m=%6.2f i=%6.2f t=%6.2f\n", + i, j, new, sc[j], FDot(matp, matc[i], Alphabet_size), + FDot(insp, insc, Alphabet_size), FDot(tij, cij, 7))); + + /* keep it if it's better */ + if (new > sc[i]) { + sc[i] = new; + tbck[i] = j; + } + } + /* bump insc, insopt insert symbol counters */ + FAdd(insc, matc[j], Alphabet_size); + for (idx = 0; idx < msa->nseq; idx++) + if (!isgap(msa->aseq[idx][j-1])) insopt[idx]++; + } + /* add in constant contributions for col i */ + /* note ad hoc scaling of mpri by wgtsum (us. nseq)*/ + sc[i] += FDot(matp, matc[i], Alphabet_size) + mpri * wgtsum; + } /* end loop over start positions i */ + + /* Termination: place the begin state. + * log odds score for S->N->B is all zero except for NB transition, which + * is a constant. So we only have to evaluate BM transitions. + */ + bestsc = -FLT_MAX; + for (i = 1; i <= last; i++) { + new = sc[i]; + for (idx = 0; idx < msa->nseq; idx++) { + if (isgap(msa->aseq[idx][j-1])) + new += bm2; /* internal B->M transition */ + else + new += bm1; /* B->M1 transition */ + } + if (new > bestsc) { + bestsc = new; + first = i; + } + } + + /* Traceback + */ + matassign[0] = 0; + for (i = 1; i <= msa->alen; i++) matassign[i] = ASSIGN_INSERT; + for (i = first; i != 0; i = tbck[i]) { + matassign[i] &= ~ASSIGN_INSERT; + matassign[i] |= ASSIGN_MATCH; + } + + /* Hand matassign off for remainder of model construction + */ + /* print_matassign(matassign, ainfo->alen); */ + matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr); + + /* Clean up. + */ + for (i = 1; i <= msa->alen; i++) free(matc[i]); + free(matc); + free(sc); + free(tbck); + free(matassign); + free(insopt); +} + + +/* Function: build_cij() + * + * Purpose: Construct a counts vector for transitions between + * column i and column j in a multiple alignment. + * + * '_' gap characters indicate "external" gaps which + * are to be dealt with by B->M and M->E transitions. + * These characters must be placed by a preprocessor. + * + * insopt is an "insert optimization" -- an incrementor + * which keeps track of the number of insert symbols + * between i and j. + * + * Args: aseqs - multiple alignment. [0.nseq-1][0.alen-1] + * nseq - number of seqs in aseqs + * insopt - number of inserts per seq between i/j [0.nseq-1] + * i - i column [1.alen], off by one from aseqs + * j - j column [1.alen], off by one from aseqs + * wgt - per-seq weights [0.nseq-1] + * cij - transition count vectors [0..7] + * + * Return: -1 if an illegal transition was seen for this i/j assignment *and* + * we are guaranteed that any j to the right will also + * have illegal transitions. + * 0 if an illegal transition was seen, but a j further to the + * right may work. + * 1 if all transitions were legal. + */ +static int +build_cij(char **aseqs, int nseq, int *insopt, int i, int j, + float *wgt, float *cij) +{ + int idx; /* counter for seqs */ + + i--; /* make i,j relative to aseqs [0..alen-1] */ + j--; + FSet(cij, 8, 0.); /* zero cij */ + for (idx = 0; idx < nseq; idx++) { + if (insopt[idx] > 0) { + if (isgap(aseqs[idx][i])) return -1; /* D->I prohibited. */ + if (isgap(aseqs[idx][j])) return 0; /* I->D prohibited. */ + cij[TMI] += wgt[idx]; + cij[TII] += (insopt[idx]-1) * wgt[idx]; + cij[TIM] += wgt[idx]; + } else { + if (!isgap(aseqs[idx][i])) { + if (aseqs[idx][j] == '_') ; /* YO! what to do with trailer? */ + else if (isgap(aseqs[idx][j])) cij[TMD] += wgt[idx]; + else cij[TMM] += wgt[idx]; + } else { /* ignores B->E possibility */ + if (aseqs[idx][j] == '_') continue; + else if (isgap(aseqs[idx][j])) cij[TDD] += wgt[idx]; + else cij[TDM] += wgt[idx]; + } + } + } + return 1; +} + + +/* Function: estimate_model_length() + * + * Purpose: Return a decent guess about the length of the model, + * based on the lengths of the sequences. + * + * Algorithm is dumb: use weighted average length. + * + * Don't assume that weights sum to nseq! + */ +static int +estimate_model_length(MSA *msa) +{ + int idx; + float total = 0.; + float wgtsum = 0.; + + for (idx = 0; idx < msa->nseq; idx++) + { + total += msa->wgt[idx] * DealignedLength(msa->aseq[idx]); + wgtsum += msa->wgt[idx]; + } + + return (int) (total / wgtsum); +} + + +/* Function: matassign2hmm() + * + * Purpose: Given an assignment of alignment columns to match vs. + * insert, finish the final part of the model construction + * calculation that is constant between model construction + * algorithms. + * + * Args: msa - multiple sequence alignment + * dsq - digitized unaligned aseq's + * matassign - 1..alen bit flags for column assignments + * ret_hmm - RETURN: counts-form HMM + * ret_tr - RETURN: array of tracebacks for aseq's + * + * Return: (void) + * ret_hmm and ret_tr alloc'ed here for the calling + * modelmaker function. + */ +static void +matassign2hmm(MSA *msa, char **dsq, int *matassign, + struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr) +{ + struct plan7_s *hmm; /* RETURN: new hmm */ + struct p7trace_s **tr; /* fake tracebacks for each seq */ + int M; /* length of new model in match states */ + int idx; /* counter over sequences */ + int apos; /* counter for aligned columns */ + + /* how many match states in the HMM? */ + M = 0; + for (apos = 1; apos <= msa->alen; apos++) { + if (matassign[apos] & ASSIGN_MATCH) + M++; + } + /* delimit N-terminal tail */ + for (apos=1; matassign[apos] & ASSIGN_INSERT && apos <= msa->alen; apos++) + matassign[apos] |= EXTERNAL_INSERT_N; + if (apos <= msa->alen) matassign[apos] |= FIRST_MATCH; + + /* delimit C-terminal tail */ + for (apos=msa->alen; matassign[apos] & ASSIGN_INSERT && apos > 0; apos--) + matassign[apos] |= EXTERNAL_INSERT_C; + if (apos > 0) matassign[apos] |= LAST_MATCH; + + /* print_matassign(matassign, msa->alen); */ + + /* make fake tracebacks for each seq */ + fake_tracebacks(msa->aseq, msa->nseq, msa->alen, matassign, &tr); + /* build model from tracebacks */ + hmm = AllocPlan7(M); + ZeroPlan7(hmm); + for (idx = 0; idx < msa->nseq; idx++) { + /* P7PrintTrace(stdout, tr[idx], NULL, NULL); */ + P7TraceCount(hmm, dsq[idx], msa->wgt[idx], tr[idx]); + } + /* annotate new model */ + annotate_model(hmm, matassign, msa); + + /* Set #=RF line of alignment to reflect our assignment + * of match, delete. matassign is valid from 1..alen and is off + * by one from msa->rf. + */ + if (msa->rf != NULL) free(msa->rf); + msa->rf = (char *) MallocOrDie (sizeof(char) * (msa->alen + 1)); + for (apos = 0; apos < msa->alen; apos++) + msa->rf[apos] = matassign[apos+1] & ASSIGN_MATCH ? 'x' : '.'; + msa->rf[msa->alen] = '\0'; + + /* Cleanup and return. */ + if (ret_tr != NULL) *ret_tr = tr; + else { for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]); free(tr); } + if (ret_hmm != NULL) *ret_hmm = hmm; else FreePlan7(hmm); + return; +} + + + +/* Function: fake_tracebacks() + * + * Purpose: From a consensus assignment of columns to MAT/INS, construct fake + * tracebacks for each individual sequence. + * + * Note: Fragment tolerant by default. Internal entries are + * B->M_x, instead of B->D1->D2->...->M_x; analogously + * for internal exits. + * + * Args: aseqs - alignment [0..nseq-1][0..alen-1] + * nseq - number of seqs in alignment + * alen - length of alignment in columns + * matassign - assignment of column; [1..alen] (off one from aseqs) + * ret_tr - RETURN: array of tracebacks + * + * Return: (void) + * ret_tr is alloc'ed here. Caller must free. + */ +static void +fake_tracebacks(char **aseq, int nseq, int alen, int *matassign, + struct p7trace_s ***ret_tr) +{ + struct p7trace_s **tr; + int idx; /* counter over sequences */ + int i; /* position in raw sequence (1..L) */ + int k; /* position in HMM */ + int apos; /* position in alignment columns */ + int tpos; /* position in traceback */ + + tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * nseq); + + for (idx = 0; idx < nseq; idx++) + { + P7AllocTrace(alen+6, &tr[idx]); /* allow room for S,N,B,E,C,T */ + + /* all traces start with S state... */ + tr[idx]->statetype[0] = STS; + tr[idx]->nodeidx[0] = 0; + tr[idx]->pos[0] = 0; + /* ...and transit to N state; N-term tail + is emitted on N->N transitions */ + tr[idx]->statetype[1] = STN; + tr[idx]->nodeidx[1] = 0; + tr[idx]->pos[1] = 0; + + i = 1; + k = 0; + tpos = 2; + for (apos = 0; apos < alen; apos++) + { + tr[idx]->statetype[tpos] = STBOGUS; /* bogus, deliberately, to debug */ + + if (matassign[apos+1] & FIRST_MATCH) + { /* BEGIN */ + tr[idx]->statetype[tpos] = STB; + tr[idx]->nodeidx[tpos] = 0; + tr[idx]->pos[tpos] = 0; + tpos++; + } + + if (matassign[apos+1] & ASSIGN_MATCH && ! isgap(aseq[idx][apos])) + { /* MATCH */ + k++; /* move to next model pos */ + tr[idx]->statetype[tpos] = STM; + tr[idx]->nodeidx[tpos] = k; + tr[idx]->pos[tpos] = i; + i++; + tpos++; + } + else if (matassign[apos+1] & ASSIGN_MATCH) + { /* DELETE */ + /* being careful about S/W transitions; no B->D transitions */ + k++; /* *always* move on model when ASSIGN_MATCH */ + if (tr[idx]->statetype[tpos-1] != STB) + { + tr[idx]->statetype[tpos] = STD; + tr[idx]->nodeidx[tpos] = k; + tr[idx]->pos[tpos] = 0; + tpos++; + } + } + else if (matassign[apos+1] & EXTERNAL_INSERT_N && + ! isgap(aseq[idx][apos])) + { /* N-TERMINAL TAIL */ + tr[idx]->statetype[tpos] = STN; + tr[idx]->nodeidx[tpos] = 0; + tr[idx]->pos[tpos] = i; + i++; + tpos++; + } + else if (matassign[apos+1] & EXTERNAL_INSERT_C && + ! isgap(aseq[idx][apos])) + { /* C-TERMINAL TAIL */ + tr[idx]->statetype[tpos] = STC; + tr[idx]->nodeidx[tpos] = 0; + tr[idx]->pos[tpos] = i; + i++; + tpos++; + } + else if (! isgap(aseq[idx][apos])) + { /* INSERT */ + tr[idx]->statetype[tpos] = STI; + tr[idx]->nodeidx[tpos] = k; + tr[idx]->pos[tpos] = i; + i++; + tpos++; + } + + if (matassign[apos+1] & LAST_MATCH) + { /* END */ + /* be careful about S/W transitions; may need to roll + * back over some D's because there's no D->E transition + */ + while (tr[idx]->statetype[tpos-1] == STD) + tpos--; + tr[idx]->statetype[tpos] = STE; + tr[idx]->nodeidx[tpos] = 0; + tr[idx]->pos[tpos] = 0; + tpos++; + /* and then transit E->C; + alignments that use J are undefined; + C-term tail is emitted on C->C transitions */ + tr[idx]->statetype[tpos] = STC; + tr[idx]->nodeidx[tpos] = 0; + tr[idx]->pos[tpos] = 0; + tpos++; + } + } + /* all traces end with T state */ + tr[idx]->statetype[tpos] = STT; + tr[idx]->nodeidx[tpos] = 0; + tr[idx]->pos[tpos] = 0; + tr[idx]->tlen = ++tpos; + /* deal with DI, ID transitions */ + /* k == M here */ + trace_doctor(tr[idx], k, NULL, NULL); + + } /* end for sequence # idx */ + + *ret_tr = tr; + return; +} + +/* Function: trace_doctor() + * + * Purpose: Plan 7 disallows D->I and I->D "chatter" transitions. + * However, these transitions may be implied by many + * alignments for hand- or heuristic- built HMMs. + * trace_doctor() collapses I->D or D->I into a + * single M position in the trace. + * Similarly, B->I and I->E transitions may be implied + * by an alignment. + * + * trace_doctor does not examine any scores when it does + * this. In ambiguous situations (D->I->D) the symbol + * will be pulled arbitrarily to the left, regardless + * of whether that's the best column to put it in or not. + * + * Args: tr - trace to doctor + * M - length of model that traces are for + * ret_ndi - number of DI transitions doctored + * ret_nid - number of ID transitions doctored + * + * Return: (void) + * tr is modified + */ +static void +trace_doctor(struct p7trace_s *tr, int mlen, int *ret_ndi, int *ret_nid) +{ + int opos; /* position in old trace */ + int npos; /* position in new trace (<= opos) */ + int ndi, nid; /* number of DI, ID transitions doctored */ + + /* overwrite the trace from left to right */ + ndi = nid = 0; + opos = npos = 0; + while (opos < tr->tlen) { + /* fix implied D->I transitions; D transforms to M, I pulled in */ + if (tr->statetype[opos] == STD && tr->statetype[opos+1] == STI) { + tr->statetype[npos] = STM; + tr->nodeidx[npos] = tr->nodeidx[opos]; /* D transforms to M */ + tr->pos[npos] = tr->pos[opos+1]; /* insert char moves back */ + opos += 2; + npos += 1; + ndi++; + } /* fix implied I->D transitions; D transforms to M, I is pushed in */ + else if (tr->statetype[opos]== STI && tr->statetype[opos+1]== STD) { + tr->statetype[npos] = STM; + tr->nodeidx[npos] = tr->nodeidx[opos+1];/* D transforms to M */ + tr->pos[npos] = tr->pos[opos]; /* insert char moves up */ + opos += 2; + npos += 1; + nid++; + } /* fix implied B->I transitions; pull I back to its M */ + else if (tr->statetype[opos]== STI && tr->statetype[opos-1]== STB) { + tr->statetype[npos] = STM; + tr->nodeidx[npos] = tr->nodeidx[opos]; /* offending I transforms to M */ + tr->pos[npos] = tr->pos[opos]; + opos++; + npos++; + } /* fix implied I->E transitions; push I to next M */ + else if (tr->statetype[opos]== STI && tr->statetype[opos+1]== STE) { + tr->statetype[npos] = STM; + tr->nodeidx[npos] = tr->nodeidx[opos]+1;/* offending I transforms to M */ + tr->pos[npos] = tr->pos[opos]; + opos++; + npos++; + } /* rare: N-N-B-E becomes N-B-M_1-E (swap B,N) */ + else if (tr->statetype[opos]==STB && tr->statetype[opos+1]==STE + && tr->statetype[opos-1]==STN && tr->pos[opos-1] > 0) { + tr->statetype[npos] = STM; + tr->nodeidx[npos] = 1; + tr->pos[npos] = tr->pos[opos-1]; + tr->statetype[npos-1] = STB; + tr->nodeidx[npos-1] = 0; + tr->pos[npos-1] = 0; + opos++; + npos++; + } /* rare: B-E-C-C-x becomes B-M_M-E-C-x (swap E,C) */ + else if (tr->statetype[opos]==STE && tr->statetype[opos-1]==STB + && tr->statetype[opos+1]==STC + && tr->statetype[opos+2]==STC) { + tr->statetype[npos] = STM; + tr->nodeidx[npos] = mlen; + tr->pos[npos] = tr->pos[opos+2]; + tr->statetype[npos+1] = STE; + tr->nodeidx[npos+1] = 0; + tr->pos[npos+1] = 0; + tr->statetype[npos+2] = STC; /* first C must be a nonemitter */ + tr->nodeidx[npos+2] = 0; + tr->pos[npos+2] = 0; + opos+=3; + npos+=3; + } /* everything else is just copied */ + else { + tr->statetype[npos] = tr->statetype[opos]; + tr->nodeidx[npos] = tr->nodeidx[opos]; + tr->pos[npos] = tr->pos[opos]; + opos++; + npos++; + } + } + tr->tlen = npos; + + if (ret_ndi != NULL) *ret_ndi = ndi; + if (ret_nid != NULL) *ret_nid = nid; + return; +} + + +/* Function: annotate_model() + * + * Purpose: Add rf, cs optional annotation to a new model. + * + * Args: hmm - new model + * matassign - which alignment columns are MAT; [1..alen] + * msa - alignment, including annotation to transfer + * + * Return: (void) + */ +static void +annotate_model(struct plan7_s *hmm, int *matassign, MSA *msa) +{ + int apos; /* position in matassign, 1.alen */ + int k; /* position in model, 1.M */ + char *pri; /* X-PRM, X-PRI, X-PRT annotation */ + + /* Transfer reference coord annotation from alignment, + * if available + */ + if (msa->rf != NULL) { + hmm->rf[0] = ' '; + for (apos = k = 1; apos <= msa->alen; apos++) + if (matassign[apos] & ASSIGN_MATCH) /* ainfo is off by one from HMM */ + hmm->rf[k++] = (msa->rf[apos-1] == ' ') ? '.' : msa->rf[apos-1]; + hmm->rf[k] = '\0'; + hmm->flags |= PLAN7_RF; + } + + /* Transfer consensus structure annotation from alignment, + * if available + */ + if (msa->ss_cons != NULL) { + hmm->cs[0] = ' '; + for (apos = k = 1; apos <= msa->alen; apos++) + if (matassign[apos] & ASSIGN_MATCH) + hmm->cs[k++] = (msa->ss_cons[apos-1] == ' ') ? '.' : msa->ss_cons[apos-1]; + hmm->cs[k] = '\0'; + hmm->flags |= PLAN7_CS; + } + + /* Transfer surface accessibility annotation from alignment, + * if available + */ + if (msa->sa_cons != NULL) { + hmm->ca[0] = ' '; + for (apos = k = 1; apos <= msa->alen; apos++) + if (matassign[apos] & ASSIGN_MATCH) + hmm->ca[k++] = (msa->sa_cons[apos-1] == ' ') ? '.' : msa->sa_cons[apos-1]; + hmm->ca[k] = '\0'; + hmm->flags |= PLAN7_CA; + } + + /* Store the alignment map + */ + for (apos = k = 1; apos <= msa->alen; apos++) + if (matassign[apos] & ASSIGN_MATCH) + hmm->map[k++] = apos; + hmm->flags |= PLAN7_MAP; + + /* Translate and transfer X-PRM annotation. + * 0-9,[a-zA-Z] are legal; translate as 0-9,10-35 into hmm->mpri. + * Any other char is translated as -1, and this will be interpreted + * as a flag that means "unknown", e.g. use the normal mixture Dirichlet + * procedure for this column. + */ + if ((pri = MSAGetGC(msa, "X-PRM")) != NULL) + { + hmm->mpri = MallocOrDie(sizeof(int) * (hmm->M+1)); + for (apos = k = 1; apos <= msa->alen; apos++) + if (matassign[apos] & ASSIGN_MATCH) + { + if (isdigit((int) pri[apos-1])) hmm->mpri[k] = pri[apos-1] - '0'; + else if (islower((int) pri[apos-1])) hmm->mpri[k] = pri[apos-1] - 'a' + 10; + else if (isupper((int) pri[apos-1])) hmm->mpri[k] = pri[apos-1] - 'A' + 10; + else hmm->mpri[k] = -1; + k++; + } + } + /* And again for X-PRI annotation on insert priors: + */ + if ((pri = MSAGetGC(msa, "X-PRI")) != NULL) + { + hmm->ipri = MallocOrDie(sizeof(int) * (hmm->M+1)); + for (apos = k = 1; apos <= msa->alen; apos++) + if (matassign[apos] & ASSIGN_MATCH) + { + if (isdigit((int) pri[apos-1])) hmm->ipri[k] = pri[apos-1] - '0'; + else if (islower((int) pri[apos-1])) hmm->ipri[k] = pri[apos-1] - 'a' + 10; + else if (isupper((int) pri[apos-1])) hmm->ipri[k] = pri[apos-1] - 'A' + 10; + else hmm->ipri[k] = -1; + k++; + } + } + /* And one last time for X-PRT annotation on transition priors: + */ + if ((pri = MSAGetGC(msa, "X-PRT")) != NULL) + { + hmm->tpri = MallocOrDie(sizeof(int) * (hmm->M+1)); + for (apos = k = 1; apos <= msa->alen; apos++) + if (matassign[apos] & ASSIGN_MATCH) + { + if (isdigit((int) pri[apos-1])) hmm->tpri[k] = pri[apos-1] - '0'; + else if (islower((int) pri[apos-1])) hmm->tpri[k] = pri[apos-1] - 'a' + 10; + else if (isupper((int) pri[apos-1])) hmm->tpri[k] = pri[apos-1] - 'A' + 10; + else hmm->tpri[k] = -1; + k++; + } + } + +} + +static void +print_matassign(int *matassign, int alen) +{ + int apos; + + for (apos = 0; apos <= alen; apos++) { + printf("%3d %c %c %c\n", + apos, + (matassign[apos] & ASSIGN_MATCH) ? 'x':' ', + (matassign[apos] & FIRST_MATCH || matassign[apos] & LAST_MATCH) ? '<' : ' ', + (matassign[apos] & EXTERNAL_INSERT_N || + matassign[apos] & EXTERNAL_INSERT_C) ? '|':' '); + } +} diff --git a/forester/archive/RIO/others/hmmer/src/plan7.c b/forester/archive/RIO/others/hmmer/src/plan7.c new file mode 100644 index 0000000..6f5eed1 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/plan7.c @@ -0,0 +1,1036 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + + +/* plan7.c + * SRE, Sat Nov 16 14:19:56 1996 + * + * Support for Plan 7 HMM data structure, plan7_s. + */ + +#include +#include +#include +#include + +#include "funcs.h" +#include "config.h" +#include "structs.h" +#include "squid.h" + +/* Functions: AllocPlan7(), AllocPlan7Shell(), AllocPlan7Body(), FreePlan7() + * + * Purpose: Allocate or free a Plan7 HMM structure. + * Can either allocate all at one (AllocPlan7()) or + * in two steps (AllocPlan7Shell(), AllocPlan7Body()). + * The two step method is used in hmmio.c where we start + * parsing the header of an HMM file but don't + * see the size of the model 'til partway thru the header. + */ +struct plan7_s * +AllocPlan7(int M) +{ + struct plan7_s *hmm; + + hmm = AllocPlan7Shell(); + AllocPlan7Body(hmm, M); + return hmm; +} +struct plan7_s * +AllocPlan7Shell(void) +{ + struct plan7_s *hmm; + + hmm = (struct plan7_s *) MallocOrDie (sizeof(struct plan7_s)); + hmm->M = 0; + + hmm->name = NULL; + hmm->acc = NULL; + hmm->desc = NULL; + hmm->rf = NULL; + hmm->cs = NULL; + hmm->ca = NULL; + hmm->comlog = NULL; + hmm->nseq = 0; + hmm->ctime = NULL; + hmm->map = NULL; + hmm->checksum = 0; + + hmm->tpri = NULL; + hmm->mpri = NULL; + hmm->ipri = NULL; + + hmm->ga1 = hmm->ga2 = 0.0; + hmm->tc1 = hmm->tc2 = 0.0; + hmm->nc1 = hmm->nc2 = 0.0; + + hmm->t = NULL; + hmm->tsc = NULL; + hmm->mat = NULL; + hmm->ins = NULL; + hmm->msc = NULL; + hmm->isc = NULL; + + hmm->begin = NULL; + hmm->bsc = NULL; + hmm->end = NULL; + hmm->esc = NULL; + /* DNA translation is not enabled by default */ + hmm->dnam = NULL; + hmm->dnai = NULL; + hmm->dna2 = -INFTY; + hmm->dna4 = -INFTY; + /* statistical parameters set to innocuous empty values */ + hmm->mu = 0.; + hmm->lambda = 0.; + + hmm->flags = 0; + return hmm; +} +void +AllocPlan7Body(struct plan7_s *hmm, int M) +{ + int k, x; + + hmm->M = M; + + hmm->rf = MallocOrDie ((M+2) * sizeof(char)); + hmm->cs = MallocOrDie ((M+2) * sizeof(char)); + hmm->ca = MallocOrDie ((M+2) * sizeof(char)); + hmm->map = MallocOrDie ((M+1) * sizeof(int)); + + hmm->t = MallocOrDie (M * sizeof(float *)); + hmm->tsc = MallocOrDie (M * sizeof(int *)); + hmm->mat = MallocOrDie ((M+1) * sizeof(float *)); + hmm->ins = MallocOrDie (M * sizeof(float *)); + hmm->msc = MallocOrDie (MAXCODE * sizeof(int *)); + hmm->isc = MallocOrDie (MAXCODE * sizeof(int *)); + hmm->t[0] = MallocOrDie ((7*M) * sizeof(float)); + hmm->tsc[0] = MallocOrDie ((7*M) * sizeof(int)); + hmm->mat[0] = MallocOrDie ((MAXABET*(M+1)) * sizeof(float)); + hmm->ins[0] = MallocOrDie ((MAXABET*M) * sizeof(float)); + hmm->msc[0] = MallocOrDie ((MAXCODE*(M+1)) * sizeof(int)); + hmm->isc[0] = MallocOrDie ((MAXCODE*M) * sizeof(int)); + + /* note allocation strategy for important 2D arrays -- trying + * to keep locality as much as possible, cache efficiency etc. + */ + for (k = 1; k <= M; k++) { + hmm->mat[k] = hmm->mat[0] + k * MAXABET; + if (k < M) { + hmm->ins[k] = hmm->ins[0] + k * MAXABET; + hmm->t[k] = hmm->t[0] + k * 7; + hmm->tsc[k] = hmm->tsc[0] + k * 7; + } + } + for (x = 1; x < MAXCODE; x++) { + hmm->msc[x] = hmm->msc[0] + x * (M+1); + hmm->isc[x] = hmm->isc[0] + x * M; + } + /* tsc[0] is used as a boundary condition sometimes [Viterbi()], + * so set to -inf always. + */ + for (x = 0; x < 7; x++) + hmm->tsc[0][x] = -INFTY; + + hmm->begin = MallocOrDie ((M+1) * sizeof(float)); + hmm->bsc = MallocOrDie ((M+1) * sizeof(int)); + hmm->end = MallocOrDie ((M+1) * sizeof(float)); + hmm->esc = MallocOrDie ((M+1) * sizeof(int)); + + return; +} + + +void +FreePlan7(struct plan7_s *hmm) +{ + if (hmm->name != NULL) free(hmm->name); + if (hmm->desc != NULL) free(hmm->desc); + if (hmm->rf != NULL) free(hmm->rf); + if (hmm->cs != NULL) free(hmm->cs); + if (hmm->ca != NULL) free(hmm->ca); + if (hmm->comlog != NULL) free(hmm->comlog); + if (hmm->ctime != NULL) free(hmm->ctime); + if (hmm->map != NULL) free(hmm->map); + if (hmm->tpri != NULL) free(hmm->tpri); + if (hmm->mpri != NULL) free(hmm->mpri); + if (hmm->ipri != NULL) free(hmm->ipri); + if (hmm->bsc != NULL) free(hmm->bsc); + if (hmm->begin != NULL) free(hmm->begin); + if (hmm->esc != NULL) free(hmm->esc); + if (hmm->end != NULL) free(hmm->end); + if (hmm->msc != NULL) free(hmm->msc[0]); + if (hmm->mat != NULL) free(hmm->mat[0]); + if (hmm->isc != NULL) free(hmm->isc[0]); + if (hmm->ins != NULL) free(hmm->ins[0]); + if (hmm->tsc != NULL) free(hmm->tsc[0]); + if (hmm->t != NULL) free(hmm->t[0]); + if (hmm->msc != NULL) free(hmm->msc); + if (hmm->mat != NULL) free(hmm->mat); + if (hmm->isc != NULL) free(hmm->isc); + if (hmm->ins != NULL) free(hmm->ins); + if (hmm->tsc != NULL) free(hmm->tsc); + if (hmm->t != NULL) free(hmm->t); + if (hmm->dnam != NULL) free(hmm->dnam); + if (hmm->dnai != NULL) free(hmm->dnai); + free(hmm); +} + +/* Function: ZeroPlan7() + * + * Purpose: Zeros the counts/probabilities fields in a model. + * Leaves null model untouched. + */ +void +ZeroPlan7(struct plan7_s *hmm) +{ + int k; + for (k = 1; k < hmm->M; k++) + { + FSet(hmm->t[k], 7, 0.); + FSet(hmm->mat[k], Alphabet_size, 0.); + FSet(hmm->ins[k], Alphabet_size, 0.); + } + FSet(hmm->mat[hmm->M], Alphabet_size, 0.); + hmm->tbd1 = 0.; + FSet(hmm->begin+1, hmm->M, 0.); + FSet(hmm->end+1, hmm->M, 0.); + for (k = 0; k < 4; k++) + FSet(hmm->xt[k], 2, 0.); + hmm->flags &= ~PLAN7_HASBITS; /* invalidates scores */ + hmm->flags &= ~PLAN7_HASPROB; /* invalidates probabilities */ +} + + +/* Function: Plan7SetName() + * + * Purpose: Change the name of a Plan7 HMM. Convenience function. + * + * Note: Trailing whitespace and \n's are chopped. + */ +void +Plan7SetName(struct plan7_s *hmm, char *name) +{ + if (hmm->name != NULL) free(hmm->name); + hmm->name = Strdup(name); + StringChop(hmm->name); +} +/* Function: Plan7SetAccession() + * + * Purpose: Change the accession number of a Plan7 HMM. Convenience function. + * + * Note: Trailing whitespace and \n's are chopped. + */ +void +Plan7SetAccession(struct plan7_s *hmm, char *acc) +{ + if (hmm->acc != NULL) free(hmm->acc); + hmm->acc = Strdup(acc); + StringChop(hmm->acc); + hmm->flags |= PLAN7_ACC; +} + +/* Function: Plan7SetDescription() + * + * Purpose: Change the description line of a Plan7 HMM. Convenience function. + * + * Note: Trailing whitespace and \n's are chopped. + */ +void +Plan7SetDescription(struct plan7_s *hmm, char *desc) +{ + if (hmm->desc != NULL) free(hmm->desc); + hmm->desc = Strdup(desc); + StringChop(hmm->desc); + hmm->flags |= PLAN7_DESC; +} + +/* Function: Plan7ComlogAppend() + * Date: SRE, Wed Oct 29 09:57:30 1997 [TWA 721 over Greenland] + * + * Purpose: Concatenate command line options and append to the + * command line log. + */ +void +Plan7ComlogAppend(struct plan7_s *hmm, int argc, char **argv) +{ + int len; + int i; + + /* figure out length of command line, w/ spaces and \n */ + len = argc; + for (i = 0; i < argc; i++) + len += strlen(argv[i]); + + /* allocate */ + if (hmm->comlog != NULL) + { + len += strlen(hmm->comlog); + hmm->comlog = ReallocOrDie(hmm->comlog, sizeof(char)* (len+1)); + } + else + { + hmm->comlog = MallocOrDie(sizeof(char)* (len+1)); + *(hmm->comlog) = '\0'; /* need this to make strcat work */ + } + + /* append */ + strcat(hmm->comlog, "\n"); + for (i = 0; i < argc; i++) + { + strcat(hmm->comlog, argv[i]); + if (i < argc-1) strcat(hmm->comlog, " "); + } +} + +/* Function: Plan7SetCtime() + * Date: SRE, Wed Oct 29 11:53:19 1997 [TWA 721 over the Atlantic] + * + * Purpose: Set the ctime field in a new HMM to the current time. + */ +void +Plan7SetCtime(struct plan7_s *hmm) +{ + time_t date = time(NULL); + if (hmm->ctime != NULL) free(hmm->ctime); + hmm->ctime = Strdup(ctime(&date)); + StringChop(hmm->ctime); +} + + +/* Function: Plan7SetNullModel() + * + * Purpose: Set the null model section of an HMM. + * Convenience function. + */ +void +Plan7SetNullModel(struct plan7_s *hmm, float null[MAXABET], float p1) +{ + int x; + for (x = 0; x < Alphabet_size; x++) + hmm->null[x] = null[x]; + hmm->p1 = p1; +} + + +/* Function: P7Logoddsify() + * + * Purpose: Take an HMM with valid probabilities, and + * fill in the integer log-odds score section of the model. + * + * Notes on log-odds scores: + * type of parameter probability score + * ----------------- ----------- ------ + * any emission p_x log_2 p_x/null_x + * N,J,C /assume/ p_x = null_x so /always/ score zero. + * transition to emitters t_x log_2 t_x/p1 + * (M,I; N,C; J) + * NN and CC loops are often equal to p1, so usu. score zero. + * C->T transition t_x log_2 t_x/p2 + * often zero, usu. C->T = p2. + * all other transitions t_x log_2 t_x + * (no null model counterpart, so null prob is 1) + * + * Notes on entry/exit scores, B->M and M->E: + * The probability form model includes delete states 1 and M. + * these states are removed from a search form model to + * prevent B->D...D->E->J->B mute cycles, which would complicate + * dynamic programming algorithms. The data-independent + * S/W B->M and M->E transitions are folded together with + * data-dependent B->D...D->M and M->D...D->E paths. + * + * This process is referred to in the code as "wing folding" + * or "wing retraction"... the analogy is to a swept-wing + * fighter in landing vs. high speed flight configuration. + * + * Note on Viterbi vs. forward flag: + * Wing retraction must take forward vs. Viterbi + * into account. If forward, sum two paths; if Viterbi, take + * max. I tried to slide this by as a sum, without + * the flag, but Alex detected it as a bug, because you can + * then find cases where the Viterbi score doesn't match + * the P7TraceScore(). + * + * Args: hmm - the hmm to calculate scores in. + * viterbi_mode - TRUE to fold wings in Viterbi configuration. + * + * Return: (void) + * hmm scores are filled in. + */ +void +P7Logoddsify(struct plan7_s *hmm, int viterbi_mode) +{ + int k; /* counter for model position */ + int x; /* counter for symbols */ + float accum; + float tbm, tme; + + if (hmm->flags & PLAN7_HASBITS) return; + + /* Symbol emission scores + */ + for (k = 1; k <= hmm->M; k++) + { + /* match/insert emissions in main model */ + for (x = 0; x < Alphabet_size; x++) + { + hmm->msc[x][k] = Prob2Score(hmm->mat[k][x], hmm->null[x]); + if (k < hmm->M) + hmm->isc[x][k] = Prob2Score(hmm->ins[k][x], hmm->null[x]); + } + /* degenerate match/insert emissions */ + for (x = Alphabet_size; x < Alphabet_iupac; x++) + { + hmm->msc[x][k] = DegenerateSymbolScore(hmm->mat[k], hmm->null, x); + if (k < hmm->M) + hmm->isc[x][k] = DegenerateSymbolScore(hmm->ins[k], hmm->null, x); + } + } + + /* State transitions. + * + * A note on "folding" of D_1 and D_M. + * These two delete states are folded out of search form models + * in order to prevent null cycles in the dynamic programming + * algorithms (see code below). However, we use their log transitions + * when we save the model! So the following log transition probs + * are used *only* in save files, *never* in search algorithms: + * log (tbd1), D1 -> M2, D1 -> D2 + * Mm-1 -> Dm, Dm-1 -> Dm + * + * In a search algorithm, these have to be interpreted as -INFTY + * because their contributions are folded into bsc[] and esc[] + * entry/exit scores. They can't be set to -INFTY here because + * we need them in save files. + */ + for (k = 1; k < hmm->M; k++) + { + hmm->tsc[k][TMM] = Prob2Score(hmm->t[k][TMM], hmm->p1); + hmm->tsc[k][TMI] = Prob2Score(hmm->t[k][TMI], hmm->p1); + hmm->tsc[k][TMD] = Prob2Score(hmm->t[k][TMD], 1.0); + hmm->tsc[k][TIM] = Prob2Score(hmm->t[k][TIM], hmm->p1); + hmm->tsc[k][TII] = Prob2Score(hmm->t[k][TII], hmm->p1); + hmm->tsc[k][TDM] = Prob2Score(hmm->t[k][TDM], hmm->p1); + hmm->tsc[k][TDD] = Prob2Score(hmm->t[k][TDD], 1.0); + } + + /* B->M entry transitions. Note how D_1 is folded out. + * M1 is just B->M1 + * M2 is sum (or max) of B->M2 and B->D1->M2 + * M_k is sum (or max) of B->M_k and B->D1...D_k-1->M_k + * These have to be done in log space, else you'll get + * underflow errors; and we also have to watch for log(0). + * A little sloppier than it probably has to be; historically, + * doing in this in log space was in response to a bug report. + */ + accum = hmm->tbd1 > 0.0 ? log(hmm->tbd1) : -9999.; + for (k = 1; k <= hmm->M; k++) + { + tbm = hmm->begin[k] > 0. ? log(hmm->begin[k]) : -9999.; /* B->M_k part */ + + /* B->D1...D_k-1->M_k part we get from accum*/ + if (k > 1 && accum > -9999.) + { + if (hmm->t[k-1][TDM] > 0.0) + { + if (viterbi_mode) tbm = MAX(tbm, accum + log(hmm->t[k-1][TDM])); + else tbm = LogSum(tbm, accum + log(hmm->t[k-1][TDM])); + } + + accum = (hmm->t[k-1][TDD] > 0.0) ? accum + log(hmm->t[k-1][TDD]) : -9999.; + } + /* Convert from log_e to scaled integer log_2 odds. */ + if (tbm > -9999.) + hmm->bsc[k] = (int) floor(0.5 + INTSCALE * 1.44269504 * (tbm - log(hmm->p1))); + else + hmm->bsc[k] = -INFTY; + } + + /* M->E exit transitions. Note how D_M is folded out. + * M_M is 1 by definition + * M_M-1 is sum of M_M-1->E and M_M-1->D_M->E, where D_M->E is 1 by definition + * M_k is sum of M_k->E and M_k->D_k+1...D_M->E + * Must be done in log space to avoid underflow errors. + * A little sloppier than it probably has to be; historically, + * doing in this in log space was in response to a bug report. + */ + hmm->esc[hmm->M] = 0; + accum = 0.; + for (k = hmm->M-1; k >= 1; k--) + { + tme = hmm->end[k] > 0. ? log(hmm->end[k]) : -9999.; + if (accum > -9999.) + { + if (hmm->t[k][TMD] > 0.0) + { + if (viterbi_mode) tme = MAX(tme, accum + log(hmm->t[k][TMD])); + else tme = LogSum(tme, accum + log(hmm->t[k][TMD])); + } + accum = (hmm->t[k][TDD] > 0.0) ? accum + log(hmm->t[k][TDD]) : -9999.; + } + /* convert from log_e to scaled integer log odds. */ + hmm->esc[k] = (tme > -9999.) ? (int) floor(0.5 + INTSCALE * 1.44269504 * tme) : -INFTY; + } + + /* special transitions */ + hmm->xsc[XTN][LOOP] = Prob2Score(hmm->xt[XTN][LOOP], hmm->p1); + hmm->xsc[XTN][MOVE] = Prob2Score(hmm->xt[XTN][MOVE], 1.0); + hmm->xsc[XTE][LOOP] = Prob2Score(hmm->xt[XTE][LOOP], 1.0); + hmm->xsc[XTE][MOVE] = Prob2Score(hmm->xt[XTE][MOVE], 1.0); + hmm->xsc[XTC][LOOP] = Prob2Score(hmm->xt[XTC][LOOP], hmm->p1); + hmm->xsc[XTC][MOVE] = Prob2Score(hmm->xt[XTC][MOVE], 1.-hmm->p1); + hmm->xsc[XTJ][LOOP] = Prob2Score(hmm->xt[XTJ][LOOP], hmm->p1); + hmm->xsc[XTJ][MOVE] = Prob2Score(hmm->xt[XTJ][MOVE], 1.0); + + hmm->flags |= PLAN7_HASBITS; /* raise the log-odds ready flag */ +} + + + +/* Function: Plan7Renormalize() + * + * Purpose: Take an HMM in counts form, and renormalize + * all of its probability vectors. Also enforces + * Plan7 restrictions on nonexistent transitions. + * + * Args: hmm - the model to renormalize. + * + * Return: (void) + * hmm is changed. + */ +void +Plan7Renormalize(struct plan7_s *hmm) +{ + int k; /* counter for model position */ + int st; /* counter for special states */ + float d; /* denominator */ + + /* match emissions */ + for (k = 1; k <= hmm->M; k++) + FNorm(hmm->mat[k], Alphabet_size); + /* insert emissions */ + for (k = 1; k < hmm->M; k++) + FNorm(hmm->ins[k], Alphabet_size); + /* begin transitions */ + d = FSum(hmm->begin+1, hmm->M) + hmm->tbd1; + FScale(hmm->begin+1, hmm->M, 1./d); + hmm->tbd1 /= d; + /* main model transitions */ + for (k = 1; k < hmm->M; k++) + { + d = FSum(hmm->t[k], 3) + hmm->end[k]; + FScale(hmm->t[k], 3, 1./d); + hmm->end[k] /= d; + + FNorm(hmm->t[k]+3, 2); /* insert */ + FNorm(hmm->t[k]+5, 2); /* delete */ + } + /* null model emissions */ + FNorm(hmm->null, Alphabet_size); + /* special transitions */ + for (st = 0; st < 4; st++) + FNorm(hmm->xt[st], 2); + /* enforce nonexistent transitions */ + /* (is this necessary?) */ + hmm->t[0][TDM] = hmm->t[0][TDD] = 0.0; + + hmm->flags &= ~PLAN7_HASBITS; /* clear the log-odds ready flag */ + hmm->flags |= PLAN7_HASPROB; /* set the probabilities OK flag */ +} + + +/* Function: Plan7RenormalizeExits() + * Date: SRE, Fri Aug 14 11:22:19 1998 [St. Louis] + * + * Purpose: Renormalize just the match state transitions; + * for instance, after a Config() function has + * modified the exit distribution. + * + * Args: hmm - hmm to renormalize + * + * Returns: void + */ +void +Plan7RenormalizeExits(struct plan7_s *hmm) +{ + int k; + float d; + + for (k = 1; k < hmm->M; k++) + { + d = FSum(hmm->t[k], 3); + FScale(hmm->t[k], 3, 1./(d + d*hmm->end[k])); + } +} + + +/***************************************************************** + * Plan7 configuration functions + * The following few functions are the Plan7 equivalent of choosing + * different alignment styles (fully local, fully global, global/local, + * multihit, etc.) + * + * There is (at least) one constraint worth noting. + * If you want per-domain scores to sum up to per-sequence scores, + * then one of the following two sets of conditions must be met: + * + * 1) t(E->J) = 0 + * e.g. no multidomain hits + * + * 2) t(N->N) = t(C->C) = t(J->J) = hmm->p1 + * e.g. unmatching sequence scores zero, and + * N->B first-model score is equal to J->B another-model score. + * + * These constraints are obeyed in the default Config() functions below, + * but in the future (when HMM editing may be allowed) we'll have + * to remember this. Non-equality of the summed domain scores and + * the total sequence score is a really easy "red flag" for people to + * notice and report as a bug, even if it may make probabilistic + * sense not to meet either constraint for certain modeling problems. + ***************************************************************** + */ + +/* Function: Plan7NakedConfig() + * + * Purpose: Set the alignment-independent, algorithm-dependent parameters + * of a Plan7 model so that no special states (N,C,J) emit anything: + * one simple, full global pass through the model. + * + * Args: hmm - the plan7 model + * + * Return: (void) + * The HMM is modified; algorithm dependent parameters are set. + * Previous scores are invalidated if they existed. + */ +void +Plan7NakedConfig(struct plan7_s *hmm) +{ + hmm->xt[XTN][MOVE] = 1.; /* disallow N-terminal tail */ + hmm->xt[XTN][LOOP] = 0.; + hmm->xt[XTE][MOVE] = 1.; /* only 1 domain/sequence ("global" alignment) */ + hmm->xt[XTE][LOOP] = 0.; + hmm->xt[XTC][MOVE] = 1.; /* disallow C-terminal tail */ + hmm->xt[XTC][LOOP] = 0.; + hmm->xt[XTJ][MOVE] = 0.; /* J state unused */ + hmm->xt[XTJ][LOOP] = 1.; + FSet(hmm->begin+2, hmm->M-1, 0.); /* disallow internal entries. */ + hmm->begin[1] = 1. - hmm->tbd1; + FSet(hmm->end+1, hmm->M-1, 0.); /* disallow internal exits. */ + hmm->end[hmm->M] = 1.; + Plan7RenormalizeExits(hmm); + hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */ +} + +/* Function: Plan7GlobalConfig() + * + * Purpose: Set the alignment-independent, algorithm-dependent parameters + * of a Plan7 model to global (Needleman/Wunsch) configuration. + * + * Like a non-looping hmmls, since we actually allow flanking + * N and C terminal sequence. + * + * Args: hmm - the plan7 model + * + * Return: (void) + * The HMM is modified; algorithm dependent parameters are set. + * Previous scores are invalidated if they existed. + */ +void +Plan7GlobalConfig(struct plan7_s *hmm) +{ + hmm->xt[XTN][MOVE] = 1. - hmm->p1; /* allow N-terminal tail */ + hmm->xt[XTN][LOOP] = hmm->p1; + hmm->xt[XTE][MOVE] = 1.; /* only 1 domain/sequence ("global" alignment) */ + hmm->xt[XTE][LOOP] = 0.; + hmm->xt[XTC][MOVE] = 1. - hmm->p1; /* allow C-terminal tail */ + hmm->xt[XTC][LOOP] = hmm->p1; + hmm->xt[XTJ][MOVE] = 0.; /* J state unused */ + hmm->xt[XTJ][LOOP] = 1.; + FSet(hmm->begin+2, hmm->M-1, 0.); /* disallow internal entries. */ + hmm->begin[1] = 1. - hmm->tbd1; + FSet(hmm->end+1, hmm->M-1, 0.); /* disallow internal exits. */ + hmm->end[hmm->M] = 1.; + Plan7RenormalizeExits(hmm); + hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */ +} + +/* Function: Plan7LSConfig() + * + * Purpose: Set the alignment independent parameters of a Plan7 model + * to hmmls (global in HMM, local in sequence) configuration. + * + * Args: hmm - the plan7 model + * + * Return: (void); + * the HMM probabilities are modified. + */ +void +Plan7LSConfig(struct plan7_s *hmm) +{ + hmm->xt[XTN][MOVE] = 1.-hmm->p1; /* allow N-terminal tail */ + hmm->xt[XTN][LOOP] = hmm->p1; + hmm->xt[XTE][MOVE] = 0.5; /* expectation 2 domains/seq */ + hmm->xt[XTE][LOOP] = 0.5; + hmm->xt[XTC][MOVE] = 1.-hmm->p1; /* allow C-terminal tail */ + hmm->xt[XTC][LOOP] = hmm->p1; + hmm->xt[XTJ][MOVE] = 1.-hmm->p1; /* allow J junction state */ + hmm->xt[XTJ][LOOP] = hmm->p1; + FSet(hmm->begin+2, hmm->M-1, 0.); /* start at M1/D1 */ + hmm->begin[1] = 1. - hmm->tbd1; + FSet(hmm->end+1, hmm->M-1, 0.); /* end at M_m/D_m */ + hmm->end[hmm->M] = 1.; + Plan7RenormalizeExits(hmm); + hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */ +} + + +/* Function: Plan7SWConfig() + * + * Purpose: Set the alignment independent parameters of + * a Plan7 model to hmmsw (Smith/Waterman) configuration. + * + * Notes: entry/exit is asymmetric because of the left/right + * nature of the HMM/profile. Entry probability is distributed + * simply by assigning p_x = pentry / (M-1) to M-1 + * internal match states. However, the same approach doesn't + * lead to a flat distribution over exit points. Exit p's + * must be corrected for the probability of a previous exit + * from the model. Requiring a flat distribution over exit + * points leads to an easily solved piece of algebra, giving: + * p_1 = pexit / (M-1) + * p_x = p_1 / (1 - (x-1) p_1) + * + * Args: hmm - the Plan7 model w/ data-dep prob's valid + * pentry - probability of an internal entry somewhere; + * will be evenly distributed over M-1 match states + * pexit - probability of an internal exit somewhere; + * will be distributed over M-1 match states. + * + * Return: (void) + * HMM probabilities are modified. + */ +void +Plan7SWConfig(struct plan7_s *hmm, float pentry, float pexit) +{ + float basep; /* p1 for exits: the base p */ + int k; /* counter over states */ + + /* Configure special states. + */ + hmm->xt[XTN][MOVE] = 1-hmm->p1; /* allow N-terminal tail */ + hmm->xt[XTN][LOOP] = hmm->p1; + hmm->xt[XTE][MOVE] = 1.; /* disallow jump state */ + hmm->xt[XTE][LOOP] = 0.; + hmm->xt[XTC][MOVE] = 1-hmm->p1; /* allow C-terminal tail */ + hmm->xt[XTC][LOOP] = hmm->p1; + hmm->xt[XTJ][MOVE] = 1.; /* J is unused */ + hmm->xt[XTJ][LOOP] = 0.; + + /* Configure entry. + */ + hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1); + FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- hmm->tbd1)) / (float)(hmm->M-1)); + + /* Configure exit. + */ + hmm->end[hmm->M] = 1.0; + basep = pexit / (float) (hmm->M-1); + for (k = 1; k < hmm->M; k++) + hmm->end[k] = basep / (1. - basep * (float) (k-1)); + Plan7RenormalizeExits(hmm); + hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */ +} + +/* Function: Plan7FSConfig() + * Date: SRE, Fri Jan 2 15:34:40 1998 [StL] + * + * Purpose: Set the alignment independent parameters of + * a Plan7 model to hmmfs (multihit Smith/Waterman) configuration. + * + * See comments on Plan7SWConfig() for explanation of + * how pentry and pexit are used. + * + * Args: hmm - the Plan7 model w/ data-dep prob's valid + * pentry - probability of an internal entry somewhere; + * will be evenly distributed over M-1 match states + * pexit - probability of an internal exit somewhere; + * will be distributed over M-1 match states. + * + * Return: (void) + * HMM probabilities are modified. + */ +void +Plan7FSConfig(struct plan7_s *hmm, float pentry, float pexit) +{ + float basep; /* p1 for exits: the base p */ + int k; /* counter over states */ + + /* Configure special states. + */ + hmm->xt[XTN][MOVE] = 1-hmm->p1; /* allow N-terminal tail */ + hmm->xt[XTN][LOOP] = hmm->p1; + hmm->xt[XTE][MOVE] = 0.5; /* allow loops / multihits */ + hmm->xt[XTE][LOOP] = 0.5; + hmm->xt[XTC][MOVE] = 1-hmm->p1; /* allow C-terminal tail */ + hmm->xt[XTC][LOOP] = hmm->p1; + hmm->xt[XTJ][MOVE] = 1.-hmm->p1; /* allow J junction between domains */ + hmm->xt[XTJ][LOOP] = hmm->p1; + + /* Configure entry. + */ + hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1); + FSet(hmm->begin+2, hmm->M-1, (pentry * (1.-hmm->tbd1)) / (float)(hmm->M-1)); + + /* Configure exit. + */ + hmm->end[hmm->M] = 1.0; + basep = pexit / (float) (hmm->M-1); + for (k = 1; k < hmm->M; k++) + hmm->end[k] = basep / (1. - basep * (float) (k-1)); + Plan7RenormalizeExits(hmm); + hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */ +} + + + + +/* Function: Plan7ESTConfig() + * + * Purpose: Configure a Plan7 model for EST Smith/Waterman + * analysis. + * + * OUTDATED; DO NOT USE WITHOUT RECHECKING + * + * Args: hmm - hmm to configure. + * aacode - 0..63 vector mapping genetic code to amino acids + * estmodel - 20x64 translation matrix, w/ codon bias and substitution error + * dna2 - probability of a -1 frameshift in a triplet + * dna4 - probability of a +1 frameshift in a triplet + */ +void +Plan7ESTConfig(struct plan7_s *hmm, int *aacode, float **estmodel, + float dna2, float dna4) +{ + int k; + int x; + float p; + float *tripnull; /* UNFINISHED!!! */ + + /* configure specials */ + hmm->xt[XTN][MOVE] = 1./351.; + hmm->xt[XTN][LOOP] = 350./351.; + hmm->xt[XTE][MOVE] = 1.; + hmm->xt[XTE][LOOP] = 0.; + hmm->xt[XTC][MOVE] = 1./351.; + hmm->xt[XTC][LOOP] = 350./351.; + hmm->xt[XTJ][MOVE] = 1.; + hmm->xt[XTJ][LOOP] = 0.; + /* configure entry/exit */ + hmm->begin[1] = 0.5; + FSet(hmm->begin+2, hmm->M-1, 0.5 / ((float)hmm->M - 1.)); + hmm->end[hmm->M] = 1.; + FSet(hmm->end, hmm->M-1, 0.5 / ((float)hmm->M - 1.)); + + /* configure dna triplet/frameshift emissions */ + for (k = 1; k <= hmm->M; k++) + { + /* translate aa to triplet probabilities */ + for (x = 0; x < 64; x++) { + p = hmm->mat[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4); + hmm->dnam[x][k] = Prob2Score(p, tripnull[x]); + + p = hmm->ins[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4); + hmm->dnai[x][k] = Prob2Score(p, tripnull[x]); + } + hmm->dnam[64][k] = 0; /* ambiguous codons score 0 (danger?) */ + hmm->dna2 = Prob2Score(dna2, 1.); + hmm->dna4 = Prob2Score(dna4, 1.); + } +} + +/* Function: PrintPlan7Stats() + * + * Purpose: Given a newly constructed HMM and the tracebacks + * of the sequences it was trained on, print out all + * the interesting information at the end of hmmb + * and hmmt runs that convinces the user we actually + * did something. + * + * Args: fp - where to send the output (stdout, usually) + * hmm - the new HMM, probability form + * dsq - digitized training seqs + * nseq - number of dsq's + * tr - array of tracebacks for dsq + * + * Return: (void) + */ +void +PrintPlan7Stats(FILE *fp, struct plan7_s *hmm, char **dsq, int nseq, + struct p7trace_s **tr) +{ + int idx; /* counter for sequences */ + float score; /* an individual trace score */ + float total, best, worst; /* for the avg. and range of the scores */ + float sqsum, stddev; /* for the std. deviation of the scores */ + + P7Logoddsify(hmm, TRUE); /* make sure model scores are ready */ + + /* find individual trace scores */ + score = P7TraceScore(hmm, dsq[0], tr[0]); + total = best = worst = score; + sqsum = score * score; + for (idx = 1; idx < nseq; idx++) { + /* P7PrintTrace(stdout, tr[idx], hmm, dsq[idx]); */ + score = P7TraceScore(hmm, dsq[idx], tr[idx]); + total += score; + sqsum += score * score; + if (score > best) best = score; + if (score < worst) worst = score; + } + if (nseq > 1) { + stddev = (sqsum - (total * total / (float) nseq)) / ((float) nseq - 1.); + stddev = (stddev > 0) ? sqrt(stddev) : 0.0; + } else stddev = 0.0; + /* print out stuff. */ + fprintf(fp, "Average score: %10.2f bits\n", total / (float) nseq); + fprintf(fp, "Minimum score: %10.2f bits\n", worst); + fprintf(fp, "Maximum score: %10.2f bits\n", best); + fprintf(fp, "Std. deviation: %10.2f bits\n", stddev); +} + +/* Function: DegenerateSymbolScore() + * + * Purpose: Given a sequence character x and an hmm emission probability + * vector, calculate the log-odds (base 2) score of + * the symbol. + * + * Easy if x is in the emission alphabet, but not so easy + * is x is a degenerate symbol. The "correct" Bayesian + * philosophy is to calculate score(X) by summing over + * p(x) for all x in the degenerate symbol X to get P(X), + * doing the same sum over the prior to get F(X), and + * doing log_2 (P(X)/F(X)). This gives an X a zero score, + * for instance. + * + * Though this is correct in a formal Bayesian sense -- + * we have no information on the sequence, so we can't + * say if it's random or model, so it scores zero -- + * it sucks, big time, for scoring biological sequences. + * Sequences with lots of X's score near zero, while + * real sequences have average scores that are negative -- + * so the X-laden sequences appear to be lifted out + * of the noise of a full histogram of a database search. + * Correct or not, this is highly undesirable. + * + * So therefore we calculated the expected score of + * the degenerate symbol by summing over all x in X: + * e_x log_2 (p(x)/f(x)) + * where the expectation of x, e_x, is calculated from + * the random model. + * + * Empirically, this works; it also has a wooly hand-waving + * probabilistic justification that I'm happy enough about. + * + * Args: p - probabilities of normal symbols + * null - null emission model + * ambig - index of the degenerate character in Alphabet[] + * + * Return: the integer log odds score of x given the emission + * vector and the null model, scaled up by INTSCALE. + */ +int +DegenerateSymbolScore(float *p, float *null, int ambig) +{ + int x; + float numer = 0.; + float denom = 0.; + + for (x = 0; x < Alphabet_size; x++) { + if (Degenerate[ambig][x]) { + numer += null[x] * sreLOG2(p[x] / null[x]); + denom += null[x]; + } + } + return (int) (INTSCALE * numer / denom); +} + +/***************************************************************** + * + * Plan9/Plan7 interface + * + * Very important code during the evolutionary takeover by Plan7 -- + * convert between Krogh/Haussler and Plan7 models. + *****************************************************************/ + +/* Function: Plan9toPlan7() + * + * Purpose: Convert an old HMM into Plan7. Configures it in + * ls mode. + * + * Args: hmm - old ugly plan9 style HMM + * ret_plan7 - new wonderful Plan7 HMM + * + * Return: (void) + * Plan7 HMM is allocated here. Free w/ FreePlan7(). + */ +void +Plan9toPlan7(struct plan9_s *hmm, struct plan7_s **ret_plan7) +{ + struct plan7_s *plan7; + int k, x; + + plan7 = AllocPlan7(hmm->M); + + for (k = 1; k < hmm->M; k++) + { + plan7->t[k][TMM] = hmm->mat[k].t[MATCH]; + plan7->t[k][TMD] = hmm->mat[k].t[DELETE]; + plan7->t[k][TMI] = hmm->mat[k].t[INSERT]; + plan7->t[k][TDM] = hmm->del[k].t[MATCH]; + plan7->t[k][TDD] = hmm->del[k].t[DELETE]; + plan7->t[k][TIM] = hmm->ins[k].t[MATCH]; + plan7->t[k][TII] = hmm->ins[k].t[INSERT]; + } + + for (k = 1; k <= hmm->M; k++) + for (x = 0; x < Alphabet_size; x++) + plan7->mat[k][x] = hmm->mat[k].p[x]; + + for (k = 1; k < hmm->M; k++) + for (x = 0; x < Alphabet_size; x++) + plan7->ins[k][x] = hmm->ins[k].p[x]; + + plan7->tbd1 = hmm->mat[0].t[DELETE] / (hmm->mat[0].t[DELETE] + hmm->mat[0].t[MATCH]); + + /* We have to make up the null transition p1; use default */ + P7DefaultNullModel(plan7->null, &(plan7->p1)); + for (x = 0; x < Alphabet_size; x++) + plan7->null[x] = hmm->null[x]; + + if (hmm->name != NULL) + Plan7SetName(plan7, hmm->name); + if (hmm->flags & HMM_REF) { + strcpy(plan7->rf, hmm->ref); + plan7->flags |= PLAN7_RF; + } + if (hmm->flags & HMM_CS) { + strcpy(plan7->cs, hmm->cs); + plan7->flags |= PLAN7_CS; + } + + Plan7LSConfig(plan7); /* configure specials for ls-style alignment */ + Plan7Renormalize(plan7); /* mainly to correct for missing ID and DI */ + plan7->flags |= PLAN7_HASPROB; /* probabilities are valid */ + plan7->flags &= ~PLAN7_HASBITS; /* scores are not valid */ + *ret_plan7 = plan7; +} + + diff --git a/forester/archive/RIO/others/hmmer/src/plan9.c b/forester/archive/RIO/others/hmmer/src/plan9.c new file mode 100644 index 0000000..f7bf7bb --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/plan9.c @@ -0,0 +1,141 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* plan9.c + * SRE, Wed Apr 8 07:35:30 1998 + * + * alloc, free, and initialization of old Plan9 (HMMER 1.x) functions. + * Rescued from the wreckage of HMMER 1.9m code. + */ + +#include +#include +#include +#include +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + + +struct plan9_s * +P9AllocHMM(int M) /* length of model to make */ +{ + struct plan9_s *hmm; /* RETURN: blank HMM */ + + hmm = (struct plan9_s *) MallocOrDie (sizeof(struct plan9_s)); + hmm->ins = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2)); + hmm->del = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2)); + hmm->mat = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2)); + hmm->ref = (char *) MallocOrDie ((M+2) * sizeof(char)); + hmm->cs = (char *) MallocOrDie ((M+2) * sizeof(char)); + hmm->xray = (float *) MallocOrDie ((M+2) * sizeof(float) * NINPUTS); + hmm->M = M; + hmm->name = Strdup("unnamed"); /* name is not optional. */ + + hmm->flags = 0; + P9ZeroHMM(hmm); + return hmm; +} +int +P9FreeHMM(struct plan9_s *hmm) +{ + if (hmm == NULL) return 0; + free(hmm->ref); + free(hmm->cs); + free(hmm->xray); + free(hmm->name); + if (hmm->mat != NULL) free (hmm->mat); + if (hmm->ins != NULL) free (hmm->ins); + if (hmm->del != NULL) free (hmm->del); + free(hmm); + return 1; +} + + +/* Function: P9ZeroHMM() + * + * Purpose: Zero emission and transition counts in an HMM. + */ +void +P9ZeroHMM(struct plan9_s *hmm) +{ + int k, ts, idx; + + for (k = 0; k <= hmm->M+1; k++) + { + for (ts = 0; ts < 3; ts++) + { + hmm->mat[k].t[ts] = 0.0; + hmm->ins[k].t[ts] = 0.0; + hmm->del[k].t[ts] = 0.0; + } + for (idx = 0; idx < Alphabet_size; idx++) + { + hmm->mat[k].p[idx] = 0.0; + hmm->ins[k].p[idx] = 0.0; + hmm->del[k].p[idx] = 0.0; + } + } +} + + + + + +/* Function: P9Renormalize() + * + * Normalize all P distributions so they sum to 1. + * P distributions that are all 0, or contain negative + * probabilities, are left untouched. + * + * Returns 1 on success, or 0 on failure. + */ +void +P9Renormalize(struct plan9_s *hmm) +{ + int k; /* counter for states */ + + for (k = 0; k <= hmm->M ; k++) + { + /* match state transition frequencies */ + FNorm(hmm->mat[k].t, 3); + FNorm(hmm->ins[k].t, 3); + if (k > 0) FNorm(hmm->del[k].t, 3); + + if (k > 0) FNorm(hmm->mat[k].p, Alphabet_size); + FNorm(hmm->ins[k].p, Alphabet_size); + } +} + +/* Function: P9DefaultNullModel() + * + * Purpose: Set up a default random sequence model, using + * global aafq[]'s for protein or 0.25 for nucleic + * acid. randomseq is alloc'ed in caller. Alphabet information + * must already be known. + */ +void +P9DefaultNullModel(float *null) +{ + int x; + if (Alphabet_type == hmmAMINO) + for (x = 0; x < Alphabet_size; x++) + null[x] = aafq[x]; + else if (Alphabet_type == hmmNUCLEIC) + for (x = 0; x < Alphabet_size; x++) + null[x] = 0.25; + else + Die("No support for non-protein, non-nucleic acid alphabets."); +} diff --git a/forester/archive/RIO/others/hmmer/src/postprob.c b/forester/archive/RIO/others/hmmer/src/postprob.c new file mode 100644 index 0000000..e57b1fc --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/postprob.c @@ -0,0 +1,709 @@ +/************************************************************ + * Copyright (C) 1998 Ian Holmes (ihh@sanger.ac.uk) + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* postprob.c + * Author: Ian Holmes (ihh@sanger.ac.uk, Jun 5 1998) + * Derived from core_algorithms.c (SRE, Nov 11 1996) + * Incorporated SRE, Sat Nov 6 09:07:12 1999 [Cold Spring Harbor] + * + * RCS $Id: postprob.c,v 1.1.1.1 2005/03/22 08:34:15 cmzmasek Exp $ + ***************************************************************** + * IHH's notes: + * + * Functions for working with posterior probabilities, + * including unfussed "backwards" and "optimal accuracy" + * implementations. + ***************************************************************** + * SRE's notes: + * + * Simple API example: + * struct p7trace_s *tr; + * struct dpmatrix_s *fwd; + * struct dpmatrix_s *bck; + * struct dpmatrix_s *posterior; + * char *postcode; + * + * (get a traceback from somewhere: P7Viterbi() or a modelmaker) + * (get an HMM from somewhere: read file or construct it) + * P7Forward (dsq, len, hmm, &fwd); + * P7Backward(dsq, len, hmm, &bck); + * posterior = bck; -- can alloc posterior, but also can re-use bck -- + * P7EmitterPosterior(len, hmm, fwd, bck, posterior); + * postcode = PostalCode(len, posterior, tr); + * + * MSAAppendGR(msa, "POST", seqidx, postcode); -- or a similar annotation call -- + * + * free(postcode); + * FreePlan7Matrix(fwd); + * FreePlan7Matrix(bck); + * + * P7OptimalAccuracy() - the Durbin/Holmes optimal accuracy + * alignment algorithm. Takes a sequence + * and an HMM, returns an alignment as + * a trace structure. + * + * P7Backward() - The Backward() algorithm, counterpart + * of P7Forward() in core_algorithms.c. + * + * P7EmitterPosterior()- The heart of postprob.c: given a Forward + * and a Backward matrix, calculate a new matrix + * that contains the posterior probabilities + * for each symbol i being emitted by + * state k (so, \sum_k p(k | x_i) = 1.0). + * + * P7FillOptimalAccuracy() - The core DP algorithm called by + * P7OptimalAccuracy(). + * + * P7OptimalAccuracyTrace() - the traceback algorithm called by + * P7FillOptimalAccuracy(). + * + * PostalCode() - Create a character string for annotating + * an alignment. + * + * No small memory variants of these algorithms are available + * right now. + */ + +#include "structs.h" +#include "config.h" +#include "funcs.h" +#include "squid.h" + + +/* Function: P7OptimalAccuracy() + * + * Purpose: The optimal accuracy dynamic programming algorithm. + * Identical to Viterbi() except that posterior residue + * label probabilities are used as scores. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model + * ret_tr - RETURN: traceback; pass NULL if it's not wanted + * + * Return: log ( sum_{residues} P(label|M,D) ), as a bit score + * (i.e. log of expected accuracy) + */ +float +P7OptimalAccuracy(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr) +{ + double sc; + struct dpmatrix_s *forward; + struct dpmatrix_s *backward; + + (void) P7Forward(dsq, L, hmm, &forward); + (void) P7Backward(dsq, L, hmm, &backward); + + P7EmitterPosterior(L, hmm, forward, backward, backward); /* Re-use backward matrix for posterior scores */ + + sc = P7FillOptimalAccuracy(L, hmm->M, backward, forward, ret_tr); /* Re-use forward matrix for optimal accuracy scores */ + + FreePlan7Matrix(forward); + FreePlan7Matrix(backward); + + return sc; +} + + + +/* Function: P7Backward() + * + * Purpose: The Backward dynamic programming algorithm. + * The scaling issue is dealt with by working in log space + * and calling ILogsum(); this is a slow but robust approach. + * + * Args: dsq - sequence in digitized form + * L - length of dsq + * hmm - the model + * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted + * + * Return: log P(S|M)/P(S|R), as a bit score. + */ +float +P7Backward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx) +{ + struct dpmatrix_s *mx; + int **xmx; + int **mmx; + int **imx; + int **dmx; + int i,k; + int sc; + + /* Allocate a DP matrix with 0..L rows, 0..M-1 columns. + */ + mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx); + + /* Initialization of the L row. + * Note that xmx[i][stS] = xmx[i][stN] by definition for all i, + * so stS need not be calculated in backward DP matrices. + */ + xmx[L][XMC] = hmm->xsc[XTC][MOVE]; /* C<-T */ + xmx[L][XME] = xmx[L][XMC] + hmm->xsc[XTE][MOVE]; /* E<-C, no C-tail */ + xmx[L][XMJ] = xmx[L][XMB] = xmx[L][XMN] = -INFTY; /* need seq to get out from here */ + for (k = hmm->M; k >= 1; k--) { + mmx[L][k] = xmx[L][XME] + hmm->esc[k]; /* M<-E ... */ + mmx[L][k] += hmm->msc[(int) dsq[L]][k]; /* ... + emitted match symbol */ + imx[L][k] = dmx[L][k] = -INFTY; /* need seq to get out from here */ + } + + /* Recursion. Done as a pull. + * Note slightly wasteful boundary conditions: + * M_M precalculated, D_M set to -INFTY, + * D_1 wastefully calculated. + * Scores for transitions to D_M also have to be hacked to -INFTY, + * as Plan7Logoddsify does not do this for us (I think? - ihh). + */ + hmm->tsc[hmm->M-1][TDD] = hmm->tsc[hmm->M-1][TMD] = -INFTY; /* no D_M state -- HACK -- should be in Plan7Logoddsify */ + for (i = L-1; i >= 0; i--) + { + /* Do the special states first. + * remember, C, N and J emissions are zero score by definition + */ + xmx[i][XMC] = xmx[i+1][XMC] + hmm->xsc[XTC][LOOP]; + + xmx[i][XMB] = -INFTY; + /* The following section has been hacked to fit a bug in core_algorithms.c + * The "correct" code is: + * for (k = hmm->M; k >= 1; k--) + * xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k]; + * + * The following code gives the same results as core_algorithms.c: + */ + xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][hmm->M] + hmm->bsc[hmm->M-1]); + for (k = hmm->M-1; k >= 1; k--) + xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k]); + + xmx[i][XMJ] = ILogsum(xmx[i][XMB] + hmm->xsc[XTJ][MOVE], + xmx[i+1][XMJ] + hmm->xsc[XTJ][LOOP]); + + xmx[i][XME] = ILogsum(xmx[i][XMC] + hmm->xsc[XTE][MOVE], + xmx[i][XMJ] + hmm->xsc[XTE][LOOP]); + + xmx[i][XMN] = ILogsum(xmx[i][XMB] + hmm->xsc[XTN][MOVE], + xmx[i+1][XMN] + hmm->xsc[XTN][LOOP]); + + /* Now the main states. Note the boundary conditions at M. + */ + + if (i>0) { + mmx[i][hmm->M] = xmx[i][XME] + hmm->esc[hmm->M] + hmm->msc[(int) dsq[i]][hmm->M]; + dmx[i][hmm->M] = -INFTY; + for (k = hmm->M-1; k >= 1; k--) + { + mmx[i][k] = ILogsum(ILogsum(xmx[i][XME] + hmm->esc[k], + mmx[i+1][k+1] + hmm->tsc[k][TMM]), + ILogsum(imx[i+1][k] + hmm->tsc[k][TMI], + dmx[i][k+1] + hmm->tsc[k][TMD])); + mmx[i][k] += hmm->msc[(int) dsq[i]][k]; + + imx[i][k] = ILogsum(imx[i+1][k] + hmm->tsc[k][TII], + mmx[i+1][k+1] + hmm->tsc[k][TIM]); + imx[i][k] += hmm->isc[(int) dsq[i]][k]; + + dmx[i][k] = ILogsum(dmx[i][k+1] + hmm->tsc[k][TDD], + mmx[i+1][k+1] + hmm->tsc[k][TDM]); + + } + } + + } + + sc = xmx[0][XMN]; + + if (ret_mx != NULL) *ret_mx = mx; + else FreePlan7Matrix(mx); + + return Scorify(sc); /* the total Backward score. */ +} + + +/* Function: P7EmitterPosterior() + * + * Purpose: Combines Forward and Backward matrices into a posterior + * probability matrix. + * The entries in row i of this matrix are the logs of the + * posterior probabilities of each state emitting symbol i of + * the sequence, i.e. all entries for non-emitting states are -INFTY. + * The caller must allocate space for the matrix, although the + * backward matrix can be used instead (overwriting it will not + * compromise the algorithm). + * + * Args: L - length of sequence + * hmm - the model + * forward - pre-calculated forward matrix + * backward - pre-calculated backward matrix + * mx - pre-allocated dynamic programming matrix + * + * Return: void + */ +void +P7EmitterPosterior(int L, + struct plan7_s *hmm, + struct dpmatrix_s *forward, + struct dpmatrix_s *backward, + struct dpmatrix_s *mx) +{ + int i; + int k; + int sc; + + sc = backward->xmx[0][XMN]; + + for (i = L; i >= 1; i--) + { + mx->xmx[i][XMC] = forward->xmx[i-1][XMC] + hmm->xsc[XTC][LOOP] + backward->xmx[i][XMC] - sc; + + mx->xmx[i][XMJ] = forward->xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP] + backward->xmx[i][XMJ] - sc; + + mx->xmx[i][XMN] = forward->xmx[i-1][XMN] + hmm->xsc[XTN][LOOP] + backward->xmx[i][XMN] - sc; + + mx->xmx[i][XMB] = mx->xmx[i][XME] = -INFTY; + + for (k = 1; k < hmm->M; k++) { + mx->mmx[i][k] = backward->mmx[i][k]; + mx->mmx[i][k] += ILogsum(ILogsum(forward->mmx[i-1][k-1] + hmm->tsc[k-1][TMM], + forward->imx[i-1][k-1] + hmm->tsc[k-1][TIM]), + ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[k], + forward->dmx[i-1][k-1] + hmm->tsc[k-1][TDM])); + mx->mmx[i][k] -= sc; + + mx->imx[i][k] = backward->imx[i][k]; + mx->imx[i][k] += ILogsum(forward->mmx[i-1][k] + hmm->tsc[k][TMI], + forward->imx[i-1][k] + hmm->tsc[k][TII]); + mx->imx[i][k] -= sc; + + mx->dmx[i][k] = -INFTY; + } + mx->mmx[i][hmm->M] = backward->mmx[i][hmm->M]; + mx->mmx[i][hmm->M] += ILogsum(ILogsum(forward->mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM], + forward->imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]), + ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[hmm->M], + forward->dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM])); + mx->mmx[i][hmm->M] -= sc; + + mx->imx[i][hmm->M] = mx->dmx[i][hmm->M] = mx->dmx[i][0] = -INFTY; + + } +} + + +/* Function: P7FillOptimalAccuracy() + * + * Purpose: The core of the optimal accuracy dynamic programming algorithm. + * Identical to Viterbi() except that scores are given by a + * posterior matrix (that the caller must pre-calculate). + * Also, the caller must pre-allocate the optimal accuracy matrix + * (this allows the forward matrix to be re-used). + * P7OptimalAccuracy() does all this for you and cleans up. + * + * + * Args: L - length of sequence + * M - length of model + * posterior - pre-calculated emitter posterior matrix + * mx - pre-allocated dynamic programming matrix + * ret_tr - RETURN: traceback; pass NULL if it's not wanted + * + * Return: log ( sum_{residues} P(label|M,D) ), as a bit score + * (i.e. log of expected accuracy) + */ +float P7FillOptimalAccuracy(int L, + int M, + struct dpmatrix_s *posterior, + struct dpmatrix_s *mx, + struct p7trace_s **ret_tr) +{ + struct p7trace_s *tr; + int **xmx; + int **mmx; + int **imx; + int **dmx; + int i,k; + int sc; + + xmx = mx->xmx; + mmx = mx->mmx; + imx = mx->imx; + dmx = mx->dmx; + + /* Initialization of the zero row. + * Each cell in the optimal accuracy matrix holds the log of the expected + * of correctly assigned symbols up to that point. + * To begin with, everything is log(0) = -INFTY. + */ + xmx[0][XMN] = xmx[0][XMB] = xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; + for (k = 0; k <= M; k++) + mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; + + /* Recursion. Done as a pull. + * Note some slightly wasteful boundary conditions: + * D_M and I_M are wastefully calculated (they don't exist) + */ + for (i = 1; i <= L; i++) + { + mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY; + + for (k = 1; k <= M; k++) + { + /* match state */ + mmx[i][k] = -INFTY; + if ((sc = mmx[i-1][k-1]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = imx[i-1][k-1]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = dmx[i-1][k-1]) > mmx[i][k]) + mmx[i][k] = sc; + if ((sc = xmx[i-1][XMB]) > mmx[i][k]) + mmx[i][k] = sc; + mmx[i][k] = ILogsum(mmx[i][k], posterior->mmx[i][k]); + + /* delete state */ + dmx[i][k] = -INFTY; + if ((sc = mmx[i][k-1]) > dmx[i][k]) + dmx[i][k] = sc; + if ((sc = dmx[i][k-1]) > dmx[i][k]) + dmx[i][k] = sc; + + /* insert state */ + imx[i][k] = -INFTY; + if ((sc = mmx[i-1][k]) > imx[i][k]) + imx[i][k] = sc; + if ((sc = imx[i-1][k]) > imx[i][k]) + imx[i][k] = sc; + imx[i][k] = ILogsum(imx[i][k], posterior->imx[i][k]); + } + + /* Now the special states. Order is important here. + * remember, C and J emissions are zero score by definition, + */ + + /* N state */ + xmx[i][XMN] = -INFTY; + if ((sc = ILogsum(xmx[i-1][XMN], posterior->xmx[i][XMN])) > -INFTY) + xmx[i][XMN] = sc; + + /* E state */ + xmx[i][XME] = -INFTY; + for (k = 1; k <= M; k++) + if ((sc = mmx[i][k]) > xmx[i][XME]) + xmx[i][XME] = sc; + + /* J state */ + xmx[i][XMJ] = -INFTY; + if ((sc = ILogsum(xmx[i-1][XMJ], posterior->xmx[i][XMJ])) > -INFTY) + xmx[i][XMJ] = sc; + if ((sc = xmx[i][XME]) > xmx[i][XMJ]) /* no E->J emission */ + xmx[i][XMJ] = sc; + + /* B state */ + xmx[i][XMB] = -INFTY; + if ((sc = xmx[i][XMN]) > -INFTY) + xmx[i][XMB] = sc; + if ((sc = xmx[i][XMJ]) > xmx[i][XMB]) + xmx[i][XMB] = sc; + + /* C state */ + xmx[i][XMC] = -INFTY; + if ((sc = ILogsum(xmx[i-1][XMC], posterior->xmx[i][XMC])) > -INFTY) + xmx[i][XMC] = sc; + if ((sc = xmx[i][XME]) > xmx[i][XMC]) /* no E->C emission */ + xmx[i][XMC] = sc; + } + + /* T state (not stored) */ + sc = xmx[L][XMC]; + + if (ret_tr != NULL) { + P7OptimalAccuracyTrace(L, M, posterior, mx, &tr); + *ret_tr = tr; + } + + return Score2Prob(sc,1); /* the log of the expected accuracy. */ +} + + +/* Function: P7OptimalAccuracyTrace() + * + * Purpose: Traceback of an optimal accuracy matrix: i.e. retrieval + * of optimum alignment. + * + * Args: L - length of sequence + * M - length of HMM + * posterior - the posterior matrix + * mx - the matrix to trace back in, (L+1) x M + * ret_tr - RETURN: traceback. + * + * Return: (void) + * ret_tr is allocated here. Free using P7FreeTrace(). + */ +void +P7OptimalAccuracyTrace(int L, + int M, + struct dpmatrix_s *posterior, + struct dpmatrix_s *mx, + struct p7trace_s **ret_tr) +{ + struct p7trace_s *tr; + int curralloc; /* current allocated length of trace */ + int tpos; /* position in trace */ + int i; /* position in seq (1..L) */ + int k; /* position in model (1..M) */ + int **xmx, **mmx, **imx, **dmx; + int sc; /* temp var for pre-emission score */ + + /* Overallocate for the trace. + * S-N-B- ... - E-C-T : 6 states + L is minimum trace; + * add L more as buffer. + */ + curralloc = L * 2 + 6; + P7AllocTrace(curralloc, &tr); + + xmx = mx->xmx; + mmx = mx->mmx; + imx = mx->imx; + dmx = mx->dmx; + + /* Initialization of trace + * We do it back to front; ReverseTrace() is called later. + */ + tr->statetype[0] = STT; + tr->nodeidx[0] = 0; + tr->pos[0] = 0; + tr->statetype[1] = STC; + tr->nodeidx[1] = 0; + tr->pos[1] = 0; + tpos = 2; + i = L; /* current i (seq pos) we're trying to assign */ + + /* Traceback + */ + while (tr->statetype[tpos-1] != STS) { + switch (tr->statetype[tpos-1]) { + case STM: /* M connects from i-1,k-1, or B */ + sc = mmx[i+1][k+1]; + if (sc == ILogsum(mmx[i][k], posterior->mmx[i+1][k+1]) && i > 0 && k > 0) + { + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + } + else if (sc == ILogsum(imx[i][k], posterior->mmx[i+1][k+1]) && i > 0 && k > 0) + { + tr->statetype[tpos] = STI; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = i--; + } + else if (sc == ILogsum(dmx[i][k], posterior->mmx[i+1][k+1]) && i > 0 && k > 1) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = 0; + } + else if (sc == ILogsum(xmx[i][XMB], posterior->mmx[i+1][k+1])) + { + tr->statetype[tpos] = STB; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else Die("traceback failed"); + break; + + case STD: /* D connects from M,D */ + if (dmx[i][k+1] == mmx[i][k] && i > 0 && k > 0) + { + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + } + else if (dmx[i][k+1] == dmx[i][k] && k > 1) + { + tr->statetype[tpos] = STD; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = 0; + } + else Die("traceback failed"); + break; + + case STI: /* I connects from M,I */ + sc = imx[i+1][k]; + if (sc == ILogsum(mmx[i][k], posterior->imx[i+1][k]) && i > 0 && k > 0) + { + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + } + else if (sc == ILogsum(imx[i][k], posterior->imx[i+1][k]) && i > 0 && k > 0) + { + tr->statetype[tpos] = STI; + tr->nodeidx[tpos] = k; + tr->pos[tpos] = i--; + } + else Die("traceback failed"); + break; + + case STN: /* N connects from S, N */ + if (i == 0 && xmx[i][XMN] == -INFTY) + { + tr->statetype[tpos] = STS; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else if (i > 0 && xmx[i+1][XMN] == ILogsum(xmx[i][XMN], posterior->xmx[i+1][XMN]) && i > 0) + { + tr->statetype[tpos] = STN; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* note convention adherence: */ + tr->pos[tpos-1] = i--; /* first N doesn't emit */ + } + else Die("traceback failed"); + break; + + case STB: /* B connects from N, J */ + if (xmx[i][XMB] == xmx[i][XMN]) + { + tr->statetype[tpos] = STN; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else if (xmx[i][XMB] == xmx[i][XMJ]) + { + tr->statetype[tpos] = STJ; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; + } + else Die("traceback failed"); + break; + + case STE: /* E connects from any M state. k set here */ + for (k = M; k >= 1; k--) + if (xmx[i][XME] == mmx[i][k] && i > 0) + { + tr->statetype[tpos] = STM; + tr->nodeidx[tpos] = k--; + tr->pos[tpos] = i--; + break; + } + if (k <= 0) Die("traceback failed"); + break; + + case STC: /* C comes from C, E */ + if (xmx[i][XMC] == ILogsum(xmx[i-1][XMC], posterior->xmx[i][XMC]) && i > 0) + { + tr->statetype[tpos] = STC; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* note convention adherence: */ + tr->pos[tpos-1] = i--; /* first C doesn't emit */ + } + else if (xmx[i][XMC] == xmx[i][XME]) + { + tr->statetype[tpos] = STE; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* E is a nonemitter */ + } + else Die("Traceback failed."); + break; + + case STJ: /* J connects from E, J */ + if (xmx[i][XMJ] == ILogsum(xmx[i-1][XMJ], posterior->xmx[i][XMJ]) && i > 0) + { + tr->statetype[tpos] = STJ; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* note convention adherence: */ + tr->pos[tpos-1] = i--; /* first J doesn't emit */ + } + else if (xmx[i][XMJ] == xmx[i][XME]) + { + tr->statetype[tpos] = STE; + tr->nodeidx[tpos] = 0; + tr->pos[tpos] = 0; /* E is a nonemitter */ + } + else Die("Traceback failed."); + break; + + default: + Die("traceback failed"); + + } /* end switch over statetype[tpos-1] */ + + tpos++; + if (tpos == curralloc) + { /* grow trace if necessary */ + curralloc += L; + P7ReallocTrace(tr, curralloc); + } + + } /* end traceback, at S state; tpos == tlen now */ + tr->tlen = tpos; + P7ReverseTrace(tr); + *ret_tr = tr; + +} + + +/* Function: PostalCode() + * Date: SRE, Sun Nov 7 15:31:35 1999 [Cold Spring Harbor] + * + * Purpose: Given a traceback and one of Ian's posterior + * probability matrices, calculate a string that + * represents the confidence values on each + * residue in the sequence. + * + * The code string is 0..L-1 (L = len of target seq), + * so it's in the coordinate system of the sequence string; + * off by one from dsq; and convertible to the coordinate + * system of aseq using MakeAlignedString(). + * + * Values are 0-9,* + * for example, 9 means with >=90% posterior probabiility, + * residue i is aligned to the state k that it + * is assigned to in the given trace. + * + * Args: L - length of seq + * mx - posterior prob matrix: see P7EmitterPosterior() + * tr - a traceback to get a Postal code string for. + * + * Returns: char * array of codes, 0..L-1 + * Caller is responsible for free'ing it. + */ +static char +score2postcode(int sc) +{ + char i; + i = (char) (Score2Prob(sc, 1.) * 10.); + return ((i > 9) ? '*' : '0'+i); +} +char * +PostalCode(int L, struct dpmatrix_s *mx, struct p7trace_s *tr) +{ + int tpos; + int i; + int k; + char *postcode; + + postcode = MallocOrDie((L+1) * sizeof(char)); + for (tpos = 0; tpos < tr->tlen; tpos++) + { + i = tr->pos[tpos]; + k = tr->nodeidx[tpos]; + if (i == 0) continue; + + switch (tr->statetype[tpos]) { + case STM: postcode[i-1] = score2postcode(mx->mmx[i][k]); break; + case STI: postcode[i-1] = score2postcode(mx->imx[i][k]); break; + case STN: postcode[i-1] = score2postcode(mx->xmx[i][XMN]); break; + case STC: postcode[i-1] = score2postcode(mx->xmx[i][XMC]); break; + case STJ: postcode[i-1] = score2postcode(mx->xmx[i][XMJ]); break; + } + } + postcode[L] = '\0'; + + return postcode; +} diff --git a/forester/archive/RIO/others/hmmer/src/postprob.h b/forester/archive/RIO/others/hmmer/src/postprob.h new file mode 100644 index 0000000..b09c036 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/postprob.h @@ -0,0 +1,55 @@ +/************************************************************ + * Copyright (C) 1998 Ian Holmes (ihh@sanger.ac.uk) + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* postprob.h + * Author: Ian Holmes (ihh@sanger.ac.uk, Jun 5 1998) + * Derived from core_algorithms.c (SRE, Nov 11 1996) + * Incorporated SRE, Sat Nov 6 09:07:02 1999 + * + * Functions for working with posterior probabilities, + * including unfussed "backwards" and "optimal accuracy" + * implementations. + */ + +#ifndef POSTPROB_INCLUDED +#define POSTPROB_INCLUDED + +#include "structs.h" +#include "config.h" +#include "funcs.h" +#include "squid.h" + +/* Extra algorithms to work with posterior probabilities. + */ + +extern float P7OptimalAccuracy(char *dsq, int L, struct plan7_s *hmm, + struct p7trace_s **ret_tr); + +extern float P7Backward(char *dsq, int L, struct plan7_s *hmm, + struct dpmatrix_s **ret_mx); + +extern void P7EmitterPosterior(int L, struct plan7_s *hmm, + struct dpmatrix_s *forward, + struct dpmatrix_s *backward, + struct dpmatrix_s *mx); + +extern float P7FillOptimalAccuracy(int L, int M, + struct dpmatrix_s *posterior, + struct dpmatrix_s *mx, + struct p7trace_s **ret_tr); + +extern void P7OptimalAccuracyTrace(int L, int M, + struct dpmatrix_s *posterior, + struct dpmatrix_s *mx, + struct p7trace_s **ret_tr); + +#endif + diff --git a/forester/archive/RIO/others/hmmer/src/prior.c b/forester/archive/RIO/others/hmmer/src/prior.c new file mode 100644 index 0000000..b2475ac --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/prior.c @@ -0,0 +1,725 @@ +/***************************************************************** + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + *****************************************************************/ + +/* prior.c + * SRE, Mon Nov 18 15:44:08 1996 + * + * Support for Dirichlet prior data structure, p7prior_s. + */ + +#include "config.h" +#include "structs.h" +#include "funcs.h" +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +static struct p7prior_s *default_amino_prior(void); +static struct p7prior_s *default_nucleic_prior(void); + +/* Function: P7AllocPrior(), P7FreePrior() + * + * Purpose: Allocation and free'ing of a prior structure. + * Very simple, but might get more complex someday. + */ +struct p7prior_s * +P7AllocPrior(void) +{ return (struct p7prior_s *) MallocOrDie (sizeof(struct p7prior_s)); } +void +P7FreePrior(struct p7prior_s *pri) +{ free(pri); } + + +/* Function: P7LaplacePrior() + * + * Purpose: Create a Laplace plus-one prior. (single component Dirichlets). + * Global alphabet info is assumed to have been set already. + * + * Args: (void) + * + * Return: prior. Allocated here; call FreePrior() to free it. + */ +struct p7prior_s * +P7LaplacePrior(void) +{ + struct p7prior_s *pri; + + pri = P7AllocPrior(); + pri->strategy = PRI_DCHLET; + + pri->tnum = 1; + pri->tq[0] = 1.; + FSet(pri->t[0], 8, 1.); + + pri->mnum = 1; + pri->mq[0] = 1.; + FSet(pri->m[0], Alphabet_size, 1.); + + pri->inum = 1; + pri->iq[0] = 1.; + FSet(pri->i[0], Alphabet_size, 1.); + + return pri; +} + +/* Function: P7DefaultPrior() + * + * Purpose: Set up a somewhat more realistic single component + * Dirichlet prior than Laplace. + */ +struct p7prior_s * +P7DefaultPrior(void) +{ + switch (Alphabet_type) { + case hmmAMINO: return default_amino_prior(); + case hmmNUCLEIC: return default_nucleic_prior(); + case hmmNOTSETYET: Die("Can't set prior; alphabet type not set yet"); + } + /*NOTREACHED*/ + return NULL; +} + +/* Function: P7ReadPrior() + * + * Purpose: Input a prior from disk file. + */ +struct p7prior_s * +P7ReadPrior(char *prifile) +{ + FILE *fp; + struct p7prior_s *pri; + char *sptr; + int q, x; + + if ((fp = fopen(prifile, "r")) == NULL) + Die("Failed to open HMMER prior file %s\n", prifile); + pri = P7AllocPrior(); + + /* First entry is the strategy: + * Only standard Dirichlet prior (simple or mixture) is supported in Plan7 so far + */ + sptr = Getword(fp, sqdARG_STRING); + s2upper(sptr); + if (strcmp(sptr, "DIRICHLET") == 0) pri->strategy = PRI_DCHLET; + else Die("No such prior strategy %s; failed to parse file %s", sptr, prifile); + + /* Second entry is the alphabet type: + * Amino or Nucleic + */ + sptr = Getword(fp, sqdARG_STRING); + s2upper(sptr); + if (strcmp(sptr, "AMINO") == 0) + { + if (Alphabet_type != hmmAMINO) + Die("HMM and/or sequences are DNA/RNA; can't use protein prior %s", prifile); + } + else if (strcmp(sptr, "NUCLEIC") == 0) + { + if (Alphabet_type != hmmNUCLEIC) + Die("HMM and/or sequences are protein; can't use DNA/RNA prior %s", prifile); + } + else + Die("Alphabet \"%s\" in prior file %s isn't valid.", sptr, prifile); + + /* State transition priors: + * # of mixtures. + * then for each mixture: + * prior P(q) + * Dirichlet terms for Tmm, Tmi, Tmd, Tim, Tii, Tid, Tdm, Tdi, Tdd + */ + pri->tnum = atoi(Getword(fp, sqdARG_INT)); + if (pri->tnum < 0) + Die("%d is bad; need at least one state transition mixture component", pri->tnum); + if (pri->tnum > MAXDCHLET) + Die("%d is bad, too many transition components (MAXDCHLET = %d)\n", MAXDCHLET); + for (q = 0; q < pri->tnum; q++) + { + pri->tq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); + for (x = 0; x < 7; x++) + pri->t[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); + } + + /* Match emission priors: + * # of mixtures. + * then for each mixture: + * prior P(q) + * Dirichlet terms for Alphabet_size symbols in Alphabet + */ + pri->mnum = atoi(Getword(fp, sqdARG_INT)); + if (pri->mnum < 0) + Die("%d is bad; need at least one match emission mixture component", pri->mnum); + if (pri->mnum > MAXDCHLET) + Die("%d is bad; too many match components (MAXDCHLET = %d)\n", pri->mnum, MAXDCHLET); + + for (q = 0; q < pri->mnum; q++) + { + pri->mq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); + for (x = 0; x < Alphabet_size; x++) + pri->m[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); + } + + /* Insert emission priors: + * # of mixtures. + * then for each mixture component: + * prior P(q) + * Dirichlet terms for Alphabet_size symbols in Alphabet + */ + pri->inum = atoi(Getword(fp, sqdARG_INT)); + if (pri->inum < 0) + Die("%d is bad; need at least one insert emission mixture component", pri->inum); + if (pri->inum > MAXDCHLET) + Die("%d is bad; too many insert components (MAXDCHLET = %d)\n", pri->inum, MAXDCHLET); + for (q = 0; q < pri->inum; q++) + { + pri->iq[q] = (float) atof(Getword(fp, sqdARG_FLOAT)); + for (x = 0; x < Alphabet_size; x++) + pri->i[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT)); + } + + fclose(fp); + return pri; +} + + +/* Function: PAMPrior() + * + * Purpose: Produces an ad hoc "Dirichlet mixture" prior for + * match emissions, using a PAM matrix. + * + * Side effect notice: PAMPrior() replaces the match + * emission section of an existing Dirichlet prior, + * which is /expected/ to be a simple one-component + * kind of prior. The insert emissions /must/ be a + * one-component prior (because of details in how + * PriorifyEmissionVector() is done). However, + * the transitions /could/ be a mixture Dirichlet prior + * without causing problems. In other words, the + * -p and -P options of hmmb can coexist, but there + * may be conflicts. PAMPrior() checks for these, + * so there's no serious problem, except that the + * error message from PAMPrior() might be confusing to + * a user. + */ +void +PAMPrior(char *pamfile, struct p7prior_s *pri, float wt) +{ + FILE *fp; + char *blastpamfile; /* BLAST looks in aa/ subdirectory of BLASTMAT */ + int **pam; + float scale; + int xi, xj; + int idx1, idx2; + + if (Alphabet_type != hmmAMINO) + Die("PAM prior is only valid for protein sequences"); + if (pri->strategy != PRI_DCHLET) + Die("PAM prior may only be applied over an existing Dirichlet prior"); + if (pri->inum != 1) + Die("PAM prior requires that the insert emissions be a single Dirichlet"); + if (MAXDCHLET < 20) + Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior"); + + blastpamfile = FileConcat("aa", pamfile); + + if ((fp = fopen(pamfile, "r")) == NULL && + (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL && + (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL) + Die("Failed to open PAM scoring matrix file %s", pamfile); + if (! ParsePAMFile(fp, &pam, &scale)) + Die("Failed to parse PAM scoring matrix file %s", pamfile); + fclose(fp); + free(blastpamfile); + + pri->strategy = PRI_PAM; + pri->mnum = 20; + + /* Convert PAM entries back to conditional prob's P(xj | xi), + * which we'll use as "pseudocounts" weighted by wt. + */ + for (xi = 0; xi < Alphabet_size; xi++) + for (xj = 0; xj < Alphabet_size; xj++) + { + idx1 = Alphabet[xi] - 'A'; + idx2 = Alphabet[xj] - 'A'; + pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale); + } + + /* Normalize so that rows add up to wt. + * i.e. Sum(xj) mat[xi][xj] = wt for every row xi + */ + for (xi = 0; xi < Alphabet_size; xi++) + { + pri->mq[xi] = 1. / Alphabet_size; + FNorm(pri->m[xi], Alphabet_size); + FScale(pri->m[xi], Alphabet_size, wt); + } + + Free2DArray((void **)pam,27); +} + + +/* Function: P7DefaultNullModel() + * + * Purpose: Set up a default random sequence model, using + * global aafq[]'s for protein or 1/Alphabet_size for anything + * else. randomseq is alloc'ed in caller. Alphabet information + * must already be known. + */ +void +P7DefaultNullModel(float *null, float *ret_p1) +{ + int x; + if (Alphabet_type == hmmAMINO) { + for (x = 0; x < Alphabet_size; x++) + null[x] = aafq[x]; + *ret_p1 = 350./351.; /* rationale: approx avg protein length. */ + } else { + for (x = 0; x < Alphabet_size; x++) + null[x] = 1.0 / (float) Alphabet_size; + *ret_p1 = 1000./1001.; /* rationale: approx inter-Alu distance. */ + } +} + +void +P7ReadNullModel(char *rndfile, float *null, float *ret_p1) +{ + FILE *fp; + char *s; + int x; + int type = 0; + + if ((fp = fopen(rndfile, "r")) == NULL) + Die("Failed to open null model file %s\n", rndfile); + if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; + s2upper(s); + if (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC; + else if (strcmp(s, "AMINO") == 0) type = hmmAMINO; + else goto FAILURE; + /* check/set alphabet type */ + if (Alphabet_type == 0) + SetAlphabet(type); + else if (Alphabet_type != type) + Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile); + /* parse the file */ + for (x = 0; x < Alphabet_size; x++) { + if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; + null[x] = atof(s); + } + if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE; + *ret_p1 = atof(s); + + fclose(fp); + return; + +FAILURE: + fclose(fp); + Die("%s is not in HMMER null model file format", rndfile); +} + + +/* Function: P7PriorifyHMM() + * + * Purpose: Add pseudocounts to an HMM using Dirichlet priors, + * and renormalize the HMM. + * + * Args: hmm -- the HMM to add counts to (counts form) + * pri -- the Dirichlet prior to use + * + * Return: (void) + * HMM returns in probability form. + */ +void +P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri) +{ + int k; /* counter for model position */ + float d; /* a denominator */ + float tq[MAXDCHLET]; /* prior distribution over mixtures */ + float mq[MAXDCHLET]; /* prior distribution over mixtures */ + float iq[MAXDCHLET]; /* prior distribution over mixtures */ + + /* Model-dependent transitions are handled simply; Laplace. + */ + FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */ + FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */ + d = hmm->tbd1 + hmm->begin[1] + 2.; + hmm->tbd1 = (hmm->tbd1 + 1.)/ d; + hmm->begin[1] = (hmm->begin[1] + 1.)/ d; + hmm->end[hmm->M] = 1.0; + + /* Main model transitions and emissions + */ + for (k = 1; k < hmm->M; k++) + { + /* The following code chunk is experimental. + * Collaboration with Michael Asman, Erik Sonnhammer, CGR Stockholm. + * Only activated if X-PR* annotation has been used, in which + * priors are overridden and a single Dirichlet component is + * specified for each column (using structural annotation). + * If X-PR* annotation is not used, which is usually the case, + * the following code has no effect (observe how the real prior + * distributions are copied into tq, mq, iq). + */ + if (hmm->tpri != NULL && hmm->tpri[k] >= 0) + { + if (hmm->tpri[k] >= pri->tnum) Die("X-PRT annotation out of range"); + FSet(tq, pri->tnum, 0.0); + tq[hmm->tpri[k]] = 1.0; + } + else + FCopy(tq, pri->tq, pri->tnum); + if (hmm->mpri != NULL && hmm->mpri[k] >= 0) + { + if (hmm->mpri[k] >= pri->mnum) Die("X-PRM annotation out of range"); + FSet(mq, pri->mnum, 0.0); + mq[hmm->mpri[k]] = 1.0; + } + else + FCopy(mq, pri->mq, pri->mnum); + if (hmm->ipri != NULL && hmm->ipri[k] >= 0) + { + if (hmm->ipri[k] >= pri->inum) Die("X-PRI annotation out of range"); + FSet(iq, pri->inum, 0.0); + iq[hmm->ipri[k]] = 1.0; + } + else + FCopy(iq, pri->iq, pri->inum); + + /* This is the main line of the code: + */ + P7PriorifyTransitionVector(hmm->t[k], pri, tq); + P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, mq, pri->m, NULL); + P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, iq, pri->i, NULL); + } + + /* We repeat the above steps just for the final match state, M. + */ + if (hmm->mpri != NULL && hmm->mpri[hmm->M] >= 0) + { + if (hmm->mpri[hmm->M] >= pri->mnum) Die("X-PRM annotation out of range"); + FSet(mq, pri->mnum, 0.0); + mq[hmm->mpri[hmm->M]] = 1.0; + } + else + FCopy(mq, pri->mq, pri->mnum); + + P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, mq, pri->m, NULL); + + /* Now we're done. Convert the counts-based HMM to probabilities. + */ + Plan7Renormalize(hmm); +} + + +/* Function: P7PriorifyEmissionVector() + * + * Purpose: Add prior pseudocounts to an observed + * emission count vector and renormalize. + * + * Can return the posterior mixture probabilities + * P(q | counts) if ret_mix[MAXDCHLET] is passed. + * Else, pass NULL. + * + * Args: vec - the 4 or 20-long vector of counts to modify + * pri - prior data structure + * num - pri->mnum or pri->inum; # of mixtures + * eq - pri->mq or pri->iq; prior mixture probabilities + * e - pri->i or pri->m; Dirichlet components + * ret_mix - filled with posterior mixture probabilities, or NULL + * + * Return: (void) + * The counts in vec are changed and normalized to probabilities. + */ +void +P7PriorifyEmissionVector(float *vec, struct p7prior_s *pri, + int num, float eq[MAXDCHLET], float e[MAXDCHLET][MAXABET], + float *ret_mix) +{ + int x; /* counter over vec */ + int q; /* counter over mixtures */ + float mix[MAXDCHLET]; /* posterior distribution over mixtures */ + float totc; /* total counts */ + float tota; /* total alpha terms */ + float xi; /* X_i term, Sjolander eq. 41 */ + + /* Calculate mix[], which is the posterior probability + * P(q | n) of mixture component q given the count vector n + * + * (side effect note: note that an insert vector in a PAM prior + * is passed with num = 1, bypassing pam prior code; this means + * that inserts cannot be mixture Dirichlets...) + * [SRE, 12/24/00: the above comment is cryptic! what the hell does that + * mean, inserts can't be mixtures? doesn't seem to be true. it + * may mean that in a PAM prior, you can't have a mixture for inserts, + * but I don't even understand that. The insert vectors aren't passed + * with num=1!!] + */ + mix[0] = 1.0; + if (pri->strategy == PRI_DCHLET && num > 1) + { + for (q = 0; q < num; q++) + { + mix[q] = eq[q] > 0.0 ? log(eq[q]) : -999.; + mix[q] += Logp_cvec(vec, Alphabet_size, e[q]); + } + LogNorm(mix, num); /* now mix[q] is P(component_q | n) */ + } + else if (pri->strategy == PRI_PAM && num > 1) + { /* pam prior uses aa frequencies as `P(q|n)' */ + for (q = 0; q < Alphabet_size; q++) + mix[q] = vec[q]; + FNorm(mix, Alphabet_size); + } + + /* Convert the counts to probabilities, following Sjolander (1996) + */ + totc = FSum(vec, Alphabet_size); + for (x = 0; x < Alphabet_size; x++) { + xi = 0.0; + for (q = 0; q < num; q++) { + tota = FSum(e[q], Alphabet_size); + xi += mix[q] * (vec[x] + e[q][x]) / (totc + tota); + } + vec[x] = xi; + } + FNorm(vec, Alphabet_size); + + if (ret_mix != NULL) + for (q = 0; q < num; q++) + ret_mix[q] = mix[q]; +} + + + +/* Function: P7PriorifyTransitionVector() + * + * Purpose: Add prior pseudocounts to transition vector, + * which contains three different probability vectors + * for m, d, and i. + * + * Args: t - state transitions, counts: 3 for M, 2 for I, 2 for D. + * prior - Dirichlet prior information + * tq - prior distribution over Dirichlet components. + * (overrides prior->iq[]; used for alternative + * methods of conditioning prior on structural data) + * + * Return: (void) + * t is changed, and renormalized -- comes back as + * probability vectors. + */ +void +P7PriorifyTransitionVector(float *t, struct p7prior_s *prior, + float tq[MAXDCHLET]) +{ + int ts; + int q; + float mix[MAXDCHLET]; + float totm, totd, toti; /* total counts in three transition vecs */ + float xi; /* Sjolander's X_i term */ + + mix[0] = 1.0; /* default is simple one component */ + if ((prior->strategy == PRI_DCHLET || prior->strategy == PRI_PAM) && prior->mnum > 1) + { + for (q = 0; q < prior->tnum; q++) + { + mix[q] = tq[q] > 0.0 ? log(tq[q]) : -999.; + mix[q] += Logp_cvec(t, 3, prior->t[q]); /* 3 match */ + mix[q] += Logp_cvec(t+3, 2, prior->t[q]+3); /* 2 insert */ + mix[q] += Logp_cvec(t+5, 2, prior->t[q]+5); /* 2 delete */ + } + LogNorm(mix, prior->tnum); /* mix[q] is now P(q | counts) */ + } + /* precalc some denominators */ + totm = FSum(t,3); + toti = t[TIM] + t[TII]; + totd = t[TDM] + t[TDD]; + + for (ts = 0; ts < 7; ts++) + { + xi = 0.0; + for (q = 0; q < prior->tnum; q++) + { + switch (ts) { + case TMM: case TMI: case TMD: + xi += mix[q] * (t[ts] + prior->t[q][ts]) / + (totm + FSum(prior->t[q], 3)); + break; + case TIM: case TII: + xi += mix[q] * (t[ts] + prior->t[q][ts]) / + (toti + prior->t[q][TIM] + prior->t[q][TII]); + break; + case TDM: case TDD: + xi += mix[q] * (t[ts] + prior->t[q][ts]) / + (totd + prior->t[q][TDM] + prior->t[q][TDD]); + break; + } + } + t[ts] = xi; + } + FNorm(t, 3); /* match */ + FNorm(t+3, 2); /* insert */ + FNorm(t+5, 2); /* delete */ +} + + +/* Function: default_amino_prior() + * + * Purpose: Set the default protein prior. + */ +static struct p7prior_s * +default_amino_prior(void) +{ + struct p7prior_s *pri; + int q, x; + /* default match mixture coefficients */ + static float defmq[9] = { + 0.178091, 0.056591, 0.0960191, 0.0781233, 0.0834977, + 0.0904123, 0.114468, 0.0682132, 0.234585 }; + + /* default match mixture Dirichlet components */ + static float defm[9][20] = { + { 0.270671, 0.039848, 0.017576, 0.016415, 0.014268, + 0.131916, 0.012391, 0.022599, 0.020358, 0.030727, + 0.015315, 0.048298, 0.053803, 0.020662, 0.023612, + 0.216147, 0.147226, 0.065438, 0.003758, 0.009621 }, + { 0.021465, 0.010300, 0.011741, 0.010883, 0.385651, + 0.016416, 0.076196, 0.035329, 0.013921, 0.093517, + 0.022034, 0.028593, 0.013086, 0.023011, 0.018866, + 0.029156, 0.018153, 0.036100, 0.071770, 0.419641 }, + { 0.561459, 0.045448, 0.438366, 0.764167, 0.087364, + 0.259114, 0.214940, 0.145928, 0.762204, 0.247320, + 0.118662, 0.441564, 0.174822, 0.530840, 0.465529, + 0.583402, 0.445586, 0.227050, 0.029510, 0.121090 }, + { 0.070143, 0.011140, 0.019479, 0.094657, 0.013162, + 0.048038, 0.077000, 0.032939, 0.576639, 0.072293, + 0.028240, 0.080372, 0.037661, 0.185037, 0.506783, + 0.073732, 0.071587, 0.042532, 0.011254, 0.028723 }, + { 0.041103, 0.014794, 0.005610, 0.010216, 0.153602, + 0.007797, 0.007175, 0.299635, 0.010849, 0.999446, + 0.210189, 0.006127, 0.013021, 0.019798, 0.014509, + 0.012049, 0.035799, 0.180085, 0.012744, 0.026466 }, + { 0.115607, 0.037381, 0.012414, 0.018179, 0.051778, + 0.017255, 0.004911, 0.796882, 0.017074, 0.285858, + 0.075811, 0.014548, 0.015092, 0.011382, 0.012696, + 0.027535, 0.088333, 0.944340, 0.004373, 0.016741 }, + { 0.093461, 0.004737, 0.387252, 0.347841, 0.010822, + 0.105877, 0.049776, 0.014963, 0.094276, 0.027761, + 0.010040, 0.187869, 0.050018, 0.110039, 0.038668, + 0.119471, 0.065802, 0.025430, 0.003215, 0.018742 }, + { 0.452171, 0.114613, 0.062460, 0.115702, 0.284246, + 0.140204, 0.100358, 0.550230, 0.143995, 0.700649, + 0.276580, 0.118569, 0.097470, 0.126673, 0.143634, + 0.278983, 0.358482, 0.661750, 0.061533, 0.199373 }, + { 0.005193, 0.004039, 0.006722, 0.006121, 0.003468, + 0.016931, 0.003647, 0.002184, 0.005019, 0.005990, + 0.001473, 0.004158, 0.009055, 0.003630, 0.006583, + 0.003172, 0.003690, 0.002967, 0.002772, 0.002686 }, + }; + + pri = P7AllocPrior(); + pri->strategy = PRI_DCHLET; + + /* Transition priors are subjective, but borrowed from GJM's estimations + * on Pfam + */ + pri->tnum = 1; + pri->tq[0] = 1.0; + pri->t[0][TMM] = 0.7939; + pri->t[0][TMI] = 0.0278; + pri->t[0][TMD] = 0.0135; + pri->t[0][TIM] = 0.1551; + pri->t[0][TII] = 0.1331; + pri->t[0][TDM] = 0.9002; + pri->t[0][TDD] = 0.5630; + + /* Match emission priors are a mixture Dirichlet, + * from Kimmen Sjolander (Blocks9) + */ + pri->mnum = 9; + for (q = 0; q < pri->mnum; q++) + { + pri->mq[q] = defmq[q]; + for (x = 0; x < 20; x++) + pri->m[q][x] = defm[q][x]; + } + + /* These insert emission priors are subjective. Observed frequencies + * were obtained from PFAM 1.0, 10 Nov 96; + * see ~/projects/plan7/InsertStatistics. + * Inserts are slightly biased towards polar residues and away from + * hydrophobic residues. + */ + pri->inum = 1; + pri->iq[0] = 1.; + pri->i[0][0] = 681.; /* A */ + pri->i[0][1] = 120.; /* C */ + pri->i[0][2] = 623.; /* D */ + pri->i[0][3] = 651.; /* E */ + pri->i[0][4] = 313.; /* F */ + pri->i[0][5] = 902.; /* G */ + pri->i[0][6] = 241.; /* H */ + pri->i[0][7] = 371.; /* I */ + pri->i[0][8] = 687.; /* K */ + pri->i[0][9] = 676.; /* L */ + pri->i[0][10] = 143.; /* M */ + pri->i[0][11] = 548.; /* N */ + pri->i[0][12] = 647.; /* P */ + pri->i[0][13] = 415.; /* Q */ + pri->i[0][14] = 551.; /* R */ + pri->i[0][15] = 926.; /* S */ + pri->i[0][16] = 623.; /* T */ + pri->i[0][17] = 505.; /* V */ + pri->i[0][18] = 102.; /* W */ + pri->i[0][19] = 269.; /* Y */ + + return pri; +} + + +/* Function: default_nucleic_prior() + * + * Purpose: Set the default DNA prior. (for now, almost a Laplace) + */ +static struct p7prior_s * +default_nucleic_prior(void) +{ + struct p7prior_s *pri; + + pri = P7AllocPrior(); + pri->strategy = PRI_DCHLET; + + /* The use of the Pfam-trained amino acid transition priors + * here is TOTALLY bogus. But it works better than a straight + * Laplace, esp. for Maxmodelmaker(). For example, a Laplace + * prior builds M=1 models for a single sequence GAATTC (at + * one time an open "bug"). + */ + pri->tnum = 1; + pri->tq[0] = 1.; + pri->t[0][TMM] = 0.7939; + pri->t[0][TMI] = 0.0278; + pri->t[0][TMD] = 0.0135; + pri->t[0][TIM] = 0.1551; + pri->t[0][TII] = 0.1331; + pri->t[0][TDM] = 0.9002; + pri->t[0][TDD] = 0.5630; + + pri->mnum = 1; + pri->mq[0] = 1.; + FSet(pri->m[0], Alphabet_size, 1.); + + pri->inum = 1; + pri->iq[0] = 1.; + FSet(pri->i[0], Alphabet_size, 1.); + + return pri; +} + diff --git a/forester/archive/RIO/others/hmmer/src/pvm.c b/forester/archive/RIO/others/hmmer/src/pvm.c new file mode 100644 index 0000000..fd5b4bb --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/pvm.c @@ -0,0 +1,453 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* pvm.c + * SRE, Wed Aug 5 15:40:09 1998 [St. Louis] + * + * PVM code shared amongst pvm masters and slaves. + * + * CVS $Id: pvm.c,v 1.1.1.1 2005/03/22 08:34:00 cmzmasek Exp $ + */ +#ifdef HMMER_PVM + +#include +#include +#include +#include +#include + +#include "version.h" +#include "structs.h" +#include "funcs.h" +#include "squid.h" +#include "sqfuncs.h" + +/* Function: PVMSpawnSlaves() + * Date: SRE, Wed Aug 19 14:01:39 1998 [St. Louis] + * + * Purpose: Spawn the slaves. + * We use the "speed" field for each host to + * determine how many tasks should be started + * on it. 1000 indicates a single processor; + * 2000 indicates a dual processor; etc. + * Since hmmpfam-pvm load balances automatically, + * the relative speed of the processor(s) is + * irrelevant. + * + * Args: slave - name of slave process to spawn ("hmmpfam-slave") + * ret_tid - RETURN: malloc'ed list of slave tid's. + * ret_nslaves - RETURN: total number of slaves. + * + * Returns: (void). + * caller must free() ret_tid. + */ +void +PVMSpawnSlaves(char *slave, int **ret_tid, int *ret_nslaves) +{ + struct pvmhostinfo *hostp; + int nodes; /* total number of nodes in the VM */ + int nslaves; /* RETURN: total number of slaves */ + int ntasks; /* number of tasks to start on this node */ + int code; /* a code returned from a PVM call */ + int *tid; /* array of slave task tids */ + int *dtid; /* array of host PVMD tids; for pvm_notify() */ + int i; + + SQD_DPRINTF1(("requesting PVM configuration...\n")); + if (pvm_config(&nodes, NULL, &hostp) != 0) Die("PVM not responding"); + dtid = MallocOrDie(sizeof(int) * nodes); + nslaves = 0; + for (i = 0; i < nodes; i++) + { + dtid[i] = hostp[i].hi_tid; + ntasks = hostp[i].hi_speed / 1000; + if (ntasks == 0) continue; + + if (nslaves == 0) tid = MallocOrDie(sizeof(int) * ntasks); + else tid = ReallocOrDie(tid, sizeof(int) * (ntasks+nslaves)); + code = pvm_spawn(slave, NULL, PvmTaskHost, hostp[i].hi_name, ntasks, tid + nslaves); + if (code < ntasks) { /* Careful error diagnostics. Important! */ + pvm_exit(); + switch (*(tid+nslaves)) { + case PvmBadParam: + Die("pvm_spawn claims PvmBadParam - code error?"); + case PvmNoHost: + Die("pvm_spawn: host %d (%s): not in virtual machine", + i, hostp[i].hi_name); + case PvmNoFile: + Die("pvm_spawn: host %d (%s): %s not in path", + i+1, hostp[i].hi_name, slave); + case PvmNoMem: + Die("pvm_spawn claims that host %s has insufficient memory", + hostp[i].hi_name); + case PvmSysErr: + Die("pvm_spawn: host %d (%s): pvmd not responding", + i+1, hostp[i].hi_name); + case PvmOutOfRes: + Die("pvm_spawn claims it is out of resources."); + default: + Die("Spawned too few slaves on node %s; expected %d got %d\n", hostp[i].hi_name, ntasks, code); + } + } + nslaves += ntasks; + SQD_DPRINTF1(("Spawned %d slaves on host %s...\n", ntasks, hostp[i].hi_name)); + } + if (nslaves == 0) { pvm_exit(); Die("No slaves were spawned"); } + + /* Arrange to be notified in case of trouble + */ + if (pvm_notify(PvmTaskExit, HMMPVM_TASK_TROUBLE, nslaves, tid) != 0) + { pvm_exit(); Die("pvm_notify() unexpectedly failed"); } + if (pvm_notify(PvmHostDelete, HMMPVM_HOST_TROUBLE, nodes, dtid) != 0) + { pvm_exit(); Die("pvm_notify() unexpectedly failed"); } + + *ret_tid = tid; + *ret_nslaves = nslaves; + free(dtid); + return; +} + +/* Function: PVMConfirmSlaves() + * Date: SRE, Mon Oct 26 17:31:42 1998 [St. Louis] + * + * Purpose: Make sure all the slaves initialized properly; + * after the master spawns and initializes them, + * they're supposed to send back a code. Valid + * codes are in structs.h and include: + * HMMPVM_OK everything's fine + * HMMPVM_NO_HMMFILE file not found (hmmpfam) + * HMMPVM_NO_INDEX no SSI file found (hmmpfam) + * HMMPVM_BAD_INIT miscellaneous error + * They also send back the RELEASE code, which + * must match the master. This was added as an + * integrity check for bug#1. + * + * Args: slave_tid array of nslaves TIDs + * nslaves number of slaves + * + * Returns: (void) + * If everything isn't OK, we Die() here. + */ +void +PVMConfirmSlaves(int *slave_tid, int nslaves) +{ + struct pvmhostinfo *hostp; + int nodes; + int i; + struct timeval tmout; + int code; /* code returned by slave */ + int bufid; + char *slaverelease; + + tmout.tv_sec = 5; /* wait 5 sec before giving up on a slave. */ + tmout.tv_usec = 0; + + SQD_DPRINTF1(("requesting PVM configuration...\n")); + if (pvm_config(&nodes, NULL, &hostp) != 0) Die("PVM not responding"); + SQD_DPRINTF1(("Slaves, count off!\n")); + for (i = 0; i < nslaves; i++) + { + /* Do a timeout receive. If we don't hear back pronto + * from our slaves, we've got a problem. + */ + if ((bufid = pvm_trecv(-1, HMMPVM_RESULTS, &tmout)) <= 0) + { + SQD_DPRINTF1(("Slave %d (%s) gives bufid %d.\n", i, hostp[i].hi_name, bufid)); + PVMKillSlaves(slave_tid, nslaves); + pvm_exit(); + Die("One or more slaves started but died before initializing."); + } + + SQD_DPRINTF1(("Slave %d (%s): present, sir!\n", i, hostp[i].hi_name)); + pvm_upkint(&code, 1, 1); + slaverelease = PVMUnpackString(); + + if (code != HMMPVM_OK) + { + PVMKillSlaves(slave_tid, nslaves); + pvm_exit(); + switch (code) { + case HMMPVM_NO_HMMFILE: + Die("One or more PVM slaves couldn't open hmm file. Check installation."); + case HMMPVM_NO_INDEX: + Die("One or more PVM slaves couldn't open SSI index for hmm file. Check installation."); + case HMMPVM_BAD_INIT: + Die("One or more PVM slaves reports a failure to initialize."); + default: + Die("Unknown error code. A slave is confused."); + } + } + + if (strcmp(slaverelease, RELEASE) != 0) + { + PVMKillSlaves(slave_tid, nslaves); + pvm_exit(); + Die("Slave %d reports that it's running release %s, which doesn't match the master (%s)", i, slaverelease, RELEASE); + } + } +} + + + +/* Function: PVMCheckSlaves() + * Date: SRE, Fri Aug 14 09:04:25 1998 [St. Louis] + * + * Purpose: Make sure all the slaves are alive. If they + * aren't, kill the rest, and die. + * + * Args: slave_tid - array of slave TIDs + * nslaves - number of slaves + * + * Returns: void + */ +void +PVMCheckSlaves(int *slave_tid, int nslaves) +{ + int trouble; /* non-zero if a trouble message is waiting */ + + trouble = pvm_nrecv(-1, HMMPVM_TASK_TROUBLE); + if (trouble > 0) + { + PVMKillSlaves(slave_tid, nslaves); + pvm_exit(); Die("One or more slave tasks exited prematurely. Shutting down."); + } + trouble = pvm_nrecv(-1, HMMPVM_HOST_TROUBLE); + if (trouble > 0) + { + PVMKillSlaves(slave_tid, nslaves); + pvm_exit(); Die("One or more hosts left the PVM unexpectedly. Shutting down."); + } +} + +/* Function: PVMKillSlaves() + * Date: SRE, Thu Aug 13 16:27:40 1998 [St. Louis] + * + * Purpose: shut down the slaves, after a fatal error. + * + * Args: slave_tid - array of slave tids + * nslaves - number of slaves + * + * Returns: void + */ +void +PVMKillSlaves(int *slave_tid, int nslaves) +{ + int i; + + for (i = 0; i < nslaves; i++) + if (pvm_kill(slave_tid[i]) != 0) + Warn("a slave refuses to die"); + return; +} + + +/* Function: PVMPackString() + * Date: SRE, Tue Aug 18 14:08:05 1998 [St. Louis] + * + * Purpose: pack a variable length string for sending over PVM, + * sending its length first so the receiver can + * malloc appropriately. + * + * Args: s - the string to send + * + * Returns: 1 on success. 0 on failure. + */ +int +PVMPackString(char *s) +{ + int len; + + len = (s == NULL) ? -1 : strlen(s); + if (pvm_pkint(&len, 1, 1) != 0) return 0; + if (len >= 0) + if (pvm_pkstr(s) != 0) return 0; + return 1; +} + +/* Function: PVMUnpackString() + * Date: SRE, Tue Aug 18 14:11:04 1998 [St. Louis] + * + * Purpose: unpack a string. + * + * Args: (void) + * + * Returns: ptr to string. + */ +char * +PVMUnpackString(void) +{ + int len; + char *s; + + if (pvm_upkint(&len, 1, 1) != 0) return NULL; + if (len == -1) return NULL; + + s = MallocOrDie(sizeof(char) * (len+1)); + if (pvm_upkstr(s) != 0) return NULL; + return s; +} + + +/* Function: PVMPackTrace() + * Date: SRE, Wed Aug 5 15:41:36 1998 [St. Louis] + * + * Purpose: Pack a trace structure for a PVM send. + * The caller is responsible for calling pvm_initsend() before, + * and pvm_send() after packing. + * + * Args: tr - the trace structure to pack. + * + * Returns: 1 on success, 0 on failure. + */ +int +PVMPackTrace(struct p7trace_s *tr) +{ + if (pvm_pkint(&(tr->tlen), 1, 1) < 0) return 0; + if (pvm_pkbyte(tr->statetype, tr->tlen, 1) < 0) return 0; + if (pvm_pkint(tr->nodeidx, tr->tlen, 1) < 0) return 0; + if (pvm_pkint(tr->pos, tr->tlen, 1) < 0) return 0; + return 1; +} + +/* Function: PVMUnpackTrace() + * Date: SRE, Wed Aug 5 15:51:03 1998 [St. Louis] + * + * Purpose: Unpack a trace structure from a PVM send. + * Caller is responsible for calling for a pvm_recv() + * before calling this. + * + * Args: none. + * + * Returns: ptr to alloc'ed trace, or NULL on failure. + * caller free's returned trace with P7FreeTrace(). + */ +struct p7trace_s * +PVMUnpackTrace(void) +{ + struct p7trace_s *tr; + int tlen; + + pvm_upkint(&tlen, 1, 1); + P7AllocTrace(tlen, &tr); + if (pvm_upkbyte(tr->statetype, tlen, 1) < 0) { P7FreeTrace(tr); return NULL;} + if (pvm_upkint(tr->nodeidx, tlen, 1) < 0) { P7FreeTrace(tr); return NULL;} + if (pvm_upkint(tr->pos, tlen, 1) < 0) { P7FreeTrace(tr); return NULL;} + tr->tlen = tlen; + return tr; +} + + +/* Function: PVMPackHMM() + * Date: SRE, Tue Aug 18 11:47:44 1998 [St. Louis] + * + * Purpose: Pack an HMM for sending over PVM. + * + * Args: hmm - the HMM to send. + * + * Returns: 1 on success, 0 on failure + */ +int +PVMPackHMM(struct plan7_s *hmm) +{ + int k; + int sendflags; /* HMM flags to send */ + + sendflags = hmm->flags; + sendflags &= ~PLAN7_HASBITS; /* no log odds scores sent */ + sendflags &= ~PLAN7_HASDNA; /* no DNA scores sent */ + + if (pvm_pkint(&(hmm->M), 1, 1) != 0) return 0; + if (pvm_pkint(&sendflags, 1, 1) != 0) return 0; + if (! PVMPackString(hmm->name)) return 0; + if (hmm->flags & PLAN7_DESC) { if (!PVMPackString(hmm->desc)) return 0; } + if (hmm->flags & PLAN7_RF) { if (!PVMPackString(hmm->rf)) return 0; } + if (hmm->flags & PLAN7_CS) { if (!PVMPackString(hmm->cs)) return 0; } + if (! PVMPackString(hmm->comlog)) return 0; + if (pvm_pkint(&(hmm->nseq), 1, 1) != 0) return 0; + if (!PVMPackString(hmm->ctime)) return 0; + if (hmm->flags & PLAN7_MAP) { if (pvm_pkint(hmm->map, hmm->M+1, 1) != 0) return 0; } + if (pvm_pkint(&(hmm->checksum), 1, 1) != 0) return 0; + + for (k = 1; k < hmm->M; k++) + if (pvm_pkfloat(hmm->t[k], 7, 1) != 0) return 0; + for (k = 1; k <= hmm->M; k++) + if (pvm_pkfloat(hmm->mat[k], Alphabet_size, 1) != 0) return 0; + for (k = 1; k < hmm->M; k++) + if (pvm_pkfloat(hmm->ins[k], Alphabet_size, 1) != 0) return 0; + if (pvm_pkfloat(&(hmm->tbd1), 1, 1) != 0) return 0; + for (k = 0; k < 4; k++) + if (pvm_pkfloat(hmm->xt[k], 2, 1) != 0) return 0; + if (pvm_pkfloat(hmm->begin, hmm->M+1, 1) != 0) return 0; + if (pvm_pkfloat(hmm->end, hmm->M+1, 1) != 0) return 0; + if (pvm_pkfloat(hmm->null, Alphabet_size, 1) != 0) return 0; + if (pvm_pkfloat(&(hmm->p1), 1, 1) != 0) return 0; + if (hmm->flags & PLAN7_STATS) + { + if (pvm_pkfloat(&(hmm->mu), 1, 1) != 0) return 0; + if (pvm_pkfloat(&(hmm->lambda), 1, 1) != 0) return 0; + } + return 1; +} + + +/* Function: PVMUnpackHMM() + * Date: SRE, Tue Aug 18 13:56:13 1998 [St. Louis] + * + * Purpose: Unpack an HMM from PVM. + * + * Args: (void) + * + * Returns: ptr to HMM, or NULL + */ +struct plan7_s * +PVMUnpackHMM(void) +{ + struct plan7_s *hmm; + int k; + int M; + + if (pvm_upkint(&(M), 1, 1) != 0) return NULL; + hmm = AllocPlan7(M); + + if (pvm_upkint(&(hmm->flags), 1, 1) != 0) return NULL; + if ((hmm->name = PVMUnpackString()) == NULL) return NULL; + if (hmm->flags & PLAN7_DESC) { if ((hmm->desc = PVMUnpackString()) == NULL) return NULL; } + if (hmm->flags & PLAN7_RF) { if ((hmm->rf = PVMUnpackString()) == NULL) return NULL; } + if (hmm->flags & PLAN7_CS) { if ((hmm->cs = PVMUnpackString()) == NULL) return NULL; } + + if ((hmm->comlog = PVMUnpackString()) == NULL) return NULL; + if (pvm_upkint(&(hmm->nseq), 1, 1) != 0) return NULL; + if ((hmm->ctime = PVMUnpackString()) == NULL) return NULL; + if (hmm->flags & PLAN7_MAP) { if (pvm_upkint(hmm->map, hmm->M+1, 1) != 0) return NULL; } + if (pvm_upkint(&(hmm->checksum), 1, 1) != 0) return NULL; + + for (k = 1; k < hmm->M; k++) + if (pvm_upkfloat(hmm->t[k], 7, 1) != 0) return NULL; + for (k = 1; k <= hmm->M; k++) + if (pvm_upkfloat(hmm->mat[k], Alphabet_size, 1) != 0) return NULL; + for (k = 1; k < hmm->M; k++) + if (pvm_upkfloat(hmm->ins[k], Alphabet_size, 1) != 0) return NULL; + if (pvm_upkfloat(&(hmm->tbd1), 1, 1) != 0) return NULL; + for (k = 0; k < 4; k++) + if (pvm_upkfloat(hmm->xt[k], 2, 1) != 0) return NULL; + if (pvm_upkfloat(hmm->begin, hmm->M+1, 1) != 0) return NULL; + if (pvm_upkfloat(hmm->end, hmm->M+1, 1) != 0) return NULL; + if (pvm_upkfloat(hmm->null, Alphabet_size, 1) != 0) return NULL; + if (pvm_upkfloat(&(hmm->p1), 1, 1) != 0) return NULL; + if (hmm->flags & PLAN7_STATS) + { + if (pvm_upkfloat(&(hmm->mu), 1, 1) != 0) return NULL; + if (pvm_upkfloat(&(hmm->lambda), 1, 1) != 0) return NULL; + } + return hmm; +} + + +#endif /* HMMER_PVM */ diff --git a/forester/archive/RIO/others/hmmer/src/states.c b/forester/archive/RIO/others/hmmer/src/states.c new file mode 100644 index 0000000..0fb7e50 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/states.c @@ -0,0 +1,444 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile-HMMs + * Copyright (C) 1992-1997 Sean R. Eddy + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and + * GNULICENSE for details. + * + ************************************************************/ + +/* states.c + * + * alloc, free, and initialization of state structures + */ + +#include +#include +#include +#include +#include "squid.h" +#include "config.h" +#include "structs.h" +#include "funcs.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + + +struct hmm_struc * +AllocHMM(int M) /* length of model to make */ +{ + struct hmm_struc *hmm; /* RETURN: blank HMM */ + + hmm = (struct hmm_struc *) MallocOrDie (sizeof(struct hmm_struc)); + hmm->ins = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2)); + hmm->del = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2)); + hmm->mat = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2)); + hmm->ref = (char *) MallocOrDie ((M+2) * sizeof(char)); + hmm->cs = (char *) MallocOrDie ((M+2) * sizeof(char)); + hmm->xray = (float *) MallocOrDie ((M+2) * sizeof(float) * NINPUTS); + hmm->M = M; + hmm->name = Strdup("unnamed"); /* name is not optional. */ + + hmm->flags = 0; + ZeroHMM(hmm); + return hmm; +} + +/* Function: ZeroHMM() + * + * Purpose: Zero emission and transition counts in an HMM. + */ +void +ZeroHMM(struct hmm_struc *hmm) +{ + int k, ts, idx; + + for (k = 0; k <= hmm->M+1; k++) + { + for (ts = 0; ts < 3; ts++) + { + hmm->mat[k].t[ts] = 0.0; + hmm->ins[k].t[ts] = 0.0; + hmm->del[k].t[ts] = 0.0; + } + for (idx = 0; idx < Alphabet_size; idx++) + { + hmm->mat[k].p[idx] = 0.0; + hmm->ins[k].p[idx] = 0.0; + hmm->del[k].p[idx] = 0.0; + } + } +} + + +/* Function: LogifyHMM() + * + * Purpose: Convert a probability-form HMM to log probabilities. + * Best to do this on a modifiable copy of an HMM. + */ +void +LogifyHMM(struct hmm_struc *hmm) +{ + int k, ts, idx; + + for (k = 0; k <= hmm->M+1; k++) + { + for (ts = 0; ts < 3; ts++) + { + hmm->mat[k].t[ts] = sreLOG2(hmm->mat[k].t[ts]); + hmm->ins[k].t[ts] = sreLOG2(hmm->ins[k].t[ts]); + hmm->del[k].t[ts] = sreLOG2(hmm->del[k].t[ts]); + } + for (idx = 0; idx < Alphabet_size; idx++) + { + hmm->mat[k].p[idx] = sreLOG2(hmm->mat[k].p[idx]); + hmm->ins[k].p[idx] = sreLOG2(hmm->ins[k].p[idx]); + } + } +} + +/* Function: LogoddsifyHMM() + * + * Convert a probability form HMM to log odds scores. + * Best to do this on a modifiable copy of an HMM. + */ +void +LogoddsifyHMM(struct hmm_struc *hmm) +{ + int k, ts, x; + + for (k = 0; k <= hmm->M+1; k++) + { + for (ts = 0; ts < 3; ts++) + { + hmm->mat[k].t[ts] = sreLOG2(hmm->mat[k].t[ts]); + hmm->ins[k].t[ts] = sreLOG2(hmm->ins[k].t[ts]); + hmm->del[k].t[ts] = sreLOG2(hmm->del[k].t[ts]); + } + for (x = 0; x < Alphabet_size; x++) + { + hmm->mat[k].p[x] = sreLOG2(hmm->mat[k].p[x]) - sreLOG2(hmm->null[x]); + hmm->ins[k].p[x] = sreLOG2(hmm->ins[k].p[x]) - sreLOG2(hmm->null[x]); + } + } +} + + +/* Function: WriteFlatPriorHMM() + * + * Purpose: Fill an HMM with expected probabilities according + * to a given prior. Used to construct "flat" initial + * models for hmmt. + */ +int +WriteFlatPriorHMM(struct hmm_struc *hmm, struct prior_s *prior) +{ + int k; /* counter across model */ + int q; /* counter over mixtures */ + int x; /* counter over symbols or transitions */ + float malpha; /* alpha for mixture */ + float ialpha; /* alpha for insert mixture */ + float dalpha; /* alpha for delete mixture */ + + for (k = 0; k <= hmm->M; k++) + { + /* xray info for structure prior */ + if (prior->strategy == PRI_STRUCT) + { + hmm->xray[k*NINPUTS + XRAY_bias] = 1.0; + hmm->xray[k*NINPUTS + XRAY_E] = 0.0; + hmm->xray[k*NINPUTS + XRAY_H] = 0.0; + hmm->xray[k*NINPUTS + XRAY_SA] = 0.0; + } + /* match symbol emissions */ + for (x = 0; x < Alphabet_size; x++) + hmm->mat[k].p[x] = 0.0; + if (k > 0) + for (q = 0; q < prior->mnum; q++) + { + if (prior->strategy == PRI_STRUCT) + prior->mq[q] = 1.0 / prior->mnum; + malpha = 0.0; + for (x = 0; x < Alphabet_size; x++) + malpha += prior->mat[q][x]; + for (x = 0; x < Alphabet_size; x++) + hmm->mat[k].p[x] += prior->mq[q] * prior->mat[q][x] / malpha; + } + /* insert emissions */ + for (x = 0; x < Alphabet_size; x++) + hmm->ins[k].p[x] = 0.0; + for (q = 0; q < prior->inum; q++) + { + if (prior->strategy == PRI_STRUCT) + prior->iq[q] = 1.0 / prior->inum; + ialpha = 0.0; + for (x = 0; x < Alphabet_size; x++) + ialpha += prior->ins[q][x]; + for (x = 0; x < Alphabet_size; x++) + hmm->ins[k].p[x] += prior->iq[q] * prior->ins[q][x] / ialpha; + } + + /* state transitions */ + for (x = 0; x < 3; x++) + hmm->mat[k].t[x] = hmm->ins[k].t[x] = hmm->del[k].t[x] = 0.0; + for (q = 0; q < prior->tnum; q++) + { + if (prior->strategy == PRI_STRUCT) + prior->tq[q] = 1.0 / prior->tnum; + malpha = ialpha = dalpha = 0.0; + for (x = 0; x < 3; x++) + { + malpha += prior->tm[q][x]; + ialpha += prior->ti[q][x]; + dalpha += prior->td[q][x]; + } + for (x = 0; x < 3; x++) + { + hmm->mat[k].t[x] += prior->tq[q] * prior->tm[q][x] / malpha; + hmm->ins[k].t[x] += prior->tq[q] * prior->ti[q][x] / ialpha; + if (k > 0) hmm->del[k].t[x] += prior->tq[q] * prior->td[q][x] / dalpha; + } + } + } + /* the final state never transits to d+1 */ + hmm->mat[hmm->M].t[DELETE] = 0.0; + hmm->ins[hmm->M].t[DELETE] = 0.0; + hmm->del[hmm->M].t[DELETE] = 0.0; + Renormalize(hmm); + return 1; +} + + +/* Function: HMMDup() + * + * Purpose: Create a duplicate copy of an HMM. + * + * Return: Pointer to the duplicate. + * Caller is responsible for free'ing the duplicate. + */ +struct hmm_struc * +HMMDup(struct hmm_struc *hmm) +{ + struct hmm_struc *newhmm; + + if ((newhmm = AllocHMM(hmm->M)) == NULL) + Die("AllocHMM() failed"); + HMMCopy(newhmm, hmm); + return newhmm; +} + + +/* Function: HMMCopy() + * + * Purpose: Make a copy of hmm2 in hmm1. + * + * Return: (void) + * Caller promises that hmm1 and hmm2 have identical architectures. + */ +void +HMMCopy(struct hmm_struc *hmm1, struct hmm_struc *hmm2) +{ + int k, x, ts; + + hmm1->flags = hmm2->flags; + if (hmm1->name != NULL) free(hmm1->name); + hmm1->name = Strdup(hmm2->name); + + if (hmm2->flags & HMM_REF) strcpy(hmm1->ref, hmm2->ref); + if (hmm2->flags & HMM_CS) strcpy(hmm1->cs, hmm2->cs); + if (hmm2->flags & HMM_XRAY) + memcpy(hmm1->xray, hmm2->xray, NINPUTS * (hmm2->M+2) * sizeof(float)); + memcpy(hmm1->null, hmm2->null, sizeof(float) * Alphabet_size); + + for (k = 0; k <= hmm2->M+1; k++) + { + /* copy transition T's */ + for (ts = 0; ts < 3; ts++) + { + hmm1->mat[k].t[ts] = hmm2->mat[k].t[ts]; + hmm1->ins[k].t[ts] = hmm2->ins[k].t[ts]; + hmm1->del[k].t[ts] = hmm2->del[k].t[ts]; + } + /* copy symbol P tables */ + for (x = 0; x < Alphabet_size; x++) + { + hmm1->mat[k].p[x] = hmm2->mat[k].p[x]; + hmm1->ins[k].p[x] = hmm2->ins[k].p[x]; + } + } + return; +} + + +int +FreeHMM(struct hmm_struc *hmm) +{ + if (hmm == NULL) return 0; + free(hmm->ref); + free(hmm->cs); + free(hmm->xray); + free(hmm->name); + if (hmm->mat != NULL) free (hmm->mat); + if (hmm->ins != NULL) free (hmm->ins); + if (hmm->del != NULL) free (hmm->del); + free(hmm); + return 1; +} + + +struct shmm_s * +AllocSearchHMM(int M) +{ + struct shmm_s *shmm; + int x; + + if ((shmm = (struct shmm_s *) malloc (sizeof(struct shmm_s))) == NULL) + Die("malloc failed"); + for (x = 0; x < 26; x++) + if ((shmm->m_emit[x] = (int *) calloc (M+1, sizeof(int))) == NULL || + (shmm->i_emit[x] = (int *) calloc (M+1, sizeof(int))) == NULL) + Die("malloc failed"); + if ((shmm->t = (int *) malloc (sizeof(int) * (9*(M+1)))) == NULL || + (shmm->ref = (char *) malloc (sizeof(char) * (M+2))) == NULL || + (shmm->cs = (char *) malloc (sizeof(char) * (M+2))) == NULL) + Die("malloc failed"); + shmm->flags = 0; + shmm->name = Strdup("nameless"); + shmm->M = M; + return shmm; +} + +void +FreeSearchHMM(struct shmm_s *shmm) +{ + int x; + + for (x = 0; x < 26; x++) + { + free(shmm->m_emit[x]); + free(shmm->i_emit[x]); + } + free(shmm->t); + free(shmm->ref); + free(shmm->cs); + free(shmm->name); + free(shmm); +} + + +/* Function: CountSymbol() + * + * Purpose: Given an observed symbol, and a number of counts to + * distribute (typically just 1.0), bump the appropriate counter(s). + * + * This is completely trivial only so long as the symbols + * always come from the expected alphabet; since we also + * have to deal with degenerate symbols for both nucleic + * acid and protein languages, we make a function to deal + * with this. + * + * Args: sym - observed symbol, e.g. `A' or `X' + * wt - number of counts to distribute (e.g. 1.0) + * counters - array of 4 or 20 counters to increment + * + * Return: Returns 1 on success and bumps the necessary counters. + * Returns 0 on failure and bumps each counter evenly, as + * if it saw a completely ambiguous symbol; this lets + * the caller silently accept garbage symbols, if it cares to. + */ +int +CountSymbol(char sym, float wt, float *counters) +{ + char *sptr; /* pointer into symbol in hmm->alphabet */ + int status; /* RETURN: status; did we recognize the symbol? */ + char symidx; /* index of symbol in Alphabet_iupac */ + + if ((sptr = strchr(Alphabet,sym)) != NULL) + { + symidx = (char) (sptr - Alphabet); + status = 1; + } + else + { + symidx = (char) (Alphabet_iupac - 1); + Warn("unrecognized character %c in CountSymbol()\n", sym); + status = 0; + } + P7CountSymbol(counters, symidx, wt); + return status; +} + + +/* Function: HMMDistance() + * + * Purpose: Test two models for how different they are, using + * a simple squared difference measure on all homologous + * parameters. They must have the same architecture: + * i.e. check that newhmm->M == oldhmm->M before calling. + * + * Args: newhmm - new HMM, probability form + * oldhmm - old HMM, probability form + * + * Return: distance. + */ +float +HMMDistance(struct hmm_struc *newhmm, struct hmm_struc *oldhmm) +{ + int k,x, ts; + float distance = 0.0; + + for (k = 0; k <= newhmm->M; k++) + { + /* state transition distances */ + if (k > 0) + { + for (ts = 0; ts < 3; ts++) + distance += SQR( 100. * (newhmm->del[k].t[ts] - oldhmm->del[k].t[ts])); + } + for (ts = 0; ts < 3; ts++) + distance += SQR( 100. * (newhmm->mat[k].t[ts] - oldhmm->mat[k].t[ts])); + for (ts = 0; ts < 3; ts++) + distance += SQR( 100. * (newhmm->ins[k].t[ts] - oldhmm->ins[k].t[ts])); + + /* symbol emission distances */ + if (k > 0) + for (x = 0; x < Alphabet_size; x++) + distance += SQR( 100. * (newhmm->mat[k].p[x] - oldhmm->mat[k].p[x])); + for (x = 0; x < Alphabet_size; x++) + distance += SQR( 100. * (newhmm->ins[k].p[x] - oldhmm->ins[k].p[x])); + } + distance = sqrt(distance) / newhmm->M; + return distance; +} + + + + +/* Function: Renormalize() + * + * Normalize all P distributions so they sum to 1. + * P distributions that are all 0, or contain negative + * probabilities, are left untouched. + * + * Returns 1 on success, or 0 on failure. + */ +void +Renormalize(struct hmm_struc *hmm) +{ + int k; /* counter for states */ + + for (k = 0; k <= hmm->M ; k++) + { + /* match state transition frequencies */ + FNorm(hmm->mat[k].t, 3); + FNorm(hmm->ins[k].t, 3); + if (k > 0) FNorm(hmm->del[k].t, 3); + + if (k > 0) FNorm(hmm->mat[k].p, Alphabet_size); + FNorm(hmm->ins[k].p, Alphabet_size); + } +} + diff --git a/forester/archive/RIO/others/hmmer/src/structs.h b/forester/archive/RIO/others/hmmer/src/structs.h new file mode 100644 index 0000000..105ebec --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/structs.h @@ -0,0 +1,564 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* structs.h + * + * Data structures used in HMMER. + * Also, a few miscellaneous macros and global variable declarations. + * + * RCS $Id: structs.h,v 1.1.1.1 2005/03/22 08:34:01 cmzmasek Exp $ + */ + +#ifndef STRUCTSH_INCLUDED +#define STRUCTSH_INCLUDED + +#include "squid.h" +#include "config.h" +#include "ssi.h" + +/* Miscellaneous math macros used in the package + */ +#define sreLOG2(x) ((x) > 0 ? log(x) * 1.44269504 : -9999.) +#define sreEXP2(x) (exp((x) * 0.69314718 )) +#define SQR(x) ((x) * (x)) + +/* an idiom for determining a symbol's position in the array + * by pointer arithmetic. + * does no error checking, so caller must already be damned sure x is + * valid in the alphabet! + */ +#define SYMIDX(x) (strchr(Alphabet, (x)) - Alphabet) + +/* The symbol alphabet. + * Must deal with IUPAC degeneracies. Nondegenerate symbols + * come first in Alphabet[], followed by degenerate symbols. + * Nucleic alphabet also must deal with other common symbols + * like U (in RNA) and X (often misused for N). + * Example: + * Nucleic: "ACGTUNRYMKSWHBVDX" size=4 iupac=17 + * Amino: "ACDEFGHIKLMNPQRSTVWYBZX" size=20 iupac=23 + * + * Parts of the code assume that the last symbol is a + * symbol for an unknown residue, i.e. 'X'. + * + * MAXCODE and MAXABET constants are defined in config.h + */ +extern char Alphabet[MAXCODE]; /* "ACDEFGHIKLMNPQRSTVWYBZX" for example */ +extern int Alphabet_type; /* hmmNUCLEIC or hmmAMINO */ +extern int Alphabet_size; /* uniq alphabet size: 4 or 20 */ +extern int Alphabet_iupac; /* total size of alphabet + IUPAC degen. */ +extern char Degenerate[MAXCODE][MAXABET]; +extern int DegenCount[MAXCODE]; +#define hmmNOTSETYET 0 +#define hmmNUCLEIC 2 /* compatibility with squid's kRNA */ +#define hmmAMINO 3 /* compatibility with squid's kAmino */ + +/********************************************************************** + * + * Plan7 + * Implementation of the new Plan7 HMM architecture. + * Fully probabilistic even for hmmsw, hmmls, and hmmfs; + * No insert->delete or delete->insert transitions; + * Improved structure layout. + * + * The strategy is to infiltrate plan7 code into HMMER in + * an evolutionary rather than revolutionary manner. + * + **********************************************************************/ + +/* Plan 7 construction strategies. + */ +enum p7_construction { + P7_MAP_CONSTRUCTION, /* maximum a posteriori architecture */ + P7_HAND_CONSTRUCTION, /* hand specified architecture */ + P7_FAST_CONSTRUCTION /* fast ad hoc architecture */ +}; + +/* Plan 7 parameter optimization strategies + */ +enum p7_param { + P7_MAP_PARAM, /* standard maximum a posteriori */ + P7_MD_PARAM, /* maximum discrimination */ + P7_MRE_PARAM, /* maximum relative entropy */ + P7_WMAP_PARAM /* ad hoc weighted MAP */ +}; + +/* Structure: plan7_s + * + * Declaration of a Plan 7 profile-HMM. + */ +struct plan7_s { + /* Annotation on the model. A name is mandatory. + * Other fields are optional; whether they are present is + * flagged in the stateflags bit array. + * + * desc is only valid if PLAN7_DESC is set in flags. + * acc is only valid if PLAN7_ACC is set in flags. + * rf is only valid if PLAN7_RF is set in flags. + * cs is only valid if PLAN7_CS is set in flags. + * ca is only valid if PLAN7_CA is set in flags. + * map is only valid if PLAN7_MAP is set in flags. + */ + char *name; /* name of the model +*/ + char *acc; /* accession number of model (Pfam) +*/ + char *desc; /* brief description of model +*/ + char *rf; /* reference line from alignment 0..M +*/ + char *cs; /* consensus structure line 0..M +*/ + char *ca; /* consensus accessibility line 0..M */ + char *comlog; /* command line(s) that built model +*/ + int nseq; /* number of training sequences +*/ + char *ctime; /* creation date +*/ + int *map; /* map of alignment cols onto model 1..M+*/ + int checksum; /* checksum of training sequences +*/ + + /* The following are annotations added to support work by Michael Asman, + * CGR Stockholm. They are not stored in model files; they are only + * used in model construction. + * + * #=GC X-PRM (PRT,PRI) annotation is picked up by hmmbuild and interpreted + * as specifying which mixture Dirichlet component to use. If these flags + * are non-NULL, the normal mixture Dirichlet code is bypassed, and a + * single specific Dirichlet is used at each position. + */ + int *tpri; /* which transition mixture prior to use */ + int *mpri; /* which match mixture prior to use */ + int *ipri; /* which insert mixture prior to use */ + + /* Pfam-specific score cutoffs. + * + * ga1, ga2 are valid if PLAN7_GA is set in flags. + * tc1, tc2 are valid if PLAN7_TC is set in flags. + * nc1, nc2 are valid if PLAN7_NC is set in flags. + */ + float ga1, ga2; /* per-seq/per-domain gathering thresholds (bits) +*/ + float tc1, tc2; /* per-seq/per-domain trusted cutoff (bits) +*/ + float nc1, nc2; /* per-seq/per-domain noise cutoff (bits) +*/ + + /* The main model in probability form: data-dependent probabilities. + * This is the core Krogh/Haussler model. + * Transition probabilities are usually accessed as a + * two-D array: hmm->t[k][TMM], for instance. They are allocated + * such that they can also be stepped through in 1D by pointer + * manipulations, for efficiency in DP algorithms. + */ + int M; /* length of the model (# nodes) +*/ + float **t; /* transition prob's. t[1..M-1][0..6] +*/ + float **mat; /* match emissions. mat[1..M][0..19] +*/ + float **ins; /* insert emissions. ins[1..M-1][0..19] +*/ + float tbd1; /* B->D1 prob (data dependent) +*/ + + /* The unique states of Plan 7 in probability form. + * These are the algorithm-dependent, data-independent probabilities. + * Some parts of the code may briefly use a trick of copying tbd1 + * into begin[0]; this makes it easy to call FChoose() or FNorm() + * on the resulting vector. However, in general begin[0] is not + * a valid number. + */ + float xt[4][2]; /* N,E,C,J extra states: 2 transitions +*/ + float *begin; /* 1..M B->M state transitions +*/ + float *end; /* 1..M M->E state transitions (!= a dist!) +*/ + + /* The null model probabilities. + */ + float null[MAXABET]; /* "random sequence" emission prob's +*/ + float p1; /* null model loop probability +*/ + + /* The model in log-odds score form. + * These are created from the probabilities by LogoddsifyHMM(). + * By definition, null[] emission scores are all zero. + * Note that emission distributions are over 26 upper-case letters, + * not just the unambiguous protein or DNA alphabet: we + * precalculate the scores for all IUPAC degenerate symbols we + * may see. Non-IUPAC symbols simply have a -INFTY score. + * Note the reversed indexing on msc and isc -- for efficiency reasons. + * + * Only valid if PLAN7_HASBITS is set. + */ + int **tsc; /* transition scores [1.M-1][0.6] -*/ + int **msc; /* match emission scores [0.MAXCODE-1][1.M] -*/ + int **isc; /* ins emission scores [0.MAXCODE-1][1.M-1] -*/ + int xsc[4][2]; /* N,E,C,J transitions -*/ + int *bsc; /* begin transitions [1.M] -*/ + int *esc; /* end transitions [1.M] -*/ + + /* DNA translation scoring parameters + * For aligning protein Plan7 models to DNA sequence. + * Lookup value for a codon is calculated by pos1 * 16 + pos2 * 4 + pos3, + * where 'pos1' is the digitized value of the first nucleotide position; + * if any of the positions are ambiguous codes, lookup value 64 is used + * (which will generally have a score of zero) + * + * Only valid if PLAN7_HASDNA is set. + */ + int **dnam; /* triplet match scores [0.64][1.M] -*/ + int **dnai; /* triplet insert scores [0.64][1.M] -*/ + int dna2; /* -1 frameshift, doublet emission, M or I -*/ + int dna4; /* +1 frameshift, doublet emission, M or I -*/ + + /* P-value and E-value statistical parameters + * Only valid if PLAN7_STATS is set. + */ + float mu; /* EVD mu +*/ + float lambda; /* EVD lambda +*/ + + int flags; /* bit flags indicating state of HMM, valid data +*/ +}; + +/* Flags for plan7->flags. + * Note: Some models have scores but no probabilities (for instance, + * after reading from an HMM save file). Other models have + * probabilities but no scores (for instance, during training + * or building). Since it costs time to convert either way, + * I use PLAN7_HASBITS and PLAN7_HASPROB flags to defer conversion + * until absolutely necessary. This means I have to be careful + * about keeping these flags set properly when I fiddle a model. + */ +#define PLAN7_HASBITS (1<<0) /* raised if model has log-odds scores */ +#define PLAN7_DESC (1<<1) /* raised if description exists */ +#define PLAN7_RF (1<<2) /* raised if #RF annotation available */ +#define PLAN7_CS (1<<3) /* raised if #CS annotation available */ +#define PLAN7_XRAY (1<<4) /* raised if structural data available */ +#define PLAN7_HASPROB (1<<5) /* raised if model has probabilities */ +#define PLAN7_HASDNA (1<<6) /* raised if protein HMM->DNA seq params set*/ +#define PLAN7_STATS (1<<7) /* raised if EVD parameters are available */ +#define PLAN7_MAP (1<<8) /* raised if alignment map is available */ +#define PLAN7_ACC (1<<9) /* raised if accession number is available */ +#define PLAN7_GA (1<<10) /* raised if gathering thresholds available */ +#define PLAN7_TC (1<<11) /* raised if trusted cutoffs available */ +#define PLAN7_NC (1<<12) /* raised if noise cutoffs available */ +#define PLAN7_CA (1<<13) /* raised if surface accessibility avail. */ + +/* Indices for special state types, I: used for dynamic programming xmx[][] + * mnemonic: eXtra Matrix for B state = XMB + */ +#define XMB 0 +#define XME 1 +#define XMC 2 +#define XMJ 3 +#define XMN 4 + +/* Indices for special state types, II: used for hmm->xt[] indexing + * mnemonic: eXtra Transition for N state = XTN + */ +#define XTN 0 +#define XTE 1 +#define XTC 2 +#define XTJ 3 + +/* Indices for Plan7 main model state transitions. + * Used for indexing hmm->t[k][] + * mnemonic: Transition from Match to Match = TMM + */ +#define TMM 0 +#define TMI 1 +#define TMD 2 +#define TIM 3 +#define TII 4 +#define TDM 5 +#define TDD 6 + +/* Indices for extra state transitions + * Used for indexing hmm->xt[][]. + */ +#define MOVE 0 /* trNB, trEC, trCT, trJB */ +#define LOOP 1 /* trNN, trEJ, trCC, trJJ */ + +/* Declaration of Plan7 dynamic programming matrix structure. + */ +struct dpmatrix_s { + int **xmx; /* special scores [0.1..N][BECJN] */ + int **mmx; /* match scores [0.1..N][0.1..M] */ + int **imx; /* insert scores [0.1..N][0.1..M-1.M] */ + int **dmx; /* delete scores [0.1..N][0.1..M-1.M] */ +}; + +/* Declaration of Plan7 shadow matrix structure. + * In general, allowed values are STM, STI, etc. + * However, E state has M possible sources, from 1..M match states; + * hence the esrc array. + */ +struct dpshadow_s { + char **xtb; /* special state traces [0.1..N][BECJN] */ + char **mtb; /* match state traces [0.1..N][0.1..M] */ + char **itb; /* insert state traces [0.1..N][0.1..M-1.M] */ + char **dtb; /* delete state traces [0.1..N][0.1..M-1.M] */ + int *esrc; /* E trace is special; must store a M state number 1..M */ +}; + +/* Structure: HMMFILE + * + * Purpose: An open HMM file or HMM library. See hmmio.c. + */ +struct hmmfile_s { + FILE *f; /* pointer to file opened for reading */ + SSIFILE *ssi; /* pointer to open SSI index, or NULL */ + int (*parser)(struct hmmfile_s *, struct plan7_s **); /* parsing function */ + int is_binary; /* TRUE if format is a binary one */ + int byteswap; /* TRUE if binary and byteswapped */ + + /* Ewan (GeneWise) needs the input API to know the offset of each + * HMM on the disk, as it's being read. This might be enough + * support for him. hmmindex also uses this. Ewan, see + * HMMFilePositionByIndex() for an example of how to use this + * opaque offset type in the SSI API - the call you need + * is SSISetFilePosition(). + */ + int is_seekable; /* TRUE if we use offsets in this HMM file */ + int mode; /* type of offset */ + SSIOFFSET offset; /* Disk offset for beginning of the current HMM */ +}; +typedef struct hmmfile_s HMMFILE; + + +/* Plan 7 model state types + * used in traceback structure + */ +#define STBOGUS 0 +#define STM 1 +#define STD 2 +#define STI 3 +#define STS 4 +#define STN 5 +#define STB 6 +#define STE 7 +#define STC 8 +#define STT 9 +#define STJ 10 + +/* Structure: p7trace_s + * + * Traceback structure for alignments of model to sequence. + * Each array in a trace_s is 0..tlen-1. + * Element 0 is always to STATE_S. Element tlen-1 is always to STATE_T. + */ +struct p7trace_s { + int tlen; /* length of traceback */ + char *statetype; /* state type used for alignment */ + int *nodeidx; /* index of aligned node, 1..M (if M,D,I), or 0 */ + int *pos; /* position in dsq, 1..L, or 0 if none */ +}; + +/* Structure: p7prior_s + * + * Dirichlet priors on HMM parameters. + */ +struct p7prior_s { + int strategy; /* PRI_DCHLET, etc. */ + + int tnum; /* number of transition Dirichlet mixtures */ + float tq[MAXDCHLET]; /* probabilities of tnum components */ + float t[MAXDCHLET][7]; /* transition terms per mix component */ + + int mnum; /* number of mat emission Dirichlet mixtures */ + float mq[MAXDCHLET]; /* probabilities of mnum components */ + float m[MAXDCHLET][MAXABET]; /* match emission terms per mix component */ + + int inum; /* number of insert emission Dirichlet mixes */ + float iq[MAXDCHLET]; /* probabilities of inum components */ + float i[MAXDCHLET][MAXABET]; /* insert emission terms */ +}; +#define PRI_DCHLET 0 /* simple or mixture Dirichlets */ +#define PRI_PAM 1 /* PAM prior hack */ + + +/********************************************************************** + * Other structures, not having to do with HMMs. + **********************************************************************/ + +/* Structure: histogram_s + * + * Keep a score histogram. + * + * The main implementation issue here is that the range of + * scores is unknown, and will go negative. histogram is + * a 0..max-min array that represents the range min..max. + * A given score is indexed in histogram array as score-min. + * The AddToHistogram() function deals with dynamically + * resizing the histogram array when necessary. + */ +struct histogram_s { + int *histogram; /* counts of hits */ + int min; /* elem 0 of histogram == min */ + int max; /* last elem of histogram == max */ + int highscore; /* highest active elem has this score */ + int lowscore; /* lowest active elem has this score */ + int lumpsize; /* when resizing, overalloc by this */ + int total; /* total # of hits counted */ + + float *expect; /* expected counts of hits */ + int fit_type; /* flag indicating distribution type */ + float param[3]; /* parameters used for fits */ + float chisq; /* chi-squared val for goodness of fit*/ + float chip; /* P value for chisquared */ +}; +#define HISTFIT_NONE 0 /* no fit done yet */ +#define HISTFIT_EVD 1 /* fit type = extreme value dist */ +#define HISTFIT_GAUSSIAN 2 /* fit type = Gaussian */ +#define EVD_MU 0 /* EVD fit parameter mu */ +#define EVD_LAMBDA 1 /* EVD fit parameter lambda */ +#define EVD_WONKA 2 /* EVD fit fudge factor */ +#define GAUSS_MEAN 0 /* Gaussian parameter mean */ +#define GAUSS_SD 1 /* Gaussian parameter std. dev. */ + +/* Structure: fancyali_s + * + * Alignment of a hit to an HMM, for printing. + */ +struct fancyali_s { + char *rfline; /* reference coord info */ + char *csline; /* consensus structure info */ + char *model; /* aligned query consensus sequence */ + char *mline; /* "identities", conservation +'s, etc. */ + char *aseq; /* aligned target sequence */ + int len; /* length of strings */ + char *query; /* name of query HMM */ + char *target; /* name of target sequence */ + int sqfrom; /* start position on sequence (1..L) */ + int sqto; /* end position on sequence (1..L) */ +}; + +/* Structure: hit_s + * + * Info about a high-scoring database hit. + * We keep this info in memory, so we can output a + * sorted list of high hits at the end. + * + * sqfrom and sqto are the coordinates that will be shown + * in the results, not coords in arrays... therefore, reverse + * complements have sqfrom > sqto + */ +struct hit_s { + double sortkey; /* number to sort by; big is better */ + float score; /* score of the hit */ + double pvalue; /* P-value of the hit */ + float mothersc; /* score of whole sequence */ + double motherp; /* P-value of whole sequence */ + char *name; /* name of the target */ + char *acc; /* accession of the target */ + char *desc; /* description of the target */ + int sqfrom; /* start position in seq (1..N) */ + int sqto; /* end position in seq (1..N) */ + int sqlen; /* length of sequence (N) */ + int hmmfrom; /* start position in HMM (1..M) */ + int hmmto; /* end position in HMM (1..M) */ + int hmmlen; /* length of HMM (M) */ + int domidx; /* index of this domain */ + int ndom; /* total # of domains in this seq */ + struct fancyali_s *ali; /* ptr to optional alignment info */ +}; + + +/* Structure: tophit_s + * + * Array of high scoring hits, suitable for efficient sorting + * when we prepare to output results. "hit" list is NULL and + * unavailable until after we do a sort. + */ +struct tophit_s { + struct hit_s **hit; /* array of ptrs to top scoring hits */ + struct hit_s *unsrt; /* unsorted array */ + int alloc; /* current allocation size */ + int num; /* number of hits in list now */ + int lump; /* allocation lumpsize */ +}; + +/* struct threshold_s + * Contains score/evalue threshold settings. + * + * made first for hmmpfam: + * Since we're going to loop over all HMMs in a Pfam (or pfam-like) + * database in main_loop_{serial,pvm}, and we're going to + * allow autocutoffs using Pfam GA, NC, TC lines, we will need + * to reset those cutoffs with each HMM in turn. Therefore the + * main loops need to know whether they're supposed to be + * doing autocutoff. This amount of info was unwieldy enough + * to pass through the argument list that I put it + * in a structure. + */ +struct threshold_s { + float globT; /* T parameter: keep only hits > globT bits */ + double globE; /* E parameter: keep hits < globE E-value */ + float domT; /* T parameter for individual domains */ + double domE; /* E parameter for individual domains */ + /* autosetting of cutoffs using Pfam annot: */ + enum { CUT_NONE, CUT_GA, CUT_NC, CUT_TC } autocut; + int Z; /* nseq to base E value calculation on */ +}; + +/********************************************************** + * PVM parallelization + **********************************************************/ +#ifdef HMMER_PVM + +/* Message tags + */ +#define HMMPVM_INIT 0 /* an initialization packet to all slaves */ +#define HMMPVM_WORK 1 /* a work packet sent to a slave */ +#define HMMPVM_RESULTS 2 /* a results packet sent back to master */ +#define HMMPVM_TASK_TROUBLE 3 /* a notification of bad things in a slave task */ +#define HMMPVM_HOST_TROUBLE 4 /* a notification of bad things in a PVM host */ + +/* error codes + */ +#define HMMPVM_OK 0 +#define HMMPVM_NO_HMMFILE 1 +#define HMMPVM_NO_INDEX 2 +#define HMMPVM_BAD_INIT 3 /* failed to initialize a slave somehow */ + +#endif + + +/********************************************************** + * Plan 9: obsolete HMMER1.x code. We still need these structures + * for reading old HMM files (e.g. backwards compatibility) + **********************************************************/ + +/* We define a "basic" state, which covers the basic match, insert, and + * delete states from the Haussler paper. Numbers are stored as + * pre-calculated negative logs. + */ +struct basic_state { + float t[3]; /* state transitions to +1 M, +0 I, +1 D */ + float p[MAXABET]; /* symbol emission probabilities */ +}; + +/* A complete hidden Markov model + */ +struct plan9_s { + int M; /* length of the model */ + struct basic_state *ins; /* insert states 0..M+1 */ + struct basic_state *mat; /* match 0..M+1; 0 = BEGIN, M+1 = END */ + struct basic_state *del; /* delete 0..M+1 */ + + float null[MAXABET]; /* the *suggested* null model */ + + /* Optional annotation on the HMM, taken from alignment + */ + char *name; /* a name for the HMM */ + char *ref; /* reference coords and annotation */ + char *cs; /* consensus structure annotation */ + float *xray; /* Structural annotation: xray[0..M+1][NINPUTS], indexed manually */ + + int flags; /* flags for what optional info is in HMM */ +}; + +/* Flags for optional info in an HMM structure + */ +#define HMM_REF (1<<0) +#define HMM_CS (1<<1) +#define HMM_XRAY (1<<2) + +#define MATCH 0 +#define INSERT 1 +#define DELETE 2 +#define BEGIN MATCH +#define END MATCH + +#endif /* STRUCTSH_INCLUDED */ diff --git a/forester/archive/RIO/others/hmmer/src/threads.c b/forester/archive/RIO/others/hmmer/src/threads.c new file mode 100644 index 0000000..d2eb450 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/threads.c @@ -0,0 +1,90 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* threads.c + * SRE, Fri Jul 10 10:05:44 1998 + * + * Pthreads code shared by hmmsearch, hmmcalibrate, and hmmpfam + * to coarse-grain parallelize on platforms capable of POSIX + * threads. Most of the threads code, however, is in the respective + * main's, i.e. hmmsearch.c, hmmpfam.c, hmmcalibrate.c + * + * RCS $Id: threads.c,v 1.1.1.1 2005/03/22 08:34:02 cmzmasek Exp $ + */ + +#ifdef HMMER_THREADS /* conditional inclusion of the entire file */ + +#include +#include +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "squid.h" +#include "sqfuncs.h" + + +/* Function: ThreadNumber() + * Date: SRE, Sat Jul 11 11:03:50 1998 [St. Louis] + * + * Purpose: Recommend how many threads to use. + * + * - if we can determine the number of processors + * on the machine by SQD_NPROC, use that. This + * should succeed for SGI IRIX, Digital UNIX, and + * Sun Solaris platforms. + * - if not, assume two processors. We're probably + * on a FreeBSD or Linux box, and odds are that its + * a dualprocessor. + * - if HMMER_NCPU is defined in config.h, use that + * number instead; allows Linux or FreeBSD machines + * to compile code for a quadprocessor, for instance. + * That define can be overridden at compile + * time by a -DHMMER_NCPU=x, where x is the + * number of threads.. + * - if HMMER_NCPU is defined in the environment, + * use that number, overriding all others. + * + * Typically, we'll set the default number of + * threads with ThreadNumber() but allow it + * to be overridden at the command line with --cpu. + * + * Summarizing priority: + * --ncpu option + * environment variable, setenv HMMER_NCPU x + * compile-time, MDEFS=HMMER_NCPU=x + * compile-time, config.h definition of HMMER_NCPU + * SQD_NPROC, or 2 if SQD_NPROC doesn't work. + * + * Args: void + * + * Returns: >= 1, recommended number of threads + */ +int +ThreadNumber(void) +{ + int num; + char *env; + + num = SQD_NPROC; /* SGI, Sun, Digital: get # of available CPUs */ + if (num == -1) num = 2; /* Linux, FreeBSD: assume dualprocessor */ +#ifdef HMMER_NCPU + num = HMMER_NCPU; /* allow config.h to override; usually we don't */ +#endif + /* allow environment variable to override */ + if ((env = getenv("HMMER_NCPU")) != NULL) + num = atoi(env); + if (num <= 0) num = 1; /* silent sanity check */ + SQD_DPRINTF1(("ThreadNumber(): setting number of threads to %d\n", num)); + return num; +} + +#endif /*HMMER_THREADS*/ diff --git a/forester/archive/RIO/others/hmmer/src/tophits.c b/forester/archive/RIO/others/hmmer/src/tophits.c new file mode 100644 index 0000000..bbd1b05 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/tophits.c @@ -0,0 +1,376 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* tophits.c + * + * Routines for storing, sorting, displaying high scoring hits + * and alignments. + * + ***************************************************************************** + * + * main API: + * + * AllocTophits() - allocation + * FreeTophits() - free'ing + * RegisterHit() - put information about a hit in the list + * GetRankedHit() - recovers information about a hit + * FullSortTophits() - sorts the top H hits. + * + ***************************************************************************** + * Brief example of use: + * + * struct tophit_s *yourhits; // list of hits + * struct fancyali_s *ali; // (optional structure) alignment of a hit + * + * yourhits = AllocTophits(200); + * (for every hit in a search) { + * if (do_alignments) + * ali = Trace2FancyAli(); // You provide a function/structure here + * if (score > threshold) + * RegisterHit(yourhits, ...) + * } + * + * FullSortTophits(yourhits); // Sort hits by evalue + * for (i = 0; i < 100; i++) // Recover hits out in ranked order + * { + * GetRankedHit(yourhits, i, ...); + * // Presumably you'd print here... + * } + * FreeTophits(yourhits); + *************************************************************************** + * + * Estimated storage per hit: + * coords: 16 bytes + * scores: 8 bytes + * name/acc/desc: 192 bytes + * alignment: 1000 bytes total = ~1200 bytes with alignment; + * = ~200 bytes without + * Designed for: 10^5 hits (20 MB) or 10^4 alignments (10 MB) + */ + +#include +#include +#include +#include "structs.h" +#include "funcs.h" + +/* Function: AllocTophits() + * + * Purpose: Allocate a struct tophit_s, for maintaining + * a list of top-scoring hits in a database search. + * + * Args: lumpsize - allocation lumpsize + * + * Return: An allocated struct hit_s. Caller must free. + */ +struct tophit_s * +AllocTophits(int lumpsize) +{ + struct tophit_s *hitlist; + + hitlist = MallocOrDie (sizeof(struct tophit_s)); + hitlist->hit = NULL; + hitlist->unsrt = MallocOrDie (lumpsize * sizeof(struct hit_s)); + hitlist->alloc = lumpsize; + hitlist->num = 0; + hitlist->lump = lumpsize; + return hitlist; +} +void +GrowTophits(struct tophit_s *h) +{ + h->unsrt = ReallocOrDie(h->unsrt,(h->alloc + h->lump) * sizeof(struct hit_s)); + h->alloc += h->lump; +} +void +FreeTophits(struct tophit_s *h) +{ + int pos; + for (pos = 0; pos < h->num; pos++) + { + if (h->unsrt[pos].ali != NULL) FreeFancyAli(h->unsrt[pos].ali); + if (h->unsrt[pos].name != NULL) free(h->unsrt[pos].name); + if (h->unsrt[pos].acc != NULL) free(h->unsrt[pos].acc); + if (h->unsrt[pos].desc != NULL) free(h->unsrt[pos].desc); + } + free(h->unsrt); + if (h->hit != NULL) free(h->hit); + free(h); +} + +struct fancyali_s * +AllocFancyAli(void) +{ + struct fancyali_s *ali; + + ali = MallocOrDie (sizeof(struct fancyali_s)); + ali->rfline = ali->csline = ali->model = ali->mline = ali->aseq = NULL; + ali->query = ali->target = NULL; + ali->sqfrom = ali->sqto = 0; + return ali; +} +void +FreeFancyAli(struct fancyali_s *ali) +{ + if (ali != NULL) { + if (ali->rfline != NULL) free(ali->rfline); + if (ali->csline != NULL) free(ali->csline); + if (ali->model != NULL) free(ali->model); + if (ali->mline != NULL) free(ali->mline); + if (ali->aseq != NULL) free(ali->aseq); + if (ali->query != NULL) free(ali->query); + if (ali->target != NULL) free(ali->target); + free(ali); + } +} + +/* Function: RegisterHit() + * + * Purpose: Add a new hit to a list of top hits. + * + * "ali", if provided, is a pointer to allocated memory + * for an alignment output structure. + * Management is turned over to the top hits structure. + * Caller should not free them; they will be free'd by + * the FreeTophits() call. + * + * In contrast, "name", "acc", and "desc" are copied, so caller + * is still responsible for these. + * + * Number of args is unwieldy. + * + * Args: h - active top hit list + * key - value to sort by: bigger is better + * pvalue - P-value of this hit + * score - score of this hit + * motherp - P-value of parent whole sequence + * mothersc - score of parent whole sequence + * name - name of target + * acc - accession of target (may be NULL) + * desc - description of target (may be NULL) + * sqfrom - 1..L pos in target seq of start + * sqto - 1..L pos; sqfrom > sqto if rev comp + * sqlen - length of sequence, L + * hmmfrom - 0..M+1 pos in HMM of start + * hmmto - 0..M+1 pos in HMM of end + * hmmlen - length of HMM, M + * domidx - number of this domain + * ndom - total # of domains in sequence + * ali - optional printable alignment info + * + * Return: (void) + * hitlist is modified and possibly reallocated internally. + */ +void +RegisterHit(struct tophit_s *h, double key, + double pvalue, float score, double motherp, float mothersc, + char *name, char *acc, char *desc, + int sqfrom, int sqto, int sqlen, + int hmmfrom, int hmmto, int hmmlen, + int domidx, int ndom, + struct fancyali_s *ali) +{ + /* Check to see if list is full and we must realloc. + */ + if (h->num == h->alloc) GrowTophits(h); + + h->unsrt[h->num].name = Strdup(name); + h->unsrt[h->num].acc = Strdup(acc); + h->unsrt[h->num].desc = Strdup(desc); + h->unsrt[h->num].sortkey = key; + h->unsrt[h->num].pvalue = pvalue; + h->unsrt[h->num].score = score; + h->unsrt[h->num].motherp = motherp; + h->unsrt[h->num].mothersc= mothersc; + h->unsrt[h->num].sqfrom = sqfrom; + h->unsrt[h->num].sqto = sqto; + h->unsrt[h->num].sqlen = sqlen; + h->unsrt[h->num].hmmfrom = hmmfrom; + h->unsrt[h->num].hmmto = hmmto; + h->unsrt[h->num].hmmlen = hmmlen; + h->unsrt[h->num].domidx = domidx; + h->unsrt[h->num].ndom = ndom; + h->unsrt[h->num].ali = ali; + h->num++; + return; +} + +/* Function: GetRankedHit() + * Date: SRE, Tue Oct 28 10:06:48 1997 [Newton Institute, Cambridge UK] + * + * Purpose: Recover the data from the i'th ranked hit. + * Any of the data ptrs may be passed as NULL for fields + * you don't want. hitlist must have been sorted first. + * + * name, acc, desc, and ali are returned as pointers, not copies; + * don't free them! + */ +void +GetRankedHit(struct tophit_s *h, int rank, + double *r_pvalue, float *r_score, + double *r_motherp, float *r_mothersc, + char **r_name, char **r_acc, char **r_desc, + int *r_sqfrom, int *r_sqto, int *r_sqlen, + int *r_hmmfrom, int *r_hmmto, int *r_hmmlen, + int *r_domidx, int *r_ndom, + struct fancyali_s **r_ali) +{ + if (r_pvalue != NULL) *r_pvalue = h->hit[rank]->pvalue; + if (r_score != NULL) *r_score = h->hit[rank]->score; + if (r_motherp != NULL) *r_motherp = h->hit[rank]->motherp; + if (r_mothersc!= NULL) *r_mothersc= h->hit[rank]->mothersc; + if (r_name != NULL) *r_name = h->hit[rank]->name; + if (r_acc != NULL) *r_acc = h->hit[rank]->acc; + if (r_desc != NULL) *r_desc = h->hit[rank]->desc; + if (r_sqfrom != NULL) *r_sqfrom = h->hit[rank]->sqfrom; + if (r_sqto != NULL) *r_sqto = h->hit[rank]->sqto; + if (r_sqlen != NULL) *r_sqlen = h->hit[rank]->sqlen; + if (r_hmmfrom != NULL) *r_hmmfrom = h->hit[rank]->hmmfrom; + if (r_hmmto != NULL) *r_hmmto = h->hit[rank]->hmmto; + if (r_hmmlen != NULL) *r_hmmlen = h->hit[rank]->hmmlen; + if (r_domidx != NULL) *r_domidx = h->hit[rank]->domidx; + if (r_ndom != NULL) *r_ndom = h->hit[rank]->ndom; + if (r_ali != NULL) *r_ali = h->hit[rank]->ali; +} + +/* Function: TophitsMaxName() + * + * Purpose: Returns the maximum name length in a top hits list; + * doesn't need to be sorted yet. + */ +int +TophitsMaxName(struct tophit_s *h) +{ + int i; + int len, maxlen; + + maxlen = 0; + for (i = 0; i < h->num; i++) + { + len = strlen(h->unsrt[i].name); + if (len > maxlen) maxlen = len; + } + return maxlen; +} + +/* Function: FullSortTophits() + * + * Purpose: Completely sort the top hits list. Calls + * qsort() to do the sorting, and uses + * hit_comparison() to do the comparison. + * + * Args: h - top hits structure + */ +int +hit_comparison(const void *vh1, const void *vh2) +{ + /* don't ask. don't change. and, Don't Panic. */ + struct hit_s *h1 = *((struct hit_s **) vh1); + struct hit_s *h2 = *((struct hit_s **) vh2); + + if (h1->sortkey < h2->sortkey) return 1; + else if (h1->sortkey > h2->sortkey) return -1; + else if (h1->sortkey == h2->sortkey) return 0; + /*NOTREACHED*/ + return 0; +} +void +FullSortTophits(struct tophit_s *h) +{ + int i; + + /* If we don't have /any/ hits, then don't + * bother. + */ + if (h->num == 0) return; + + /* Assign the ptrs in h->hit. + */ + h->hit = MallocOrDie(h->num * sizeof(struct hit_s *)); + for (i = 0; i < h->num; i++) + h->hit[i] = &(h->unsrt[i]); + + /* Sort the pointers. Don't bother if we've only got one. + */ + if (h->num > 1) + qsort(h->hit, h->num, sizeof(struct hit_s *), hit_comparison); +} + + + +/* Function: TophitsReport() + * Date: Thu Dec 18 13:19:18 1997 + * + * Purpose: Generate a printout summarizing how much + * memory is used by a tophits structure, + * how many hits are stored, and how much + * waste there is from not knowing nseqs. + * + * Args: h - the sorted tophits list + * E - the cutoff in Evalue + * nseq - the final number of seqs used for Eval + * + * Return: (void) + * Prints information on stdout + */ +void +TophitsReport(struct tophit_s *h, double E, int nseq) +{ + int i; + int memused; + int x; + int n; + + /* Count up how much memory is used + * in the whole list. + */ + memused = sizeof(struct hit_s) * h->alloc + sizeof(struct tophit_s); + for (i = 0; i < h->num; i++) + { + if (h->unsrt[i].name != NULL) + memused += strlen(h->unsrt[i].name) + 1; + if (h->unsrt[i].acc != NULL) + memused += strlen(h->unsrt[i].acc) + 1; + if (h->unsrt[i].desc != NULL) + memused += strlen(h->unsrt[i].desc) + 1; + if (h->unsrt[i].ali != NULL) + { + memused += sizeof(struct fancyali_s); + x = 0; + if (h->unsrt[i].ali->rfline != NULL) x++; + if (h->unsrt[i].ali->csline != NULL) x++; + if (h->unsrt[i].ali->model != NULL) x++; + if (h->unsrt[i].ali->mline != NULL) x++; + if (h->unsrt[i].ali->aseq != NULL) x++; + memused += x * (h->unsrt[i].ali->len + 1); + + if (h->unsrt[i].ali->query != NULL) + memused += strlen(h->unsrt[i].ali->query) + 1; + if (h->unsrt[i].ali->target != NULL) + memused += strlen(h->unsrt[i].ali->target) + 1; + } + } + + /* Count how many hits actually satisfy the E cutoff. + */ + n = 0; + for (i = 0; i < h->num; i++) + { + if (h->hit[i]->pvalue * (double) nseq >= E) break; + n++; + } + + /* Format and print a summary + */ + printf("tophits_s report:\n"); + printf(" Total hits: %d\n", h->num); + printf(" Satisfying E cutoff: %d\n", n); + printf(" Total memory: %dK\n", memused / 1000); +} diff --git a/forester/archive/RIO/others/hmmer/src/trace.c b/forester/archive/RIO/others/hmmer/src/trace.c new file mode 100644 index 0000000..424d3ba --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/trace.c @@ -0,0 +1,1203 @@ +/************************************************************ + * HMMER - Biological sequence analysis with profile HMMs + * Copyright (C) 1992-1999 Washington University School of Medicine + * All Rights Reserved + * + * This source code is distributed under the terms of the + * GNU General Public License. See the files COPYING and LICENSE + * for details. + ************************************************************/ + +/* trace.c + * SRE, Sat Nov 16 12:34:57 1996 + * RCS $Id: trace.c,v 1.1.1.1 2005/03/22 08:34:07 cmzmasek Exp $ + * + * Support for Plan 7 traceback data structure, p7trace_s. + */ + +#include +#include +#include + +#include "structs.h" +#include "config.h" +#include "squid.h" +#include "funcs.h" +#include "version.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +static void rightjustify(char *s, int n); + +/* Function: P7AllocTrace(), P7ReallocTrace(), P7FreeTrace() + * + * Purpose: allocation and freeing of traceback structures + */ +void +P7AllocTrace(int tlen, struct p7trace_s **ret_tr) +{ + struct p7trace_s *tr; + + tr = MallocOrDie (sizeof(struct p7trace_s)); + tr->statetype = MallocOrDie (sizeof(char) * tlen); + tr->nodeidx = MallocOrDie (sizeof(int) * tlen); + tr->pos = MallocOrDie (sizeof(int) * tlen); + *ret_tr = tr; +} +void +P7ReallocTrace(struct p7trace_s *tr, int tlen) +{ + tr->statetype = ReallocOrDie (tr->statetype, tlen * sizeof(char)); + tr->nodeidx = ReallocOrDie (tr->nodeidx, tlen * sizeof(int)); + tr->pos = ReallocOrDie (tr->pos, tlen * sizeof(int)); +} +void +P7FreeTrace(struct p7trace_s *tr) +{ + free(tr->pos); + free(tr->nodeidx); + free(tr->statetype); + free(tr); +} + +/* Function: TraceSet() + * Date: SRE, Sun Mar 8 12:39:00 1998 [St. Louis] + * + * Purpose: Convenience function; set values at position tpos + * in a trace. + * + * + * Args: tr - trace object to write to + * tpos - ptr to position in trace to set + * type - statetype e.g. STS, etc. + * idx - nodeidx 1..M or 0 + * pos - seq position 1..L or 0 + * + * Returns: void + */ +void +TraceSet(struct p7trace_s *tr, int tpos, char type, int idx, int pos) +{ + tr->statetype[tpos] = type; + tr->nodeidx[tpos] = idx; + tr->pos[tpos] = pos; +} + + +/* Function: MergeTraceArrays() + * Date: SRE, Sun Jul 5 15:09:10 1998 [St. Louis] + * + * Purpose: Combine two arrays of traces into a single array. + * Used in hmmalign to merge traces from a fixed alignment + * with traces from individual unaligned seqs. + * + * t1 traces always precede t2 traces in the resulting array. + * + * Args: t1 - first set of traces + * n1 - number of traces in t1 + * t2 - second set of traces + * n2 - number of traces in t2 + * + * Returns: pointer to new array of traces. + * Both t1 and t2 are free'd here! Do not reuse. + */ +struct p7trace_s ** +MergeTraceArrays(struct p7trace_s **t1, int n1, struct p7trace_s **t2, int n2) +{ + struct p7trace_s **tr; + int i; /* index in traces */ + + tr = MallocOrDie(sizeof(struct p7trace_s *) * (n1+n2)); + for (i = 0; i < n1; i++) tr[i] = t1[i]; + for (i = 0; i < n2; i++) tr[n1+i] = t2[i]; + free(t1); + free(t2); + return tr; +} + + + +/* Function: P7ReverseTrace() + * Date: SRE, Mon Aug 25 12:57:29 1997; Denver CO. + * + * Purpose: Reverse the arrays in a traceback structure. + * Tracebacks from Forward() and Viterbi() are + * collected backwards, and call this function + * when they're done. + * + * It's possible to reverse the arrays in place + * more efficiently; but the realloc/copy strategy + * has the advantage of reallocating the trace + * into the right size of memory. (Tracebacks + * overallocate.) + * + * Args: tr - the traceback to reverse. tr->tlen must be set. + * + * Return: (void) + * tr is modified. + */ +void +P7ReverseTrace(struct p7trace_s *tr) +{ + char *statetype; + int *nodeidx; + int *pos; + int opos, npos; + + /* Allocate + */ + statetype = MallocOrDie (sizeof(char)* tr->tlen); + nodeidx = MallocOrDie (sizeof(int) * tr->tlen); + pos = MallocOrDie (sizeof(int) * tr->tlen); + + /* Reverse the trace. + */ + for (opos = tr->tlen-1, npos = 0; npos < tr->tlen; npos++, opos--) + { + statetype[npos] = tr->statetype[opos]; + nodeidx[npos] = tr->nodeidx[opos]; + pos[npos] = tr->pos[opos]; + } + + /* Swap old, new arrays. + */ + free(tr->statetype); + free(tr->nodeidx); + free(tr->pos); + tr->statetype = statetype; + tr->nodeidx = nodeidx; + tr->pos = pos; +} + + + +/* Function: P7TraceCount() + * + * Purpose: Count a traceback into a count-based HMM structure. + * (Usually as part of a model parameter re-estimation.) + * + * Args: hmm - counts-based HMM + * dsq - digitized sequence that traceback aligns to the HMM (1..L) + * wt - weight on the sequence + * tr - alignment of seq to HMM + * + * Return: (void) + */ +void +P7TraceCount(struct plan7_s *hmm, char *dsq, float wt, struct p7trace_s *tr) +{ + int tpos; /* position in tr */ + int i; /* symbol position in seq */ + + for (tpos = 0; tpos < tr->tlen; tpos++) + { + i = tr->pos[tpos]; + + /* Emission counts. + * Don't bother counting null states N,J,C. + */ + if (tr->statetype[tpos] == STM) + P7CountSymbol(hmm->mat[tr->nodeidx[tpos]], dsq[i], wt); + else if (tr->statetype[tpos] == STI) + P7CountSymbol(hmm->ins[tr->nodeidx[tpos]], dsq[i], wt); + + /* State transition counts + */ + switch (tr->statetype[tpos]) { + case STS: + break; /* don't bother; P=1 */ + case STN: + switch (tr->statetype[tpos+1]) { + case STB: hmm->xt[XTN][MOVE] += wt; break; + case STN: hmm->xt[XTN][LOOP] += wt; break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STB: + switch (tr->statetype[tpos+1]) { + case STM: hmm->begin[tr->nodeidx[tpos+1]] += wt; break; + case STD: hmm->tbd1 += wt; break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STM: + switch (tr->statetype[tpos+1]) { + case STM: hmm->t[tr->nodeidx[tpos]][TMM] += wt; break; + case STI: hmm->t[tr->nodeidx[tpos]][TMI] += wt; break; + case STD: hmm->t[tr->nodeidx[tpos]][TMD] += wt; break; + case STE: hmm->end[tr->nodeidx[tpos]] += wt; break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STI: + switch (tr->statetype[tpos+1]) { + case STM: hmm->t[tr->nodeidx[tpos]][TIM] += wt; break; + case STI: hmm->t[tr->nodeidx[tpos]][TII] += wt; break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STD: + switch (tr->statetype[tpos+1]) { + case STM: hmm->t[tr->nodeidx[tpos]][TDM] += wt; break; + case STD: hmm->t[tr->nodeidx[tpos]][TDD] += wt; break; + case STE: /* ignore; p(D->E) = 1.0 */ break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STE: + switch (tr->statetype[tpos+1]) { + case STC: hmm->xt[XTE][MOVE] += wt; break; + case STJ: hmm->xt[XTE][LOOP] += wt; break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STJ: + switch (tr->statetype[tpos+1]) { + case STB: hmm->xt[XTJ][MOVE] += wt; break; + case STJ: hmm->xt[XTJ][LOOP] += wt; break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STC: + switch (tr->statetype[tpos+1]) { + case STT: hmm->xt[XTC][MOVE] += wt; break; + case STC: hmm->xt[XTC][LOOP] += wt; break; + default: + Die("illegal state transition %s->%s in traceback", + Statetype(tr->statetype[tpos]), + Statetype(tr->statetype[tpos+1])); + } + break; + case STT: + break; /* T is the last. It makes no transitions. */ + default: + Die("illegal state %s in traceback", + Statetype(tr->statetype[tpos])); + } + } +} + + +/* Function: P7TraceScore() + * + * Purpose: Score a traceback and return the score in scaled bits. + * + * Args: hmm - HMM with valid log odds scores. + * dsq - digitized sequence that traceback aligns to the HMM (1..L) + * tr - alignment of seq to HMM + * + * Return: (void) + */ +float +P7TraceScore(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr) +{ + int score; /* total score as a scaled integer */ + int tpos; /* position in tr */ + int sym; /* digitized symbol in dsq */ + + /* P7PrintTrace(stdout, tr, hmm, dsq); */ + score = 0; + for (tpos = 0; tpos < tr->tlen-1; tpos++) + { + sym = (int) dsq[tr->pos[tpos]]; + + /* Emissions. + * Don't bother counting null states N,J,C. + */ + if (tr->statetype[tpos] == STM) + score += hmm->msc[sym][tr->nodeidx[tpos]]; + else if (tr->statetype[tpos] == STI) + score += hmm->isc[sym][tr->nodeidx[tpos]]; + + /* State transitions. + */ + score += TransitionScoreLookup(hmm, + tr->statetype[tpos], tr->nodeidx[tpos], + tr->statetype[tpos+1], tr->nodeidx[tpos+1]); + } + return Scorify(score); +} + + + +/* Function: P7Traces2Alignment() + * + * Purpose: Convert an array of traceback structures for a set + * of sequences into a new multiple alignment. + * + * Insertions are put into lower case and + * are not aligned; instead, Nterm is right-justified, + * Cterm is left-justified, and internal insertions + * are split in half and the halves are justified in + * each direction (the objective being to increase + * the chances of getting insertions aligned well enough + * for them to become a match). SAM gap char conventions + * are used: - in match columns, . in insert columns + * + * NOTE: Does not recognize J state. + * + * Args: dsq - digitized unaligned sequences + * sqinfo - array of info about the sequences + * wgt - weights on seqs + * nseq - number of sequences + * mlen - length of model (number of match states) + * tr - array of tracebacks + * matchonly - TRUE if we don't print insert-generated symbols at all + * Return: MSA structure; NULL on failure. + * Caller responsible for freeing msa with MSAFree(msa); + */ +MSA * +P7Traces2Alignment(char **dsq, SQINFO *sqinfo, float *wgt, int nseq, int mlen, + struct p7trace_s **tr, int matchonly) +{ + MSA *msa; /* RETURN: new alignment */ + int idx; /* counter for sequences */ + int alen; /* width of alignment */ + int *inserts; /* array of max gaps between aligned columns */ + int *matmap; /* matmap[k] = apos of match k [1..M] */ + int nins; /* counter for inserts */ + int apos; /* position in aligned sequence (0..alen-1)*/ + int rpos; /* position in raw digital sequence (1..L)*/ + int tpos; /* position counter in traceback */ + int statetype; /* type of current state, e.g. STM */ + int k; /* counter over states in model */ + + /* Here's the problem. We want to align the match states in columns, + * but some sequences have inserted symbols in them; we need some + * sort of overall knowledge of where the inserts are and how long + * they are in order to create the alignment. + * + * Here's our trick. inserts[] is a 0..hmm->M array; inserts[i] stores + * the maximum number of times insert substate i was used. This + * is the maximum number of gaps to insert between canonical + * column i and i+1. inserts[0] is the N-term tail; inserts[M] is + * the C-term tail. + * + * Remember that N and C emit on transition, hence the check for an + * N->N or C->C transition before bumping nins. + */ + inserts = (int *) MallocOrDie (sizeof(int) * (mlen+1)); + for (k = 0; k <= mlen; k++) + inserts[k] = 0; + for (idx = 0; idx < nseq; idx++) { + nins = 0; + for (tpos = 0; tpos < tr[idx]->tlen; tpos++) { + switch (tr[idx]->statetype[tpos]) { + case STI: nins++; break; + case STN: if (tr[idx]->statetype[tpos-1] == STN) nins++; break; + case STC: if (tr[idx]->statetype[tpos-1] == STC) nins++; break; + case STM: + case STD: /* M,D: record max. reset ctr. */ + if (nins > inserts[tr[idx]->nodeidx[tpos]-1]) + inserts[tr[idx]->nodeidx[tpos]-1] = nins; + nins = 0; + break; + case STB: /* B; record N-tail max, reset ctr */ + if (nins > inserts[0]) + inserts[0] = nins; + nins = 0; + break; + case STT: /* T: record C-tail max */ + if (nins > inserts[mlen]) + inserts[mlen] = nins; + break; + case STS: case STE: break; /* ignore other states */ + case STJ: + Die("yo! you don't support J in Traces2Alignment(), remember?"); + default: + Die("Traces2Alignment reports unrecognized statetype %c", + Statetype(tr[idx]->statetype[tpos])); + } + } + } + + /* Insert compression option. */ + if (matchonly) + for (k = 0; k <= mlen; k++) + if (inserts[k] > 1) + inserts[k] = 1; + + /*********************************************** + * Construct the alignment + ***********************************************/ + /* calculate alignment length and matmap */ + matmap= (int *) MallocOrDie (sizeof(int) * (mlen+1)); + matmap[0] = -1; + alen = inserts[0]; + for (k = 1; k <= mlen ; k++) { + matmap[k] = alen; + alen += inserts[k] + 1; + } + /* allocation for new alignment */ + msa = MSAAlloc(nseq, alen); + + for (idx = 0; idx < nseq; idx++) { + /* blank an aseq */ + for (apos = 0; apos < alen; apos++) + msa->aseq[idx][apos] = '.'; + for (k = 1; k <= mlen; k++) + msa->aseq[idx][matmap[k]] = '-'; + msa->aseq[idx][alen] = '\0'; + /* align the sequence */ + apos = 0; + for (tpos = 0; tpos < tr[idx]->tlen; tpos++) { + statetype = tr[idx]->statetype[tpos]; /* just for clarity */ + rpos = tr[idx]->pos[tpos]; + k = tr[idx]->nodeidx[tpos]; + + if (statetype == STM) { + apos = matmap[k]; + msa->aseq[idx][apos] = Alphabet[(int) dsq[idx][rpos]]; + apos++; + } + else if (statetype == STI) { + if (matchonly) + msa->aseq[idx][apos] = '*'; /* insert compression option */ + else { + msa->aseq[idx][apos] = (char) tolower((int) Alphabet[(int) dsq[idx][rpos]]); + apos++; + } + } + else if ((statetype == STN || statetype == STC) && rpos > 0) { + if (matchonly) + msa->aseq[idx][apos] = '*'; /* insert compression option */ + else { + msa->aseq[idx][apos] = (char) tolower((int) Alphabet[(int) dsq[idx][rpos]]); + apos++; + } + } + else if (statetype == STE) + apos = matmap[mlen]+1; /* set position for C-term tail */ + } + + /* N-terminal extension is right-justified. + * Internal inserts are split in half, and C-term is right-justified. + * C-terminal extension remains left-justified. + */ + if (! matchonly) { + rightjustify(msa->aseq[idx], inserts[0]); + + for (k = 1; k < mlen; k++) + if (inserts[k] > 1) { + for (nins = 0, apos = matmap[k]+1; islower((int) (msa->aseq[idx][apos])); apos++) + nins++; + nins /= 2; /* split the insertion in half */ + rightjustify(msa->aseq[idx]+matmap[k]+1+nins, inserts[k]-nins); + } + } + + } + + /*********************************************** + * Build the rest of the MSA annotation. + ***********************************************/ + + msa->nseq = nseq; + msa->alen = alen; + msa->au = MallocOrDie(sizeof(char) * (strlen(RELEASE)+7)); + sprintf(msa->au, "HMMER %s", RELEASE); + /* copy sqinfo array and weights */ + for (idx = 0; idx < nseq; idx++) + { + msa->sqname[idx] = sre_strdup(sqinfo[idx].name, -1); + if (sqinfo[idx].flags & SQINFO_ACC) + MSASetSeqAccession(msa, idx, sqinfo[idx].acc); + if (sqinfo[idx].flags & SQINFO_DESC) + MSASetSeqAccession(msa, idx, sqinfo[idx].desc); + + if (sqinfo[idx].flags & SQINFO_SS) { + if (msa->ss == NULL) msa->ss = MallocOrDie(sizeof(char *) * nseq); + MakeAlignedString(msa->aseq[idx], alen, + sqinfo[idx].ss, &(msa->ss[idx])); + } + if (sqinfo[idx].flags & SQINFO_SA) { + if (msa->sa == NULL) msa->sa = MallocOrDie(sizeof(char *) * nseq); + MakeAlignedString(msa->aseq[idx], alen, + sqinfo[idx].sa, &(msa->sa[idx])); + } + msa->wgt[idx] = wgt[idx]; + } + + /* #=RF annotation: x for match column, . for insert column + */ + msa->rf = (char *) MallocOrDie (sizeof(char) * (alen+1)); + for (apos = 0; apos < alen; apos++) + msa->rf[apos] = '.'; + for (k = 1; k <= mlen; k++) + msa->rf[matmap[k]] = 'x'; + msa->rf[alen] = '\0'; + + /* Currently, we produce no consensus structure. + * #=CS, generated from HMM structural annotation, would go here. + */ + + free(inserts); + free(matmap); + return msa; +} + +/* Function: TransitionScoreLookup() + * + * Purpose: Convenience function used in PrintTrace() and TraceScore(); + * given state types and node indices for a transition, + * return the integer score for that transition. + */ +int +TransitionScoreLookup(struct plan7_s *hmm, char st1, int k1, + char st2, int k2) +{ + switch (st1) { + case STS: return 0; /* S never pays */ + case STN: + switch (st2) { + case STB: return hmm->xsc[XTN][MOVE]; + case STN: return hmm->xsc[XTN][LOOP]; + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STB: + switch (st2) { + case STM: return hmm->bsc[k2]; + case STD: return Prob2Score(hmm->tbd1, 1.); + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STM: + switch (st2) { + case STM: return hmm->tsc[k1][TMM]; + case STI: return hmm->tsc[k1][TMI]; + case STD: return hmm->tsc[k1][TMD]; + case STE: return hmm->esc[k1]; + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STI: + switch (st2) { + case STM: return hmm->tsc[k1][TIM]; + case STI: return hmm->tsc[k1][TII]; + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STD: + switch (st2) { + case STM: return hmm->tsc[k1][TDM]; + case STD: return hmm->tsc[k1][TDD]; + case STE: return 0; /* D_m->E has probability 1.0 by definition in Plan7 */ + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STE: + switch (st2) { + case STC: return hmm->xsc[XTE][MOVE]; + case STJ: return hmm->xsc[XTE][LOOP]; + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STJ: + switch (st2) { + case STB: return hmm->xsc[XTJ][MOVE]; + case STJ: return hmm->xsc[XTJ][LOOP]; + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STC: + switch (st2) { + case STT: return hmm->xsc[XTC][MOVE]; + case STC: return hmm->xsc[XTC][LOOP]; + default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2)); + } + break; + case STT: return 0; /* T makes no transitions */ + default: Die("illegal state %s in traceback", Statetype(st1)); + } + /*NOTREACHED*/ + return 0; +} + + +/* Function: CreateFancyAli() + * Date: SRE, Mon Oct 27 06:49:44 1997 [Sanger Centre UK] + * + * Purpose: Output of an HMM/sequence alignment, using a + * traceback structure. Deliberately similar to + * the output of BLAST, to make it easier for + * people to adapt their Perl parsers (or what have + * you) from BLAST to HMMER. + * + * Args: tr - traceback structure that gives the alignment + * hmm - the model + * dsq - the sequence (digitized form) + * name- name of the sequence + * + * Return: allocated, filled fancy alignment structure. + */ +struct fancyali_s * +CreateFancyAli(struct p7trace_s *tr, struct plan7_s *hmm, + char *dsq, char *name) +{ + struct fancyali_s *ali; /* alignment to create */ + int tpos; /* position in trace and alignment */ + int bestsym; /* index of best symbol at this pos */ + float mthresh; /* above this P(x), display uppercase */ + + /* Allocate and initialize the five lines of display + */ + ali = AllocFancyAli(); + ali->rfline = NULL; + ali->csline = NULL; + ali->model = MallocOrDie (sizeof(char) * (tr->tlen+1)); + ali->mline = MallocOrDie (sizeof(char) * (tr->tlen+1)); + ali->aseq = MallocOrDie (sizeof(char) * (tr->tlen+1)); + + memset(ali->model, ' ', tr->tlen); + memset(ali->mline, ' ', tr->tlen); + memset(ali->aseq, ' ', tr->tlen); + + if (hmm->flags & PLAN7_RF) + { + ali->rfline = (char *) MallocOrDie (sizeof(char) * (tr->tlen+1)); + memset(ali->rfline, ' ', tr->tlen); + } + if (hmm->flags & PLAN7_CS) + { + ali->csline = (char *) MallocOrDie (sizeof(char) * (tr->tlen+1)); + memset(ali->csline, ' ', tr->tlen); + } + + ali->query = Strdup(hmm->name); + ali->target = Strdup(name); + + if (Alphabet_type == hmmAMINO) mthresh = 0.5; + else mthresh = 0.9; + + /* Find first, last seq position + * HMM start/end positions currently not recorded, because there + * might be multiple HMM hits per sequence. + */ + for (tpos = 0; tpos < tr->tlen; tpos++) + if (tr->pos[tpos] > 0) { + ali->sqfrom = tr->pos[tpos]; + break; + } + for (tpos = tr->tlen-1; tpos >= 0; tpos--) + if (tr->pos[tpos] > 0) { + ali->sqto = tr->pos[tpos]; + break; + } + + /* Fill in the five lines of display + */ + for (tpos = 0; tpos < tr->tlen; tpos++) { + switch (tr->statetype[tpos]) { + case STS: + case STT: + ali->model[tpos] = '*'; + break; + + case STN: + case STJ: + case STC: + ali->model[tpos] = '-'; + if (tr->pos[tpos] > 0) { + ali->aseq[tpos] = tolower(Alphabet[(int) dsq[tr->pos[tpos]]]); + } + break; + + case STB: + ali->model[tpos] = '>'; + break; + + case STE: + ali->model[tpos] = '<'; + break; + + case STM: + if (hmm->flags & PLAN7_RF) ali->rfline[tpos] = hmm->rf[tr->nodeidx[tpos]]; + if (hmm->flags & PLAN7_CS) ali->csline[tpos] = hmm->cs[tr->nodeidx[tpos]]; + bestsym = FMax(hmm->mat[tr->nodeidx[tpos]], Alphabet_size); + ali->model[tpos] = Alphabet[bestsym]; + if (hmm->mat[tr->nodeidx[tpos]][bestsym] < mthresh) + ali->model[tpos] = tolower(ali->model[tpos]); + if (dsq[tr->pos[tpos]] == bestsym) + { + ali->mline[tpos] = Alphabet[(int) dsq[tr->pos[tpos]]]; + if (hmm->mat[tr->nodeidx[tpos]][bestsym] < mthresh) + ali->mline[tpos] = tolower(ali->mline[tpos]); + } + else if (hmm->msc[(int) dsq[tr->pos[tpos]]] [tr->nodeidx[tpos]] > 0) + ali->mline[tpos] = '+'; + ali->aseq[tpos] = Alphabet[(int) dsq[tr->pos[tpos]]]; + break; + + case STD: + if (hmm->flags & PLAN7_RF) ali->rfline[tpos] = hmm->rf[tr->nodeidx[tpos]]; + if (hmm->flags & PLAN7_CS) ali->csline[tpos] = hmm->cs[tr->nodeidx[tpos]]; + bestsym = FMax(hmm->mat[tr->nodeidx[tpos]], Alphabet_size); + ali->model[tpos] = Alphabet[bestsym]; + if (hmm->mat[tr->nodeidx[tpos]][bestsym] < mthresh) + ali->model[tpos] = tolower(ali->model[tpos]); + ali->aseq[tpos] = '-'; + break; + + case STI: + ali->model[tpos] = '.'; + if (hmm->isc[(int) dsq[tr->pos[tpos]]] [tr->nodeidx[tpos]] > 0) + ali->mline[tpos] = '+'; + ali->aseq[tpos] = (char) tolower((int) Alphabet[(int) dsq[tr->pos[tpos]]]); + break; + + default: + Die("bogus statetype"); + } /* end switch over statetypes */ + } /* end loop over tpos */ + + ali->len = tpos; + if (hmm->flags & PLAN7_RF) ali->rfline[tpos] = '\0'; + if (hmm->flags & PLAN7_CS) ali->csline[tpos] = '\0'; + ali->model[tpos] = '\0'; + ali->mline[tpos] = '\0'; + ali->aseq[tpos] = '\0'; + return ali; +} + + +/* Function: PrintFancyAli() + * Date: SRE, Mon Oct 27 06:56:42 1997 [Sanger Centre UK] + * + * Purpose: Print an HMM/sequence alignment from a fancyali_s + * structure. Line length controlled by ALILENGTH in + * config.h (set to 50). + * + * Args: fp - where to print it (stdout or open FILE) + * ali - alignment to print + * + * Return: (void) + */ +void +PrintFancyAli(FILE *fp, struct fancyali_s *ali) +{ + char buffer[ALILENGTH+1]; /* output line buffer */ + int starti, endi; + int pos; + int i; + + buffer[ALILENGTH] = '\0'; + endi = ali->sqfrom - 1; + for (pos = 0; pos < ali->len; pos += ALILENGTH) + { + /* coords of target seq for this line */ + starti = endi + 1; + for (i = pos; ali->aseq[i] != '\0' && i < pos + ALILENGTH; i++) + if (!isgap(ali->aseq[i])) endi++; + + if (ali->csline != NULL) { + strncpy(buffer, ali->csline+pos, ALILENGTH); + fprintf(fp, " %16s %s\n", "CS", buffer); + } + if (ali->rfline != NULL) { + strncpy(buffer, ali->rfline+pos, ALILENGTH); + fprintf(fp, " %16s %s\n", "RF", buffer); + } + if (ali->model != NULL) { + strncpy(buffer, ali->model+pos, ALILENGTH); + fprintf(fp, " %16s %s\n", " ", buffer); + } + if (ali->mline != NULL) { + strncpy(buffer, ali->mline+pos, ALILENGTH); + fprintf(fp, " %16s %s\n", " ", buffer); + } + if (ali->aseq != NULL) { + strncpy(buffer, ali->aseq+pos, ALILENGTH); + if (endi >= starti) + fprintf(fp, " %10.10s %5d %s %-5d\n\n", ali->target, starti, buffer, endi); + else + fprintf(fp, " %10.10s %5s %s %-5s\n\n", ali->target, "-", buffer, "-"); + } + } + + /* Cleanup and return + */ + fflush(fp); + return; +} + + + +/* Function: TraceDecompose() + * Date: Sat Aug 30 11:18:40 1997 (Denver CO) + * + * Purpose: Decompose a long multi-hit trace into zero or more + * traces without N,C,J transitions: for consistent + * scoring and statistical evaluation of single domain + * hits. + * + * Args: otr - original trace structure + * ret_tr - RETURN: array of simpler traces + * ret_ntr- RETURN: number of traces. + * + * Return: (void) + * ret_tr alloc'ed here; free individuals with FreeTrace(). + */ +void +TraceDecompose(struct p7trace_s *otr, struct p7trace_s ***ret_tr, int *ret_ntr) +{ + struct p7trace_s **tr; /* array of new traces */ + int ntr; /* number of traces */ + int i,j; /* position counters in traces */ + int idx; /* index over ntr subtraces */ + + /* First pass: count begin states to get ntr. + */ + for (ntr = 0, i = 0; i < otr->tlen; i++) + if (otr->statetype[i] == STB) ntr++; + + /* Allocations. + */ + if (ntr == 0) { + *ret_ntr = 0; + *ret_tr = NULL; + return; + } + tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * ntr); + + for (idx = 0, i = 0; i < otr->tlen; i++) /* i = position in old trace */ + if (otr->statetype[i] == STB) + { + for (j = i+1; j < otr->tlen; j++) /* j = tmp; get length of subtrace */ + if (otr->statetype[j] == STE) break; + /* trace = S-N-(B..E)-C-T : len + 4 : j-i+1 + 4*/ + P7AllocTrace(j-i+5, &(tr[idx])); + tr[idx]->tlen = j-i+5; + + tr[idx]->statetype[0] = STS; + tr[idx]->nodeidx[0] = 0; + tr[idx]->pos[0] = 0; + tr[idx]->statetype[1] = STN; + tr[idx]->nodeidx[1] = 0; + tr[idx]->pos[1] = 0; + j = 2; /* now j = position in new subtrace */ + while (1) /* copy subtrace */ + { + tr[idx]->statetype[j] = otr->statetype[i]; + tr[idx]->nodeidx[j] = otr->nodeidx[i]; + tr[idx]->pos[j] = otr->pos[i]; + if (otr->statetype[i] == STE) break; + i++; j++; + } + j++; + tr[idx]->statetype[j] = STC; + tr[idx]->nodeidx[j] = 0; + tr[idx]->pos[j] = 0; + j++; + tr[idx]->statetype[j] = STT; + tr[idx]->nodeidx[j] = 0; + tr[idx]->pos[j] = 0; + idx++; + } + + *ret_tr = tr; + *ret_ntr = ntr; + return; +} + + +/* Function: TraceDomainNumber() + * + * Purpose: Count how many times we traverse the + * model in a single Plan7 trace -- equivalent + * to counting the number of domains. + * + * (A weakness is that we might discard some of + * those domains because they have low scores + * below E or T threshold.) + */ +int +TraceDomainNumber(struct p7trace_s *tr) +{ + int i; + int ndom = 0; + + for (i = 0; i < tr->tlen; i++) + if (tr->statetype[i] == STB) ndom++; + return ndom; +} + + +/* Function: TraceSimpleBounds() + * + * Purpose: For a trace that contains only a single + * traverse of the model (i.e. something that's + * come from TraceDecompose(), or a global + * alignment), determine the bounds of + * the match on both the sequence [1..L] and the + * model [1..M]. + * + * Args: tr - trace to look at + * i1 - RETURN: start point in sequence [1..L] + * i2 - RETURN: end point in sequence [1..L] + * k1 - RETURN: start point in model [1..M] + * k2 - RETURN: end point in model [1..M] + */ +void +TraceSimpleBounds(struct p7trace_s *tr, int *ret_i1, int *ret_i2, + int *ret_k1, int *ret_k2) +{ + int i1, i2, k1, k2, tpos; + + i1 = k1 = i2 = k2 = -1; + + /* Look forwards to find start of match */ + for (tpos = 0; tpos < tr->tlen; tpos++) + { + if (k1 == -1 && (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD)) + k1 = tr->nodeidx[tpos]; + if (tr->statetype[tpos] == STM) + { + i1 = tr->pos[tpos]; + break; + } + } + if (tpos == tr->tlen || i1 == -1 || k1 == -1) + Die("sanity check failed: didn't find a match state in trace"); + + /* Look backwards to find end of match */ + for (tpos = tr->tlen-1; tpos >= 0; tpos--) + { + if (k2 == -1 && (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD)) + k2 = tr->nodeidx[tpos]; + if (tr->statetype[tpos] == STM) + { + i2 = tr->pos[tpos]; + break; + } + } + if (tpos == tr->tlen || i2 == -1 || k2 == -1) + Die("sanity check failed: didn't find a match state in trace"); + + *ret_k1 = k1; + *ret_i1 = i1; + *ret_k2 = k2; + *ret_i2 = i2; +} + + +/* Function: MasterTraceFromMap() + * Date: SRE, Tue Jul 7 18:51:11 1998 [St. Louis] + * + * Purpose: Convert an alignment map (e.g. hmm->map) to + * a master trace. Used for mapping an alignment + * onto an HMM. Generally precedes a call to + * ImposeMasterTrace(). Compare P7ViterbiAlignAlignment(), + * which aligns an alignment to the model using a + * Viterbi algorithm to get a master trace. + * MasterTraceFromMap() only works if the alignment + * is exactly the one used to train the model. + * + * Args: map - the map (usually hmm->map is passed) 1..M + * M - length of map (model; usually hmm->M passed) + * alen - length of alignment that map refers to + * + * Returns: ptr to master trace + * Caller must free: P7FreeTrace(). + */ +struct p7trace_s * +MasterTraceFromMap(int *map, int M, int alen) +{ + struct p7trace_s *tr; /* RETURN: master trace */ + int tpos; /* position in trace */ + int apos; /* position in alignment, 1..alen */ + int k; /* position in model */ + + /* Allocate for the trace. + * S-N-B- ... - E-C-T : 6 states + alen is maximum trace, + * because each of alen columns is an N*, M*, I*, or C* metastate. + * No D* metastates possible. + */ + P7AllocTrace(alen+6, &tr); + + /* Initialize the trace + */ + tpos = 0; + TraceSet(tr, tpos, STS, 0, 0); tpos++; + TraceSet(tr, tpos, STN, 0, 0); tpos++; + + /* Leading N's + */ + for (apos = 1; apos < map[1]; apos++) { + TraceSet(tr, tpos, STN, 0, apos); tpos++; + } /* now apos == map[1] */ + TraceSet(tr, tpos, STB, 0, 0); tpos++; + + for (k = 1; k < M; k++) + { + TraceSet(tr, tpos, STM, k, apos); tpos++; + apos++; + + for (; apos < map[k+1]; apos++) { + TraceSet(tr, tpos, STI, k, apos); tpos++; + } + } /* now apos == map[M] and k == M*/ + + TraceSet(tr, tpos, STM, M, apos); tpos++; + apos++; + + /* Trailing C's + */ + TraceSet(tr, tpos, STE, 0, 0); tpos++; + TraceSet(tr, tpos, STC, 0, 0); tpos++; + for (; apos <= alen; apos++) { + TraceSet(tr, tpos, STC, 0, apos); tpos++; + } + + /* Terminate and return + */ + TraceSet(tr, tpos, STT, 0, 0); tpos++; + tr->tlen = tpos; + return tr; +} + + + +/* Function: ImposeMasterTrace() + * Date: SRE, Sun Jul 5 14:27:16 1998 [St. Louis] + * + * Purpose: Goes with P7ViterbiAlignAlignment(), which gives us + * a "master trace" for a whole alignment. Now, given + * the alignment and the master trace, construct individual + * tracebacks for each sequence. Later we'll hand these + * (and presumably other traces) to P7Traces2Alignment(). + * + * It is possible to generate individual traces that + * are not consistent with Plan7 (e.g. D->I and I->D + * transitions may be present). P7Traces2Alignment() + * can handle such traces; other functions may not. + * See modelmaker.c:trace_doctor() if this is a problem. + * + * Akin to modelmaker.c:fake_tracebacks(). + * + * Args: aseq - aligned seqs + * nseq - number of aligned seqs + * mtr - master traceback + * ret_tr- RETURN: array of individual tracebacks, one for each aseq + * + * Returns: (void) + */ +void +ImposeMasterTrace(char **aseq, int nseq, struct p7trace_s *mtr, struct p7trace_s ***ret_tr) +{ + struct p7trace_s **tr; + int idx; /* counter over sequences */ + int i; /* position in raw sequence (1..L) */ + int tpos; /* position in traceback */ + int mpos; /* position in master trace */ + + tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * nseq); + + for (idx = 0; idx < nseq; idx++) + { + P7AllocTrace(mtr->tlen, &tr[idx]); /* we're guaranteed that individuals len < master len */ + + tpos = 0; + i = 1; + for (mpos = 0; mpos < mtr->tlen; mpos++) + { + switch (mtr->statetype[mpos]) + { + case STS: /* straight copies w/ no emission: S, B, D, E, T*/ + case STB: + case STD: + case STE: + case STT: + TraceSet(tr[idx], tpos, mtr->statetype[mpos], mtr->nodeidx[mpos], 0); + tpos++; + break; + + case STM: /* M* implies M or D */ + if (isgap(aseq[idx][mtr->pos[mpos]-1])) + TraceSet(tr[idx], tpos, STD, mtr->nodeidx[mpos], 0); + else { + TraceSet(tr[idx], tpos, STM, mtr->nodeidx[mpos], i); + i++; + } + tpos++; + break; + + case STI: /* I* implies I or nothing */ + if (!isgap(aseq[idx][mtr->pos[mpos]-1])) { + TraceSet(tr[idx], tpos, STI, mtr->nodeidx[mpos], i); + i++; + tpos++; + } + break; + + case STJ: /* N,J,C: first N* -> N. After that, N* -> N or nothing. */ + case STN: + case STC: + if (mtr->pos[mpos] == 0) { + TraceSet(tr[idx], tpos, mtr->statetype[mpos], 0, 0); + tpos++; + } else if (!isgap(aseq[idx][mtr->pos[mpos]-1])) { + TraceSet(tr[idx], tpos, mtr->statetype[mpos], 0, i); + i++; + tpos++; + } + break; + + case STBOGUS: + Die("never happens. Trust me."); + } + } + tr[idx]->tlen = tpos; + } + *ret_tr = tr; +} + + +/* Function: rightjustify() + * + * Purpose: Given a gap-containing string of length n, + * pull all the non-gap characters as far as + * possible to the right, leaving gaps on the + * left side. Used to rearrange the positions + * of insertions in HMMER alignments. + */ +static void +rightjustify(char *s, int n) +{ + int npos; + int opos; + + npos = n-1; + opos = n-1; + while (opos >= 0) { + if (isgap(s[opos])) opos--; + else s[npos--]=s[opos--]; + } + while (npos >= 0) + s[npos--] = '.'; +} + + diff --git a/forester/archive/RIO/others/hmmer/src/weetest.c b/forester/archive/RIO/others/hmmer/src/weetest.c new file mode 100644 index 0000000..406a41e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/src/weetest.c @@ -0,0 +1,55 @@ +/* This is a throwaway wrapper program for doing quick + * and dirty tests on sequence databases. Archives of past + * versions are kept and logged in RCS. + * RCS $Id: weetest.c,v 1.1.1.1 2005/03/22 08:34:07 cmzmasek Exp $ + * + * Compile with: + * +cc -g -o weetest -I ~/lib/squid.linux -L/nfs/wol2/people/eddy/lib/squid.linux weetest.c alphabet.o camJul97.o core_algorithms.o histogram.o hmmio.o mathsupport.o masks.o misc.o modelmakers.o debug.o prior.o trace.o plan7.o states.o tophits.o -lsquid-debug -lm + * + * or, for optimized version: +cc -O2 -o weetest -I ~/lib/squid.linux -L/nfs/wol2/people/eddy/lib/squid.linux weetest.c alphabet.o camJul97.o core_algorithms.o histogram.o hmmio.o mathsupport.o masks.o misc.o modelmakers.o debug.o prior.o trace.o plan7.o states.o tophits.o -lsquid -lm + */ + +/* This test looks at histogram of protein lengths in Swissprot + */ +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +int +main(int argc, char **argv) +{ + char *file; + char *seq; + char *dsq; + int format; + SQFILE *sqfp; + SQINFO sqinfo; + int i,x; + + struct histogram_s *h; + + file = argv[1]; + if (! SeqfileFormat(file, &format, "BLASTDB")) + Die("SeqfileFormat()"); + if ((sqfp = SeqfileOpen(file, format, "BLASTDB")) == NULL) + Die("SeqfileOpen()"); + + h = AllocHistogram(0, 10000, 1000); + while (ReadSeq(sqfp, format, &seq, &sqinfo)) + AddToHistogram(h, (float) sqinfo.len); + + GaussianFitHistogram(h, 999999.); + PrintASCIIHistogram(stdout, h); + + printf("mean = %f\n", h->param[GAUSS_MEAN]); + printf("sd = %f\n", h->param[GAUSS_SD]); + + SeqfileClose(sqfp); + + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/Exercises.sh b/forester/archive/RIO/others/hmmer/testsuite/Exercises.sh new file mode 100644 index 0000000..612ac99 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Exercises.sh @@ -0,0 +1,17 @@ +#! /bin/sh + +# Various exercises to test the package. +# SRE, Fri Oct 23 10:38:44 1998 +# RCS $Id: Exercises.sh,v 1.1.1.1 2005/03/22 08:34:49 cmzmasek Exp $ + +# Test binary formats and interconversion. +# (tests for bug detected in 2.1, fixed in 2.1.1a.) +# +../binaries/hmmconvert -F fn3-bin ex1.tmp > /dev/null +../binaries/hmmconvert -F fn3-bin-swap ex2.tmp > /dev/null +diff ex1.tmp ex2.tmp > /dev/null +if (test $? != 0) then + echo FAILED: hmmconvert byteswap test +fi +rm ex1.tmp ex2.tmp + diff --git a/forester/archive/RIO/others/hmmer/testsuite/Makefile.in b/forester/archive/RIO/others/hmmer/testsuite/Makefile.in new file mode 100644 index 0000000..4bda272 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Makefile.in @@ -0,0 +1,83 @@ +################################################################ +# Makefile for HMMER testsuite +# CVS $Id: Makefile.in,v 1.1.1.1 2005/03/22 08:34:49 cmzmasek Exp $ +########## +# HMMER - Biological sequence analysis with profile HMMs +# Copyright (C) 1992-1999 Washington University School of Medicine +# All Rights Reserved +# +# This source code is distributed under the terms of the +# GNU General Public License. See the files COPYING and LICENSE +# for details. +########### + +CC = @CC@ +CFLAGS = @CFLAGS@ +MDEFS = @MDEFS@ @DEFS@ + +# Configuration for optional pthreads multiprocessor support +# +PTHREAD_LIBS = @PTHREAD_LIBS@ +PTHREAD_CFLAGS = @PTHREAD_CFLAGS@ + +SHELL = /bin/sh +MYLIBS = -lhmmer -lsquid +LIBS = @LIBS@ -lm + +SHIVA = alignalign_test\ + evd_test\ + masks_test\ + parsingviterbi_test\ + tophits_test\ + trace_test\ + viterbi_exercise\ + weeviterbi_test + +####### +## Targets defining how to make Shiva executables. +####### + +.c.o: + $(CC) $(CFLAGS) $(PTHREAD_CFLAGS) $(MDEFS) -I../squid -I../src -c $< + +all: $(SHIVA) + +$(SHIVA): @EXEC_DEPENDENCY@ + $(CC) $(CFLAGS) $(PTHREAD_CFLAGS) $(MDEFS) -o $@ -L../squid -L../src $@.o $(MYLIBS) $(PTHREAD_LIBS) $(LIBS) + +####### +## `make check` actually runs the tests. +####### + +check: $(SHIVA) + @echo + @echo Running compiled Shiva exercises: + @echo Warning: some tests may take several minutes to complete. + @for shivatest in $(SHIVA); do\ + if ./$$shivatest; then\ + echo $$shivatest: ok;\ + else\ + echo $$shivatest: FAILED;\ + fi;\ + done + @echo + @echo Running scripted Shiva exercises: + @echo Warning: This also may take several minutes. + perl ./Optiontests.pl + sh ./Exercises.sh + +####### +## Miscellaneous +####### + +clean: + -rm -f *.o *~ Makefile.bak core $(SHIVA) TAGS gmon.out + +distclean: + make clean + -rm -f Makefile + +TAGS: + etags -t *.c *.h Makefile.in + + diff --git a/forester/archive/RIO/others/hmmer/testsuite/Optiontests.fa b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.fa new file mode 100644 index 0000000..cdaa297 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.fa @@ -0,0 +1,16 @@ +>seq1 +ACDEFGHIKLMNPQRSTVWY +>seq2 +ACDEFGHIKLMNPQRSTVWY +>seq3 +ACDEFGHIKLMNPQRSTVWY +>seq4 +ACDEFGHIKLMNPQRSTVWY +>seq5 +ACDEFGHIKLMNPQRSTVWY +>seq6 +ACDEFGHIKLMNPQRSTVWY +>seq7 +ACDEFGHIKLMNPQRSTVWY +>seq8 +ACDEFGHIKLMNPQRSTVWY diff --git a/forester/archive/RIO/others/hmmer/testsuite/Optiontests.nfa b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.nfa new file mode 100644 index 0000000..03d614d --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.nfa @@ -0,0 +1,12 @@ +>seq1 +AAACCCGGGTTT +>seq1 +AAACCCGGGTTT +>seq1 +AAACCCGGGTTT +>seq1 +AAACCCGGGTTT +>seq1 +AAACCCGGGTTT +>seq1 +AAACCCGGGTTT diff --git a/forester/archive/RIO/others/hmmer/testsuite/Optiontests.nslx b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.nslx new file mode 100644 index 0000000..aa19616 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.nslx @@ -0,0 +1,9 @@ +# A simple DNA alignment for Optiontests.pl + +#=RF xxxxxx +seq1 AAACCCGGGTTT +seq1 AAACCCGGGTTT +seq1 AAACCCGGGTTT +seq1 AAACCCGGGTTT +seq1 AAACCCGGGTTT +seq1 AAACCCGGGTTT diff --git a/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pam b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pam new file mode 100644 index 0000000..205f139 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pam @@ -0,0 +1,31 @@ +# Matrix made by matblas from blosum62.iij +# * column uses minimum score +# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units +# Blocks Database = /data/blocks_5.0/blocks.dat +# Cluster Percentage: >= 62 +# Entropy = 0.6979, Expected = -0.5209 + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 +B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 +Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 +* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 diff --git a/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pl b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pl new file mode 100644 index 0000000..1b5fbb9 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pl @@ -0,0 +1,116 @@ +#! /usr/local/bin/perl + +@tests = ( + "hmmbuild --informat selex -F Optiontests.hmm Optiontests.slx", # Make a protein HMM + "hmmbuild --informat selex -F Optiontests.nhmm Optiontests.nslx", # Make a DNA HMM + "hmmalign -h", + "hmmalign Optiontests.hmm Optiontests.fa", + "hmmalign -m Optiontests.hmm Optiontests.fa", + "hmmalign -o tmp Optiontests.hmm Optiontests.fa", + "hmmalign -q Optiontests.hmm Optiontests.fa", + "hmmalign --withali Optiontests.slx Optiontests.hmm Optiontests.fa", + "hmmalign --mapali Optiontests.slx Optiontests.hmm Optiontests.fa", + "hmmbuild -h", + "hmmbuild --informat selex tmp.hmm Optiontests.slx", + "hmmbuild --informat selex -F tmp.hmm Optiontests.slx", # Need -F to force + "hmmbuild --informat selex -n foo -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex -o tmp -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex -A tmp.hmm Optiontests.slx", + "hmmbuild --informat selex -f -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex -g -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex -s -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --fast -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --hand -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --null ../tutorial/amino.null -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --pam Optiontests.pam -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --prior ../tutorial/amino.pri -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --wblosum -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --wgsc -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --wme -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --wvoronoi -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --wnone -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --noeff -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --amino -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --nucleic -F tmp.hmm Optiontests.nslx", + "hmmbuild --informat selex --archpri 0.9 -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --binary -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --cfile tmp -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --gapmax 0.6 --fast -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --idlevel 0.5 -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --pamwgt 10 --pam Optiontests.pam -F tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --swentry 0.3 -F -s tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --swexit 0.3 -F -s tmp.hmm Optiontests.slx", + "hmmbuild --informat selex --verbose -F tmp.hmm Optiontests.slx", + "hmmcalibrate -h", + "hmmcalibrate Optiontests.hmm", + "hmmcalibrate --fixed 15 Optiontests.hmm", + "hmmcalibrate --mean 25 Optiontests.hmm", + "hmmcalibrate --histfile tmp --fixed 15 Optiontests.hmm", + "hmmcalibrate --num 4500 --fixed 15 Optiontests.hmm", + "hmmcalibrate --sd 50 --mean 25 Optiontests.hmm", + "hmmcalibrate --seed 666 --fixed 15 Optiontests.hmm", + "hmmconvert -h", + "hmmconvert Optiontests.hmm tmp2.hmm", + "hmmconvert -F Optiontests.hmm tmp2.hmm", + "hmmconvert -a -F Optiontests.hmm tmp2.hmm", + "hmmconvert -A Optiontests.hmm tmp2.hmm", # order sensitive. tmp2.hmm must be HMM + "hmmconvert -b -F Optiontests.hmm tmp2.hmm", + "hmmconvert -p -F Optiontests.hmm tmp2.hmm", + "hmmconvert -P -F Optiontests.hmm tmp2.hmm", + "hmmemit -h", + "hmmemit Optiontests.hmm", + "hmmemit -a Optiontests.hmm", + "hmmemit -n 6 Optiontests.hmm", + "hmmemit -o tmp Optiontests.hmm", + "hmmemit -q Optiontests.hmm", + "hmmemit --seed 666 Optiontests.hmm", + "hmmindex -h", + "hmmindex Optiontests.hmm", + "hmmfetch -h", + "hmmfetch Optiontests.hmm Optiontests", + "hmmpfam -h", + "hmmpfam -n Optiontests.nhmm Optiontests.nfa", + "hmmpfam -A 0 Optiontests.hmm Optiontests.fa", + "hmmpfam -E 1 Optiontests.hmm Optiontests.fa", + "hmmpfam -T 1 Optiontests.hmm Optiontests.fa", + "hmmpfam -Z 10 Optiontests.hmm Optiontests.fa", + "hmmpfam --domE 1 Optiontests.hmm Optiontests.fa", + "hmmpfam --domT 1 Optiontests.hmm Optiontests.fa", + "hmmpfam --forward Optiontests.hmm Optiontests.fa", + "hmmpfam --null2 Optiontests.hmm Optiontests.fa", + "hmmpfam --xnu Optiontests.hmm Optiontests.fa", + "hmmsearch -h", + "hmmsearch -A 0 Optiontests.hmm Optiontests.fa", + "hmmsearch -E 1 Optiontests.hmm Optiontests.fa", + "hmmsearch -T 1 Optiontests.hmm Optiontests.fa", + "hmmsearch -Z 10 Optiontests.hmm Optiontests.fa", + "hmmsearch --domE 1 Optiontests.hmm Optiontests.fa", + "hmmsearch --domT 1 Optiontests.hmm Optiontests.fa", + "hmmsearch --forward Optiontests.hmm Optiontests.fa", + "hmmsearch --null2 Optiontests.hmm Optiontests.fa", + "hmmsearch --xnu Optiontests.hmm Optiontests.fa", + ); + + +unlink "tmp.hmm"; +while ($testline = shift(@tests)) +{ + $status = system("../binaries/$testline 2>&1 > tmp.out"); + if ($status > 0) { + print "failure: $testline\n"; + $failed++; + } + $total++; +} + +$passed = $total - $failed; +printf "Option tests: %d. Passed: %d. Failed: %d\n", $total, $passed, $failed; + +unlink "tmp"; +unlink "tmp.out"; +unlink "tmp.hmm"; +unlink "tmp2.hmm"; +unlink "Optiontests.hmm"; +unlink "Optiontests.nhmm"; +unlink "Optiontests.hmm.ssi"; + diff --git a/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pri b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pri new file mode 100644 index 0000000..a2f19d3 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.pri @@ -0,0 +1,59 @@ +# This file incorporates Blocks9.plib, the UCSC mixture +# Dirichlet prior created by Kimmen Sjolander. +# + +Dirichlet # Strategy (mixture Dirichlet) +Amino # type of prior (Amino or Nucleic) + +# Transitions +1 # Single component +1.0 # with probability = 1.0 +0.7939 0.0278 0.0135 # m->m, m->i, m->d alpha's +0.1551 0.1331 # i->m, i->i alpha's +0.9002 0.5630 # d->m, d->d alpha's + +# Match emissions +# +9 # 9 components + +0.178091 +0.270671 0.039848 0.017576 0.016415 0.014268 0.131916 0.012391 0.022599 0.020358 0.030727 0.015315 0.048298 0.053803 0.020662 0.023612 0.216147 0.147226 0.065438 0.003758 0.009621 +# S A T , C G P >< N V M , Q H R I K F L D W , E Y + +0.056591 +0.021465 0.0103 0.011741 0.010883 0.385651 0.016416 0.076196 0.035329 0.013921 0.093517 0.022034 0.028593 0.013086 0.023011 0.018866 0.029156 0.018153 0.0361 0.07177 0.419641 +# Y , F W , H ,>< L M , N Q I C V S R , T P A K D G E + +0.0960191 +0.561459 0.045448 0.438366 0.764167 0.087364 0.259114 0.21494 0.145928 0.762204 0.24732 0.118662 0.441564 0.174822 0.53084 0.465529 0.583402 0.445586 0.22705 0.02951 0.12109 +# Q E , K N R S H D T A >< M P Y G , V L I W C F + +0.0781233 +0.070143 0.01114 0.019479 0.094657 0.013162 0.048038 0.077 0.032939 0.576639 0.072293 0.02824 0.080372 0.037661 0.185037 0.506783 0.073732 0.071587 0.042532 0.011254 0.028723 +# K R , Q , H >< N E T M S , P W Y A L G V C I , D F + +0.0834977 +0.041103 0.014794 0.00561 0.010216 0.153602 0.007797 0.007175 0.299635 0.010849 0.999446 0.210189 0.006127 0.013021 0.019798 0.014509 0.012049 0.035799 0.180085 0.012744 0.026466 +# L M , I , F V ><, W Y C T Q , A P H R , K S E N , D G + +0.0904123 +0.115607 0.037381 0.012414 0.018179 0.051778 0.017255 0.004911 0.796882 0.017074 0.285858 0.075811 0.014548 0.015092 0.011382 0.012696 0.027535 0.088333 0.94434 0.004373 0.016741 +# I V ,, L M >< C T A , F , Y S P W N , E Q K R D G H + +0.114468 +0.093461 0.004737 0.387252 0.347841 0.010822 0.105877 0.049776 0.014963 0.094276 0.027761 0.01004 0.187869 0.050018 0.110039 0.038668 0.119471 0.065802 0.02543 0.003215 0.018742 +# D , E N , Q H S >< K G P T A , R Y , M V L F W I C + +0.0682132 +0.452171 0.114613 0.06246 0.115702 0.284246 0.140204 0.100358 0.55023 0.143995 0.700649 0.27658 0.118569 0.09747 0.126673 0.143634 0.278983 0.358482 0.66175 0.061533 0.199373 +# M , V I L F T Y C A >< W S H Q R N K , P E G , D + +0.234585 +0.005193 0.004039 0.006722 0.006121 0.003468 0.016931 0.003647 0.002184 0.005019 0.00599 0.001473 0.004158 0.009055 0.00363 0.006583 0.003172 0.00369 0.002967 0.002772 0.002686 +# P G W , C H R D E >< N Q K F Y T L A M , S V I + + +## Insert emissions +1 # Single component +1.0 # with probability 1.0 +681 120 623 651 313 902 241 371 687 676 143 548 647 415 551 926 623 505 102 269 diff --git a/forester/archive/RIO/others/hmmer/testsuite/Optiontests.slx b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.slx new file mode 100644 index 0000000..a821515 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/Optiontests.slx @@ -0,0 +1,15 @@ +# A simple amino acid test alignment for Optiontests.pl + +#=RF xxxxx +seq1 ACDEFGHIKLMNPQRSTVWY +seq2 ACDEFGHIKLMNPQRSTVWY +seq3 ACDEFGHIKLMNPQRSTVWY +seq4 ACDEFGHIKLMNPQRSTVWY +seq5 ACDEFGHIKLMNPQRSTVWY +seq6 ACDEFGHIKLMNPQRSTVWY +seq7 ACDEFGHIKLMNPQRSTVWY +seq8 ACDEFGHIKLMNPQRSTVWY + + + + diff --git a/forester/archive/RIO/others/hmmer/testsuite/README b/forester/archive/RIO/others/hmmer/testsuite/README new file mode 100644 index 0000000..4a4e2ac --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/README @@ -0,0 +1,82 @@ +Shiva: HMMER testsuite + +##################################################################### +I. Compiled test drivers. +--------------------------------------------------------------------- + +- A test driver runs with no arguments, gives no output, + and returns EXIT_SUCCESS if the test passes. +- If the test fails, the test driver calls Die() to print a diagnostic on + stderr, and exit with EXIT_FAILURE. +- The -v option always activates rudimentary verbose output on stdout. + + +Current tests: +--------------- + +alignalign_test + Exercises P7ViterbiAlignAlignment() -- alignment of a fixed + multiple alignment to an HMM. Aligns fn3 seed alignment to + fn3 model, compares to results of aligning sequences individually; + if an excessive number of discrepancies are detected, test + fails. + Other files: fn3.seed, fn3.hmm. + +evd_test + Exercises code in histogram.c, especially EVD fitting + Default generates 1000 EVD samples; fits EVD; tests that fitted + parameters are reasonably close to real ones. + +masks_test + Exercises code in masks.c + Default runs XNU on a sequence and compares to a known result. + +parsingviterbi_test + Exercises P7ParsingViterbi() in core_algorithms.c + Runs Fn3 model against titin using both standard Viterbi and + ParsingViterbi; compares results for identity. + +tophits_test + Exercises tophits.c + Generates random scores in three tiers (good, middling, bad). + Uses RegisterHit() API; FullSort's them; tests that they + end up in the right number/order. + +trace_test + Exercises traceback code in core_algorithms.c + Runs a simple HMM against synthetic sequences designed to + exercise all possible arrangements of transitions, and + does a TraceVerify() to be sure resulting trace is internally + consistent. + +viterbi_exercise + Exercises P7Viterbi and P7SmallViterbi in core_algorithms.c + Configures Fn3 model into various modes; generates 100 random + seqs from each configuration; does P7Viterbi and P7SmallViterbi + alignments, TraceVerify()'s them, checks them for identity. + +weeviterbi_test + Exercises P7WeeViterbi in core_algorithms.c + Runs RRM model against two subsequences of human U1A; + compares Viterbi trace to WeeViterbi trace. + +##################################################################### +II. Scripted test drivers. +--------------------------------------------------------------------- + +Optiontests.pl + Runs every documented option for every program, to be + sure they're really connected up. + + +##################################################################### +Obsolete: kept in RCS archive only for reproducibility of old results +--------------------------------------------------------------------- + +fitting_test.c Tests of EVD and Gaussian histogram fitting code. + 17 June 1997 + + + + + diff --git a/forester/archive/RIO/others/hmmer/testsuite/alignalign_test.c b/forester/archive/RIO/others/hmmer/testsuite/alignalign_test.c new file mode 100644 index 0000000..99679d2 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/alignalign_test.c @@ -0,0 +1,206 @@ +/* alignalign_test.c + * Sun Jul 5 13:42:41 1998 + * + * Test driver for P7ViterbiAlignAlignment(). + * + * The test is to + * 1) read an alignment and a corresponding HMM + * 2) align the alignment to the HMM to get a master trace + * 3) map the alignment to the HMM to get another master trace + * 4) Test that the two master traces are identical; if not, fail. + * This doesn't have to be true always, but it's true for the + * fn3 test example. + * 5) Get imposed traces for each sequence + * 6) Viterbi align individual seqs to the model; + * compare the imposed trace with the Viterbi trace; + * 7) If an excessive number of individual traces differ from + * those imposed by master, fail. + * + * CVS $Id: alignalign_test.c,v 1.1.1.1 2005/03/22 08:34:49 cmzmasek Exp $ + */ + +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +static char banner[] = "\ +alignalign_test : testing of P7ViterbiAlignAlignment() code"; + +static char usage[] = "\ +Usage: alignalign_test [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -v : be verbose\n\ +"; + +static char experts[] = "\ + --ali : read alignment from \n\ + --hmm : read HMM from \n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-v", TRUE, sqdARG_NONE }, + { "--ali", FALSE, sqdARG_STRING }, + { "--hmm", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMM(s) from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + struct plan7_s *hmm; /* HMM to search with */ + char *afile; /* file to read alignment from */ + int format; /* format determined for afile */ + MSAFILE *afp; /* afile, open for reading */ + MSA *msa; /* multiple sequence alignment from afile */ + char **rseq; /* raw, dealigned aseq */ + char *dsq; /* digitized target sequence */ + struct p7trace_s *mtr; /* master traceback from alignment */ + struct p7trace_s *maptr; /* master traceback from mapping */ + struct p7trace_s **tr; /* individual tracebacks imposed by mtr */ + struct p7trace_s **itr; /* individual trace from P7Viterbi() */ + int idx; /* counter for seqs */ + int ndiff; /* number of differing traces */ + int rlen; /* length of an unaligned sequence */ + + int be_verbose; + int be_standard; /* TRUE when running standard test */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + + hmmfile = "fn3.hmm"; + afile = "fn3.seed"; + format = MSAFILE_STOCKHOLM; + be_verbose = FALSE; + be_standard = TRUE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-v") == 0) be_verbose = TRUE; + else if (strcmp(optname, "--ali") == 0) { afile = optarg; be_standard = FALSE; } + else if (strcmp(optname, "--hmm") == 0) { hmmfile = optarg; be_standard = FALSE; } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + /*********************************************** + * Get one alignment from test file: must be Stockholm format. + ***********************************************/ + + if ((afp = MSAFileOpen(afile, format, NULL)) == NULL) + Die("Alignment file %s could not be opened for reading", afile); + if ((msa = MSAFileRead(afp)) == NULL) + Die("Didn't read an alignment from %s", afile); + MSAFileClose(afp); + + for (idx = 0; idx < msa->nseq; idx++) + s2upper(msa->aseq[idx]); + DealignAseqs(msa->aseq, msa->nseq, &rseq); + + /*********************************************** + * Open HMM file + * Read a single HMM from it. + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, NULL)) == NULL) + Die("Failed to open HMM file %s\n", hmmfile); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", hmmfile); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + P7Logoddsify(hmm, TRUE); + + if (! (hmm->flags & PLAN7_MAP)) + Die("HMM in %s has no map", hmmfile); + if (GCGMultchecksum(msa->aseq, msa->nseq) != hmm->checksum) + Die("Checksum for alignment in %s does not match that in HMM (%d != %d)", + afile, GCGMultchecksum(msa->aseq, msa->nseq), hmm->checksum); + + /*********************************************** + * First test: + * mapped alignment should match re-aligned alignment: + * obtain and compare the two master traces + ***********************************************/ + + mtr = P7ViterbiAlignAlignment(msa, hmm); + maptr = MasterTraceFromMap(hmm->map, hmm->M, msa->alen); + if (! TraceVerify(mtr, hmm->M, msa->alen)) + Die("Trace verify on P7ViterbiAlignAlignment() result failed\n"); + if (! TraceVerify(maptr, hmm->M, msa->alen)) + Die("Trace verify on MasterTraceFromMap() result failed\n"); + if (! TraceCompare(mtr, maptr)) + Die("Master traces differ for alignment versus map\n"); + + /************************************************** + * Second test: + * seq traces implied by mapped alignment should generally match + * re-aligned individual sequences. + ***************************************************/ + + ImposeMasterTrace(msa->aseq, msa->nseq, mtr, &tr); + + itr = MallocOrDie(sizeof(struct p7trace_s *) * msa->nseq); + /* align individuals, compare traces */ + ndiff = 0; + for (idx = 0; idx < msa->nseq; idx++) + { + rlen = strlen(rseq[idx]); + dsq = DigitizeSequence(rseq[idx], rlen); + P7Viterbi(dsq, rlen, hmm, &(itr[idx])); + + if (! TraceCompare(itr[idx], tr[idx])) + ndiff++; + free(dsq); + } + + /* Determine success/failure. + */ + if (ndiff > msa->nseq / 2) + Die("alignalign: Test FAILED; %d/%d differ\n", ndiff, msa->nseq); + + if (be_standard) { + if (ndiff != 12) + Die("alignalign: Test FAILED; %d traces differ, should be 12\n", ndiff); + if (msa->nseq != 109) + Die("alignalign: Test FAILED; %d seqs read, should be 109\n", msa->nseq); + } + + if (be_verbose) printf("alignalign: Test passed; %d/%d differ, as expected\n", + ndiff, msa->nseq); + + /* Cleanup. + */ + P7FreeTrace(mtr); + P7FreeTrace(maptr); + for (idx = 0; idx < msa->nseq; idx++) + { + P7FreeTrace(tr[idx]); + P7FreeTrace(itr[idx]); + } + free(tr); + free(itr); + Free2DArray((void **) rseq, msa->nseq); + MSAFree(msa); + FreePlan7(hmm); + SqdClean(); + + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/evd_test.c b/forester/archive/RIO/others/hmmer/testsuite/evd_test.c new file mode 100644 index 0000000..5a2446e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/evd_test.c @@ -0,0 +1,295 @@ +/* evd_test.c + * SRE, Wed Nov 12 11:17:27 1997 [St. Louis] + * + * Test driver for EVD distribution support in histogram.c + * Generates random EVD samples; fits them; checks fitted mu, lambda + * against parametric mu, lambda. If they differ badly, calls Die(). + * If OK, returns EXIT_SUCCESS. + * + * RCS $Id: evd_test.c,v 1.1.1.1 2005/03/22 08:34:45 cmzmasek Exp $ + */ + + +#include +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +#ifdef MEMDEBUG +#include "dbmalloc.h" +#endif + +static char banner[] = "\ +evd_test : testing of EVD code in histogram.c"; + +static char usage[] = "\ +Usage: testdriver [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -c : censor data below \n\ + -e : sample times from EVD\n\ + -g : add Gaussian samples of \"noise\"\n\ + -n : set number of trials to \n\ + -s : set random seed to \n\ + -v : be verbose (default is to simply exit with status 1 or 0)\n\ +"; + +static char experts[] = "\ + --xmgr : save graphical data to \n\ + --hist : fit to histogram instead of raw samples\n\ + --loglog : save log log regression line to \n\ + --regress : do old-style linear regression fit, not ML\n\ + --mu : set EVD mu to \n\ + --lambda : set EVD lambda to \n\ + --mean : set Gaussian mean to \n\ + --sd : set Gaussian std. dev. to \n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-c", TRUE, sqdARG_FLOAT }, + { "-e", TRUE, sqdARG_INT }, + { "-g", TRUE, sqdARG_INT }, + { "-n", TRUE, sqdARG_INT }, + { "-s", TRUE, sqdARG_INT }, + { "-v", TRUE, sqdARG_NONE }, + { "--xmgr", FALSE, sqdARG_STRING}, + { "--hist", FALSE, sqdARG_NONE}, + { "--loglog", FALSE, sqdARG_STRING}, + { "--regress",FALSE, sqdARG_NONE}, + { "--mu", FALSE, sqdARG_FLOAT}, + { "--lambda", FALSE, sqdARG_FLOAT}, + { "--mean", FALSE, sqdARG_FLOAT}, + { "--sd", FALSE, sqdARG_FLOAT}, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + struct histogram_s *h; /* histogram structure */ + int ntrials; /* number of different fits */ + int be_verbose; /* option: TRUE to show output */ + int seed; /* option: random number seed */ + int nevd; /* # of samples from EVD */ + float mu; /* EVD mu parameter */ + float lambda; /* EVD lambda parameter */ + int ngauss; /* # of samples from Gaussian */ + float mean; /* Gaussian "noise" mean */ + float sd; /* Gaussian "noise" std. dev. */ + float x; /* a random sample */ + int i, idx; + float *val; /* array of samples */ + float mlmu; /* estimate of mu */ + float mllambda; /* estimate of lambda */ + + char *xmgrfile; /* output file for XMGR graph data */ + char *logfile; /* output file for regression line */ + FILE *xmgrfp; /* open output file */ + FILE *logfp; /* open log log file */ + int do_ml; /* TRUE to do a max likelihood fit */ + int fit_hist; /* TRUE to fit histogram instead of samples */ + int censoring; /* TRUE to left-censor the data */ + float censorlevel; /* value to censor at */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + +#ifdef MEMDEBUG + unsigned long histid1, histid2, orig_size, current_size; + orig_size = malloc_inuse(&histid1); + fprintf(stderr, "[... memory debugging is ON ...]\n"); +#endif + + /*********************************************** + * Parse command line + ***********************************************/ + be_verbose = FALSE; + seed = (int) time ((time_t *) NULL); + ntrials = 1; + nevd = 1000; + mu = -20.0; + lambda = 0.4; + ngauss = 0; + mean = 20.; + sd = 20.; + xmgrfile = NULL; + logfile = NULL; + xmgrfp = NULL; + logfp = NULL; + do_ml = TRUE; + censoring = FALSE; + censorlevel= 0.; + fit_hist = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-e") == 0) { nevd = atoi(optarg); } + else if (strcmp(optname, "-c") == 0) { censoring = TRUE; + censorlevel= atof(optarg); } + else if (strcmp(optname, "-g") == 0) { ngauss = atoi(optarg); } + else if (strcmp(optname, "-n") == 0) { ntrials = atoi(optarg); } + else if (strcmp(optname, "-s") == 0) { seed = atoi(optarg); } + else if (strcmp(optname, "-v") == 0) { be_verbose = TRUE; } + else if (strcmp(optname, "--xmgr") == 0) { xmgrfile = optarg; } + else if (strcmp(optname, "--hist") == 0) { fit_hist = TRUE; } + else if (strcmp(optname, "--loglog") == 0) { logfile = optarg; } + else if (strcmp(optname, "--regress")== 0) { do_ml = FALSE; } + else if (strcmp(optname, "--mu") == 0) { mu = atof(optarg); } + else if (strcmp(optname, "--lambda") == 0) { lambda = atof(optarg); } + else if (strcmp(optname, "--mean") == 0) { mean = atof(optarg); } + else if (strcmp(optname, "--sd") == 0) { sd = atof(optarg); } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + sre_srandom(seed); + + /**************************************************************** + * Print options + ****************************************************************/ + + if (be_verbose) + { + puts("--------------------------------------------------------"); + printf("EVD samples = %d\n", nevd); + printf("mu, lambda = %f, %f\n", mu, lambda); + if (ngauss > 0) { + printf("Gaussian noise = %d\n", ngauss); + printf("mean, sd = %f, %f\n", mean, sd); + } + if (censoring) printf("pre-censoring = ON, at %f\n", censorlevel); + printf("total trials = %d\n", ntrials); + printf("random seed = %d\n", seed); + printf("fit method = %s\n", do_ml ? "ML" : "linear regression"); + printf("fit is to = %s\n", fit_hist ? "histogram" : "list"); + puts("--------------------------------------------------------"); + } + + if (xmgrfile != NULL) + if ((xmgrfp = fopen(xmgrfile, "w")) == NULL) + Die("Failed to open output file %s", xmgrfile); + if (logfile != NULL) + if ((logfp = fopen(logfile, "w")) == NULL) + Die("Failed to open output file %s", logfile); + + /* Generate random EVD "signal" (and Gaussian "noise") + * samples and put them in the histogram + */ + while (ntrials--) + { + val = MallocOrDie(sizeof(double) * (nevd+ngauss)); + h = AllocHistogram(-20, 20, 10); + + /* EVD signal */ + idx = 0; + for (i = 0; i < nevd; i++) + { + x = EVDrandom(mu, lambda); + if (! censoring || x > censorlevel) + { + AddToHistogram(h, x); + val[idx] = x; + idx++; + } + } + /* Gaussian noise */ + for (; i < nevd + ngauss; i++) + { + x = Gaussrandom(mean, sd); + if (! censoring || x > censorlevel) + { + AddToHistogram(h, x); + val[idx] = x; + idx++; + } + } + + if (do_ml) + { + + if (censoring) + { + if (be_verbose) + printf("I have censored the data at %f: %d observed, %d censored\n", censorlevel, idx, (nevd+ngauss)-idx); + + EVDCensoredFit(val, NULL, idx, + (nevd+ngauss)-idx, censorlevel, + &mlmu, &mllambda); + ExtremeValueSetHistogram(h, (float) mlmu, (float) mllambda, + censorlevel, h->highscore, 1); + } + else + { + if (fit_hist) + { + ExtremeValueFitHistogram(h, TRUE, 20.); + } + else + { + EVDMaxLikelyFit(val, NULL, idx, &mlmu, &mllambda); + ExtremeValueSetHistogram(h, (float) mlmu, (float) mllambda, + h->lowscore, h->highscore, 2); + } + } + } + else + EVDBasicFit(h); + + if (be_verbose) { + printf("%f\tmu\n", h->param[EVD_MU]); + printf("%f\tlambda\n", h->param[EVD_LAMBDA]); + printf("%f\t%% error on mu\n", + fabs(100. * (h->param[EVD_MU] - mu) / mu)); + printf("%f\t%% error on lambda\n", + fabs(100. * (h->param[EVD_LAMBDA] - lambda) / lambda)); + printf("%f\tchi-squared P value\n", h->chip); + } + if (xmgrfp != NULL) PrintXMGRHistogram(xmgrfp, h); + /* if (xmgrfp != NULL) PrintXMGRDistribution(xmgrfp, h); */ + if (logfp != NULL) PrintXMGRRegressionLine(logfp, h); + + /* Generate the expected lines: sets 5,7 of xmgrfile (manually delete 4,6) + * set 3 of loglogfile (manually delete 2) + */ + ExtremeValueSetHistogram(h, mu, lambda, h->lowscore, h->highscore, 0); + if (xmgrfp != NULL) PrintXMGRHistogram(xmgrfp, h); + /* if (xmgrfp != NULL) PrintXMGRDistribution(xmgrfp, h); */ + if (logfp != NULL) PrintXMGRRegressionLine(logfp, h); + + /* Do the internal test. + * Criterion: on a 1000 sample EVD of u = -40 and lambda = 0.4, + * estimate u to within +/- 2 and lambda to within +/- 0.05. + */ + if (fabs(h->param[EVD_MU] - mu) > 2.) + Die("evd_test: tolerance to mu exceeded (%f)", + fabs(h->param[EVD_MU] - mu)); + if (fabs(h->param[EVD_LAMBDA] - lambda) > 0.05) + Die("evd_test: tolerance to lambda exceeded (%f)", + fabs(h->param[EVD_LAMBDA] - lambda)); + + FreeHistogram(h); + free(val); + } + +#ifdef MEMDEBUG + current_size = malloc_inuse(&histid2); + if (current_size != orig_size) Die("evd_test failed memory test"); + else fprintf(stderr, "[No memory leaks.]\n"); +#endif + + if (xmgrfp != NULL) fclose(xmgrfp); + if (logfp != NULL) fclose(logfp); + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/fitting_test.c b/forester/archive/RIO/others/hmmer/testsuite/fitting_test.c new file mode 100644 index 0000000..5eefdfb --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/fitting_test.c @@ -0,0 +1,71 @@ +/* fitting_test.c + * 17 June 1997 (see notebook) + */ + +#include +#include +#include +#include +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "squid.h" + +#include "globals.h" + +int +main(int argc, char **argv) +{ + int n; /* number of EVD samples */ + float p1, p2; + struct histogram_s *histog; + int i,j; + float x; + int seed; + int do_evd, set, fit_evd, show_hist; + + p1 = atof(argv[1]); /* mu or mean */ + p2 = atof(argv[2]); /* lambda or sd */ + n = atoi(argv[3]); /* # of histograms */ + do_evd = atoi(argv[4]); /* 1 to sample EVD; 0 to sample Gaussian */ + set = atoi(argv[5]); /* 1 to set instead of fit the dist */ + fit_evd = atoi(argv[6]); /* 1 to fit EVD; 0 to fit Gaussian */ + show_hist = atoi(argv[7]); /* 1 to show histogram */ + + seed = (int) time ((time_t *) NULL); + sre_srandom(seed); + + for (j = 0; j < n; j++) + { + histog = AllocHistogram(-200, 200, 100); + for (i = 0; i < 2500; i++) + { + if (do_evd) x = EVDrandom(p1, p2); + else x = Gaussrandom(p1, p2); + + assert(x > -100.); + assert(x < 100.); + AddToHistogram(histog, x); + } + + if (set && fit_evd) + ExtremeValueSetHistogram(histog, p1, p2); + else if (set && !fit_evd) + GaussianSetHistogram(histog, p1, p2); + else if (!set && fit_evd) + ExtremeValueFitHistogram(histog, 9999.); + else + GaussianFitHistogram(histog, 9999.); + + printf("%f\n", histog->chip); + + if (show_hist) + PrintASCIIHistogram(stdout, histog); + + FreeHistogram(histog); + } + + return 0; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/fn3-bin b/forester/archive/RIO/others/hmmer/testsuite/fn3-bin new file mode 100644 index 0000000..59163b3 Binary files /dev/null and b/forester/archive/RIO/others/hmmer/testsuite/fn3-bin differ diff --git a/forester/archive/RIO/others/hmmer/testsuite/fn3-bin-swap b/forester/archive/RIO/others/hmmer/testsuite/fn3-bin-swap new file mode 100644 index 0000000..8074209 Binary files /dev/null and b/forester/archive/RIO/others/hmmer/testsuite/fn3-bin-swap differ diff --git a/forester/archive/RIO/others/hmmer/testsuite/fn3.hmm b/forester/archive/RIO/others/hmmer/testsuite/fn3.hmm new file mode 100644 index 0000000..ed949cb --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/fn3.hmm @@ -0,0 +1,270 @@ +HMMER2.0 [2.1.4] +NAME fn3 +LENG 84 +ALPH Amino +RF no +CS no +MAP yes +COM ../binaries/hmmbuild -F fn3.hmm fn3.seed +NSEQ 109 +DATE Sat Apr 29 15:36:08 2000 +CKSUM 9857 +XT -8455 -4 -1000 -1000 -8455 -4 -8455 -4 +NULT -4 -8455 +NULE 595 -1558 85 338 -294 453 -1158 197 249 902 -1085 -142 -21 -313 45 531 201 384 -1998 -644 +HMM A C D E F G H I K L M N P Q R S T V W Y + m->m m->i m->d i->m i->i d->m d->d b->m m->e + -13 * -6769 + 1 -1712 -4227 -5498 -865 -4208 -2901 -1274 -566 -2467 395 -3420 -4836 3619 -1858 -4835 -1203 -1345 -131 -4660 -1520 1 + - -150 -501 232 46 -382 399 104 -628 211 -461 -722 274 395 44 95 358 118 -368 -296 -251 + - -142 -3413 -12964 -19 -6286 -701 -1378 -13 * + 2 -626 -5402 1665 -881 -5720 541 -3570 -5469 -3152 -906 -4492 -539 1858 -1555 -2021 1928 -595 -1313 -5587 -1487 3 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11922 -12964 -894 -1115 -701 -1378 * * + 3 1982 -5408 -2052 103 -5729 -1682 -3568 -5480 246 -3106 -4497 -1099 2207 785 -341 -15 -969 -591 -5592 -1728 4 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11922 -12964 -894 -1115 -701 -1378 * * + 4 -1926 -4681 -2749 -6730 -4830 -6496 -5599 -144 -6364 -1674 -3977 -6224 3833 -6035 -1363 -5637 -2263 1172 -5463 -5108 5 + - -148 -501 232 42 -382 397 104 -620 209 -460 -713 274 394 44 98 358 116 -371 -296 -251 + - -124 -3599 -12990 -21 -6130 -701 -1378 * * + 5 -1312 -699 -1390 365 -156 656 -278 -5500 196 -1273 -1315 -1810 -383 1165 1059 988 1349 -1727 -330 -1786 7 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -11948 -6745 -894 -1115 -701 -1378 * * + 6 -952 -5420 1492 -589 -2060 995 -241 -588 -1239 -1246 -4509 2702 -765 546 117 -1666 -1058 -5042 -5603 -1781 8 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11935 -12977 -894 -1115 -373 -2136 * * + 7 -1910 -4186 -1403 -683 554 -5930 -4807 398 -5685 2189 658 -5576 664 -5305 -5486 -5017 -4309 1814 -4661 -4323 9 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -22 -11948 -6043 -894 -1115 -701 -1378 * * + 8 -1843 -858 -1320 -329 -1890 -1639 2015 -1004 837 -3190 -4500 -528 -5007 1351 1283 933 1453 118 -5594 -4912 10 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11926 -12968 -894 -1115 -288 -2468 * * + 9 1390 190 -6595 -5959 1799 -625 -4670 917 -2544 -64 -175 -5444 -2246 -521 -2424 -1906 -2445 2071 -270 -4193 11 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 10 -1212 -976 -129 110 -5720 -2845 -164 383 351 -777 230 -1215 -2266 -366 198 1223 1864 -91 -385 -1640 12 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 11 357 -5433 1583 1276 -488 484 -1040 -5505 -1949 -5449 -4522 1509 419 -407 -723 -10 -541 -211 -5616 -1543 13 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -54 -11948 -4764 -894 -1115 -701 -1378 * * + 12 -185 -872 -1228 -815 -5284 -4981 -3657 1656 -1247 39 -279 70 -732 313 214 -878 364 1866 -5347 -4747 14 + - -147 -503 232 43 -379 402 103 -625 210 -467 -723 275 395 42 93 360 117 -369 -297 -247 + - -243 -2693 -12936 -347 -2227 -156 -3286 * * + 13 -1722 -5433 -171 -632 -5754 984 -3592 -5504 -1911 -2306 -1336 1274 -5027 757 52 1088 2327 -5055 -461 -684 17 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 14 545 -5433 984 806 -5754 -286 -134 -5505 -1324 -3076 -4522 -291 1593 -1508 -1328 1511 638 -1810 257 -1616 18 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11948 -6839 -894 -1115 -701 -1378 * * + 15 -3956 -5429 1465 -1445 -5750 -533 -126 -5501 -1258 -5445 -4519 1773 -5022 -3128 -91 1187 2378 -1884 -5613 197 19 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11935 -12978 -894 -1115 -384 -2097 * * + 16 -620 -5433 -1185 95 -2048 -386 -1155 -5505 -255 -5449 -4522 345 -5027 -387 -680 2481 1173 -1235 -453 371 20 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 17 545 -4080 -6600 -5964 979 -2741 -1162 1755 -5559 1775 1268 -5448 -5852 -5182 -5359 -4887 -2301 1502 -4538 -1461 21 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 18 -945 -842 -1435 -275 -1139 -2672 314 -366 -905 -66 -427 -211 -2235 675 461 548 1698 1036 -558 -670 22 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 19 -1902 -670 -6964 -6348 188 -2623 -5112 1707 -5963 1935 -237 -5851 -6235 -5596 -5776 -5302 -2390 2324 -4960 -4615 23 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 20 -657 -5433 -2278 72 -5754 -1781 1065 -5505 -429 -1282 -368 683 -5027 935 654 2126 1335 -5055 -468 -4933 24 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 21 -8231 -7330 -9169 -9336 78 -9016 -5428 -2450 -8949 -2065 -5190 -7783 -2409 -7649 -8320 -8300 -8080 -6572 6181 -3498 25 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 22 -1358 -5432 305 919 -2019 -2615 -276 -1003 511 -826 -1303 41 -2076 1069 -82 1302 1506 -1283 -5615 -1694 26 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 23 1017 -5398 -3827 -347 -5703 -777 618 -939 30 -1608 -4491 -1916 2739 -944 109 -308 -293 -499 474 -1759 27 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 24 7 -5362 -1347 -1387 -5651 -1257 -3617 -2388 -460 -727 -1300 -1005 3338 -1501 -3712 558 -1388 -154 -5565 -4903 28 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -649 -11948 -1466 -894 -1115 -701 -1378 * * + 25 -313 -4824 738 903 489 -1961 -319 -1743 600 -730 -3913 -512 1182 -1049 456 69 1061 -1200 -5008 -1017 29 + - -149 -513 256 52 -380 388 111 -628 204 -474 -734 271 397 57 90 357 118 -370 -308 -236 + - -2828 -458 -2930 -849 -1169 -4461 -67 * * + 26 -1035 -178 227 -2617 -4671 2074 534 -1574 -1120 -142 -607 230 1117 -92 -7 -60 -1463 -921 262 -829 34 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11120 -12162 -894 -1115 -167 -3197 * * + 27 -857 -5371 794 -103 -5693 997 431 -981 -785 -3041 -4461 2028 -43 581 354 793 -605 -1682 -313 -1481 35 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11883 -12925 -894 -1115 -135 -3482 * * + 28 586 -5425 -81 -53 -5742 2298 -3595 -991 -1451 -2142 -1249 -623 -2318 -1468 -5 1095 -941 -277 -5610 -1606 36 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 29 -210 -5433 989 1029 -5754 -1116 353 -1014 171 -3117 -1178 -275 2115 -43 -830 -503 318 111 -5616 -1638 37 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 30 -1068 -4078 -359 -5958 1263 -1799 -4669 2510 -5554 276 -910 -2229 -375 -1944 -5354 -2555 -2191 1553 549 775 38 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 31 -980 -681 1466 40 -545 -5057 -3737 -672 -560 -87 -1269 -558 -5145 282 -90 219 2049 -46 -417 -1694 39 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 32 -1302 -5429 -440 -620 -1239 2220 1313 -2398 434 -3100 -1104 805 -5028 -433 316 596 -1027 -1916 -565 522 40 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 33 -8272 -7332 -8886 -9158 2064 -8716 -1014 -1559 -1656 -2163 -6667 -7396 -8586 -7513 -8120 -7948 -8139 -1736 -299 4507 41 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 34 -1292 -5430 -146 1338 -556 -2813 -206 -371 415 -197 -4519 165 -5028 278 1599 253 380 -304 -5614 -262 42 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 35 -2855 -5757 -8978 -8661 -6170 -8865 -8890 2503 -8646 1015 -1233 -8521 -8521 -8496 -8785 -8267 -6272 2826 -8027 -7630 43 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -85 -11948 -4130 -894 -1115 -701 -1378 * * + 36 -740 -868 121 1136 -1117 -4863 456 -330 599 -1234 -4412 -3505 -4956 1505 886 120 1536 -1114 -5511 -101 44 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11863 -12905 -894 -1115 -109 -3778 * * + 37 -932 1837 -6595 -5959 -91 -5798 -328 -291 -407 -8 -3280 -5444 -5848 -5178 -795 -2551 -988 502 2740 3658 45 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1993 -11948 -418 -894 -1115 -701 -1378 * * + 38 -4510 -5545 -3323 -676 -6286 -4728 2257 -5751 -2041 -5420 -4762 -420 -4944 -2780 3841 -989 -4331 -5444 -5302 -5022 46 + - -149 -489 232 45 -381 398 105 -627 210 -467 -711 277 393 48 95 359 117 -370 -295 -250 + - -1797 -3160 -736 -733 -1328 -10 -7215 * * + 39 -147 363 -2007 304 -918 -1103 -3478 -161 1080 -715 -4049 -1617 -2050 2003 -3601 557 -702 -321 2877 644 49 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11712 -6548 -894 -1115 -359 -2184 * * + 40 -855 -930 550 1092 -5698 -1653 286 -2500 552 -1710 226 346 1948 343 -1203 414 429 -794 -5560 -4877 50 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11889 -12931 -894 -1115 -2701 -241 * * + 41 181 -153 516 463 -5664 -1662 -1038 -48 1045 -254 -194 -3524 143 556 -1145 61 726 794 -5544 -4868 51 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -116 -11889 -3697 -894 -1115 -146 -3378 * * + 42 -3855 -5327 1599 16 -1927 1687 -1100 -5398 -1840 -1553 -4416 2295 -4922 -736 263 13 -364 -765 -5511 -1538 52 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11837 -12879 -894 -1115 -87 -4089 * * + 43 -638 -5433 1340 850 -5754 1040 735 -2398 522 -1339 -1395 491 -5027 -79 -737 897 693 -1779 -5616 -4933 53 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 44 -972 -5410 -1362 621 -2330 1671 -968 -1642 -952 -1007 223 1495 819 1092 -3690 -29 -326 -652 -5600 -685 54 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -155 -11948 -3299 -894 -1115 -701 -1378 * * + 45 -2194 -5287 1087 1415 -5608 870 -907 -2395 349 -427 -4376 -492 291 1176 -317 355 -70 -2410 771 -4787 55 + - -148 -444 225 63 -389 400 108 -622 197 -475 -727 259 408 43 103 350 102 -361 -222 -237 + - -1290 -759 -12836 -1725 -520 -1218 -811 * * + 46 -157 337 519 798 -1912 -1225 -68 -653 114 -297 -291 -1697 725 -732 -1940 -255 36 -19 3575 -4817 66 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11858 -12900 -894 -1115 -104 -3844 * * + 47 -354 -5019 -821 -3545 401 -5079 1252 159 969 -224 -1342 1435 -694 1532 487 -1195 179 115 -703 -1756 67 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 48 -1345 -5433 -1224 1817 -5754 -2557 2299 -2518 -1322 -5449 -1196 460 -410 547 474 477 796 -85 1521 -203 68 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 49 -735 -922 -865 99 1424 -1378 -968 800 125 368 -298 -1091 -5130 1044 487 -1699 -652 820 -5380 495 69 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 50 -2548 -5416 476 -926 -786 -4939 -202 333 -743 -1027 220 1772 -491 302 391 547 1705 -327 -5604 -4926 70 + - -146 -501 231 42 -382 399 104 -628 211 -467 -722 274 394 47 94 361 116 -362 -296 -251 + - -54 -4771 -12990 -1430 -669 -701 -1378 * * + 51 -125 -4491 -206 -447 -1977 -1822 -4130 1477 -2342 582 -1108 -162 -2226 -3990 -445 -1290 -16 2224 -4885 -4452 74 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 52 -392 -879 1247 -518 -1099 -606 -233 -5499 1 -2146 -4519 -1046 2266 -3134 -723 1223 288 -67 -5614 -4932 75 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 53 110 -5432 -786 -62 -5753 1386 -3592 -2570 485 -1099 -4521 1153 1199 -1503 1373 399 -3899 -2642 1101 -1717 76 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 54 -933 -5432 792 -584 -5753 -910 1029 -2570 -3173 -5448 -4522 1505 989 185 -764 1043 1675 261 -5616 -1615 77 + - -149 -502 231 43 -383 404 104 -628 208 -461 -722 276 392 47 94 357 121 -371 -296 -251 + - -62 -4587 -12990 -1650 -554 -701 -1378 * * + 55 528 -5430 -2070 1507 -5750 -4935 -27 -1606 194 -768 -4520 -3570 -147 1018 -439 182 1977 -290 -5614 -4932 82 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -531 -11948 -1701 -894 -1115 -701 -1378 * * + 56 -1241 -4929 424 -414 -37 -713 1149 -537 -799 -2452 153 1422 -1051 -431 507 353 1641 -1262 -5114 735 83 + - -149 -490 230 45 -378 398 103 -621 208 -466 -723 273 391 43 104 358 121 -372 -297 -252 + - -348 -2223 -12460 -101 -3887 -730 -1332 * * + 57 -1103 -5244 -3635 529 -972 -961 40 -2282 -714 -1986 -1144 223 -1119 761 812 1888 1499 -2270 -5430 -107 86 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11760 -12802 -894 -1115 -56 -4726 * * + 58 990 1359 -2412 -5961 1171 -2669 1032 -583 -5555 -220 684 -5445 -5849 -5178 -2256 -4883 -79 1469 -497 2841 87 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 59 -1740 -5388 -543 238 -2017 -4947 205 658 -59 -940 305 334 -2308 69 -686 -619 2386 690 -5584 -1963 88 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 60 -4313 -4136 -693 -6024 1490 -5866 -4738 1755 -5621 1797 -1007 -5512 -5911 -452 -2279 -2669 -4253 1969 -4597 -4258 89 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 61 -1206 -5433 -67 557 -1180 -1815 996 -5504 429 -3083 -279 290 -135 714 105 843 1912 -622 -475 -1615 90 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 62 -2512 -5433 1789 -331 -5754 2420 -115 -2442 -207 -5449 -1143 1986 -2076 -952 -2025 -549 -3899 -5055 -5616 -1656 91 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 63 -2664 -4761 -2431 -6696 -927 -6570 -5431 -1266 -6307 3192 -228 -6220 -6539 -2095 -6083 -5670 -2437 -1081 -5161 -4890 92 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 64 -1265 -5432 -786 696 -5753 -2574 -190 -709 1120 -437 -1279 2 -869 1117 1301 -183 955 122 -5616 -235 93 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 65 667 -5360 -3850 234 -5648 -4957 -3619 -5375 -1411 -5362 -1250 -1129 3641 -944 -3714 -1681 -282 -2598 -323 -1676 94 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 66 -891 -5431 642 -1327 -637 2698 1447 -5500 -1220 -3194 -4520 1005 -5028 -1534 -1185 -106 -2252 -2539 -453 719 95 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 67 -341 -924 -3935 -309 -5457 -5000 -291 -1612 -499 -2284 675 -1099 -5091 -887 1180 235 2750 669 -5465 -4840 96 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 68 -728 -5432 601 1421 -1172 -1229 -74 -1011 998 -1077 -298 499 243 679 186 -154 551 -1728 -344 -4933 97 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 69 -8596 -7506 -8959 -9321 -422 -8843 -5022 -7480 -8878 -3146 -6880 -2244 -8697 -7593 -8237 -8093 -8445 -7638 -4268 4885 98 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 70 -1827 -5418 -2102 1225 -1065 -2710 630 -696 193 -993 -1227 1015 -5031 784 -68 445 1395 673 -5606 -199 99 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 71 -1785 -589 -6706 -6075 2739 -2734 -4797 1908 -5675 95 263 -5564 -5962 -5299 -5478 -5006 -4295 2092 -4657 643 100 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 72 -1787 144 -783 -139 -5753 -2709 -260 -1568 913 -5448 -373 39 -5027 1175 2142 801 910 -1336 1128 173 101 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 73 -1872 -5834 -9025 -8676 -1056 -8881 -8673 1755 -8630 359 -1237 -8541 -8490 -8319 -8674 -8265 -6337 3280 -7727 -7464 102 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 74 327 -924 -2319 -1494 -173 -4998 230 316 472 -1019 -4344 -1099 -5088 1874 1193 -1265 439 611 -5469 1668 103 + - -149 -500 233 43 -381 398 105 -626 210 -462 -721 275 394 45 96 359 117 -369 -295 -250 + - -42 -5742 -6616 -86 -4119 -701 -1378 * * + 75 3012 -4665 -7117 -2715 -5019 -382 -5509 -2260 -6298 -1487 -4254 -5905 -6197 -5917 -6115 536 1246 -967 -5505 -5179 105 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11933 -12975 -894 -1115 -357 -2190 * * + 76 -902 -714 -6363 -469 -141 -5764 290 585 -447 281 -993 -5334 -5816 439 1001 -1825 -1361 2268 -4558 1019 106 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 77 -2376 -795 187 -2095 -5752 -2675 -955 -1144 135 -3116 -4521 2792 -767 -1566 -777 1498 1246 -2809 -5615 251 107 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -11948 -6747 -894 -1115 -701 -1378 * * + 78 337 -737 928 157 -2106 1805 -240 -2485 -205 -5436 -4509 308 -2438 892 51 -133 409 -284 -5604 -4921 108 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11935 -12977 -894 -1115 -373 -2135 * * + 79 751 -879 627 278 -2228 932 413 207 367 -1700 -291 502 -2136 495 118 -1224 -556 59 -5609 -760 109 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -746 -11948 -1309 -894 -1115 -701 -1378 * * + 80 -1113 -4530 -1626 -702 -1338 3243 -3172 -4385 -2856 -2220 -3677 -1218 -4564 -2783 -3335 -609 475 -4087 -4819 473 110 + - -153 -500 231 50 -364 398 93 -624 206 -464 -733 283 383 46 94 356 116 -368 -250 -237 + - -6367 -18 -12245 -485 -1807 -3943 -97 * * + 81 -5380 -6068 -5956 -5195 -7389 3194 -752 -6800 -1418 -2482 -5840 -5064 -706 -4120 -1351 1839 -5345 -6386 -6418 -6291 116 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11248 -12290 -894 -1115 -270 -2549 * * + 82 55 -5343 -399 930 -1066 -48 1295 -2492 90 -1643 1044 -891 1912 1116 368 -1154 -891 -1051 -5526 -1841 117 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11854 -12896 -894 -1115 -101 -3893 * * + 83 90 -5310 -3879 1738 8 -1225 -3636 -1523 -271 -615 -1216 -1875 1773 -1639 653 -366 -193 -4902 2379 453 118 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -1 -11948 -12990 -894 -1115 -701 -1378 * * + 84 -1223 -5446 -3828 -1467 -5768 -863 -152 -2326 -1293 -5461 -4537 -3588 -2253 -1611 -424 3277 371 -5069 -5628 -4949 119 + - * * * * * * * * * * * * * * * * * * * * + - * * * * * * * * 0 +// diff --git a/forester/archive/RIO/others/hmmer/testsuite/fn3.seed b/forester/archive/RIO/others/hmmer/testsuite/fn3.seed new file mode 100644 index 0000000..aef27c1 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/fn3.seed @@ -0,0 +1,332 @@ +# STOCKHOLM 1.0 + +7LES_DROME/1799-1891 P.SPP.RNFSVRVL..SPRELEVSWLPPEq...LRSESVYYTLHW...QQ +7LES_DROVI/1917-1997 S.YAPlPPLQLIEL..NAYGMTLAWPGT......PDALSSLTLEC...QS +APU_THETY/928-1009 A.PQPiTDLKAVS...GNGKVDLSWSVV.......DKAVSYNI.YR...S +APU_THETY/1165-1248 P.TAP.V.LQQPGI..ESSRVTLNWSPSA....DDVAIFGYEIYK...SS +AXO1_CHICK/602-692 PpGPP.GGVVVRDI..GDTTVQLSWSRGFd...NHSPIARYSIEAR...T +AXO1_CHICK/807-896 PkVAP.FRVTAKAV..LSSEMDVSWEPVEqg.dMTGVLLGYEIRY...WK +CAML_HUMAN/812-907 P.QAI.PELEGIEIl.NSSAVLVKWRPVDla.qVKGHLRGYNVTY...WR +CHI1_BACCI/465-542 P.SVP.GNARSTGV..TANSVTLAWNAST....DNVGVTGYNV.YN.... +CHIT_STRLI/142-219 P.SAP.GTPTASNI..TDTSVKLSWSAAT....DDKGVKNYDV.LR.... +CHIX_STROI/169-240 P.PAPpTGLRTGSV..TATSVALSWSPV.......TGATGYAV.YR.... +CONT_CHICK/799-884 PtEVP.TDVSVKVL..SSSEISVSWHHVT.....EKSVEGYQIRY...WA +CPSF_CHICK/630-716 P.DPP.QSVRVTSV..GEDWAVLSWEAPPf..dGGMPITGYLMER...KK +CPSF_CHICK/923-1008 P.GPP.QAVRVMEV..WGSNALLQWEPPKd..dGNAEISGYTVQK...AD +ECK_HUMAN/329-420 P.SAP.HYLTAVGM..GA.KVELRWTPPQd..sGGREDIVYSVTCEqcWP +ECK_HUMAN/436-519 Q.TEP.PKVRLEGR..STTSLSVSWSIPPp...QQSRVWKYEVTYR...K +EPH1_HUMAN/333-435 P.SAP.RNLSFSA...SGTQLSLRWEPPAd..tGGRQDVRYSVRCS..QC +EPH3_CHICK/333-429 P.SAP.QAV.ISSV..NETSLMLEWTPPRd..sGGREDLVYNIIC...KS +EPH3_CHICK/444-528 P.SAV.SIMHQVSR..TVDSITLSWSQPDq...PNGVILDYELQY...YE +ETK1_CHICK/325-421 P.SAP.RNV.ISNI..NETSVILDWSWPLd..tGGRKDVTFNIIC...KK +FAS2_SCHAM/530-616 P.SAV.LQVKMDVM..TATTVTFKFFGPGn..dGGLPTKNYAVQY...KQ +FAS2_SCHAM/642-735 T.SGT.ENEVVVSP..YPNRYELRWQVPAd...NGEPITHYSVKS...CP +FINC_BOVIN/577-660 T.SGP.VQVIITETpsQPNSHPIQWSAPE.....SSHISKYILRW...KP +FINC_BOVIN/689-768 P.VVA.TSESVTEI..TASSFVVSWVSA......SDTVSGFRVEY...EL +FINC_BOVIN/780-858 P.DAP.PDPTVDQV..DDTSIVVRWSRP......RAPITGYRIVY...SP +FINC_BOVIN/875-955 KvPPP.RDLQFVEV..TDVKITIMWTPP......ESPVTGYRVDV...IP +FINC_BOVIN/1142-1225 PlSPP.TNLHLEANp.DTGVLTVSWERST.....TPDITGYRITT...TP +FINC_BOVIN/1236-1316 V.PPP.TDLRFTNV..GPDTMRVTWAPPS.....SIELTNLLVRY...SP +FINC_BOVIN/1327-1406 L.DSP.SGIDFSDI..TANSFTVHWIAP......RATITGYRIRHH...P +FINC_BOVIN/1417-1499 S.DVP.RDLEVIAA..TPTSLLISWDAP......AVTVRYYRITY...GE +FINC_BOVIN/1511-1590 I.DKP.SQMQVTDV..QDNSISVRWLPS......SSPVTGYRVTT...AP +FINC_BOVIN/1601-1680 I.PAP.TNLKFTQV..TPTSLTAQWTAP......NVQLTGYRVRV...TP +FINC_BOVIN/1693-1771 V.SPP.RRARVTDA..TETTITISWRTK......TETITGFQVDA...IP +FINC_BOVIN/1782-1861 I.DAP.SNLRFLAT..TPNSLLVSWQPP......RARITGYIIKY...EK +FINC_CHICK/551-630 I.DRP.KGLTFTEV..DVDSIKIAWESP......QGQVTRYRVTY...SS +FINC_RAT/1266-1346 V.PQL.TDLSFVDI..TDSSIGLRWTPLN.....SSTIIGYRITV...VA +GUNB_CELFI/651-733 P.TTP.GTPVATGV..TTVGASLSWAASTd...AGSGVAGYEL.YR...V +IL7R_HUMAN/129-221 P.EAP.FDLSVIYRe.GANDFVVTFNTSHlq.kKYVKVLMHDVAYR..QE +ITB4_HUMAN/1127-1208 L.GAP.QNPNAKAA..GSRKIHFNWLPP......SGKPMGYRVKY...WI +ITB4_HUMAN/1220-1310 P.SEP.GRLAFNVV..SSTVTQLSWAEPAe...TNGEITAYEVCY...GL +ITB4_HUMAN/1581-1665 P.DTP.TRLVFSAL..GPTSLRVSWQEPR....CERPLQGYSVEY...QL +ITB4_HUMAN/1694-1781 P.SAP.GPLVFTAL..SPDSLQLSWERPRr...PNGDIVGYLVTC...EM +KALM_CHICK/178-271 P.LKPrKELKFIEL..QSGDLEVKWSSKFn...ISIEPVIYVVQRR..WN +KALM_CHICK/544-642 L.AKP.ENLSASFIv.QEGNITGHFSWKIskavLHQPMTGFQVTW...AE +KMLC_CHICK/60-145 P.DPPaGTPCASDI..RSSSLTLSWYGSSy..dGGSAVQSYTVEI...WN +LAR_DROME/322-404 P.TAP.TDVQISEV..TATSVRLEWSYK.....GPEDLQYYVIQY...KP +LAR_DROME/417-503 E.SAP.RNVQVRTL..SSSTMVITWEPPEt...PNGQVTGYKV.Y...YT +LAR_DROME/515-598 P.SQP.SNFRATDI..GETAVTLQWTKPTh...SSENIVHYELYW...ND +LAR_DROME/709-800 P.GDP.QDVKATPL..NSTSIHVSWKPPLek.dRNGIIRGYHIHA...QE +LAR_DROME/909-995 PgGPP.SNITIRFQ..TPDVLCVTWDPPTre.hRNGIITRYDVQFH..KK +MPSF_CHICK/371-457 P.GAP.MDVKCHDA..NRDYVIVTWKPPNt..tSQNPVIGYFVDK...CE +MPSF_CHICK/499-585 P.GPP.TNVHASEI..SKTYVVLSWDPPVp...RGREPLTYFIEK...SM +MPSF_CHICK/600-684 P.SAP.GRVVATRN..TKTSVVVQWDKPK....HEENLYGYYIDY...SV +MPSF_CHICK/699-785 P.SYP.HGITLLNC..DGHSMTLGWKAPKy..sGGSPILGYYIDKR...E +MPSF_CHICK/801-887 P.GPA.YDLTVCEV..RNTSLVLLWKAPVy..eGKSPITGYLVDY...KE +NCA1_BOVIN/509-597 P.SSP.SIDQVEP...YSSTAQVQFDEPEa..tGGVPILKYKAEWR...A +NCA1_BOVIN/610-691 P.SAP.KLEGQMGE..DGNSIKVKLIKQDd...GGSPIRHYLVKYR...A +NGCA_CHICK/700-794 PeRNP.GGVHGEGN..ETGNLVITWEPLPpq.aWNAPWARYRVQWR...P +NRCA_CHICK/623-709 P.NPP.LDLELTGQ..LERSIELSWVPGEe...NNSPITNFVIEY...ED +NRCA_CHICK/726-810 ....P.SNVQGIGS..EPDNLVITWESLKgf.qSNGPGLQYKVSWR..QK +NRCA_CHICK/928-1014 P.SPP.SFLKITNP..TLDSLTLEWGSPTh...PNGVLTSYILKF...QP +NRG_DROME/717-799 ....P.DNVVGQGT..EPNNLVISWTPMPei.eHNAPNFHYYVSW...K. +NRG_DROME/815-905 PlDAP.TNFTMRQIt.SSTSGYMAWTPVSee.sVRGHFKGYKIQT...WT +NRG_DROME/917-1007 P.SPV.QGLDAYPL..GSSAFMLHWKKPLy...PNGKLTGYKIYY...EE +PHB_ALCFA/344-418 G.SAP.TGLAVTAT..TSTSVSLSWNAV.......ANASSYGV.YR.... +PTP1_DROME/123-205 P.DPP.SNLSVQVR..SGKNAIILWSPPT.....QGSYTAFKIKV...LG +PTP1_DROME/217-301 P.NTP.GKFIVWFR..NETTLLVLWQPPY....PAGIYTHYKVSI...EP +PTP1_DROME/312-394 P.LRP.LNVTFDRDfiTSNSFRVLWEAPK....GISEFDKYQVSV...AT +PTP1_DROME/405-485 P.LPV.RNLRSINDd.KTNTMIITWEADP.....ASTQDEYRIVYHe.LE +PTP1_DROME/583-661 P.NPP.RNMTIETV..RSNSVLVHWSPPE.....SGEFTEYSIRYR...T +PTP1_DROME/864-944 P.EPI.TQLHATNI..TDTEISLRWDLP......KGEYNDFDIAY...LT +PTP1_DROME/958-1044 P.GRV.ERFHPTDV..QPSEINFEWSLPSs..eANGVIRQFSIAY...TN +PTP6_DROME/236-321 V.PQV.SIDFAKAV..GANKIYLNWTVND....GNDPIQKFFITL...QE +PTP6_DROME/332-425 Y.DPI.FIPKVETTgsTASTITIGWNPPPp..dLIDYIQYYELIV...SE +PTP9_DROME/171-259 P.SKP.QNLTILDV..SANSITMSWHPPKn...QNGAIAGYHVFH...IH +PTPB_HUMAN/22-103 AePER.CNFTLAESkaSSHSVSIQWRIL.......GSPCNFSLIY...SS +PTPB_HUMAN/112-192 P.PAR.FGVSKEKT..TSTGLHVWWTPS......SGKVTSYEVQL...FD +PTPB_HUMAN/467-543 P.LAV.LQLRVKHA..NETSLSIMWQTP......VAEWEKYIISL...AD +PTPB_HUMAN/554-632 P.AQV.TDLHVANQg.MTSSLFTNWTQA......QGDVEFYQVLL...IH +PTPB_HUMAN/643-725 P.SSV.SGVTVNNSg.RNDYLSVSWLVA......PGDVDNYEVTL...SH +PTPB_HUMAN/731-808 P.DKV.QGVSVSNSa.RSDYLRVSWVHA......TGDFDHYEVTI...KN +PTPB_HUMAN/907-984 P.SAV.KNIHISPNg.ATDSLTVNWTPG......GGDVDSYTVSA...FR +PTPB_HUMAN/995-1074 P.ASV.QGVIADNAy.SSYSLIVSWQKA......AGVAERYDILL...LT +PTPB_HUMAN/1085-1162 P.AAV.TDLRITEN..STRHLSFRWTAS......EGELSWYNIFL...YN +PTPB_HUMAN/1173-1250 P.ASV.SHLRGSNRn.TTDSLWFNWSPA......SGDFDFYELIL...YN +PTPB_HUMAN/1261-1344 P.SPP.SLMSFADI..ANTSLAITWKGPP....DWTDYNDFELQW...LP +PTPB_HUMAN/1355-1434 P.DKI.QNLHCRPQ..NSTAIACSWIPP......DSDFDGYSIECR...K +PTPK_MOUSE/290-376 P.PRPiAPPQLLGV..GPTYLLIQLNANSi..iGDGPIILKEVEYR...M +PTPZ_HUMAN/312-401 S.SEP.ENVQADPE..NYTSLLVTWERPRv..vYDTMIEKFAVLY...QQ +SEK_MOUSE/441-525 P.SSI.ALVQAKEV..TRYSVALAWLEPDr...PNGVILEYEVKY...YE +TENA_CHICK/593-671 V.SPP.TELTVTNV..TDKTVNLEWKHE.......NLVNEYLVTY...VP +TENA_CHICK/682-767 L.PAP.EGLKFKSV..RETSVQVEWDPL......SISFDGWELVFRnmQK +TENA_CHICK/774-853 L.DAP.SQIEAKDV..TDTTALITWSKP......LAEIEGIELTY...GP +TENA_CHICK/864-945 L.DAP.RNLKRVSQ..TDNSITLEWKNS......HANIDNYRIKF...AP +TENA_CHICK/956-1033 L.DNP.KDLEVSDP..TETTLSLRWRRP......VAKFDRYRLTY...VS +TENA_CHICK/1045-1124 E.PEL.GNLSVSET..GWDGFQLTWTAA......DGAYENFVIQV...QQ +TENA_CHICK/1136-1215 H.PEV.GELTVSDI..TPESFNLSWTTT......NGDFDAFTIEI...ID +TENA_CHICK/1227-1306 E.PEV.DNLLVSDA..TPDGFRLSWTAD......DGVFDSFVLKIR..DT +TENA_CHICK/1317-1395 V.GSP.KGISFSDI..TENSATVSWTPP......RSRVDSYRVSY...VP +TENA_CHICK/1406-1483 L.DSP.SGLVVMNI..TDSEALATWQPA......IAAVDNYIVSY...SS +TENA_CHICK/1494-1571 L.DAP.KDLSATEV..QSETAVITWRPP......RAPVTDYLLTY...ES +TENA_HUMAN/1254-1334 E.VPDmGNLTVTEV..SWDALRLNWTTP......DGTYDQFTIQV...QE +TENA_HUMAN/1528-1607 L.PLL.ENLTISDI..NPYGFTVSWMAS......ENAFDSFLVTV...VD +TIE1_HUMAN/446-533 P.PVPlAAPRLLTK..QSRQLVVSPLVSFs...GDGPISTVRLHYR..PQ +TIE1_HUMAN/545-632 PlLQP.WLEGWHVE..GTDRLRVSWSLPLv..pGPLVGDGFLLRL...WD +TIE1_HUMAN/644-729 P.PAP.RHLHAQAL..SDSEIQLTWKHPEa...LPGPISKYVVEV...QV +TIE2_HUMAN/444-529 L.PKPlNAPNVIDT..GHNFAVINISSEPy..fGDGPIKSKKLLY...KP +TIE2_HUMAN/543-626 L.PPP.RGLNLLPK..SQTTLNLTWQPIFp...SSEDDFYVEVERR...S +TIE2_HUMAN/639-724 P.PQP.ENIKISNI..THSSAVISWTILD.....GYSISSITIRY...KV +UFO_HUMAN/327-411 L.GPP.ENISATR...NGSQAFVHWQEPRa..pLQGTLLGYRLAY...QG + +7LES_DROME/1799-1891 ELDGEnvqd..rrewEAHER...RLET....AG..THRLTGIKPGSGYSL +7LES_DROVI/1917-1997 LREQ............LQFN...VAGN....HT..QMRLAPLQPKTRYSC +APU_THETY/928-1009 TVKGG..........LYEKI...ASNV....TQi.TYTDTEVTNGLKYVY +APU_THETY/1165-1248 SETGPf.........IKIAT...VSDS....VY..NYVDTDVVNGNVYYY +AXO1_CHICK/602-692 LLSNKwkq.....mrTNPVN...IEGN....AE..TAQVVNLIPWMDYEF +AXO1_CHICK/807-896 DGDKEea.......aDRVRT...AGLV....T...SAHVTGLNPNTKYHV +CAML_HUMAN/812-907 EGSQRkhsk..rhihKDHVV...VPAN....TT..SVILSGLRPYSSYHL +CHI1_BACCI/465-542 .GAN............LATS...VTGT....T....ATISGLTAGTSYTF +CHIT_STRLI/142-219 .DGA............KVAT...VTGT....T....YTDNGLTKGTAYSY +CHIX_STROI/169-240 .DGV............KVAT...ASGT....S....ATVTGLTPDTAYAF +CONT_CHICK/799-884 AHDKEa........aAQRVQ...VSNQ....EY..STKLENLKPNTRYHI +CPSF_CHICK/630-716 KGSMRw........mKLNFE...VFPD....T...TYESTKMIEGVFYEM +CPSF_CHICK/923-1008 TRTME..........WFTVL...EHSR....PT..RCTVSELVMGNEYRF +ECK_HUMAN/329-420 E.SGEcgp....ceaSVRYS...EPPHgl.tRT..SVTVSDLEPHMNYTF +ECK_HUMAN/436-519 KGDS............NSYN...VRRT....EGf.SVTLDDLAPDTTYLV +EPH1_HUMAN/333-435 QGTAQdggpcqpcgvGVHFSpgaRGLT....TP..AVHVNGLEPYANYTF +EPH3_CHICK/333-429 CGSGRgact...rcgDNVQF...APRQlgltEP..RIYISDLLAHTQYTF +EPH3_CHICK/444-528 KNLSE..........LNSTA...VKSP....TN..TVTVQNLKAGTIYVF +ETK1_CHICK/325-421 CGGSSkice...pcsDNVRF...LPRQtg.lTNt.TVTVVDLLAHTNYTF +FAS2_SCHAM/530-616 DSQGW..........EDALN...RTWP....VDs.PYILENLKPQTRYNF +FAS2_SCHAM/642-735 VEKYDtewrl.lpypCQEHK...LEGQ....AT..TFQLESLQPDTHYKV +FINC_BOVIN/577-660 KNSPDr.........WKEAT...IPGH....LN..SYTIKGLRPGVVYEG +FINC_BOVIN/689-768 SEEGDe.........PQYLD...LPST....AT..SVNIPDLLPGRKYTV +FINC_BOVIN/780-858 SVEGS..........STELN...LPET....AN..SVTLSDLQPGVQYNI +FINC_BOVIN/875-955 VNLPGe........hGQRLP...VSRN....T...FAEVTGLSPGVTYHF +FINC_BOVIN/1142-1225 TNGQQg........ySLEEV...VHAD....QS..SCTFENLSPGLEYNV +FINC_BOVIN/1236-1316 VKNEEd.........VAELS...ISPS....DN..AVVLTNLLPGTEYLV +FINC_BOVIN/1327-1406 ENMGGr.........PREDR...VPPS....RN..SITLTNLNPGTEYVV +FINC_BOVIN/1417-1499 TGGSSp.........VQEFT...VPGS....KS..TATISGLKPGVDYTI +FINC_BOVIN/1511-1590 KNGPGp.........SKTKT...VGPD....QT..EMTIEGLQPTVEYVV +FINC_BOVIN/1601-1680 KEKTGp.........MKEIN...LAPD....SS..SVVVSGLMVATKYEV +FINC_BOVIN/1693-1771 ANGQT..........PIQRT...IRPD....VR..SYTITGLQPGTDYKI +FINC_BOVIN/1782-1861 PGSPPr........eVVPRP...RPGV....T...EATITGLEPGTEYTI +FINC_CHICK/551-630 PEDG............IHEL...LPAPgg.eED..TAELHGLRPGSEYTI +FINC_RAT/1266-1346 AGEGIp.........IFEDF...VDSS....VG..YYTVTGLEPGIDYDI +GUNB_CELFI/651-733 QGTTQ..........TLVGT...TTAA....A....YILRDLTPGTAYSY +IL7R_HUMAN/129-221 KDENK..........WTHVN...LSST....KL..TLLQRKLQPAAMYEI +ITB4_HUMAN/1127-1208 QGDSEs.........EAHLL...DSKV....P...SVELTNLYPYCDYEM +ITB4_HUMAN/1220-1310 VNDDNrpi.....gpMKKVL...VDNP....KNr.MLLIENLRESQPYRY +ITB4_HUMAN/1581-1665 LNGGE..........LHRLN...IPNP....AQt.SVVVEDLLPNHSYVF +ITB4_HUMAN/1694-1781 AQGGGpa.......tAFRVD...GDSP....ES..RLTVPGLSENVPYKF +KALM_CHICK/178-271 QGIHPsed.....daTNWQT...VAQT....TDe.RVQLSDIRASRWYQF +KALM_CHICK/544-642 VTTESrqnslpnsiiSQSQI...LPAD....HY..VLTVPNLRPSMLYRL +KMLC_CHICK/60-145 SVDNK..........WTDLT...TCRS....T...SFNVQDLQADREYKF +LAR_DROME/322-404 KNANQ..........AFSEI...SGII....TM..YYVVRALSPYTEYEF +LAR_DROME/417-503 TNSNQpe......asWNSQM...VDNS....E...LTTVSDVTPHAIYTV +LAR_DROME/515-598 TYANQ..........AHHKR...ISNS....E...AYTLDGLYPDTLYYI +LAR_DROME/709-800 LRDEGkgf....lnePFKFD...VVDT....L...EFNVTGLQPDTKYSI +LAR_DROME/909-995 IDHGL..........GSERN...MTLR....K....AVFTNLEENTEYIF +MPSF_CHICK/371-457 VGLEN..........WVQCN...DAPV....KIc.KYPVTGLYEGRSYIF +MPSF_CHICK/499-585 VGSGS..........WQRVNaqvAVKS....P...RYAVFDLAEGKPYVF +MPSF_CHICK/600-684 VGSNQwe.......pANHKP...INYN....R....FVVHGLETGEQYIF +MPSF_CHICK/699-785 ANHKN..........WHEVNssvISRT....I....YTVEDLTEDAFYEF +MPSF_CHICK/801-887 VDTED..........WITAN...EKPT....SHr.YFKVTDLHQGHTYVF +NCA1_BOVIN/509-597 MGEEVw........hSKWYD...AKEA....SMegIVTIVGLKPETTYAV +NCA1_BOVIN/610-691 LSSEW..........KPEIR...LPSG....SD..HVMLKSLDWNAEYEV +NGCA_CHICK/700-794 LEEPGgggps.ggfpWAEST...VDAP....P....VVVGGLPPFSPFQI +NRCA_CHICK/623-709 GLHEPg........vWHYQT...EVPG....SH..TTVQLKLSPYVNYSF +NRCA_CHICK/726-810 DVDDE..........WTSVV...VANV....S...KYIVSGTPTFVPYEI +NRCA_CHICK/928-1014 INNTHel......gpLVEIR...IPAN....ES..SLILKNLNYSTRYKF +NRG_DROME/717-799 .RDIPaa......awENNNI...FDWR....QN..NIVIADQPTFVKYLI +NRG_DROME/815-905 ENEGEe........gLREIH...VKGD....TH..NALVTQFKPDSKNYA +NRG_DROME/917-1007 V.KESyvge..rreyDPHIT...DPRV....T...RMKMAGLKPNSKYRI +PHB_ALCFA/344-418 .NGS............KVGS...ATAT....A....YTDSGLIAGTTYSY +PTP1_DROME/123-205 LSEASss.......yNRTFQ...VNDN....TF..QHSVKELTPGATYQV +PTP1_DROME/217-301 PDANDsvl.....yvEKEGE...PPGP....A...QAAFKGLVPGRAYNI +PTP1_DROME/312-394 TRRQS..........TVPRS...NEPV....AF..SDFRDIAEPGKTFNV +PTP1_DROME/405-485 TFNGD..........TSTLT...TDRT....R....FTLESLLPGRNYSL +PTP1_DROME/583-661 DSEQQ..........WVRLP...SVRS....T...EADITDMTKGEKYTI +PTP1_DROME/864-944 A.DNL..........LAQNM...TTRN....E....ITISDLRPHRNYTF +PTP1_DROME/958-1044 INNLT..........DAGMQ...DFES....EEa.FGVIKNLKPGETYVF +PTP6_DROME/236-321 AGTPTft.......yHKDFI...NGSH....T...SYILDHFKPNTTYFL +PTP6_DROME/332-425 SGEVPkvi.....eeAIYQQ...NSRN....L...PYMFDKLKTATDYEF +PTP9_DROME/171-259 DNQTGve......ivKNSRN...SVET....LI..HFELQNLRPYTDYRV +PTPB_HUMAN/22-103 DTLGAa........lCPTFR...IDNT....TY..GCNLQDLQAGTIYNF +PTPB_HUMAN/112-192 ENNQKiq......gvQIQES...TSWN....E....YTFFNLTAGSKYNI +PTPB_HUMAN/467-543 R.DLL..........LIHKS...LSKD....AK..EFTFTDLVPGRKYMA +PTPB_HUMAN/554-632 ENVV...........IKNES...ISSE....TS..RYSFHSLKSGSLYSV +PTPB_HUMAN/643-725 DGKV...........VQSLV...IAKS....VR..ECSFSSLTPGRLYTV +PTPB_HUMAN/731-808 KNNF...........IQTKS...IPKS....EN..ECVFVQLVPGRLYSV +PTPB_HUMAN/907-984 H.SQK..........VDSQT...IPKH....VF..EHTFHRLEAGEQYQI +PTPB_HUMAN/995-1074 ENGIL..........LRNTS...EPAT....TK..QHKFEDLTPGKKYKI +PTPB_HUMAN/1085-1162 PDGNLq.........ERAQV...DPLV....Q...SFSFQNLLQGRMYKM +PTPB_HUMAN/1173-1250 PNGTKk.........ENWKD...KDLT....E....WRFQGLVPGRKYVL +PTPB_HUMAN/1261-1344 RDALTv.........FNPYN...NRKS....E...GRIVYGLRPGRSYQF +PTPB_HUMAN/1355-1434 MDTQEv.........EFSRK...LEKE....KS..LLNIMMLVPHKRYLV +PTPK_MOUSE/290-376 T.SGS..........WTETH...AVNA....P...TYKLWHLDPDTEYEI +PTPZ_HUMAN/312-401 LDGEDq........tKHEFL...TDGY....QDl.GAILNNLLPNMSYVL +SEK_MOUSE/441-525 KDQN...........ERSYR...IVRT....AAr.NTDIKGLNPLTSYVF +TENA_CHICK/593-671 TSSGGl.........DLQFT...VPGN....QT..SATIHELEPGVEYFI +TENA_CHICK/682-767 KDDNG..........DITSS...LKRP....ET..SYMQPGLAPGQQYNV +TENA_CHICK/774-853 KDVPGd.........RTTID...LSED....EN..QYSIGNLRPHTEYEV +TENA_CHICK/864-945 ISGGD..........HTELT...VPKGnq.aTT..RATLTGLRPGTEYGI +TENA_CHICK/956-1033 P.SGK..........KNEME...IPVD....ST..SFILRGLDAGTEYTI +TENA_CHICK/1045-1124 SDNPEe.........TWNIT...VPGG....QH..SVNVTGLKANTPYNV +TENA_CHICK/1136-1215 SNRLLe.........PMEFN...ISGN....SR..TAHISGLSPSTDFIV +TENA_CHICK/1227-1306 KRKSD..........PLELI...VPGH....ER..THDITGLKEGTEYEI +TENA_CHICK/1317-1395 ITGGT..........PNVVT...VDGS....KT..RTKLVKLVPGVDYNV +TENA_CHICK/1406-1483 EDEP...........EVTQM...VSGN....TV..EYDLNGLRPATEYTL +TENA_CHICK/1494-1571 I.DGR..........VKEVI...LDPE....TT..SYTLTELSPSTQYTV +TENA_HUMAN/1254-1334 ADQVEe.........AHNLT...VPGS....LR..SMEIPGLRAGTPYTV +TENA_HUMAN/1528-1607 SGKLLd.........PQEFT...LSGT....QR..KLELRGLITGIGYEV +TIE1_HUMAN/446-533 DSTMD..........WSTIV...VDPS....E...NVTLMNLRPKTGYSV +TIE1_HUMAN/545-632 GTRGQ..........ERREN...VSSP....QAr.TALLTGLTPGTHYQL +TIE1_HUMAN/644-729 AGGAGd.........PLWID...VDRP....EEt.STIIRGLNASTRYLF +TIE2_HUMAN/444-529 VNHYEa.........WQHIQ...VTNE....I....VTLNYLEPRTEYEL +TIE2_HUMAN/543-626 VQKSD..........QQNIK...VPGN....LT..SVLLNNLHPREQYVV +TIE2_HUMAN/639-724 QGKNE..........DQHVDv.kIKNA....TIi.QYQLKGLEPETAYQV +UFO_HUMAN/327-411 QDTPE..........VLMDI...GLRQ....EV..TLELQGDGSVSNLTV + +7LES_DROME/1799-1891 WVQ.AHATPTk....SNSS +7LES_DROVI/1917-1997 RLA.LAYAATp....GAPI +APU_THETY/928-1009 AVT.AVDNDGn...eSALS +APU_THETY/1165-1248 KVV.AVDTSYn....RTAS +AXO1_CHICK/602-692 RVL.ASNILGv....GEPS +AXO1_CHICK/807-896 SVR.AYNRAGa....GPPS +CAML_HUMAN/812-907 EVQ.AFNGRGs....GPAS +CHI1_BACCI/465-542 TIK.AKDAAGn...lSAAS +CHIT_STRLI/142-219 SVK.ARDTADq...tGPAS +CHIX_STROI/169-240 QVA.AVNGA.......GES +CONT_CHICK/799-884 DVS.AFNSAGy....GPPS +CPSF_CHICK/630-716 RVF.AVNAIGv....SQPS +CPSF_CHICK/923-1008 RVY.SENVCGt....SQEP +ECK_HUMAN/329-420 TVE.ARNGV........SG +ECK_HUMAN/436-519 QVQ.ALTQEGq....GAGS +EPH1_HUMAN/333-435 NVE.AQNGVSglgssGHAS +EPH3_CHICK/333-429 EIQ.AVNGVTd...qSPFS +EPH3_CHICK/444-528 QVR.ARTVAGy....GRYS +ETK1_CHICK/325-421 EID.AVNGVSd...lSTLS +FAS2_SCHAM/530-616 RFA.AQNEVGf....GPWS +FAS2_SCHAM/642-735 EVR.ATNAIGn....SVPG +FINC_BOVIN/577-660 QLI.SVQHY......GQRE +FINC_BOVIN/689-768 NVY.EISEE.......GEQ +FINC_BOVIN/780-858 TIY.AVEEN.......QES +FINC_BOVIN/875-955 KVF.AVNQG.......RES +FINC_BOVIN/1142-1225 SVY.TVKDD.......KES +FINC_BOVIN/1236-1316 SVS.SVYEQ.......HES +FINC_BOVIN/1327-1406 SIV.ALNSK.......EES +FINC_BOVIN/1417-1499 TVY.AVTGRGd....SPAS +FINC_BOVIN/1511-1590 SVY.AQNQN.......GES +FINC_BOVIN/1601-1680 SVY.ALKDT.......LTS +FINC_BOVIN/1693-1771 HLY.TLNDN.......ARS +FINC_BOVIN/1782-1861 QVI.ALKNN.......QKS +FINC_CHICK/551-630 NIV.AIYDD.......MES +FINC_RAT/1266-1346 SVI.TLING.......GES +GUNB_CELFI/651-733 VVK.AKDVAGn...vSAAS +IL7R_HUMAN/129-221 KVR.SIPDHYfkgfwSEWS +ITB4_HUMAN/1127-1208 KVC.AYGAQGe....GPYS +ITB4_HUMAN/1220-1310 TVK.ARNGAGw....GPER +ITB4_HUMAN/1581-1665 RVR.AQSQEGw....GRER +ITB4_HUMAN/1694-1781 KVQ.ARTTEGf....GPER +KALM_CHICK/178-271 RVA.AVNVHGt...rGFTA +KALM_CHICK/544-642 EVQ.VLTTGGe....GPAT +KMLC_CHICK/60-145 RVR.AANVYGi....SEPS +LAR_DROME/322-404 YVI.AVNNIGr....GPPS +LAR_DROME/417-503 RVQ.AYTSMGa....GPMS +LAR_DROME/515-598 WLA.ARSQRGe....GATT +LAR_DROME/709-800 QVA.ALTRKGd....GDRS +LAR_DROME/909-995 RVR.AYTKQGa....GPFS +MPSF_CHICK/371-457 RVR.AVNSAGi....SRPS +MPSF_CHICK/499-585 RVL.SANKHGi....SDPS +MPSF_CHICK/600-684 RVK.AVNAVGf....SENS +MPSF_CHICK/699-785 KIA.AANVVGi....GHPS +MPSF_CHICK/801-887 KVR.AVNDAGv....GKSS +NCA1_BOVIN/509-597 RLA.ALNGKGl....GEIS +NCA1_BOVIN/610-691 YVV.AENQQ.......GKS +NGCA_CHICK/700-794 RVQ.AVNGAGk....GPEA +NRCA_CHICK/623-709 RVI.AVNEIGr....SQPS +NRCA_CHICK/726-810 KVQ.ALNDLGy...aPEPS +NRCA_CHICK/928-1014 YFN.AQTSV......GSGS +NRG_DROME/717-799 KVV.AINDR.......GES +NRG_DROME/815-905 RIL.AYNGRFn....GPPS +NRG_DROME/917-1007 SIT.ATTKMGe....GSEH +PHB_ALCFA/344-418 TVT.AVDPTAg...eSQPS +PTP1_DROME/123-205 QAY.TIYDG.......KES +PTP1_DROME/217-301 SVQ.TMSED.......EIS +PTP1_DROME/312-394 IVK.TVSGK.......VTS +PTP1_DROME/405-485 SVQ.AVSKK.......MES +PTP1_DROME/583-661 QVN.TVSFG.......VES +PTP1_DROME/864-944 TVV.VRSGTEss..vLRSS +PTP1_DROME/958-1044 KIQ.AKTAIGf....GPER +PTP6_DROME/236-321 RIV.GKNSIGn....GQPT +PTP6_DROME/332-425 RVR.ACSDLTkt..cGPWS +PTP9_DROME/171-259 IVK.AFTTKNe....GEPS +PTPB_HUMAN/22-103 KII.SLDEE........RT +PTPB_HUMAN/112-192 AIT.AVSGG.......KRS +PTPB_HUMAN/467-543 TVT.SISGD........LK +PTPB_HUMAN/554-632 VVT.TVSGG.......ISS +PTPB_HUMAN/643-725 TIT.TRSGKYe...nHSFS +PTPB_HUMAN/731-808 TVT.TKSGQ........YE +PTPB_HUMAN/907-984 MIA.SVSGS........LK +PTPB_HUMAN/995-1074 QIL.TVSGG.......LFS +PTPB_HUMAN/1085-1162 VIV.THSGE........LS +PTPB_HUMAN/1173-1250 WVV.THSGD........LS +PTPB_HUMAN/1261-1344 NVK.TVSGDSw....KTYS +PTPB_HUMAN/1355-1434 SIK.VQSAG.......MTS +PTPK_MOUSE/290-376 RVLlTRPGEGg...tGLPG +PTPZ_HUMAN/312-401 QIV.AICTNGl...yGKYS +SEK_MOUSE/441-525 HVR.ARTAAGy....GDFS +TENA_CHICK/593-671 RVF.AILKN.......KKS +TENA_CHICK/682-767 SLH.IVKNNTr...gPGLS +TENA_CHICK/774-853 TLI.SRRGD.......MES +TENA_CHICK/864-945 GVT.AVRQD.......RES +TENA_CHICK/956-1033 SLV.AEKGR.......HKS +TENA_CHICK/1045-1124 TLY.GVIRG.......YRT +TENA_CHICK/1136-1215 YLY.GISHG.......FRT +TENA_CHICK/1227-1306 ELY.GVSSG.......RRS +TENA_CHICK/1317-1395 NII.SVKGF.......EES +TENA_CHICK/1406-1483 RVH.AVKDA.......QKS +TENA_CHICK/1494-1571 KLQ.ALSRS.......MRS +TENA_HUMAN/1254-1334 TLH.GEVRG.......HST +TENA_HUMAN/1528-1607 MVS.GFTQG.......HQT +TIE1_HUMAN/446-533 RVQlSRPGEGg...eGAWG +TIE1_HUMAN/545-632 DVQ.LYHCTLl....GPAS +TIE1_HUMAN/644-729 RMR.ASI.QGl....GDWS +TIE2_HUMAN/444-529 CVQ.LVRRGEg....GEGH +TIE2_HUMAN/543-626 RAR..VNTKAq....GEWS +TIE2_HUMAN/639-724 DIF.AENNIGs....SNPA +UFO_HUMAN/327-411 CVA.AYTAAGd....GPWS +// diff --git a/forester/archive/RIO/others/hmmer/testsuite/masks_test.c b/forester/archive/RIO/others/hmmer/testsuite/masks_test.c new file mode 100644 index 0000000..9713ffd --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/masks_test.c @@ -0,0 +1,149 @@ +/* masks_test.c + * SRE, Tue Nov 18 11:10:20 1997 [St. Louis] + * + * Test driver for sequence masking routines in masks.c + * + * CVS $Id: masks_test.c,v 1.1.1.1 2005/03/22 08:34:46 cmzmasek Exp $ + */ + +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +static char banner[] = "\ +masks_test : testing of repeat masking code in masks.c"; + +static char usage[] = "\ +Usage: testdriver [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -v : verbose output\n\ +"; + +static char experts[] = "\ + --xnu : apply xnu to seqs in \n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-v", TRUE, sqdARG_NONE }, + { "--xnu", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +/* The test sequence and result from the XNU software distribution + */ +static char *test1 = "\ +ACDEFGHIKLMNPQRQRQRQRQRQRQRQRQRSTVWYACDEFGHIKLMNPQRQRQRQRQRQ\ +RQRQRQRSTVWYACDEFGHIKLMNPQRQRQRQRQRQRQRQRQRSTVWYACDEFGHIKLMN\ +PQRQRQRQRQRQRQRQRQRSTVWYACDEFGHIKLMNPQRQRQRQRQRQRQRQRQRSTVWY\ +ACDEFGHIKLMNPQRQRQRQRQRQRQRQRQRSTVWY"; + +static char *answer1 = "\ +ACDEFGHIKLMNPXXXXXXXXXXXXXXXXXXSTVWYACDEFGHIKLMNPXXXXXXXXXXX\ +XXXXXXXSTVWYACDEFGHIKLMNPXXXXXXXXXXXXXXXXXXSTVWYACDEFGHIKLMN\ +PXXXXXXXXXXXXXXXXXXSTVWYACDEFGHIKLMNPXXXXXXXXXXXXXXXXXXSTVWY\ +ACDEFGHIKLMNPXXXXXXXXXXXXXXXXXXSTVWY"; + +int +main(int argc, char **argv) +{ + char *seq; + char *dsq; + int len; + int i,j; + char *result; + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + int be_verbose; + char *xnufile; /* NULL, or file to run xnu on */ + + + /*********************************************** + * Parse command line + ***********************************************/ + + be_verbose = FALSE; + xnufile = NULL; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-v") == 0) { be_verbose = TRUE; } + else if (strcmp(optname, "--xnu") == 0) { xnufile = optarg; } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + SetAlphabet(hmmAMINO); + + /* XNU test + */ + seq = test1; + len = (int) strlen(seq); + dsq = DigitizeSequence(seq, len); + XNU(dsq, len); + result = MallocOrDie(sizeof(char) * (len+1)); + + for (i = 0; i < len; i++) + result[i] = Alphabet[(int) dsq[i+1]]; + result[len] = '\0'; + + if (be_verbose) + { + printf("XNU test:\n"); + for (i = 1; i <= len; i+=60) + { + for (j = i; j < i+60 && j <= len; j++) + putc(Alphabet[(int) dsq[j]], stdout); + putc('\n', stdout); + } + if (strcmp(answer1, result) == 0) + printf("-- OK; Identical to expected\n"); + } + + if (strcmp(answer1, result) != 0) + Die("XNU test failed."); + free(result); + free(dsq); + + /* On demand XNU test. + */ + if (xnufile != NULL) + { + int format; + SQFILE *sqfp; + SQINFO sqinfo; + int xnum; + + if ((sqfp = SeqfileOpen(xnufile, SQFILE_UNKNOWN, NULL)) == NULL) + Die("Failed to open sequence database file %s\n%s\n", xnufile, usage); + while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) + { + dsq = DigitizeSequence(seq, sqinfo.len); + xnum = XNU(dsq, sqinfo.len); + result = DedigitizeSequence(dsq, sqinfo.len); + + printf("%-20s\t%5d\n", sqinfo.name, xnum); + if (be_verbose) + WriteSeq(stdout, SQFILE_FASTA, result, &sqinfo); + + free(dsq); + FreeSequence(seq, &sqinfo); + free(result); + } + SeqfileClose(sqfp); + } + + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/parsingviterbi_test.c b/forester/archive/RIO/others/hmmer/testsuite/parsingviterbi_test.c new file mode 100644 index 0000000..57eab1e --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/parsingviterbi_test.c @@ -0,0 +1,167 @@ +/* parsingviterbi_test.c + * Wed Mar 4 15:07:37 1998 + * cp trace_test.c ../src/testdriver.c; cd ../src; make testdriver + * + * Test driver for P7ParsingViterbi(); alignment in linear memory. + * + * CVS $Id: parsingviterbi_test.c,v 1.1.1.1 2005/03/22 08:34:47 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +static char banner[] = "\ +parsingviterbi_test : testing of Plan7 linear memory alignment code"; + +static char usage[] = "\ +Usage: parsingviterbi_test [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -v : be verbose\n\ +"; + +static char experts[] = "\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-v", TRUE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMM(s) from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + char *seqfile; /* file to read target sequence(s) from */ + SQFILE *sqfp; /* opened seqfile for reading */ + char *seq; /* target sequence */ + SQINFO sqinfo; /* optional info for seq */ + char *dsq; /* digitized target sequence */ + struct plan7_s *hmm; /* HMM to search with */ + struct p7trace_s *tr1; /* traceback from P7Viterbi() */ + struct p7trace_s *tr2; /* traceback from P7ParsingViterbi() */ + int nseq; + float sc1, sc2; /* scores from Viterbi, ParsingViterbi() */ + + struct p7trace_s **tarr; /* array of decomposed Viterbi traces */ + int ntr; /* number of traces */ + int i1,i2,k1,k2; /* starts, stops in seq, model for Viterbi */ + int idx; /* index of a decomposed trace */ + + int be_verbose; + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + + be_verbose = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-v") == 0) be_verbose = TRUE; + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + hmmfile = "fn3.hmm"; + seqfile = "titin.fa"; + + /*********************************************** + * Open test sequence file + ***********************************************/ + + if ((sqfp = SeqfileOpen(seqfile, SQFILE_UNKNOWN, "BLASTDB")) == NULL) + Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); + + /*********************************************** + * Open HMM file + * Read a single HMM from it. (Config HMM, if necessary). + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, NULL)) == NULL) + Die("Failed to open HMM file %s\n%s", hmmfile, usage); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", hmmfile); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + P7Logoddsify(hmm, TRUE); + + /*********************************************** + * Search HMM against each sequence, using both + * normal Viterbi and P7ParsingViterbi. + ***********************************************/ + + nseq = 0; + while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) + { + nseq++; + dsq = DigitizeSequence(seq, sqinfo.len); + + sc1 = P7Viterbi(dsq, sqinfo.len, hmm, &tr1); + sc2 = P7ParsingViterbi(dsq, sqinfo.len, hmm, &tr2); + + if (be_verbose) + { + printf("test sequence %d: %s %s\n", + nseq, sqinfo.name, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); + for (idx = 0; idx < tr2->tlen; idx++) + printf("%1s %d\n", Statetype(tr2->statetype[idx]), tr2->pos[idx]); + } + + if (sc1 != sc2) + Die("Scores for the two Viterbi implementations are unequal (%d,%d)", sc1, sc2); + + TraceDecompose(tr1, &tarr, &ntr); + if (ntr == 0) + Die("ntr == 0 can't happen"); + if (ntr != (tr2->tlen/2) -1) + Die("# of domains for the two Viterbi implementations are unequal (%d, %d)", + ntr, (tr2->tlen/2) -1); + + for (idx = 0; idx < ntr; idx++) + { + TraceSimpleBounds(tarr[idx], &i1, &i2, &k1, &k2); + + if (i1 != tr2->pos[idx*2 + 1] + 1) + Die("Start positions %d and %d disagree for domain %d\n", + i1, tr2->pos[idx*2 + 1] + 1, idx); + if (i2 != tr2->pos[idx*2 + 2]) + Die("End positions %d and %d disagree for domain %d\n", + i2, tr2->pos[idx*2 + 2], idx); + } + + + for (idx = 0; idx < ntr; idx++) + P7FreeTrace(tarr[idx]); + free(tarr); + FreeSequence(seq, &sqinfo); + P7FreeTrace(tr1); + P7FreeTrace(tr2); + free(dsq); + } + + FreePlan7(hmm); + HMMFileClose(hmmfp); + SeqfileClose(sqfp); + + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/titin.fa b/forester/archive/RIO/others/hmmer/testsuite/titin.fa new file mode 100644 index 0000000..0b0f1dd --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/titin.fa @@ -0,0 +1,386 @@ +>gi|2136280|pir||I38344 titin - human +MTTQAPTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVISTSTLPGVQISFSDGRAKLTIPAV +TKANSGRYSLKATNGSGQATSTAELLVKAETAPPNFVQRLQSMTVRQGSQVRLQVRVTGIPNPVVKFYRD +GAEIQSSLDFQISQEGDLYSLLIAEAYPEDSGTYSVNATNSVGRATSTAELLVQGEEEVPAKKTKTIVST +AQISESRQTRIEKKIEAHFDARSIATVEMVIDGAAGQQLPHKTPPRIPPKPKSRSPTPPSIAAKAQLARQ +QSPSPIRHSPSPVRHVRAPTPSPVRSVSPAARISTSPIRSVRSPLLMRKTQASTVATGPEVPPPWKQEGY +VASSSEAEMRETTLTTSTQIRTEERWEGRYGVQEQVTISGAAGAAASVSASASYAAEAVATGAKEVKQDA +DKSAAVATVVAAVDMARVREPVISAVEQTAQRTTTTAVHIQPAQEQVRKEAEKTAVTKVVVAADKAKEQE +LKSRTKEIITTKQEQMHVTHEQIRKETEKTFVPKVVISAAKAKEQETRISEEITKKQKQVTQEAIMKETR +KTVVPKVIVATPKVKEQDLVSRGREGITTKREQVQITQEKMRKEAEKTALSTIAVATAKAKEQETILRTR +ETMATRQEQIQVTHGKVDVGKKAEAVATVVAAVDQARVREPREPGHLEESYAQQTTLEYGYKERISAAKV +AEPPQRPASEPHVVPKAVKPRVIQAPSETHIKTTDQKGMHISSQIKKTTDLTTERLVHVDKRPRTASPHF +TVSKISVPKTEHGYEASIAGSAIATLQKELSATSSAQKITKSVKAPTVKPSETRVRAEPTPLPQFPFADT +PDTYKSEAGVEVKKEVGVSITGTTVREERFEVLHGREAKVTETARVPAPVEIPVTPPTLVSGLKNVTVIE +GESVTLECHISGYPSPTVTWYREDYQIESSIDFQITFQSGIARLMIREAFAEDSGRFTCSAVNEAGTVST +SCYLAVQVSEEFEKETTAVTEKFTTEEKRFVESRDVVMTDTSLTEEQAGPGEPAAPYFITKPVVQKLVEG +GSVVFGCQVGGNPKPHVYWKKSGVPLTTGYRYKVSYNKQTGECKLVISMTFADDAGEYTIVVRNKHGETS +ASASLLEEADYELLMKSQQEMLYQTQVTAFVQEPEVGETAPGFVYSEYEKEYEKEQALIRKKMAKDTVVV +RTYVEDQEFHISSFEERLIKEIEYRIIKTTLEELLEEDGEEKMAVDISESEAVESGFDLRIKNYRILEGM +GVTFHCKMSGYPLPKIAWYKDGKRIKHGERYQMDFLQDGRASLRIPVVLPEDEGIYTAFASNIKGNAICS +GKLYVEPAAPLGAPTYIPTLEPVSRIRSLSPRSVSRSPIRMSPARMSPARMSPARMSPARMSPGRRLEET +DESQLERLYKPVFVLKPVSFKCLEGANCRFDLKVVGRPMPETFWFHDGQQIVNDYTHKVVIKEDGTQSLI +IVPATPSDSGEWTVVAQNRAGRSSISVILTVEAVEHQVKPMFVEKLKNVNIKEGSRLEMKVRATGNPNPD +IVWLKNSDIIVPHKYPKIRIEGTKGEAALKIDSTVSQDSAWYTATAINKAGRDTTRCKVNVEVEFAEPEP +ERKLIIPRGTYRAKEIAAPELEPLHLRYGQEQWEEGDLYDKEKQQKPFFKKKLTSLRLKRFGPAHFECRL +TPISDPTMVVEWLHDGKPLEAANRLRMINEFGYCSLDYGVAYSRDSGIITCRATNKYGTDHTSATLIVKD +EKSLVEESQLPEGRKGLQRIEELERMAHEGALTGVTTDQKEKQKPDIVLYPEPVRVLEGETARFRCRVTG +YPQPKVNWYLNGQLIRKSKRFRVRYDGIHYLDIVDCKSYDTGEVKVTAENPEGVIEHKVKLEIQQREDFR +SVLRRAPEPRPEFHVHEPGKLQFEVQKVDRPVDTTETKEVVKLKRAERITHEKVPEESEELRSKFKRRTE +EGYYEAITAVELKSRKKDESYEELLRKTKDELLHWTKELTEEEKKALAEEGKITIPTFKPDKIELSPSME +APKIFERIQSQTVGQGSDAHFRVRVVGKPDPECEWYKNGVKIERSDRIYWYWPEDNVCELVIRDVTAEDS +ASIMVKAINIAGETSSHAFLLVQAKQLITFTQELQDVVAKEKDTMATFECETSEPFVKVKWYKDGMEVHE +GDKYRMHSDRKVHFLSILTIDTSDAEDYSCVLVEDENVKTTAKLIVEGAVVEFVKELQDIEVPESYSGEL +ECIVSPENIEGKWYHNDVELKSNGKYTITSRRGRQNLTVKDVTKEDQGEYSFVIDGKKTTCKLKMKPRPI +AILQGLSDQKVCEGDIVQLEVKVSLESVEGVWMKDGQEVQPSDRVHIVIDKQSHMLLIEDMTKEDAGNYS +FTIPALGLSTSGRVSVYSVDVITPLKDVNVIEGTKAVLECKVSVPDVTSVKWYLNDEQIKPDDRVQAIVK +GTKQRLVINRTHASDEGPYKLIVGRVETNCNLSVEKIKIIRGLRDLTCTETQNVVFEVELSHSGIDVLWN +FKDKEIKPSSKYKIEAHGKIYKLTVLNMMKDDEGKYTFYAGENMTSGKLTVAGGAISKPLTDQTVAESQE +AVFECEVANPDSKGEWLRDGKHLPLTNNIRSESDGHKRRLIIAATKLDDIGEYTYKVATSKTSAKLKVEA +VKIKKTLKNLTVTETQDAVFTVELTHPNVKGVQWIKNGVVLESNEKYAISVKGTIYSLRIKNCAIVDESV +YGFRLGRLGASARLHVETVKIIKKPKDVTALENATVAFEVSVSHDTVPVKWFHKSVEIKPSDKHRLVSER +KVHKLMLQNISPSDAGEYTAVVGQLECKAKLFVETLHITKTMKNIEVPETKTASFECEVSHFNVPSMWLK +NGVEIEMSEKFKIVVQGKLHQLIIMNTSTEDSAEYTFVCGNDQVSATLTVTPIMITSMLKDINAEEKDTI +TFEVTVNYEGISYKWLKNGVEIKSTDKCQMRTKKLTHSLNIRNVHFGDAADYTFVAGKATSTATLYVEAR +HIEFRKHIKDIKVLEKKRAMFECEVSEPDITVQWMKDDQELQITDRIKIQKEKYVHRLLIPSTRMSDAGK +YTVVAGGNVSTAKLFVEGRDVRIRSIKKEVQVIEKQRAVVEFEVNEDDVDAHWYKDGIEINFQVQERHKY +VVERRIHRMFISETRQSDAGEYTFVAGRNRSSVTLYVNAPEPPQVLQELQPVTVQSGKPARFCAMISGRP +QPKISWYKEEQLLSTGFKCKFLHDGQEYTLLLIEAFPEDAAVYTCEAKNDYGVATTSASLSVEVPEVVSP +DQEMPVYPPAIITPLQDTVTSEGQPARFQCRVSGTDLKVSWYSKDKKIKPSRFFRMTQFEDTYQLEIAEA +YPEDEGTYTFVANNAVGQVSSTANLSLEAPESILHERIEQEIEMEMKEFSSSFLSAEEEGLHSAELQLSK +INETLELLSESPVYPTKFDSEKEGTGPIFIKEVSNADISMGDVATLSVTVIGIPKPKIQWFFNGVLLTPS +ADYKFVFDGDDHSLIILFTKLEDEGEYTCMASNDYGKTICSAYLKINSKGEGHKDTETESAVAKSLEKLG +GPCPPHFLKELKPIRCAQGLPAIFEYTVVGEPAPTVTWFKENKQLCTSVYYTIIHNPNGSGTFIVNDPQR +EDSGLYICKAENMLGESTCAAELLVLLEDTDMTDTPCKAKSTPEAPEDFPQTPLKGPAVEALDSEQEIAT +FVKDTILKAALITEENQQLSYEHIAKANELSSQLPLGAQELQSILEQDKLTPESTREFLCINGSIHFQPL +KEPSPNLQLQIVQSQKTFSKEGILMPEEPETQAVLSDTEKIFPSAMSIEQINSLTVEPLKTLLAEPEGNY +PQSSIEPPMHSYLTSVAEEVLSLKEKTVSDTNREQRVTLQKQEAQSALILSQSLAEGHVESLQSPDVMIS +QVNYEPLVPSEHSCTEGGKILIESANPLENAGQDSAVRIEEGKSLRFPLALEEKQVLLKEEHSDNVVMPP +DQIIESKREPVAIKKVQEVQGRDLLSKESLLSGIPEEQRLNLKIQICRALQAAVASEQPGLFSEWLRNIE +KVEVEAVNITQEPRHIMCMYLVTSAKSVTEEVTIIIEDVDPQMANLKMELRDALCAIIYEEIDILTAEGP +RIQQGAKTSLQEEMDSFSGSQKVEPITEPEVESKYLISTEEVSYFNVQSRVKYLDATPVTKGVASAVVSD +EKQDESLKPSEEKEESSSESGTEEVATVKIQEAEGGLIKEDGPMIHTPLVDTVSEEGDIVHLTTSITNAK +EVNWYFENKLVPSDEKFKCLQDQNTYTLVIDKVNTEDHQGEYVCEALNDSGKTATSAKLTVVKRAAPVIK +RKIEPLEVALGHLAKFTCEIQSAPNVRFQWFKAGREIYESDKCSIRSSKYISSLEILRTQVVDCGEYTCK +ASNEYGSVSCTATLTVTVPGGEKKVRKLLPERKPEPKEEVVLKSVLRKRPEEEEPKVEPKKLEKVKKPAV +PEPPPPKPVEEVEVPTVTKRERKIPEPTKVPEIKPAIPLPAPEPKPKPEAEVKTIKPPPVEPEPTPIAAP +VTVPVVGKKAEAKAPKEEAAKPKGPIKGVPKKTPSPIEAERRKLRPGSGGEKPPDEAPFTYQLKAVPLKF +VKEIKDIILTESEFVGSSAIFECLVSPSTAITTWMKDGSNIRESPKHRFIADGKDRKLHIIDVQLSDAGE +YTCVLRLGNKEKTSTAKLVVEELPVRFVKTLEEEVTVVKGQPLYLSCELNKERDVVWRKDGKIVVEKPGR +IVPGVIGLMRALTINDADDTDAGTYTVTVENANNLECSSCVKVVEVIRDWLVKPIRDQHVKPKGTAIFAC +DIAKDTPNIKWFKGYDEIPAEPNDKTEILRDGNHLYLKIKNAMPEDIAEYAVEIEGKRYPAKLTLGEREV +ELLKPIEDVTIYEKESASFDAEISEADIPGQWKLKGELLRPSPTCEIKAEGGKRFLTLHKVKLDQAGEVL +YQALNAITTAILTVKEIELDFAVPLKDVTVPERRQARFECVLTREANVIWSKGPDIIKSSDKFDIIADGK +KHILVINDSQFDDEGVYTAEVEGKKTSARLFVTGIRLKFMSPLEDQTVKEGETATFVCELSHEKMHVVWF +KNDAKLHTSRTVLISSEGKTHKLEMKEVTLDDISQIKAQVKELSSTAQLKVLEADPYFTVKLHDKTAVEK +DEITLKCEVSKDVPVKWFKDGEEIVPSPKYSIKADGLRRILKIKKADLKDKGEYVCDCGTDKTKANVTVE +ARLIEVEKPLYGVEVFVGETAHFEIELSEPDVHGQWKLKGQPLTASPDCEIIEDGKKHILILHNCQLGMT +GEVSFQAANAKSAANLKVKELPLIFITPLSDVKVFEKDEAKFECEVSREPKTFRWLKGTQEITGDDRFEL +IKDGTKHSMVIKSAAFEDEAKYMFEAEDKHTSGKLIIEGIRLKFLTPLKDVTAKEKESAVFTVELSHDNI +RVKWFKNDQRLHTTRSVSMQDEGKTHSITFKDLSIDDTSQIRVEAMGMSSEAKLTVLEGDPYFTGKLQDY +TGVEKDEVILQCEISKADAPVKWFKDGKEIKPSKNAVIKTDGKKRMLILKKALKSDIGQYTCDCGTDKTS +GKLDIEDREIKLVRPLHSVEVMETETARFETEISEDDIHANWKLKGEALLQTPDCEIKEEGKIHSLVLHN +CRLDQTGGVDFQAANVKSSAHLRVKPRVIGLLRPLKDVTVTAGETATFDCELSYEDIPVEWYLKGKKLEP +SDKVVPRSEGKVHTLTLRDVKLEDAGEVQLTAKDFKTHANLFVKEPPVEFTKPLEDQTVEEGATAVLECE +VSRENAKVKWFKNGTEILKSKKYEIVADGRVRKLVIHDCTPEDIKTYTCDAKDFKTSCNLNVVPPHVEFL +RPLTDLQVREKEMARFECELSRENAKVKWFKDGAEIKKGKKYDIISKGAVRILVINKCLLDDEAEYSCEV +RTARTSGMLTVLEEEAVFTKNLANIEVSETDTIKLVCEVSKPGAEVIWYKGDEEIIETGRYEILTEGRKR +ILVIQNAHLEDAGNYNCRLPSSRTDGKVKVHELAAEFISKPQNLEILEGEKAEFVCSISKESFPVQWKRD +DKTLESGDKYDVIADGKKRVLVVKDATLQDMGTYVVMVGAARAAAHLTVIEKLRIVVPLKDTRVKEQQEV +VFNCEVNTEGAKAKWFRNEEAIFDSSKYIILQKDLVYTLRIRDAHLDDQANYNVSLTNHRGENVKSAANL +IVEEEDLRIVEPLKDIETMEKKSVTFWCKVNRLNVTLKWTKNGEEVPFDNRVSYRVDKYKHMLTIKDCGF +PDEGEYIVTAGQDKSVAELLIIEAPTEFVEHLEDQTVTEFDDAVFSCQLSREKANVKWYRNGREIKEGKK +YKFEKDGSIHRLIIKDCRLDDECEYACGVEDRKSRARLFVEEIPVEIIRPPQDILEAPGADVVFLAELNK +DKVEVQWLRNNMVVVQGDKHQMMSEGKIHRLQICDIKPRDQGEYRFIAKDKEARAKLELAAAPKIKTADQ +DLVVDVGKPLTMVVPYDAYPKAEAEWFKENEPLSTKTIDTTAEQTSFRILEAKKGDKGRYKIVLQNKHGK +AEGFINLKVIDVPGPVRNLEVTETFDGEVSLAWEEPLTDGGSKIIGYVVERRDIKRKTWVLATDRAESCE +FTVTGLQKGGVEYLFRVSARNRVGTGEPVETDNPVEARSKYDVPGPPLNVTITDVNRFGVSLTWEPPEYD +GGAEITNYVIELRDKTSIRWDTAMTVRAEDLSATVTDVVEGQEYSFRVRAQNRIGVGKPSAATPFVKVAD +PIERPSPPVNLTSSDQTQSSVQLKWEPPLKDGGSPILGYIIERCEEGKDNWIRCNMKLVPELTYKVTGLE +KGNKYLYRVSAENKAGVSDPSEILGPLTADDAFVEPTMDLSAFKDGLEVIVPNPITILVPSTGYPRPTAT +WCFGDKVLETGDRVKMKTLSAYAELVISPSERSDKGIYTLKLENRVKTISGEIDVNVIARPSAPKELKFG +DITKDSVHLTWEPPDDDGGSPLTGYVVEKREVSRKTWTKVMDFVTDLEFTVPDLVQGKEYLFKVCARNKC +GPGEPAYVDEPVNMSTPATVPDPPENVKWRDRTANSIFLTWDPPKNDGGSRIKGYIVERCPRGSDKWVAC +GEPVAETKMEVTGLEEGKWYAYRVKTLNRQGASKPSRPTEEIQAVDTQEAPEIFLDVKLLAGLTVKAGTK +IELPATVTGKPEPKITWTKADMILKQDKRITIENVPKKSTVTIVDSKRSDTGTYIIEAVNVCGRATAVVE +VNVLDKPGPPAAFDITDVTNESCLLTWNPPRDDGGSKITNYVVERRATDSEVWHKLSSTVKDTNFKATKL +IPNKEYIFRVAAENMYGAGEPVQASPITAKYQFDPPGPPTRLEPSDITKDAVTLTWCEPDDDGGSPITGY +WVERLDPDTDKWVRCNKMPVKDTTYRVKGLTNKKKYRFRVLAENLAGPGKPSKSTEPILIKDPIDPPWPP +GKPTVKDVGKTSVRLNWTKPEHDGGAKIESYVIEMLKTGTDEWVRVAEGVPTTQHLLPGLMEGQEYSFRV +RAVNKAGESEPSEPSDPVLCREKLYPPSPPRWLEVINITKNTADLKWTVPEKDGGSPITNYIVEKRDVRR +KGWQTVDTTVKDTKCTVTPLTEGSLYVFRVAAENAIGQSDYTEIEDSVLAKDTFTTPGPPYALAVVDVTK +RHVDLKWEPPKNDGGRPIQRYVIEKKERLGTRWVKAGKTAGPDCNFRVTDVIEGTEVQFQVRAENEAGVG +HPSEPTEILSIEDPTSPPSPPLDLHVTDAGRKHIAIAWKPPEKNGGSPIIGYHVEMCPVGTEKWMRVNSR +PIKDLKFKVEEGVVPDKEYVLRVRAVNAIGVSEPSEISENVVAKDPDCKPTIDLETHDIIVIEGEKLSIP +VPFRAVPVPTVSWHKDGKEVKASDRLTMKNDHISAHLEVPKSVRADAGIYTITLENKLGSATASINVKVI +GLPGPCKDIKASDITKSSCKLTWEPPEFDGGTPILHYVLERREAGRRTYIPVMSGENKLSWTVKDLIPNG +EYFFRVKAVNKVGGGEYIELKNPVIAQDPKQPPDPPVDVEVHNPTAEAMTITWKPPLYDGGSKIMGYIIE +KIAKGEERWKRCNEHLVPILTYTAKGLEEGKEYQFRVRAENAAGISEPSRATPPTKAVDPIDAPKVILRT +SLEVKRGDEIALDASISGSPYPTITWIKDENVIVPEEIKKRAAPLVRRRKGEVQEEEPFVLPLTQRLSID +NSKKGESQLRVRDSLRPDHGLYMIKVENDHGIAKAPCTVSVLDTPGPPINFVFEDIRKTSVLCKWEPPLD +DGGSEIINYTLEKKDKTKPDSEWIVVTSTLRHCKYSVTKLIEGKEYLFRVRAENRFGPGPPCVSKPLVAK +DPFGPPDAPDKPIVEDVTSNSMLVKWNEPKDNGSPILGYWLEKREVNSTHWSRVNKSLLNALKANVDGLL +EGLTYVFRVCAENAAGPGKFSPPSDPKTAHDPISPPGPPIPRVTDTSSTTIELEWEPPAFNGGGEIVGYF +VDKQLVGTNKWSRCTEKMIKVRQYTVKEIREGADYKLRVSAVNAAGEGPPGETQPVTVAEPQEPPAVELD +VSVKGGIQIMAGKTLRIPAVVTGRPVPTKVWTKEEGELDKDRVVIDNVGTKSELIIKDALRKDHGRYVIT +ATNSCGSKFAAARVEVFDVPGPVLDLKPVVTNRKMCLLNWSDPEDDGGSEITGFIIERKDAKMHTWRQPI +ETERSKCDITGLLEGQEYKFRVIAKNKFGCGPPVEIGPILAVDPLGPPTSPERLTYTERQRSTITLDWKE +PRSNGGSPIQGYIIEKRRHDKPDFERVNKRLCPTTSFLVENLDEHQMYEFRVKAVNEIGESEPSLPLNVV +IQDDEVPPTIKLRLSVRGDTIKVKAGEPVHIPADVTGLPMPKIEWSKNETVIEKPTDALQITKEEVSRSE +AKTELSIPKAVREDKGTYTVTASNRLGSVFRNVHVEVYDRPSPPRNLAVTDIKAESCYLTWDAPLDNGGS +EITHYVIDKRDASRKKAEWEEVTNTAVEKRYGIWKLIPNGQYEFRVRAVNKYGISDECKSDKVVIQDPYR +LPGPPGKPKVLARTKGSMLVSWTPPLDNGGSPITGYWLEKREEGSPYWSRVSRAPITKVGLKGVEFNVPR +LLEGVKYQFRAMAINAAGIGPPSEPSDPEVAGDPIFPPGPPSCPEVKDKTKSSISLGWKPPAKDGGSPIK +GYIVEMQEEGTTDWKRVNEPDKLITTCECVVPNLKELRKYRFRVKAVNEAGESEPSDTTGEIPATDIQEE +PEVFIDIGAQDCLVCKAGSQIRIPAVIKGRPTPKSSWEFDGKAKKAMKDGVHDIPEDAQLETAENSSVII +IPECKRSHTGKYSITAKNKAGQKTANCRVKVMDVPGPPKDLKVSDITRGSCRLSWKMPDDDGGDRIKGYV +IEKRTIDGKAWTKVNPDCGSTTFVVPDLLSEQQYFFRVRAENRFGIGPPVETIQRTTARDPIYPPDPPIK +LKIGLITKNTVHLSWKPPKNDGGSPVTHYIVECLAWDPTGTKKEAWRQCNKRDVEELQFTVEDLVEGGEY +EFRVKAVNAAGVSKPSATVGPCDCQRPDMPPSIDLKEFMEVEEGTNVNIVAKIKGVPFPTLTWFKAPPKK +PDNKEPVLYDTHVNKLVVDDTCTLVIPQSRRSDTGLYTITAVNNLGTASKEMRLNVLGRPGPPVGPIKFE +SVSADQMTLSWFPPKDDGGSKITNYVIEKREANRKTWVHVSSEPKECTYTIPKLLEGHEYVFRIMAQNKY +GIGEPLDSEPETARNLFSVPGAPDKPTVSSVTRNSMTVNWEEPEYDGGSPVTGYWLEMKDTTSKRWKRVN +RDPIKAMTLGVSYKVTGLIEGSDYQFRVYAINAAGVGPASLPSDPATARDPIAPPGPPFPKVTDWTKSSA +DLEWSPPLKDGGSKVTGYIVEYKEEGKEEWEKGKDKEVRGTKLVVTGLKEGAFYKFRVSAVNIAGIGEPG +EVTDVIEMKDRLVSPDLQLDASVRDRIVVHAGGVIRIIAYVSGKPPPTVTWNMNERTLPQEATIETTAIS +SSMVIKNCQRSHQGVYSLLAKNEAGERKKTIIVDVLDVPGPVGTPFLAHNLTNESCKLTWFSPEDDGGSP +ITNYVIEKRESDRRAWTPVTYTVTRQNATVQGLIQGKAYFFRIAAENSIGMGPFVETSEALVIREPITVP +ERPEDLEVKEVTKNTVTLTWNPPKYDGGSEIINYVLESRLIGTEKFHKVTNDNLLSRKYTVKGLKEGDTY +EYRVSAVNIVGQGKPSFCTKPITCKDELAPPTLHLDFRDKLTIRVGEAFALTGRYSGKPKPKVSWFKDEA +DVLEDDRTHIKTTPATLALEKIKAKRSDSGKYCVVVENSTGSRKGFCQVNVVDHPGPPVGPVSFDEVTKD +YMVISWKPPLDDGGSKITNYIIEKKEVGKDVWMPVTSASAKTTCKVSKLLEGKDYIFRIHAENLYGISDP +LVSDSMKAKDRFRVPDAPDQPIVTEVTKDSALVTWNKPHDGGKPITNYILEKRETMSKRWARVTKDPIHP +YTKFRVPDLLEGCQYEFRVSAENEIGIGDPSPPSKPVFAKDPIAKPSPPVNPEAIDTTCNSVDLTWQPPR +HDGGSKILGYIVEYQKVGDEEWRRANHTPESCPETKYKVTGLRDGQTYKFRVLAVNAAGESDPAHVPEPV +LVKDRLEPPELILDANMAREQHIKVGDTLRLSAIIKGVPFPKVTWKKEDRDAPTKARIDVTPVGSKLEIR +NAAHEDGGIYSLTVENPAGSKTVSVKVLVLDKPGPPRDLEVSEIRKDSCYLTWKEPLDDGGSVITNYVVE +RRDVASAQWSPLSATSKKKSHFAKHLNEGNQYLFRVAAENQYGRGPFVETPKPIKALDPLHPPGPPKDLH +HVDVDKTEVSLVWNKPDRDGGSPITGYLVEYQEEGTQDWIKFKTVTNLECVVTGLQQGKTYRFRVKAENI +VGLGLPDTTIPIECQEKLVPPSVELDVKLIEGLVVKAGTTVRFPAIIRGVPVPTAKWTTDGSEIKTDEHY +TVETDNFSSVLTIKNCLRRDTGEYQITVSNAAGSKTVAVHLTVLDVPGPPTGPINILDVTPEHMTISWQP +PKDDGGSPVINYIVEKQDTRKDTWGVVSSGSSKTKLKIPHLQKGCEYVFRVRAENKIGVGPPLDSTPTVA +KHKFSPPSPPGKPVVTDITENAATVSWTLPKSDGGSPITGYYMERREVTGKWVRVNKTPIADLKFRVTGL +YEGNTYEFRVFAENLAGLSKPSPSSDPIKACRPIKPPGPPINPKLKDKSRETADLVWTKPLSDGGSPILG +YVVECQKPGTAQWNRINKDELIRQCAFRVPGLIEGNEYRFRIKAANIVGEGEPRELAESVIAKDILHPPE +VELDVTCRDVITVRVGQTIRILARVKGRPEPDITWTKEGKVLVREKRVDLIQDLPRVELQIKEAVRADHG +KYIISAKNSSGHAQGSAIVNVLDRPGPCQNLKVTNVTKENCTISWENPLDNGGSEITNFIVEYRKPNQKG +WSIVASDVTKRLIKANLLANNEYYFRVCAENKVGVGPTIETKTPILAINPIDRPGEPENLHIADKGKTFV +YLKWRRPDYDGGSPNLSYHVERRLKGSDDWERVHKGSIKETHYMVDRCVENQIYEFRVQTKNEGGESDWV +KTEEVVVKEDLQKPVLDLKLSGVLTVKAGDTIRLEAGVRGKPFPEVAWTKDKDATDLTRSPRVKIDTRAD +SSKFSLTKAKRSDGGKYVVTATNTAGSFVAYATVNVLDKPGPVRNLKIVDVSSDRCTVCWDPPEDDGGCE +IQNYILEKCETKRMVWSTYSATVLTPGTTVTRLIEGNEYIFRVRAENKIGTGPPTESKPVIAKTKYDKPG +RPDPPEVTKVSKEEMTVVWNPPEYDGGKSITGYFLEKKEKHSTRWVPVNKSAIPERRMKVQNLLPDHEYQ +FRVKAENEIGIGEPSLPSRPVVAKDPIEPPGPPTNFRVVDTTKHSITLGWGKPVYDGGAPIIGYVVEMRP +KIADASPDEGWKRCNAAAQLVRKEFTVTSLDENQEYEFRVCAQNQVGIGRPAELKEAIKPKEILEPPEID +LDASMRKLVIVRAGCPIRLFAIVRGRPAPKVTWRKVGIDNVVRKGQVDLVDTMAFLVIPNSTRDDSGKYS +LTLVNPAGEKAVFVNVRVLDTPGPVSDLKVSDVTKTSCHVSWAPPENDGGSQVTHYIVEKREADRKTWST +VTPEVKKTSFHVTNLVPGNEYYFRVTAVNEYGPGVPTDVPKPVLASDPLSEPDPPRKLEATEMTKNSATL +AWLPPLRDGGAKIDGYIISYREEEQPADRWTEYSVVKDLSLVVTGLKEGKKYKFRVAARNAVGVSLPREA +EGVYEAKEQLLPPKILMPEQITIKAGKKLRIEAHVYGKPHPTCKWKKGEDEVVTSSHLAVHKADSSSILI +IKDVTRKDSGYYSLTAENSSGTDTQKIKVVVMDAPGPPQPPFDISDIDADACSLSWHIPLEDGGSNITNY +IVEKCDVSRGDWVTALASVTKTSCRVGKLIPGQEYIFRVRAENRFGISEPLTSPKMVAQFPFGVPSEPKN +ARVTKVNKDCIFVAWDRPDSDGGSPIIGYLIERKERNSLLWVKANDTLVRSTEYPCAGLVEGLEYSFRIY +ALNKAGSSPPSKPTEYVTARMPVDPPGKPEVIDVTKSTVSLIWARPKHDGGSKIIGYFVEACKLPGDKWV +RCNTAPHQIPQEEYTATGLEEKAQYQFRAIARTAVNISPPSEPSDPVTILAENVPPRIDLSVAMKSLLTV +KAGTNVCLDATVFGKPMPTVSWKKDGTLLKPAEGIKMAMQRNLCTLELFSVNRKDSGDYTITAENSSGSK +SATIKLKVLDKPGPPASVKINKMYSDRAMLSWEPPLEDGGSEITNYIVDKRETSRPNWAQVSATVPITSC +SVEKLIEGHEYQFRICAENKYGVGDPVFTEPAIAKNPYDPPGRCDPPVISNITKDHMTVSWKPPADDGGS +PITGYLLEKRETQAVNWTKVNRKPIIERTLKATGLQEGTEYEFRVTAINKAGPGKPSDASKAAYARDPQY +PPAPPAFPKVYDTTRSSVSLSWGKPAYDGGSPIIGYLVEVKRADSDNWVRCNLPQNLQKTRFEVTGLMED +TQYQFRVYAVNKIGYSDPSDVPDKHYPKDILIPPEGEHDADLRKTLILRAGVTMRLYVPVKGRPPPKITW +SKPNVNLRDRIGLDIKSTDFDTFLRCENVNKYDAGKYILTLENSCGKKEYTIVVKVLDTPGPPINVTVKE +ISKDSAYVTWEPPIIDGGSPIINYVVQKRDAERKSWSTVTTECSKTSFRVPNLEEGKSYFFRVFAENEYG +IGDPGETRDAVKASQTPGPVVDLKVRSVSKSSCSIGWKKPHSDGGSRIIGYVVDFLTEENKWQRVMKSLS +LQYSAKDLTEGKEYTFRVSAENENGEGTPSEITVVARDDVVAPDLDLKGLPDLCYLAKENSNFRLKIPIK +GKPAPSVSWKKGEDPLATDTRVSVESSAVNTTLIVYDCQKSDAGKYTITLKNVAGTKEGTISIKVVGKPG +IPTGPIKFDEVTAEAMTLKWAPPKDDGGSEITNYILEKRDSVNNKWVTCASAVQKTTFRVTRLHEGMEYT +FRVSAENKYGVGEGLKSEPIVARHPFDVPDAPPPPNIVDVRHDSVSLTWTDPKKTGGSPITGYHLEFKER +NSLLWKRANKTPIRMRDFKVTGLTEGLEYEFRVMAINLAGVGKPSLPSEPVVALDPIDPPGKPEVINITR +NSVTLIWTEPKYDGGHKLTGYIVEKRDLPSKSWMKANHVNVPECAFTVTDLVEGGKYEFRIRAKNTAGAI +SAPSESTETIICKDEYEAPTIVLDPTIKDGLTIKAGDTIVLNAISILGKPLPKSSWSKAGKDIRPSDITQ +ITSTPTSSMLTIKYATRKDAGEYTITATNPFGTKVEHVKVTVLDVPGPPGPVEISNVSAEKATLTWTPPL +EDGGSPIKSYILEKRETSRLLWTVVSEDIQSCRHVATKLIQGNEYIFRVSAVNHYGKGEPVQSEPVKMVD +RFGPPGPPEKPEVSNVTKNTATVSWKRPVDDGGSEITGYHVERREKKSLRWVRAIKTPVSDLRCKVTGLQ +EGSTYEFRVSAENRAGIGPPSEASDSVLMKDAAYPPGPPSNPHVTDTTKKSASLAWGKPHYDGGLEITGY +VVEHQKVGDEAWIKDTTGTALRITQFVVPDLQTKEKYNFRISAINDAGVGEPAVIPDVEIVEREMAPDFE +LDAELRRTLVVRAGLSIRIFVPIKGRPAPEVTWTKDNINLKNRANIENTESFTLLIIPECNRYDTGKFVM +TIENPAGKKSGFVNVRVLDTARPSPQLRPTDITKDSVTLHWDLPLIDGGSRITNYIVEKREATRKSYSTA +TTKCHKCTYKVTGLSEGCEYFFRVMAENEYGIGEPTETTEPVKASEAPSPPDSLNIMDITKSTVSLAWPK +PKHDGGSKITGYVIEAQRKGSDQWTHITTVKGLECVVRNLTEGEEYTFQVMAVNSAGRSAPRESRPVIVK +EQTMLPELDLRGIYQKLVIAKAGDNIKVEIPVLGRPKPTVTWKKGDQILKQTQRVNFETTATSTILNINE +CVRSDSGPYPLTARNIVGEVGDVITIQVHDIPGPPTGPIKFDEVSSDFVTFSWDPPENDGGVPISNYVVE +MRQTDSTTWVELATTVIRTTYKATRLTTGLEYQFRVKAQNRYGVGPGITSAWIVANYPFKVPGPPGTPQV +TAVTKDSMTISWHEPLSDGGSPILGYHVERKERNGILWQTVSKALVPGNIFKSSGLTDGIAYEFRVIAEN +MAGKSKPSKPSEPMLALDPIDPPGKPVPLNITRHTVTLKWAKPEYTGGFKITSYIVEKRDLPNGRWLKAN +FSNILENEFTVSGLTEDAAYEFRVIAKNAAGAISPPSEPSDAITCRDDVEAPKIKVDVKFKDTVILKAGE +AFRLEADVSGRPPPTMEWSKDGKELEGTAKLEIKIADFSTNLVNKDSTRRDSGAYTLTATNPGGFAKHIF +NVKVLDRPGPPEGPLAVTEVTSEKCVLSWFPPLDDGGAKIDHYIVQKRETSRLAWTNVASEVQVTKLKVT +KLLKGNEYIFRVMAVNKYGVGEPLESEPVLAVNPYGPPDPPKNPEVTTITKDSMVVCWGHPDSDGGSEII +NYIVERRDKAGQRWIKCNKKTLTDLRYKVSGLTEGHEYEFRIMAENAAGISAPSPTSPFYKACDTVFKPG +PPGNPRVLDTSRSSISIAWNKPIYDGGSEITGYMVEIALPEEDEWQIVTPPAGLKATSYTITGLTENQEY +KIRIYAMNSEGLGEPALVPGTPKAEDRMLPPEIELDADLRKVVTIRACCTLRLFVPIKGRPDPEVKWARD +HGESLDKASIESASSYTLLIVGNVNRFDSGKYILTVENSSGSKSAFVNVRVLDTPGPPQDLKVKEVTKTS +VTLTWDPPLLDGGSKIKNYIVEKRESTRKAYSTVATNCHKTSWKVDQLQEGCSYYFRVLAENEYGIGLPA +ETAESVKASERPLPPGKITLMDVTRNSVSLSWEKPEHDGGSRILGYIVEMQTKGSDKWATCATVKVTEAT +ITGLIQGEEYSFRVSAQNEKGISDPRQLSVPVIAKDLVIPPAFKLLFNTFTVLAGEDLKVDVPFIGRPTP +AVTWHKDNVPLKQTTRVNAESTENNSLLTIKDACREDVGHYVVKLTNSAGEAIETLNVIVLDKPGPPTGP +VKMDEVTADSITLSWGPPKYDGGSSINNYIVEKRDTSTTTWQIVSATVARTTIKACRLKTGCEYQFRIAA +ENRYGKSTYLNSEPTVAQYPFKVPGPPGTPVVTLSSRDSMEVQWNEPISDGGSRVIGYHLERKERNSILW +VKLNKTPIPQTKFKTTGLEEGVEYEFRVSAENIVGIGKPSKVSECYVARDPCDPPGRPEAIIVTRNSVTL +QWKKPTYDGGSKITGYIVEKKELPEGRWMKASFTNIIDTHFEVTGLVEDHRYEFRVIARNAAGVFSEPSE +STGAITARDEVDPPRISMDPKYKDTIVVHAGESFKVDADIYGKPIPTIQWIKGDQELSNTARLEIKSTDF +ATSLSVKDAVRVDSGNYILKAKNVAGERSVTVNVKVLDRPGPPEGPVVISGVTAEKCTLAWKPPLQDGGS +DIINYIVERRETSRLVWTVVDANVQTLSCKVTKLLEGNEYTFRIMAVNKYGVGEPLESEPVVAKNPFVVP +DAPKAPEVTTVTKDSMIVVWERPASDGGSEILGYVLEKRDKEGIRWTRCHKRLIGELRLRVTGLIENHDY +EFRVSAENAAGLSEPSPPSAYQKACDPIYKPGPPNNPKVIDITRSSVFLSWSKPIYDGGCEIQGYIVEKC +DVNVGEWTMCTPPTGINKTNIEVEKLLEKHEYNFRICAINKAGVGEHADVPGPIIVEEKLEAPDIDLDLE +LRKIINIRAGGSLRLFVPIKGRPTPEVKWGKVDGEIRDAAIIDVTSSFTSLVLDNVNRYDSGKYTLTLEN +SSGTKSAFVTVRVLDTPSPPVNLKVTEITKDSVSITWEPPLLDGGSKIKNYIVEKREATRKSYAAVVTNC +HKNSWKIDQLQEGCSYYFRVTAENEYGIGLPAQTADPIKVAEVPQPPGKITVDDVTRNSVSLSWTKPEHD +GGSKIIQYIVEMQAKHSEKWSECARVKSLQAVITNLTQGEEYLFRVVAVNEKGRSDPRSLAVPIVAKDLV +IEPDVKPAFSSYSVQVGQDLKMEVPISGRPKPTITWTKDGLPLKQTTRINVTDSLDLTTLSIKETHKDDG +GQYGITVANVVGQKTASIEIVTLDKPDPPKGPVKFDDVSAESITLSWNPPLYTGGCQITNYIVQKRDTTT +TVWDVVSATVARTTLKVTKLKTGTEYQFRIFAENRYGQSFALESDPIVAQYPYKEPGPPGTPFATAISKD +SMVIQWHEPVNNGGSPVIGYHLERKERNSILWTKVNKTIIHDTQFKAQNLEEGIEYEFRVYAENIVGVGK +ASKNSECYVARDPCDPPGTPEPIMVKRNEITLQWTKPVYDGGSMITGYIVEKRDLPDGRWMKASFTNVIE +TQFTVSGLTEDQRYEFRVIAKNAAGAISKPSDSTGPITAKDEVELPRISMDPKFRDTIVVNAGETFRLEA +DVHGKPLPTIEWLRGDKEIEESARCEIKNTDFKALLIVKDAIRIDGGQYILRASNVAGSKSFPVNVKVLD +RPGPPEGPVQVTGVTSEKCSLTWSPPLQDGGSDISHYVVEKRETSRLAWTVVASEVVTNSLKVTKLLEGN +EYVFRIMAVNKYGVGEPLESAPVLMKNPFVLPGPPKSLEVTNIAKDSMTVCWNRPDSDGGSEIIGYIVEK +RDRSGIRWIKCNKRRITDLRLRVTGLTEDHEYEFRVSAENAAGVGEPSPATVYYKACDPVFKPGPPTNAH +IVDTTKNSITLAWGKPIYDGGSEILGYVVEICKADEEEWQIVTPQTGLRVTRFEISKLTEHQEYKIRVCA +LNKVGLGEATSVPGTVKPEDKLEAPELDLDSELRKGIVVRAGGSARIHIPFKGRPMPEITWSREEGEFTD +KVQIEKGVNYTQLSIDNCDRNDAGKYILKLENSSGSKSAFVTVKVLDTPGPPQNLAVKEVRKDSAFLVWE +PPIIDGGAKVKNYVIDKRESTRKAYANVSSKCSKTSFKVENLTEGAIYYFRVMAENEFGVGVPVETVDAV +KAAEPPSPPGKVTLTDVSQTSASLMWEKPEHDGGSRVLGYVVEMQPKGTEKWSIVAESKVCNAVVTGLSS +GQEYQFRVKAYNEKGKSDPRVLGVPVIAKDLTIQPSLKLPFNTYSIQAGEDLKIEIPVIGRPRPNISWVK +DGEPLKQTTRVNVEETATSTVLHIKEGNKDDFGKYTVTATNSAGTATENLSVIVLEKPGPPVGPVRFDEV +SADFVVISWEPPAYTGGCQISNYIVEKRDTTTTTWHMVSATVARTTIKITKLKTGTEYQFRIFAENRYGK +SAPLDSKAVIVQYPFKEPGPPGTPFVTSISKDQMLVQWHEPVNDGGTKIIGYHLEQKEKNSILWVKLNKT +PIQDTKFKTTGLDEGLEYEFKVSAENIVGIGKPSKVSECFVARDPCDPPGRPEAIVITRNNVTLKWKKPA +YDGGSKITGYIVEKKDLPDGRWMKASFTNVLETEFTVSGLVEDQRYEFRVIARNAAGNFSEPSDSSGAIT +ARDEIDAPNASLDPKYKDVIVVHAGETFVLEADIRGKPIPDVVWSKDGKELEETAARMEIKSTIQKTTLV +VKDCIRTDGGQYILKLSNVGGTKSIPITVKVLDRPGSPEGPLKVTGVTAEKCYLAWNPPLQDGGANISHY +IIEKRETSRLSWTQVSTEVQALNYKVTKLLPGNEYIFRVMAVNKYGIGEPLESGPVTACNPYKPPGPPST +PEVSAITKDSMVVTWARPVDDGGTEIEGYILEKRDKEGVRWTKCNKKTLTDLRLRVTGLTEGHSYEFRVA +AENAAGVGEPSEPSVFYRACDALYPPGPPSNPKVTDTSRSSVSLAWSKPIYDGGAPVKGYVVEVKEAAAD +EWTTCTPPTGLQGKQFTVTKLKENTEYNFRICAINSEGVGEPATLPGSVVAQERIEPPEIELDADLRKVV +VLRASATLRLFVTIKGRPEPEVKWEKAEGILTDRAQIEVTSSFTMLVIDNVTRFDSGRYNLTLENNSGSK +TAFVNVRVLDSPSAPVNLTIREVKKDSVTLSWEPPLIDGGAKITNYIVEKRETTRKAYATITNNCTKTTF +RIENLQEGCSYYFRVLASNEYGIGLPAETTEPVKVSEPPLPPGRVTLVDVTRNTATIKWEKPESDGGSKI +TGYVVEMQTKGSEKWSTCTQVKTLEATISGLTAGEEYVFRVAAVNEKGRSDPRQLGVPVIARDIEIKPSV +ELPFHTFNVKAREQLKIDVPFKGRPQATVNWRKDGQTLKETTRVNVSSSKTVTSLSIKEASKEDVGTYEL +CVSNSAGSITVPITIIVLDRPGPPGPIRIDEVSCDSITISWNPPEYDGGCQISNYIVEKKETTSTTWHIV +SQAVARTSIKIVRLTTGSEYQFRVCAENRYGKSSYSESSAVVAEYPFSPPGPPGTPKVVHATKSTMLVTW +QVPVNDGGSRVIGYHLEYKERSSILWSKANKILIADTQVKVSGLDEGLMYEYRVYAENIAGIGKCSKSCE +PVPARDPCDPPGQPEVTNITRKSVSLKWSKPHYDGGAKITGYIVERRELPDGRWLKCNYTNIQETYFEVT +ELTEDQRYEFRVFARNAADSVSEPSESTGPIIVKDDVEPPRVMMDVKFRDVIVVKAGEVLKINADIAGRP +LPVISWAKDGIEIEERARTEIISTDNHTLLTVKDCIRRDTGQYVLTLKNVAGTRSVAVNCKVLDKPGPPA +GPLEINGLTAEKCSLSWGRPQEDGGADIDYYHRKKRETSHLAWTICEGELQMTSCKVTKLLKGNEYIFRV +TGVNKYGVGEPLESVAIKALDPFTVPSPPTSLEITSVTKESMTLCWSRPESDGGSEISGYIIERREKNSL +RWVRVNKKPVYDLRVKSTGLREGCEYEYRVYAENAAGLSLPSETSPLIRAEDPVFLPSPPSKPKIVDSGK +TTITIAWVKPLFDGGAPITGYTVEYKKSDDTDWKTSIQSLRGTEYTISGLTTGAEYVFRVKSVNKVGASD +PSDSSDPQIAKEREEEPLFDIDSEMRKTLIVKAGASFTMTVPFRGRPVPNVLWSKPDTDLRTRAYVDTTD +SRTSLTIENANRNDSGKYTLTIQNVLSAASLTLVVKVLDTPGPPTNITVQDVTKESAVLSWDVPENDGGA +PVKNYHIEKREASKKAWVSVTNNCNRLSYKVTNLQEGAIYYFRVSGENEFGVGIPAETKEGVKITEKPSP +PEKLGVTSISKDSVSLTWLKPEHDGGSRIVHYVVEALEKGQKNWVKCAVAKSTHHVVSGLRENSEYFFRV +FAENQAGLSDPRELLLPVLIKEQLEPPEIDMKNFPSHTVYVRAGSNLKVDIPISGKPLPKVTLSRDGVPL +KATMRFNTEITAENLTINLKESVTADAGRYEITAANSSGTTKAFINIVVLDRPGPPTGPVVISDITEESV +TLKWEPPKYDGGSQVTNYILLKRETSTAVWTEVSATVARTMMKVMKLTTGEEYQFRIKAENRFGISDHID +SACVTVKLPYTTPGPPSTPWVTNVTRESITVGWHEPVSNGGSAVVGYHLEMKDRNSILWQKANKLVIRTT +HFKVTTISAGLIYEFRVYAENAAGVGKPSHPSEPVLAIDACEPPRNVRITDISKNSVSLSWQQPAFDGGS +KITGYIVERRDLPDGRWTKASFTNVTETQFTISGLTQNSQYEFRVFARNAVGSISNPSEVVGPITCIDSY +GGPVIDLPLEYTEVVKYRAGTSVKLRAGISGKPAPTIEWYKDDKELQTNALVCVENTTDLASILIKDADR +LNSGCYELKLRNAMASASATIRVQILDKPGPPGGPIEFKTVTAEKITLLWRPPADDGGAKITHYIVEKRE +TSRVVWSMVSEHLEECIITTTKIIKGNEYIFRVRAVNKYGIGEPLESDSVVAKNAFVTPGPPGIPEVTKI +TKNSMTVVWSRPIADGGSDISGYFLEKRDKKSLGWFKVLKETIRDTRQKVTGLTENSDYQYRVCAVNAAG +QGPFSEPSEFYKAADPIDPPGPPAKIRIADSTKSSITLGWSKPVYDGGSAVTGYVVEIRQGEEEEWTTVS +TKGEVRTTEYVVSNLKPGVNYYFRVSAVNCAGQGEPIEMNEPVQAKDILEAPEIDLDVALRTSVIAKAGE +DVQVLIPFKGRPPPTVTWRKDEKNLGSDARYSIENTDSSSLLTIPQVTRNDTGKYILTIENGVGEPKSST +VSVKVLDTPAACQKLQVKHVSRGTVTLLWDPPLIDGGSPIINYVIEKRDATKRTWSVVSHKCSSTSFKLI +DLSEKTPFFFRVLAENEIGIGEPCETTEPVKAAEVPAPIRDLSMKDSTKTSVILSWTKPDFDGGSVITEY +VVERKGKGEQTWSHAGISKTCEIEVSQLKEQSVLEFRVFAKNEKGLSDPVTIGPITVKELIITPEVDLSD +IPGAQVTVRIGHNVHLELPYKGKPKPSISWLKDGLPLKESEFVRFSKTENKITLSIKNAKKEHGGKYTVI +LDNAVCRIAVPITVITLGPPSKPKGPIRFDEIKADSVILSWDVPEDNGGGEITCYSIEKRETSQTNWKMV +CSSVARTTFKVPNLVKDAEYQFRVRAENRYGVSQPLVSSIIVAKHQFRIPGPPGKPVIYNVTSDGMSLTW +DAPVYDGGSEVTGFHVEKKERNSILWQKVNTSPISGREYRATGLVEGLDYQFRVYAENSAGLSSPSDPSK +FTLAVSPVDPPGTPDYIDVTRETITLKWNPPLRDGGSKIVGYSIEKRQGNERWVRCNFTDVSECQYTVTG +LSPGDRYEFRIIARNAVGTISPPSQSSGIIMTRDENVPPIVEFGPEYFDGLIIKSGESLRIKALVQGRPV +PRVTWFKDGVEIEKRMNMEITNVLGSTSLFVRDATRDHRGVYTVEAKNASGSAKAEIKVKVQDTPGKVVG +PIRFTNITGEKMTLWWDAPLNDGCAPITHYIIEKRETSRLAWALIEDKCEAQSYTAIKLINGNEYQFRVS +AVNKFGVGRPLDSDPVVAQIQYTVPDAPGIPEPSNITGNSITLTWARPESDGGSEIQQYILERREKKSTR +WVKVISKRPISETRFKVTGLTEGNEYEFHVMAENAAGVGPASGISRLIKCREPVNPPGPPTVVKVTDTSK +TTVSLEWSKPVFDGGMEIIGYIIEMCKTDLGDWHKVNAEACVKTRYTVTDLQAGEEYKFRVSAINGAGKG +DSCEVTGTIKAVDRLTAPELDIDANFKQTHVVRAGASIRLFIAYQGRPTPTAVWSKPDSNLSLRADIHTT +DSFSTLTVENCNRNDAGKYTLTVENNSGSKSITFTVKVLDTPGPPGPITFKDVTRGSATLMWDAPLLDGG +ARIHHYVVEKREASRRSWQVISEKCTRQIFKVNDLAEGVPYYFRVSAVNEYGVGEPYEMPEPIVATEQPA +PPRRLDVVDTSKSSAVLAWLKPDHDGGSRITGYLLEMRQKGSDLWVEAGHTKQLTFTVERLVEKTEYEFR +VKAKNDAGYSEPREAFSSVIIKEPQIEPTADLTGITNQLITCKAGSPFTIDVPISGRPAPKVTWKLEEMR +LKETDRVSITTTKDRTTLTVKDSMRGDSGRYFLTLENTAGVKTFSVTVVVIGRPGPVTGPIEVSSVSAES +CVLSWGEPKDGGGTEITNYIVEKRESGTTAWQLVNSSVKRTQIKVTHLTKYMEYSFRVSSENRFGVSKPL +ESAPIIAEHPFVPPSAPTRPEVYHVSANAMSIRWEEPYHDGGSKIIGYWVEKKERNTILWVKENKVPCLE +CNYKVTGLVEGLEYQFRTYALNAAGVSKASEASRPIMAQNPVDAPGRPEVTDVTRSTVSLIWSAPAYDGG +SKVVGYIIERKPVSEVGDGRWLKCNYTIVSDNFFTVTALSEGDTYEFRVLAKNAAGVISKGSESTGPVTC +RDEYAPPKAELDARLHGDLVTIRAGSDLVLDAAVGGKPEPKIIWTKGDKELDLCEKVSLQYTGKRATAVI +KFCDRSDSGKYTLTVKNASGTKAVSVMVKVLDSPGPCGKLTVSRVTQEKCTLAWSLPQEDGGAEITHYIV +ERRETSRLNWVIVEGECPTLSYVVTRLIKNNEYIFRVRAVNKYGPGVPVESEPIVARNSFTIPSPPGIPE +EVGTGKEHIIIQWTKPESDGGNEISNYLVDKREKESLRWTRVNKDYVVYDTRLKVTSLMEGCDYQFRVTA +VNAAGNSEPSERSNFISCREPSYTPGPPSAPRVVDTTKHSISLAWTKPMYDGGTDIVGYVLEMQEKDTDQ +WYRVHTNATIRNTEFTVPDLKMGQKYSFRVAAVNVKGMSEYSESIAEIEPVERIEIPDLELADDLKKTVT +IRAGASLRLMVSVSGRPPPVITWSKQGIDLASRAIIDTTESYSLLIVDKVNRYDAGKYTIEAENQSGKKS +ATVLVKVYDTPGPCPSVKVKEVSRDSVTITWEIPTIDGGAPINNYIVEKREAAMRAFKTVTTKCSKTLYR +ISGLVEGTMHYFRVLPENIYGIGEPCETSDAVLVSEVPLVPAKLEVVDVTKSTVTLAWEKPLYDGGSRLT +GYVLEACKAGTERWMKVVTLKPTVLEHTVTSLNEGEQYLFRIRAQNEKGVSEPRETVTAVTVQDLRVLPT +IDLSTMPQKTIHVPAGRPVELVIPIAGRPPPAASWFFAGSKLRESERVTVETHTKVAKLTIRETTIRDTG +EYTLELKNVTGTTSETIKVIILDKPGPPTGPIKIDEIDATSITISWEPPELDGGAPLSGYVVEQRDAHRP +GWLPVSESVTRSTFKFTRLTEGNEYVFRVAATNRFGIGSYLQSEVIECRSSIRIPGPPETLQIFDVSRDG +MTLTWYPPEDDGGSQVTGYIVERKEVRADRWVRVNKVPVTMTRYRSTGLTEGLEYEHRVTAINARGSGKP +SRPSKPIVAMDPIAPPGKPQNPRVTDTTRTSVSLAWSVPEDEGGSKVTGYLIEMQKVDQHEWTKCNTTPT +KIREYTLTHLPQGAEYRFRVLACNAGGPGEPAEVPGTVKVTEMLEYPDYELDERYQEGIFVRQGGVIRLT +IPIKGKPFPICKWTKEGQDISKRAMIATSETHTELVIKEADRGDSGTYDLVLENKCGKKAVYIKVRVIGS +PNSPEGPLEYDDIQVRSVRVSWRPPADDGGADILGYILERREVPKAAWYTIDSRVRGTSLVVKGLKENVE +YHFRVSAENQFGISKPLKSEEPVTPKTPLNPPEPPSNPPEVLDVTKSSVSLSWSRPKDDGGSRVTGYYIE +RKETSTDKVVRHNKTQITTTMYTVTGLVPDAEYQFRIIAQNDVGLSETSPASEPVVCKDPFDKPSQPGEL +EILSISKDSVTLQWEKPECDGGKEILGYWVEYRQSGDSAWKKSNKERIKDKQFTIGGLLEATEYEFRVFA +ENETGLSRPRRTAMSIKTKLTSGEAPGIRKEMKDVTTKLGEAAQLSCQIVGRPLPDIKWYRFGKELIQSR +KYKMSSDGRTHTLTVMTEEQEDEGVYTCIATNEVGEVETSSKLLLQATPQFHPGYPLKEKYYGAVGSTLR +LHVMYIGRPVPAMTWFHGQKLLQNSENITIENTEHYTHLVMKNVQRKTHAGKYKVQLSNVFGTVDAILDV +EIQDKPDKPTGPIVIEALLKNSAVISWKPPADDGGSWITNYVVEKCEAKEGAEWQLVSSAISVTTCRIVN +LTENAGYYFRVSAQNTFGISDPLEVSSVVIIKSPFEKPGAPGKPTITAVTKDSCVVAWKPPASDGGAKIR +NYYLEKREKKQNKWISVTTEEIRETVFSVKNLIEGLEYEFRVKCENLGGESEWSEISEPITPKSDVPIQA +PHFKEELRNLNVRYQSNATLVCKVTGHPKPIVKWYRQGKEIIADGLKYRIQEFKGGYHQLIIASVTDDDA +TVYQVRATNQGGSVSGTASLEVEVPAKIHLPKTLEGMGAVHALRGEVVSIKIPFSGKPDPVITWQKGQDL +IDNNGHYQVIVTRSFTSLVFPNGVERKDAGFYVVCAKNRFGIDQKTVELDVADVPDPPRGVKVSDASRDS +VNLTWTEPASDGGSKITNYIVEKCATTAERWLRVGQARETRYTVINLFGKTSYQFRVIAENKFGLSKPSE +PSEPTITKEDKTRAMNYDEEVDETREVSMTKASHSSTKELYEKYMIAEDLGRGEFGIVHRCVETSSKKTY +MAKFVKVKGTDQVLVKKEISILNIARHRNILHLHESFESMEELVMIFEFISGLDIFERINTSAFELNERE +IVSYVHQVCEALQFLHSHNIGHFDIRPENIIYQTRRSSTIKIIEFGQARQLKPGDNFRLLFTAPEYYAPE +VHQHDVVSTATDMWSLGTLVYVLLSGINPFLAETNQQIIENIMNAEYTFDEEAFKEISIEAMDFVDRLLV +KERKSRMTASEALQHPWLKQKIERVSTKVIRTLKHRRYYHTLIKKDLNMVVSAARISCGGAIRSQKGVSV +AKVKVASIEIGPVSGQIMHAVGEEGGHVKYVCKIENYDQSTQVTWYFGVRQLENSEKYEITYEDGVAILY +VKDITKLDDGTYRCKVVNDYGEDSSYAELFVKGVREVYDYYCRRTMKKIKRRTDTMRLLERPPEFTLPLY +NKTAYVGENVRFGVTITVHPEPHVTWYKSGQKIKPGDNDKKYTFESDKGLYQLTINSVTTDDDAEYTVVA +RNKYGEDSCKAKLTVTLHPPPTDSTLRPMFKRLLANAECQEGQSVCFEIRVSGIPPPTLKWEKDGQPLSL +GPNIEIIHEGLDYYALHIRDTLPEDTGYYRVTATNTAGSTSCQAHLQVERLRYKKQEFKSKEEHERHVQK +QIDKTLRMAEILSGTESVPLTQVAKEALREAAVLYKPAVSTKTVKGEFRLEIEEKKEERKLRMPYDVPEP +RKYKQTTIEEDQRIKQFVPMSDMKWYKKIRDQYEMPGKLDRVVQKRPKRIRLSRWEQFYVMPLPRITDQY +RPKWRIPKLSQDDLEIVRPARRRTPSPDYDFYYRPRRRSLGDISDEELLLPIDDYLAMKRTEEERLRLEE +ELELGFSASPPSRSPPHFELSSLRYSSPQAHVKVEETRKNFRYSTYHIPTKAEASTSYAELRERHAQAAY +RQPKQRQRIMAEREDEELLRPVTTTQHLSEYKSELDFMSKEEKSRKKSRRQREVTEITEIEEEYEISKHA +QRESSSSASRLLRRRRSLSPTYIELMRPVSELIRSRPQPAEEYEDDTERRSPTPERTRPRSPSPVSSERS +LSRFERSARFDIFSRYESMKAALKTQKTSERKYEVLSQQPFTLDHAPRITLRMRSHRVPCGQNTRFILNV +QSKPTAEVKWYHNGVELQESSKIHYTNTSGVLTLEILDCHTDDSGTYRAVCTNYKGEASDYATLDVTGGD +YTTYASQRRDEEVPRSVFPELTRTEAYAVPSFKKTSEMEASSSVREVKSQMTETRESLSSYEHSASAEMK +SAALEEKSLEEKSTTRKIKTTLAARILTKPRSMTVYEGESARFSCDTDGEPVPTVTWLRKGQVLSTSARH +QVTTTKYKSTFEISSVQASDEGNYSVVVENSEGKQEAEFTLTIQKARVTEKAVTSPPRVKSPEPRVKSPE +AVKSPKRVKSPEPSHPKAVSPTETKPTPIEKVQHLPVSAPPKITQFLKAEASKEIAKLTCVVESSVLRAK +EVTWYKDGKKLKENGHFQFHYSADGTYELKINNLTESDQGEYVCEISGEGGTSKTNLQFMGQAFKSIHEK +VSKISETKKSDQKTTESTVTRKTEPKAPEPISSKPVIVTGLQDTTVSSDSVAKFAVKATGEPRPTAIWTK +DGKAITQGGKYKLSEDKGGFFLEIHKTDTSDSGLYTCTVKNSAGSVSSSCKLTIKAIKDTEAQKVSTQKT +SEITPQKKAVVQEEISQKALRSEEIKMSEAKSQEKLALKEEASKVLISEEVKKSAATSLEKSIVHEEITK +TSQASEEVRTHAEIKAFSTQMSINEGQRLVLKANIAGATDVKWVLNGVELTNSEEYRYGVSGSDQTLTIK +QASHRDEGILTCISKTKEGIVKCQYDLTLSKELSDAPAFISQPRSQNINEGQNVLFTREISGEPSPEIEW +FKNNLPISISSNVSISRSRNVYSLEIRNASVSDSGKYTIKAKNFRGQCSATASLMVLPLVEEPSREVVLR +TSGDTSLQGSFSSQSVQMSASKQEASFSSFSSSSASSMTEMKFASMSAQSMSSMQESFVEMSSSSFMGIS +NMTQLESSTSKMLKAGIRGIPPKIEALPSDISIDEGKVLTVACAFTGEPTPEVTWSCGGRKIHSQEQGRF +HIENTDDLTTLIIMDVQKQDGGLYTLSLGNEFGSDSATVNIHIRSI diff --git a/forester/archive/RIO/others/hmmer/testsuite/tophits_test.c b/forester/archive/RIO/others/hmmer/testsuite/tophits_test.c new file mode 100644 index 0000000..cdf1cfa --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/tophits_test.c @@ -0,0 +1,170 @@ +/* tophits_test.c + * SRE, Tue Oct 28 08:03:10 1997 [Newton Institute, Cambridge UK] + * + * Test driver for tophits.c. Returns 0 if everything is OK. + * + * Options: + * -v Verbose; print stuff. + */ + +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +static char banner[] = "\ +tophits_test : internal verification of tophits.c"; + +static char usage[] = "\ +Usage: tophits_test [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -s : set random seed to \n\ + -v : be verbose (default is to simply exit with status 1 or 0)\n\ +"; + +static char experts[] = "\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-s", TRUE, sqdARG_INT }, + { "-v", TRUE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + struct tophit_s *hit; /* hit list */ + int i,j; /* counters */ + int nsamples; /* option: # of random "scores" */ + int be_verbose; /* option: TRUE to show output */ + int seed; /* option: random number seed */ + int paramH; /* option: H parameter */ + int paramA; /* option: A parameter */ + double *list; /* list of "scores" */ + double tmp; /* used for swapping */ + float score, score2; + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + be_verbose = FALSE; + seed = (int) time ((time_t *) NULL); + paramH = 100; + paramA = 10; + nsamples = 1000; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-s") == 0) { seed = atoi(optarg); } + else if (strcmp(optname, "-v") == 0) { be_verbose = TRUE; } + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + sre_srandom(seed); + if (be_verbose) + printf("%d\tSEED\n", seed); + + /*********************************************** + * Generate three tiers of numbers: + * paramA - really good scores, 1000-2000 + * paramH - good scores, 100-200 + * nsamples - paramH - paramA: bad scores, 10-20 + * then shuffle. + ***********************************************/ + + list = MallocOrDie (sizeof(double) * nsamples); + for (i = 0; i < paramA; i++) + list[i] = 1000. + 1000. * sre_random(); + for (; i < paramA + paramH; i++) + list[i] = 100. + 100. * sre_random(); + for (; i < nsamples; i++) + list[i] = 10. + 10. * sre_random(); + + for (i = 0; i < nsamples; i++) + { + j = CHOOSE(nsamples); + tmp = list[j]; + list[j] = list[i]; + list[i] = tmp; + } + + if (be_verbose) + for (i = 0; i < nsamples; i++) + printf("%8.2f\tTest set\n", list[i]); + + /*********************************************** + * Test of FullSortTophits(). + * Fill up a hit list with random numbers; + * FullSort it; + * check that all top H are >= 100 and sorted. + ***********************************************/ + + hit = AllocTophits(100); + for (i = 0; i < nsamples; i++) + RegisterHit(hit, list[i], 0., (float) list[i], 0., 0., + NULL, NULL, NULL, /* name, acc, desc */ + 0,0,0, + 0,0,0, + 0,0, + NULL); + FullSortTophits(hit); + + if (be_verbose) + { + for (i = 0; i < hit->num; i++) + { + GetRankedHit(hit, i, NULL, &score, NULL, NULL, + NULL, NULL, NULL, /* name, acc, desc */ + NULL, NULL, NULL, + NULL, NULL, NULL, + NULL, NULL, + NULL); + printf("%8.2f FullSort()\n", score); + } + } + + for (i = 0; i < hit->num-1; i++) + { + GetRankedHit(hit, i, NULL, &score, NULL, NULL, + NULL, NULL, NULL, /* name, acc, desc */ + NULL, NULL, NULL, + NULL, NULL, NULL, + NULL, NULL, + NULL); + GetRankedHit(hit, i+1,NULL, &score2,NULL, NULL, + NULL, NULL, NULL, /* name, acc, desc */ + NULL, NULL, NULL, + NULL, NULL, NULL, + NULL, NULL, + NULL); + if (score < score2) + Die("FullSortTophits() fails test: order wrong"); + if (i < paramA && score < 1000.) + Die("FullSortTophits() fails test: lost a number"); + if (i < paramA + paramH && score < 100.) + Die("FullSortTophits() fails test: lost a number"); + } + + FreeTophits(hit); + free(list); + + if (be_verbose) printf("tophits_test is OK\n"); + return 0; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/trace_test.c b/forester/archive/RIO/others/hmmer/testsuite/trace_test.c new file mode 100644 index 0000000..98e5e97 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/trace_test.c @@ -0,0 +1,146 @@ +/* trace_test.c + * Mon Feb 2 07:57:47 1998 + * cp trace_test.c ../src/testdriver.c; cd ../src; make testdriver + * + * Test driver for Viterbi tracebacks. + * + * RCS $Id: trace_test.c,v 1.1.1.1 2005/03/22 08:34:47 cmzmasek Exp $ + */ + + +#include +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +static char banner[] = "\ +trace_test : testing of Plan7 Viterbi traceback code"; + +static char usage[] = "\ +Usage: testdriver [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -v : be verbose\n\ +"; + +static char experts[] = "\ + --hmm : use HMM in file \n\ + --seq : use seq(s) in file \n\ + --small : run P7SmallViterbi()\n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-v", TRUE, sqdARG_NONE }, + { "--hmm", FALSE, sqdARG_STRING }, + { "--seq", FALSE, sqdARG_STRING }, + { "--small", FALSE, sqdARG_NONE }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMM(s) from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + char *seqfile; /* file to read target sequence(s) from */ + SQFILE *sqfp; /* opened seqfile for reading */ + char *seq; /* target sequence */ + SQINFO sqinfo; /* optional info for seq */ + char *dsq; /* digitized target sequence */ + struct plan7_s *hmm; /* HMM to search with */ + struct p7trace_s *tr; /* traceback */ + int nseq; + float sc; + + int be_verbose; + int do_small; /* TRUE to invoke P7SmallViterbi */ + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + + be_verbose = FALSE; + hmmfile = "trace_test.hmm"; + seqfile = "trace_test.seq"; + do_small = FALSE; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-v") == 0) be_verbose = TRUE; + else if (strcmp(optname, "--hmm") == 0) hmmfile = optarg; + else if (strcmp(optname, "--seq") == 0) seqfile = optarg; + else if (strcmp(optname, "--small") == 0) do_small = TRUE; + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + /*********************************************** + * Open test sequence file + ***********************************************/ + + if ((sqfp = SeqfileOpen(seqfile, SQFILE_UNKNOWN, "BLASTDB")) == NULL) + Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); + + /*********************************************** + * Open HMM file + * Read a single HMM from it. (Config HMM, if necessary). + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, NULL)) == NULL) + Die("Failed to open HMM file %s\n%s", hmmfile, usage); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", hmmfile); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + P7Logoddsify(hmm, TRUE); + + /*********************************************** + * Search HMM against each sequence + ***********************************************/ + + nseq = 0; + while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) + { + nseq++; + dsq = DigitizeSequence(seq, sqinfo.len); + + if (do_small) sc = P7SmallViterbi(dsq, sqinfo.len, hmm, &tr); + else sc = P7Viterbi(dsq, sqinfo.len, hmm, &tr); + + if (be_verbose) + { + printf("test sequence %d: score %.1f : %s %s\n", + nseq, sc, sqinfo.name, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); + P7PrintTrace(stdout, tr, hmm, dsq); + } + + if (! TraceVerify(tr, hmm->M, sqinfo.len)) + Die("Trace verify failed on seq #%d, %s\n", nseq, sqinfo.name); + + FreeSequence(seq, &sqinfo); + P7FreeTrace(tr); + free(dsq); + } + + FreePlan7(hmm); + HMMFileClose(hmmfp); + SeqfileClose(sqfp); + + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/trace_test.hmm b/forester/archive/RIO/others/hmmer/testsuite/trace_test.hmm new file mode 100644 index 0000000..f52309a --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/trace_test.hmm @@ -0,0 +1,47 @@ +HMMER2.0 +NAME trace_test +DESC +LENG 10 +ALPH Amino +RF no +CS no +COM ./hmmbuild -F trace_test.hmm trace_test.slx +NSEQ 7 +DATE Mon Feb 2 09:14:31 1998 +XT -8455 -4 -1000 -1000 -8455 -4 -8455 -4 +NULT -4 -8455 +NULE 595 -1558 85 338 -294 453 -1158 197 249 902 -1085 -142 -21 -313 45 531 201 384 -1998 -644 +HMM A C D E F G H I K L M N P Q R S T V W Y + m->m m->i m->d i->m i->i d->m d->d b->m m->e + -585 * -1585 + 1 2806 -444 -1270 -1265 -1732 -744 -1182 -1034 -1212 -1545 -1010 -869 -1380 -1102 -1325 -156 -259 -640 -2039 -1721 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -111 -6123 -7165 -894 -1115 -701 -1378 -585 * + 2 -608 5196 -2368 -2362 -1635 -1253 -1702 -987 -2034 -1540 -1191 -1745 -1831 -1953 -1922 -934 -927 -770 -1855 -1692 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -115 -6127 -7169 -894 -1115 -701 -1378 * * + 3 -1130 -2063 3441 243 -2502 -1157 -662 -2514 -843 -2578 -2029 -96 -1658 -421 -1434 -970 -1239 -2138 -2388 -1915 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -120 -6132 -7174 -894 -1115 -701 -1378 * * + 4 -988 -1921 379 3052 -2309 -1196 -495 -2108 -350 -2194 -1612 -103 -1621 -210 -770 -862 -1042 -1799 -2207 -1742 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -126 -6138 -7180 -894 -1115 -701 -1378 * * + 5 -1602 -1340 -2512 -2455 3740 -2315 -379 -478 -2225 -175 -190 -1874 -2503 -1781 -2079 -1892 -1647 -683 210 1257 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -132 -6144 -7186 -894 -1115 -701 -1378 * * + 6 -759 -1104 -1314 -1488 -2461 3330 -1563 -2503 -1721 -2645 -2082 -1325 -1776 -1591 -1791 -959 -1094 -1922 -2199 -2280 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -140 -6152 -7194 -894 -1115 -701 -1378 * * + 7 -1192 -1547 -812 -781 -520 -1557 4586 -1881 -457 -1789 -1357 -866 -1913 -669 -600 -1231 -1258 -1678 -918 -90 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -148 -6160 -7202 -894 -1115 -701 -1378 * * + 8 -1016 -805 -2637 -2367 -591 -2470 -1853 3039 -2040 329 346 -2180 -2589 -1943 -2070 -1879 -1051 1184 -1698 -1261 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -158 -6170 -7212 -894 -1115 -701 -1378 * * + 9 -1028 -1686 -800 -442 -2171 -1521 -263 -1873 3103 -1859 -1251 -547 -1750 74 599 -1004 -972 -1620 -1812 -1528 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -170 -6182 -7224 -894 -1115 -701 -1378 * * + 10 -1369 -1199 -2542 -2295 -129 -2453 -1603 488 -1867 2607 742 -2168 -2539 -1724 -1835 -1992 -1384 130 -1284 -897 + - * * * * * * * * * * * * * * * * * * * * + - * * * * * * * * 0 +// diff --git a/forester/archive/RIO/others/hmmer/testsuite/trace_test.seq b/forester/archive/RIO/others/hmmer/testsuite/trace_test.seq new file mode 100644 index 0000000..5f5eba4 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/trace_test.seq @@ -0,0 +1,25 @@ +>seq1 Basic traceback. SNB M ECT +ACDEFGHIKL +>seq2 C-terminal tail; one CC transition +ACDEFGHIKLY +>seq3 C-terminal tail; three CC transitions +ACDEFGHIKLYYY +>seq4 N-terminal tail; one NN transition +YACDEFGHIKL +>seq5 N-terminal tail; three NN transitions +YYYACDEFGHIKL +>seq6 one JJ +ACDEFGHIKLYACDEFGHIKL +>seq7 three JJ +ACDEFGHIKLYYYACDEFGHIKL +>seq8 D->E wing unfolding +ACDEFGHIK +>seq9 B->D wing unfolding +CDEFGHIKL +>seq10 MD, DD, DM transitions +ACDEHIKL +>seq11 MI, II, IM transitions +ACDEFYYYGHIKL +>seq12 bogosity +Y + diff --git a/forester/archive/RIO/others/hmmer/testsuite/trace_test.slx b/forester/archive/RIO/others/hmmer/testsuite/trace_test.slx new file mode 100644 index 0000000..3e71587 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/trace_test.slx @@ -0,0 +1,7 @@ +seq1 ACDEFGHIKL +seq2 ACDEFGHIKL +seq3 ACDEFGHIKL +seq4 ACDEFGHIKL +seq5 ACDEFGHIKL +seq6 ACDEFGHIKL +seq7 ACDEFGHIKL diff --git a/forester/archive/RIO/others/hmmer/testsuite/viterbi_exercise.c b/forester/archive/RIO/others/hmmer/testsuite/viterbi_exercise.c new file mode 100644 index 0000000..01a9c1f --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/viterbi_exercise.c @@ -0,0 +1,166 @@ +/* viterbi_exercise.c + * SRE, Mon Mar 9 07:55:47 1998 [St. Louis] + * + * Exercise the various Viterbi algorithms, big and small. + * + * RCS $Id: viterbi_exercise.c,v 1.1.1.1 2005/03/22 08:34:50 cmzmasek Exp $ + */ + + +#include +#include +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +static char banner[] = "\ +viterbi_exercise : testing of Plan7 Viterbi code"; + +static char usage[] = "\ +Usage: testdriver [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -v : be verbose\n\ +"; + +static char experts[] = "\ + --hmm : use HMM in file \n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-v", TRUE, sqdARG_NONE }, + { "--hmm", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMM(s) from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + struct plan7_s *hmm; /* the HMM to search with */ + char *dsq; /* digitized target sequence */ + char *seq; + SQINFO sqinfo; + int L; /* length of dsq */ + struct p7trace_s *tr1; /* traceback */ + struct p7trace_s *tr2; /* another traceback */ + int nseq; + float sc1, sc2; /* scores */ + int config; + int i; + + int be_verbose; + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + + /*********************************************** + * Parse command line + ***********************************************/ + + be_verbose = FALSE; + hmmfile = "fn3.hmm"; + nseq = 100; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-v") == 0) be_verbose = TRUE; + else if (strcmp(optname, "--hmm") == 0) hmmfile = optarg; + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + /*********************************************** + * Open HMM file + * Read a single HMM from it. + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, NULL)) == NULL) + Die("Failed to open HMM file %s\n%s", hmmfile, usage); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", hmmfile); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + Plan7Renormalize(hmm); + + /*********************************************** + * We cycle through different model configurations. + * For each configuration, we repeat 100 times: + * - generate a sequence + * - score it by Viterbi and by SmallViterbi + * - make sure they give OK and identical results + ***********************************************/ + + for (config = 1; config <= 5; config++) + { + switch (config) { + case 1: Plan7NakedConfig(hmm); break; + case 2: Plan7GlobalConfig(hmm); break; + case 3: Plan7LSConfig(hmm); break; + case 4: Plan7FSConfig(hmm, 0.5, 0.5); break; + case 5: Plan7SWConfig(hmm, 0.5, 0.5); break; + default: Die("never happens"); + } + P7Logoddsify(hmm, TRUE); + + for (i = 0; i < nseq; i++) + { + EmitSequence(hmm, &dsq, &L, NULL); + sprintf(sqinfo.name, "seq%d", i+1); + sqinfo.len = L; + sqinfo.flags = SQINFO_NAME | SQINFO_LEN; + + sc1 = P7Viterbi(dsq, L, hmm, &tr1); + sc2 = P7SmallViterbi(dsq, L, hmm, &tr2); + + if (be_verbose) + { + printf("Viterbi score: %.1f SmallViterbi: %.1f\n", sc1, sc2); + P7PrintTrace(stdout, tr1, hmm, dsq); + P7PrintTrace(stdout, tr2, hmm, dsq); + + seq = DedigitizeSequence(dsq, L); + WriteSeq(stdout, SQFILE_FASTA, seq, &sqinfo); + free(seq); + } + + if (sc1 != sc2) + Die("Different scores from normal/small Viterbi"); + + if (fabs(sc1 - P7TraceScore(hmm, dsq, tr1)) > 0.1) + Die("P7Viterbi score doesn't match its TraceScore"); + if (fabs(sc2 - P7TraceScore(hmm, dsq, tr2)) > 0.1) + Die("P7SmallViterbi score doesn't match its TraceScore"); + + if (! TraceVerify(tr1, hmm->M, L)) + Die("TraceVerify() failed for a P7Viterbi trace"); + if (! TraceVerify(tr2, hmm->M, L)) + Die("TraceVerify() failed for a P7SmallViterbi trace"); + + if (tr1->tlen != tr2->tlen) + Die("Trace lengths differ for normal/small Viterbi"); + if (! TraceCompare(tr1, tr2)) + Die("Different traces from normal/small Viterbi"); + + P7FreeTrace(tr1); + P7FreeTrace(tr2); + free(dsq); + } + } + + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.c b/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.c new file mode 100644 index 0000000..232ca9b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.c @@ -0,0 +1,150 @@ +/* weeviterbi_test.c + * Wed Mar 4 17:30:39 1998 + * + * Test driver for Myers/Miller/Hirschberg linear memory Viterbi tracebacks. + * + * RCS $Id: weeviterbi_test.c,v 1.1.1.1 2005/03/22 08:34:47 cmzmasek Exp $ + */ + +#include +#include +#include + +#include "structs.h" +#include "funcs.h" +#include "globals.h" +#include "squid.h" + +static char banner[] = "\ +weeviterbi_test : testing of Plan7 Myers/Miller/Hirschberg Viterbi traceback code"; + +static char usage[] = "\ +Usage: testdriver [-options]\n\ + Available options are:\n\ + -h : help; display this usage info\n\ + -v : be verbose\n\ +"; + +static char experts[] = "\ + --hmm : use HMM in file \n\ + --seq : use seq(s) in file \n\ +\n"; + +static struct opt_s OPTIONS[] = { + { "-h", TRUE, sqdARG_NONE }, + { "-v", TRUE, sqdARG_NONE }, + { "--hmm", FALSE, sqdARG_STRING }, + { "--seq", FALSE, sqdARG_STRING }, +}; +#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s)) + +int +main(int argc, char **argv) +{ + char *hmmfile; /* file to read HMM(s) from */ + HMMFILE *hmmfp; /* opened hmmfile for reading */ + char *seqfile; /* file to read target sequence(s) from */ + SQFILE *sqfp; /* opened seqfile for reading */ + char *seq; /* target sequence */ + SQINFO sqinfo; /* optional info for seq */ + char *dsq; /* digitized target sequence */ + struct plan7_s *hmm; /* HMM to search with */ + struct p7trace_s *t1; /* standard Viterbi traceback */ + struct p7trace_s *t2; /* WeeViterbi traceback */ + int nseq; + float sc1,sc2; /* scores from Viterbi, WeeViterbi */ + + int be_verbose; + + char *optname; /* name of option found by Getopt() */ + char *optarg; /* argument found by Getopt() */ + int optind; /* index in argv[] */ + + /*********************************************** + * Parse command line + ***********************************************/ + + be_verbose = FALSE; + hmmfile = "weeviterbi_test.hmm"; + seqfile = "weeviterbi_test.seq"; + + while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage, + &optind, &optname, &optarg)) { + if (strcmp(optname, "-v") == 0) be_verbose = TRUE; + else if (strcmp(optname, "--hmm") == 0) hmmfile = optarg; + else if (strcmp(optname, "--seq") == 0) seqfile = optarg; + else if (strcmp(optname, "-h") == 0) { + Banner(stdout, banner); + puts(usage); + puts(experts); + exit(0); + } + } + if (argc - optind != 0) + Die("Incorrect number of arguments.\n%s\n", usage); + + /*********************************************** + * Open test sequence file + ***********************************************/ + + if ((sqfp = SeqfileOpen(seqfile, SQFILE_UNKNOWN, "BLASTDB")) == NULL) + Die("Failed to open sequence database file %s\n%s\n", seqfile, usage); + + /*********************************************** + * Open HMM file + * Read a single HMM from it. (Config HMM, if necessary). + ***********************************************/ + + if ((hmmfp = HMMFileOpen(hmmfile, NULL)) == NULL) + Die("Failed to open HMM file %s\n%s", hmmfile, usage); + if (!HMMFileRead(hmmfp, &hmm)) + Die("Failed to read any HMMs from %s\n", hmmfile); + if (hmm == NULL) + Die("HMM file %s corrupt or in incorrect format? Parse failed", hmmfile); + P7Logoddsify(hmm, TRUE); + + /*********************************************** + * Search HMM against each sequence + ***********************************************/ + + nseq = 0; + while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) + { + nseq++; + dsq = DigitizeSequence(seq, sqinfo.len); + + sc1 = P7Viterbi(dsq, sqinfo.len, hmm, &t1); + sc2 = P7WeeViterbi(dsq, sqinfo.len, hmm, &t2); + + if (be_verbose) + { + printf("test sequence %d: %s %s\n", + nseq, sqinfo.name, + sqinfo.flags & SQINFO_DESC ? sqinfo.desc : ""); + printf("** P7Viterbi trace:\n"); + P7PrintTrace(stdout, t1, hmm, dsq); + printf("** P7WeeViterbi trace:\n"); + P7PrintTrace(stdout, t2, hmm, dsq); + } + + if (! TraceVerify(t1, hmm->M, sqinfo.len)) + Die("Trace verify failed on Viterbi for seq #%d, %s\n", nseq, sqinfo.name); + if (! TraceVerify(t2, hmm->M, sqinfo.len)) + Die("Trace verify failed on WeeViterbi for seq #%d, %s\n", nseq, sqinfo.name); + if (sc1 != sc2) + Die("Scores for the two Viterbi implementations are unequal (%.1f,%.1f)", sc1, sc2); + if (! TraceCompare(t1, t2)) + Die("WeeViterbi() trace is not identical to Viterbi() trace"); + + FreeSequence(seq, &sqinfo); + P7FreeTrace(t1); + P7FreeTrace(t2); + free(dsq); + } + + FreePlan7(hmm); + SeqfileClose(sqfp); + HMMFileClose(hmmfp); + + return EXIT_SUCCESS; +} diff --git a/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.hmm b/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.hmm new file mode 100644 index 0000000..ed56c05 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.hmm @@ -0,0 +1,233 @@ +HMMER2.0 +NAME rrm +DESC +LENG 72 +ALPH Amino +RF no +CS no +COM hmmbuild weeviterbi_test.hmm /nfs/w4/Pfam/Seed/rrm.seed +NSEQ 70 +DATE Wed Mar 4 17:40:23 1998 +XT -8455 -4 -1000 -1000 -8455 -4 -8455 -4 +NULT -4 -8455 +NULE 595 -1558 85 338 -294 453 -1158 197 249 902 -1085 -142 -21 -313 45 531 201 384 -1998 -644 +HMM A C D E F G H I K L M N P Q R S T V W Y + m->m m->i m->d i->m i->i d->m d->d b->m m->e + -21 * -6129 + 1 -1234 -371 -8214 -7849 -5304 -8003 -7706 2384 -7769 2261 -681 -7660 -7694 -7521 -7816 -7346 -5543 1527 -6974 -6639 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 -21 * + 2 -3634 -3460 -5973 -5340 3521 -2129 -4036 -831 -2054 -1257 -2663 -4822 -5229 -4557 -4735 -1979 -1569 -1476 -3893 3439 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 3 -5570 838 -8268 -7958 -5637 -8152 -8243 2427 -7947 -461 -539 -7805 -7843 -7878 -8124 -7550 -5559 3130 -7481 -7000 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 4 -1146 -4797 -1564 -2630 -1480 2769 -2963 -1850 992 -4812 -3887 737 -4397 -120 793 -205 -1019 -4418 -4981 -1059 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 5 -5242 -7035 445 -3538 -7284 1773 -4583 -7166 -4676 -7046 -6312 3633 -1651 -1262 -849 -1278 -5287 -6650 -7228 -291 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 6 -6898 -6238 -9292 -8703 -410 -9176 -7772 820 -8535 3071 -753 -8917 -8033 -7171 -7955 -8614 -6722 5 -6136 -6414 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 278 394 45 96 359 117 -369 -294 -249 + - -33 -6025 -12326 -153 -3315 -701 -1378 * * + 7 -5 -5297 178 -2982 -5685 -2278 -528 -5452 -1615 -5394 -4488 1396 3136 -3022 -3659 780 976 -4981 -5565 -4854 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12327 -894 -1115 -701 -1378 * * + 8 -3329 -4799 -805 543 789 -4303 572 -4868 140 -1087 -3888 -603 1691 530 183 -162 293 -2124 2317 2037 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11284 -12327 -894 -1115 -701 -1378 * * + 9 -373 -4801 2182 1353 -1426 44 -407 -1928 -366 -4817 -3891 1263 -4395 -1080 -666 295 50 -1947 -4985 397 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 10 450 1883 -5953 -5317 -1256 -1301 -4027 1322 -1847 -283 1542 -4802 -5206 -1502 -4713 -4241 2143 1615 -3893 -3551 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 11 -1786 -4835 1027 -807 -5155 -1278 -2989 -4907 -410 -4850 -3924 957 -4421 -943 -250 670 3048 -4456 -5017 -4333 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 12 -3329 -4802 1324 2670 -5123 -4302 -2961 -4874 732 -2424 -3891 -457 -262 553 250 -694 -989 -4424 1772 -1014 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 13 -325 -4802 1515 2286 -5123 -2017 868 -4874 260 -2865 -1087 -2938 -4395 2006 -810 492 -1754 -4424 -4985 -4302 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 14 -337 -4801 2075 1854 -5121 -723 -567 -1924 73 -634 -194 -1227 -4396 1588 -3049 -212 -414 -4422 -5 -4302 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 15 -6843 -6192 -9252 -8675 -481 -9132 -7773 1557 -8511 2856 467 -8869 -8024 -7180 -7953 -8566 -6676 459 -6154 -6421 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 16 5 -4654 -1525 936 444 -4347 -3013 -1809 2193 -441 -3760 -441 -4438 -2577 1775 -91 -3285 -1104 180 -259 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 17 -97 -4802 2341 1548 -5123 -2042 -2961 -4874 -347 -2479 -194 -5 -726 1566 807 -1858 42 -4424 -4985 -4302 + - -146 -501 232 42 -381 398 105 -627 210 -463 -721 275 393 44 95 361 116 -370 -295 -242 + - -45 -5457 -12327 -1928 -440 -701 -1378 * * + 18 358 -3435 -5945 -1175 1490 -5154 1309 1157 -1944 1759 -387 -4797 -5204 -4530 -1684 -4238 -376 166 -3893 1330 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 19 -2191 733 -7910 -7364 4360 -7323 -5649 -1557 -7016 -750 -407 -6877 -7039 -6263 -6681 -6482 -5572 -4211 -4950 -1019 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12328 -894 -1115 -701 -1378 * * + 20 -83 -4801 -3176 698 -5121 1566 -2961 -1977 942 -4817 -3890 -239 -4396 582 256 1807 -874 -1745 -4984 -1334 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 21 -1216 -4802 -289 1083 -1452 -655 -584 -4874 1345 -4818 -3891 964 1488 2130 -3049 -310 107 -2012 -4985 -1334 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 22 -45 1344 -1667 -843 2933 -2146 400 582 -4479 -1948 -2709 -506 -5117 -436 -1764 -4119 -3523 -96 215 2616 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 23 -556 -4294 -4426 -1796 -273 3377 -4149 -4100 -4273 -2279 -3695 -562 298 -4067 -4575 -1940 -3954 -3921 -4866 -77 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 24 -376 -4801 -143 1004 -1426 805 279 -1771 821 -1486 -3890 -527 2002 126 45 -287 -1679 -617 -4985 -4302 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 25 -3608 -178 -1585 -1970 660 -5154 -4024 2773 -894 -985 -386 -4796 -1707 -4528 -4707 -609 -1823 2145 -3893 -1100 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 26 -673 -173 -3429 1042 -4598 -2161 -3110 535 1570 9 283 -508 -4517 -255 382 -1924 313 1407 -4706 -4127 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 27 -1211 -4799 1518 768 -5119 -1218 -441 -945 -1312 -2414 -587 909 -4396 -1010 534 1815 78 -487 -4983 -128 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 28 1271 2236 -5933 -5299 810 -2278 -651 1901 -1970 -221 -2639 -1497 -5203 -4524 -629 -638 -1577 1521 -3894 -1008 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 29 -1909 -4796 153 441 -1513 -4304 -599 -1894 1709 25 -3886 689 -1498 243 1438 -189 -879 380 -126 -255 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12329 -894 -1115 -701 -1378 * * + 30 -1277 -3441 -5893 -1776 -1155 -5147 -513 1829 -1993 1189 1888 -1484 -703 -4503 -1652 -1974 -3546 2209 -3898 -3554 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -11287 -12329 -894 -1115 -701 -1378 * * + 31 -1299 746 -5893 -1992 -1190 -5147 -524 1691 424 -60 2330 -4774 111 -4503 -132 248 -1571 1419 -3898 -19 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -11287 -12329 -894 -1115 -701 -1378 * * + 32 -3370 -4477 -3387 50 -560 -1979 -449 -51 1375 -681 233 1068 701 -1040 1343 -1845 543 -480 -10 1246 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -240 -11287 -2797 -894 -1115 -701 -1378 * * + 33 -3122 -4595 3395 -593 -4916 -1399 589 -1433 360 -4611 -290 780 -1313 35 -1369 -1782 -3061 -1712 -4778 -4095 + - -151 -504 236 42 -380 396 122 -618 211 -468 -714 274 392 45 98 355 123 -373 -299 -248 + - -841 -2976 -1709 -1966 -426 -3668 -118 * * + 34 -452 -4116 -568 -735 -4435 -1350 -2280 -1270 1458 -4131 792 -2257 1620 415 1996 479 -765 -1327 -4300 -538 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -10529 -11571 -894 -1115 -1180 -840 * * + 35 272 -4448 -1054 1495 -1086 -283 -2616 -726 380 -1231 -3538 1286 -4050 1395 -988 154 68 50 -4633 -876 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -10906 -11948 -894 -1115 -2229 -346 * * + 36 -3050 -4521 457 -2349 -4841 -1681 65 -1545 404 -2305 -3610 996 -1241 -714 -1055 -351 3167 -4143 -4705 -4022 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -10981 -12024 -894 -1115 -2036 -403 * * + 37 -943 -4583 277 -486 -4904 2690 -181 -1421 829 -2551 -758 866 -4177 -751 11 -804 -1361 -4205 -4766 -4084 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11049 -12091 -894 -1115 -2632 -254 * * + 38 -1544 -4606 -1206 -627 -1238 -1111 -220 -4677 1841 -1463 -537 -311 146 1310 2236 252 -1424 -1820 -4789 -1025 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11074 -12116 -894 -1115 -1795 -490 * * + 39 -871 902 -3255 -2704 -1212 -2110 605 -4156 -647 -1293 101 192 1442 -2552 91 2587 -171 -3858 -4584 -3996 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11128 -12170 -894 -1115 -1064 -938 * * + 40 -3251 -4717 -597 -2552 -1539 -1882 45 -4784 2499 -1083 -3807 -1125 -312 -892 2672 -1497 -649 -1932 -4902 -1040 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11202 -12244 -894 -1115 -158 -3269 * * + 41 -4425 -5751 -1160 -3492 -6118 3496 -552 -1896 -1318 -2596 -4883 -434 -258 -3375 -548 -4283 -4348 -5409 -5833 -5262 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 42 -3608 -96 -1795 -5308 3204 -5154 498 -1086 -989 -1857 1406 -4797 -5204 -807 -4709 -4238 -268 -366 187 3035 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 43 2573 2359 -7700 -8052 -7623 2634 -6965 -7447 -7655 -7712 -6731 -6019 -5985 -7072 -7238 -2014 -4755 -2203 -7845 -7842 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 44 -1896 -3552 -6072 -5447 4093 -5277 -4115 -1389 -5044 -1849 -2748 -4920 -5327 -4660 -4842 -2020 -787 -772 -3948 1996 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 45 -2123 1258 -8228 -7927 -5768 -8106 -8270 1951 -7921 -982 -4434 -7761 -7830 -7926 -8131 -7503 -5516 3355 -7605 -7039 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11288 -12331 -894 -1115 -701 -1378 * * + 46 -1158 -4801 136 2359 -5122 -4302 -508 -644 437 -2559 -3890 628 -4395 -213 172 18 1464 -2067 -4985 -1086 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 47 -7925 -6836 -8294 -8655 4067 -8176 -4357 -6786 -8211 -6080 795 -6785 -8028 -6925 -7569 -7427 -7774 -6956 -3603 3066 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 48 -633 -4801 851 2019 -1639 -2148 879 -1118 1178 -2414 -3891 -481 -71 241 -1485 -232 744 -569 -4985 -4302 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 49 -3331 -4805 2054 434 -5126 -1882 -432 -4877 377 -4821 -3894 2009 -4398 -269 -1336 1291 1198 -1970 -4988 -4305 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 50 -638 -4800 -1786 1796 -5120 -1884 1628 -1952 812 -444 -621 -1191 1228 530 -672 8 -873 45 -4983 -276 + - -149 -500 232 43 -381 398 105 -627 210 -466 -721 277 393 45 95 359 119 -370 -295 -239 + - -38 -6076 -12331 -1893 -453 -701 -1378 * * + 51 243 -4801 1218 2315 -5122 -1551 -485 -1640 -795 -2479 -783 -420 -685 -1027 1035 415 -3268 -631 -23 -4302 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12332 -894 -1115 -701 -1378 * * + 52 415 694 2467 1155 -1401 -4334 -490 -1800 -2599 -4689 -637 -384 -1759 -12 -3098 1144 -834 -569 -4907 -271 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 53 2846 -3442 -1698 -5254 -979 -5146 -4014 -750 -4864 -773 1875 -4771 -5197 -1456 -1779 -127 -329 428 -3898 -3555 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 54 581 -4801 1239 1462 -5122 -1606 -432 -367 1251 -1623 -3891 335 -4395 1283 -110 -3209 753 -1920 -4985 -4302 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 55 686 -4798 937 304 -1378 -4303 -437 -1924 2219 -1669 -621 828 -4396 -1012 742 0 -1608 -1126 -4982 -1015 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 56 3420 863 -7680 -7410 -5526 -6323 -6681 -57 -7168 -2455 -4425 -6591 -6708 -6875 -7058 -2256 -4981 -4 -6573 -6193 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 57 -2038 -3436 -5943 -5308 -1145 -5154 -4025 2255 423 1498 1203 -4797 -1707 -478 -1267 -2117 -3548 1450 -3893 -931 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11291 -12333 -894 -1115 -701 -1378 * * + 58 622 -4802 1764 1486 -5123 -4302 -2961 -1060 334 -4818 -3891 -420 -4396 1293 1148 487 -3268 -1087 -4985 -429 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -102 -11291 -4156 -894 -1115 -701 -1378 * * + 59 1265 -231 -1498 1351 -5045 -262 -355 -4796 922 -1073 -3813 778 -4318 877 -34 53 386 -2030 289 -4225 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11207 -12249 -894 -1115 -160 -3250 * * + 60 -684 813 -5723 -473 532 -2124 -3981 -2958 -121 2114 2840 -1421 -5174 -4409 -926 -4196 -1685 -376 -3915 497 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11291 -12333 -894 -1115 -701 -1378 * * + 61 -1812 -4803 1626 -749 -515 -1133 -415 -4875 -1294 -4819 -3892 3181 -793 1470 -1377 -246 -3268 -4425 -4986 -193 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11291 -12333 -894 -1115 -701 -1378 * * + 62 -1812 -4808 -1465 33 -1509 2998 1583 -4879 122 -4823 -3897 972 -4400 -1078 -3055 -1613 -682 -4429 -4991 -1114 + - -149 -500 232 43 -378 398 105 -627 212 -466 -721 275 393 45 98 359 117 -367 -295 -250 + - -98 -4229 -12334 -49 -4901 -701 -1378 * * + 63 -676 -4701 -742 -1422 825 -589 -545 255 1702 -2571 812 -2986 -4424 796 418 -221 1302 -1179 -4912 1028 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12334 -894 -1115 -701 -1378 * * + 64 -3341 -4695 350 1378 -1551 -1973 -2998 477 1265 78 273 -1163 21 504 -1507 -1108 282 114 -19 473 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12334 -894 -1115 -701 -1378 * * + 65 -3605 -3444 -949 -2090 2356 -1177 -4010 1410 -1703 1341 -404 -1673 -747 -4487 -4679 -2139 -1048 1197 -3900 411 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12334 -894 -1115 -701 -1378 * * + 66 -655 -539 1179 279 -1324 1202 -2962 -1895 147 -682 1298 1427 -2056 608 756 -1119 -1893 -4419 -4982 140 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12335 -894 -1115 -701 -1378 * * + 67 -1814 -4814 166 -2636 -5135 2921 -568 -4885 -1333 -2415 -3903 1495 -4406 -312 -619 602 -1672 -4436 -4997 -4314 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12335 -894 -1115 -701 -1378 * * + 68 -3329 1217 -624 -797 -1594 -4303 1580 -4872 2069 -2414 -3890 617 -4396 283 2449 -560 -267 -2067 -4984 -1334 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12335 -894 -1115 -701 -1378 * * + 69 108 566 -1460 747 -1608 -4306 -2965 -30 1407 -2607 -3878 346 1033 -336 863 -1038 745 617 -4975 -4296 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12335 -894 -1115 -701 -1378 * * + 70 -1318 -3465 -283 -172 -3423 -2053 -3974 1957 -4721 1761 1425 -4678 -1762 -4391 -1578 -1974 -1561 1341 -3918 -3570 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12336 -894 -1115 -701 -1378 * * + 71 -1165 -4790 -240 -275 -5105 -4306 1035 -2009 1665 -395 707 -1334 -218 -188 1891 -1077 -383 404 110 348 + - -149 -500 233 43 -381 398 106 -626 210 -464 -720 275 394 45 96 359 117 -369 -294 -249 + - -43 -6001 -12336 -150 -3342 -701 -1378 * * + 72 -1929 1218 -1535 -1647 -3990 -4677 -3410 1725 207 -1481 -3117 -3608 -810 -1118 -743 -1942 428 2687 -4325 -3869 + - * * * * * * * * * * * * * * * * * * * * + - * * * * * * * * 0 +// diff --git a/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.seq b/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.seq new file mode 100644 index 0000000..3a8750f --- /dev/null +++ b/forester/archive/RIO/others/hmmer/testsuite/weeviterbi_test.seq @@ -0,0 +1,10 @@ +>RU1A_HUMAN U1 SMALL NUCLEAR RIBONUCLEOPROTEIN A (U1 SNRNP A PROTEIN). +MAVPETRPNHTIYINNLNEKIKKDELKKSLYAIFSQFGQILDILVSRSLK +MRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT +FVERDRKREKRKPKSQETPATKKAVQGGGATPVVGAVQGPVPGMPPMTQA +PRIMHHMPGQPPYMPPPGMIPPPGLAPGQIPPGAMPPQQLMPGQMPPAQP +>RU1A_HUMAN U1 SMALL NUCLEAR RIBONUCLEOPROTEIN A (U1 SNRNP A PROTEIN). +TFVERDRKREKRKPKSQETPATKKAVQGGGATPVVGAVQGPVPGMPPMTQ +APRIMHHMPGQPPYMPPPGMIPPPGLAPGQIPPGAMPPQQLMPGQMPPAQ +PLSENPPNHILFLTNLPEETNELMLSMLFNQFPGFKEVRLVPGRHDIAFV +EFDNEVQAGAARDALQGFKITQNNAMKISFAKK diff --git a/forester/archive/RIO/others/hmmer/tutorial/7LES_DROME b/forester/archive/RIO/others/hmmer/tutorial/7LES_DROME new file mode 100644 index 0000000..885cccf --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/7LES_DROME @@ -0,0 +1,138 @@ +ID 7LES_DROME STANDARD; PRT; 2554 AA. +AC P13368; +DT 01-JAN-1990 (REL. 13, CREATED) +DT 01-JAN-1990 (REL. 13, LAST SEQUENCE UPDATE) +DT 01-NOV-1995 (REL. 32, LAST ANNOTATION UPDATE) +DE SEVENLESS PROTEIN (EC 2.7.1.112). +GN SEV. +OS DROSOPHILA MELANOGASTER (FRUIT FLY). +OC EUKARYOTA; METAZOA; ARTHROPODA; INSECTA; DIPTERA. +RN [1] +RP SEQUENCE FROM N.A. +RC STRAIN=CANTON-S; +RX MEDLINE; 88282538. +RA BASLER K., HAFEN E.; +RL CELL 54:299-311(1988). +RN [2] +RP SEQUENCE FROM N.A. +RC STRAIN=OREGON-R; +RX MEDLINE; 88329706. +RA BOWTELL D.L.L., SIMON M.A., RUBIN G.M.; +RL GENES DEV. 2:620-634(1988). +RN [3] +RP IDENTIFICATION OF FN-III REPEATS. +RX MEDLINE; 90199889. +RA NORTON P.A., HYNES R.O., RESS D.J.G.; +RL CELL 61:15-16(1990). +CC -!- FUNCTION: RECEPTOR FOR AN EXTRACELLULAR SIGNAL REQUIRED TO +CC INSTRUCT A CELL TO DIFFERENTIATE INTO A R7 PHOTORECEPTOR. THE +CC LIGAND FOR SEV IS THE BOSS (BRIDE OF SEVENLESS) PROTEIN ON THE +CC SURFACE OF THE NEIGHBORING R8 CELL. +CC -!- CATALYTIC ACTIVITY: ATP + A PROTEIN TYROSINE = ADP + +CC PROTEIN TYROSINE PHOSPHATE. +CC -!- SUBUNIT: MAY FORM A COMPLEX WITH DRK AND SOS. +CC -!- SIMILARITY: BELONGS TO THE INSULIN RECEPTOR FAMILY OF TYROSINE- +CC PROTEIN KINASES. +CC -!- SIMILARITY: CONTAINS SEVEN FIBRONECTIN TYPE III-LIKE DOMAINS. +CC -!- CAUTION: UNCLEAR WHETHER THE POTENTIAL MEMBRANE SPANNING REGION +CC NEAR THE N-TERMINUS IS PRESENT AS A TRANSMEMBRANE DOMAIN IN THE +CC NATIVE PROTEIN OR SERVES AS A CLEAVED SIGNAL SEQUENCE. +DR EMBL; X13666; G8579; ALT_INIT. +DR EMBL; J03158; G158419; -. +DR PIR; A28912; TVFF7L. +DR FLYBASE; FBGN0003366; SEV. +DR PROSITE; PS00107; PROTEIN_KINASE_ATP. +DR PROSITE; PS00109; PROTEIN_KINASE_TYR. +DR PROSITE; PS00239; RECEPTOR_TYR_KIN_II. +DR PROSITE; PS50011; PROTEIN_KINASE_DOM. +KW TRANSFERASE; TYROSINE-PROTEIN KINASE; TRANSMEMBRANE; ATP-BINDING; +KW PHOSPHORYLATION; RECEPTOR; VISION; REPEAT. +FT DOMAIN 1 2123 EXTRACELLULAR (POTENTIAL). +FT TRANSMEM 102 122 POTENTIAL. +FT TRANSMEM 2124 2147 POTENTIAL. +FT DOMAIN 2148 2554 CYTOPLASMIC (POTENTIAL). +FT DOMAIN 311 431 FIBRONECTIN TYPE-III. +FT DOMAIN 436 528 FIBRONECTIN TYPE-III. +FT DOMAIN 822 921 FIBRONECTIN TYPE-III. +FT DOMAIN 1298 1392 FIBRONECTIN TYPE-III. +FT DOMAIN 1680 1794 FIBRONECTIN TYPE-III. +FT DOMAIN 1797 1897 FIBRONECTIN TYPE-III. +FT DOMAIN 1898 1988 FIBRONECTIN TYPE-III. +FT DOMAIN 2038 2046 POLY-ARG. +FT DOMAIN 2209 2485 PROTEIN KINASE. +FT NP_BIND 2215 2223 ATP (BY SIMILARITY). +FT BINDING 2242 2242 ATP (BY SIMILARITY). +FT MUTAGEN 2242 2242 K->M: INACTIVATES THE PROTEIN. +FT MOD_RES 2380 2380 PHOSPHORYLATION (AUTO-) (BY SIMILARITY). +FT CARBOHYD 30 30 POTENTIAL. +FT CARBOHYD 129 129 POTENTIAL. +FT CARBOHYD 481 481 POTENTIAL. +FT CARBOHYD 505 505 POTENTIAL. +FT CARBOHYD 617 617 POTENTIAL. +FT CARBOHYD 647 647 POTENTIAL. +FT CARBOHYD 966 966 POTENTIAL. +FT CARBOHYD 1228 1228 POTENTIAL. +FT CARBOHYD 1313 1313 POTENTIAL. +FT CARBOHYD 1353 1353 POTENTIAL. +FT CARBOHYD 1550 1550 POTENTIAL. +FT CARBOHYD 1557 1557 POTENTIAL. +FT CARBOHYD 1639 1639 POTENTIAL. +FT CARBOHYD 1725 1725 POTENTIAL. +FT CARBOHYD 1756 1756 POTENTIAL. +FT CARBOHYD 1804 1804 POTENTIAL. +FT CARBOHYD 1889 1889 POTENTIAL. +FT CARBOHYD 1947 1947 POTENTIAL. +FT CARBOHYD 2073 2073 POTENTIAL. +FT VARIANT 392 392 M -> V. +FT VARIANT 1668 1668 A -> V. +FT VARIANT 1703 1703 N -> H. +FT VARIANT 1730 1730 R -> K. +FT VARIANT 1731 1731 G -> E. +FT VARIANT 1741 1741 V -> M. +FT VARIANT 2271 2271 R -> C. +FT CONFLICT 1823 1823 E -> Q (IN REF. 2). +SQ SEQUENCE 2554 AA; 287107 MW; 1143D891 CRC32; + MTMFWQQNVD HQSDEQDKQA KGAAPTKRLN ISFNVKIAVN VNTKMTTTHI NQQAPGTSSS + SSNSQNASPS KIVVRQQSSS FDLRQQLARL GRQLASGQDG HGGISTILII NLLLLILLSI + CCDVCRSHNY TVHQSPEPVS KDQMRLLRPK LDSDVVEKVA IWHKHAAAAP PSIVEGIAIS + SRPQSTMAHH PDDRDRDRDP SEEQHGVDER MVLERVTRDC VQRCIVEEDL FLDEFGIQCE + KADNGEKCYK TRCTKGCAQW YRALKELESC QEACLSLQFY PYDMPCIGAC EMAQRDYWHL + QRLAISHLVE RTQPQLERAP RADGQSTPLT IRWAMHFPEH YLASRPFNIQ YQFVDHHGEE + LDLEQEDQDA SGETGSSAWF NLADYDCDEY YMCEILEALI PYTQYRFRFE LPFGENRDEV + LYSPATPAYQ TPPEGAPISA PVIEHLMGLD DSHLAVHWHP GRFTNGPIEG YRLRLSSSEG + NATSEQLVPA GRGSYIFSQL QAGTNYTLAL SMINKQGEGP VAKGFVQTHS ARNEKPAKDL + TESVLLVGRR AVMWQSLEPA GENSMIYQSQ EELADIAWSK REQQLWLLNV HGELRSLKFE + SGQMVSPAQQ LKLDLGNISS GRWVPRRLSF DWLHHRLYFA MESPERNQSS FQIISTDLLG + ESAQKVGESF DLPVEQLEVD ALNGWIFWRN EESLWRQDLH GRMIHRLLRI RQPGWFLVQP + QHFIIHLMLP QEGKFLEISY DGGFKHPLPL PPPSNGAGNG PASSHWQSFA LLGRSLLLPD + SGQLILVEQQ GQAASPSASW PLKNLPDCWA VILLVPESQP LTSAGGKPHS LKALLGAQAA + KISWKEPERN PYQSADAARS WSYELEVLDV ASQSAFSIRN IRGPIFGLQR LQPDNLYQLR + VRAINVDGEP GEWTEPLAAR TWPLGPHRLR WASRQGSVIH TNELGEGLEV QQEQLERLPG + PMTMVNESVG YYVTGDGLLH CINLVHSQWG CPISEPLQHV GSVTYDWRGG RVYWTDLARN + CVVRMDPWSG SRELLPVFEA NFLALDPRQG HLYYATSSQL SRHGSTPDEA VTYYRVNGLE + GSIASFVLDT QQDQLFWLVK GSGALRLYRA PLTAGGDSLQ MIQQIKGVFQ AVPDSLQLLR + PLGALLWLER SGRRARLVRL AAPLDVMELP TPDQASPASA LQLLDPQPLP PRDEGVIPMT + VLPDSVRLDD GHWDDFHVRW QPSTSGGNHS VSYRLLLEFG QRLQTLDLST PFARLTQLPQ + AQLQLKISIT PRTAWRSGDT TRVQLTTPPV APSQPRRLRV FVERLATALQ EANVSAVLRW + DAPEQGQEAP MQALEYHISC WVGSELHEEL RLNQSALEAR VEHLQPDQTY HFQVEARVAA + TGAAAGAASH ALHVAPEVQA VPRVLYANAE FIGELDLDTR NRRRLVHTAS PVEHLVGIEG + EQRLLWVNEH VELLTHVPGS APAKLARMRA EVLALAVDWI QRIVYWAELD ATAPQAAIIY + RLDLCNFEGK ILQGERVWST PRGRLLKDLV ALPQAQSLIW LEYEQGSPRN GSLRGRNLTD + GSELEWATVQ PLIRLHAGSL EPGSETLNLV DNQGKLCVYD VARQLCTASA LRAQLNLLGE + DSIAGQLAQD SGYLYAVKNW SIRAYGRRRQ QLEYTVELEP EEVRLLQAHN YQAYPPKNCL + LLPSSGGSLL KATDCEEQRC LLNLPMITAS EDCPLPIPGV RYQLNLTLAR GPGSEEHDHG + VEPLGQWLLG AGESLNLTDL LPFTRYRVSG ILSSFYQKKL ALPTLVLAPL ELLTASATPS + PPRNFSVRVL SPRELEVSWL PPEQLRSESV YYTLHWQQEL DGENVQDRRE WEAHERRLET + AGTHRLTGIK PGSGYSLWVQ AHATPTKSNS SERLHVRSFA ELPELQLLEL GPYSLSLTWA + GTPDPLGSLQ LECRSSAEQL RRNVAGNHTK MVVEPLQPRT RYQCRLLLGY AATPGAPLYH + GTAEVYETLG DAPSQPGKPQ LEHIAEEVFR VTWTAARGNG APIALYNLEA LQARSDIRRR + RRRRRRNSGG SLEQLPWAEE PVVVEDQWLD FCNTTELSCI VKSLHSSRLL LFRVRARSLE + HGWGPYSEES ERVAEPFVSP EKRGSLVLAI IAPAAIVSSC VLALVLVRKV QKRRLRAKKL + LQQSRPSIWS NLSTLQTQQQ LMAVRNRAFS TTLSDADIAL LPQINWSQLK LLRFLGSGAF + GEVYEGQLKT EDSEEPQRVA IKSLRKGASE FAELLQEAQL MSNFKHENIV RLVGICFDTE + SISLIMEHME AGDLLSYLRA ARATSTQEPQ PTAGLSLSEL LAMCIDVANG CSYLEDMHFV + HRDLACRNCL VTESTGSTDR RRTVKIGDFG LARDIYKSDY YRKEGEGLLP VRWMSPESLV + DGLFTTQSDV WAFGVLCWEI LTLGQQPYAA RNNFEVLAHV KEGGRLQQPP MCTEKLYSLL + LLCWRTDPWE RPSFRRCYNT LHAISTDLRR TQMASATADT VVSCSRPEFK VRFDGQPLEE + HREHNERPED ENLTLREVPL KDKQLYANEG VSRL +// diff --git a/forester/archive/RIO/others/hmmer/tutorial/Artemia.fa b/forester/archive/RIO/others/hmmer/tutorial/Artemia.fa new file mode 100644 index 0000000..339a71b --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/Artemia.fa @@ -0,0 +1,48 @@ +>S13421 S13421 GLOBIN - BRINE SHRIMP +DKATIKRTWATVTDLPSFGRNVFLSVFAAK +PEYKNLFVEFRNIPASELASSERLLYHGGR +VLSSIDEAIAGIDTPDRAVKTLLALGERHI +SRGTVRRHFEAFSYAFIDELKQRGVESADL +AAWRRGWDNIVNVLEAGLLRRQIDLEVTGL +SCVDVANIQESWSKVSGDLKTTGSVVFQRM +INGHPEYQQLFRQFRDVDLDKLGESNSFVA +HVFRVVAAFDGIIHELDNNQFIVSTLKKLG +EQHIARGTDISHFQNFRVTLLEYLKENGMN +GAQKASWNKAFDAFEKYISMGLSSLKRVDP +ITGLSGLEKNAILSTWGKVRGNLQEVGKAT +FGKLFTAHPEYQQMFRFSQGMPLASLVESP +KFAAHTQRVVSALDQTLLALNRPSDFVYMI +KELGLDHINRGTDRSHFENYQVVFIEYLKE +TLGDSLDEFTVKSFNHVFEVIISFLNEGLR +QADIVDPVTHLTGRQKEMIKASWSKARTDL +RSLGQELFMRMFKAHPEYQTLFVNKGFADV +PLVSLREDERFISHMANVLGGFDTLLQNLD +ESSYFIYSLRNLGDAHIQRKAGTQHFRSFE +AILIPILQESQGLDAASVEAWKKFFDVSIG +VIAQGLKVATSEEADPVTGLYGKEIVALRQ +AFAAVTPRNVEIGKRVFAKLFAAHPEYKNL +FKKFEQYSVEELPSTDAFHYHISLVMNRFS +SIGKVIDDNVSFVYLLKKLGREHIKRGLSR +KQFDQFVELYIAEISSELSDTGRNGLEKVL +TFATGVIEQGLFQLGQVDSNTLTALEKQSI +QDIWSNLRSTGLQDLAVKIFTRLFSAHPEY +KLLFTGRFGNVDNINENAPFKAHLHRVLSA +FDIVISTLDDSEHLIRQLKDLGLFHTRLGM +TRSHFDNFATAFLSVAQDIAPNQLTVLGRE +SLNKGFKLMHGVIEEGLLQLERINPITGLS +AREVAVVKQTWNLVKPDLMGVGMRIFKSLF +EAFPAYQAVFPKFSDVPLDKLEDTPAVGKH +SISVTTKLDELIQTLDEPANLALLARQLGE +DHIVLRVNKPMFKSFGKVLVRLLENDLGQR +FSSFASRSWHKAYDVIVEYIEEGLQQSYKQ +DPVTGITDAEKALVQESWDLLKPDLLGLGR +KIFTKVFTKHPDYQILFTRTGFGDTPLTKL +DDNPAFGTHIIKVMRAFDHVIQILGKPKTL +MAYLRSVGADHIATNVERRHFQAFSNALIP +VMQHDLKAQLRPDAVAAWRKGLDRIIGIID +QGLIGLKEVNPQNAFSAYDIQAVQRTWALA +KPDLMGKGAMVFKQLFTDHGYQPLFSNLAQ +YEITGLEGSPELNTHARNVMAQLDTLVGSL +QNSIELGQSLAQLGKDHVPRKVNRVHFKDF +AEHFIPLMKADLGDEFTPLAESAWKRAFDV +MIATIEQGQEGSSHALSSFLTNPVA diff --git a/forester/archive/RIO/others/hmmer/tutorial/RU1A_HUMAN b/forester/archive/RIO/others/hmmer/tutorial/RU1A_HUMAN new file mode 100644 index 0000000..412e284 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/RU1A_HUMAN @@ -0,0 +1,98 @@ +ID RU1A_HUMAN STANDARD; PRT; 282 AA. +AC P09012; +DT 01-NOV-1988 (REL. 09, CREATED) +DT 01-NOV-1988 (REL. 09, LAST SEQUENCE UPDATE) +DT 01-OCT-1996 (REL. 34, LAST ANNOTATION UPDATE) +DE U1 SMALL NUCLEAR RIBONUCLEOPROTEIN A (U1 SNRNP A PROTEIN). +GN SNRPA. +OS HOMO SAPIENS (HUMAN). +OC EUKARYOTA; METAZOA; CHORDATA; VERTEBRATA; TETRAPODA; MAMMALIA; +OC EUTHERIA; PRIMATES. +RN [1] +RP SEQUENCE FROM N.A. +RC TISSUE=LIVER; +RX MEDLINE; 91340152. +RA NELISSEN R.L.H., SILLEKENS P.T.G., BEIJER R.P., +RA GEURTS VAN KESSEL A.H.M., VAN VENROOIJ W.J.; +RL GENE 102:189-196(1991). +RN [2] +RP SEQUENCE FROM N.A. +RX MEDLINE; 88111575. +RA SILLEKENS P.T.G., HABETS W.J., BEIJER R.P., VAN VENROOIJ W.J.; +RL EMBO J. 6:3841-3848(1987). +RN [3] +RP X-RAY CRYSTALLOGRAPHY (2.8 ANGSTROMS) OF 1-95. +RX MEDLINE; 91061907. +RA NAGAI K., OUBRIDGE C., JESSEN T.-H., LI J., EVANS P.R.; +RL NATURE 348:515-520(1990). +RN [4] +RP X-RAY CRYSTALLOGRAPHY (1.92 ANGSTROMS). +RX MEDLINE; 95075454. +RA OUBRIDGE C., ITO N., EVANS P.R., TEO C.-H., NAGAI K.; +RL NATURE 372:432-438(1994). +RN [5] +RP STRUCTURE BY NMR OF 11-94. +RX MEDLINE; 91172834. +RA HOFFMAN D.W., QUERY C.C., GOLDEN B.L., WHITE S.W., KEENE J.D.; +RL PROC. NATL. ACAD. SCI. U.S.A. 88:2495-2499(1991). +RN [6] +RP STRUCTURE BY NMR OF 1-102. +RX MEDLINE; 94349935. +RA HOWE P.W.A., NAGAI K., NEUHAUS D., VARANI G.; +RL EMBO J. 13:3873-3881(1994). +RN [7] +RP STRUCTURE BY NMR OF 2-102. +RX MEDLINE; 96186818. +RA ALLAIN F.H.-T., GUBSER C.C., HOWE P.W.A., NAGAI K., NEUHAUS D., +RA VARANI G.; +RL NATURE 380:646-650(1996). +RN [8] +RP STRUCTURE BY NMR OF 1-117. +RX MEDLINE; 96180024. +RA AVIS J.M., ALLAIN F.H.-T., HOWE P.W.A., VARANI G., NAGAI K., +RA NEUHAUS D.; +RL J. MOL. BIOL. 257:398-411(1996). +RN [9] +RP MUTAGENESIS, AND DETAILED STUDIES OF RNA-BINDING. +RX MEDLINE; 92007796. +RA JESSEN T.-H., OUBRIDGE C., TEO C.H., PRITCHARD C., NAGAI K.; +RL EMBO J. 10:3447-3456(1991). +CC -!- FUNCTION: BINDS STEM LOOP II OF U1 SNRNA. IT IS THE FIRST SN-RNP +CC TO INTERACT WITH PRE-MRNA. THIS INTERACTION IS REQUIRED FOR THE +CC SUBSEQUENT BINDING OF U2 SN-RNP AND THE U4/U6/U5 TRI-SN-RNP. +CC -!- SUBUNIT: BELONGS TO THE SPLICEOSOME WHERE IT IS ASSOCIATED WITH +CC SN-RNP U1. +CC -!- SUBCELLULAR LOCATION: NUCLEAR. +CC -!- SIMILARITY: BELONGS TO THE U1 A/B" FAMILY. +CC -!- SIMILARITY: CONTAINS 2 RNA RECOGNITION MOTIFS (RNP). +DR EMBL; M60784; G340052; -. +DR EMBL; M60779; G340052; JOINED. +DR EMBL; M60780; G340052; JOINED. +DR EMBL; M60781; G340052; JOINED. +DR EMBL; M60782; G340052; JOINED. +DR EMBL; M60783; G340052; JOINED. +DR EMBL; X06347; G37541; -. +DR PIR; JQ1528; JQ1528. +DR PDB; 1NRC; 31-JAN-94. +DR MIM; 182285; -. +DR PROSITE; PS00030; RNP_1. +KW NUCLEAR PROTEIN; RNA-BINDING; RIBONUCLEOPROTEIN; REPEAT; +KW SPLICEOSOME; 3D-STRUCTURE. +FT DOMAIN 12 17 RNA-BINDING (RNP2) (BY SIMILARITY). +FT DOMAIN 52 59 RNA-BINDING (RNP1) (BY SIMILARITY). +FT DOMAIN 210 215 RNA-BINDING (RNP2) (BY SIMILARITY). +FT DOMAIN 244 251 RNA-BINDING (RNP1) (BY SIMILARITY). +FT REPEAT 1 89 +FT REPEAT 199 282 +FT MUTAGEN 11 11 T->V: ABOLISHES RNA-BINDING. +FT MUTAGEN 13 13 Y->F: SUBSTANTIALLY REDUCES RNA-BINDING. +FT MUTAGEN 15 15 N->V: ABOLISHES RNA-BINDING. +FT MUTAGEN 16 16 N->V: SUBSTANTIALLY REDUCES RNA-BINDING. +FT MUTAGEN 52 52 R->Q: ABOLISHES RNA-BINDING. +SQ SEQUENCE 282 AA; 31279 MW; 22427816 CRC32; + MAVPETRPNH TIYINNLNEK IKKDELKKSL YAIFSQFGQI LDILVSRSLK MRGQAFVIFK + EVSSATNALR SMQGFPFYDK PMRIQYAKTD SDIIAKMKGT FVERDRKREK RKPKSQETPA + TKKAVQGGGA TPVVGAVQGP VPGMPPMTQA PRIMHHMPGQ PPYMPPPGMI PPPGLAPGQI + PPGAMPPQQL MPGQMPPAQP LSENPPNHIL FLTNLPEETN ELMLSMLFNQ FPGFKEVRLV + PGRHDIAFVE FDNEVQAGAA RDALQGFKIT QNNAMKISFA KK +// diff --git a/forester/archive/RIO/others/hmmer/tutorial/amino.null b/forester/archive/RIO/others/hmmer/tutorial/amino.null new file mode 100644 index 0000000..f882173 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/amino.null @@ -0,0 +1,30 @@ +# amino.null +# +# Example of a null model file for protein sequences. +# The values in this file are the HMMER 2 default +# settings. + +Amino + +0.075520 # A +0.016973 # C +0.053029 # D +0.063204 # E +0.040762 # F +0.068448 # G +0.022406 # H +0.057284 # I +0.059398 # K +0.093399 # L +0.023569 # M +0.045293 # N +0.049262 # P +0.040231 # Q +0.051573 # R +0.072214 # S +0.057454 # T +0.065252 # V +0.012513 # W +0.031985 # Y + +0.997151 # p1 diff --git a/forester/archive/RIO/others/hmmer/tutorial/amino.pri b/forester/archive/RIO/others/hmmer/tutorial/amino.pri new file mode 100644 index 0000000..77c1d8c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/amino.pri @@ -0,0 +1,70 @@ +# amino.pri +# +# This file incorporates Blocks9.plib, the UCSC mixture +# Dirichlet prior created by Kimmen Sjolander. +# The values in this file are the HMMER 2 default settings. + +Dirichlet # Strategy (mixture Dirichlet) +Amino # type of prior (Amino or Nucleic) + +# Transitions +1 # Single component +1.0 # with probability = 1.0 +0.7939 0.0278 0.0135 # m->m, m->i, m->d alpha's +0.1551 0.1331 # i->m, i->i alpha's +0.9002 0.5630 # d->m, d->d alpha's + +# Match emissions +# +9 # 9 components + +# Component 1 +0.178091 +0.270671 0.039848 0.017576 0.016415 0.014268 0.131916 0.012391 0.022599 0.020358 0.030727 0.015315 0.048298 0.053803 0.020662 0.023612 0.216147 0.147226 0.065438 0.003758 0.009621 +# S A T , C G P >< N V M , Q H R I K F L D W , E Y + +# Component 2 +0.056591 +0.021465 0.0103 0.011741 0.010883 0.385651 0.016416 0.076196 0.035329 0.013921 0.093517 0.022034 0.028593 0.013086 0.023011 0.018866 0.029156 0.018153 0.0361 0.07177 0.419641 +# Y , F W , H ,>< L M , N Q I C V S R , T P A K D G E + +# Component 3 +0.0960191 +0.561459 0.045448 0.438366 0.764167 0.087364 0.259114 0.21494 0.145928 0.762204 0.24732 0.118662 0.441564 0.174822 0.53084 0.465529 0.583402 0.445586 0.22705 0.02951 0.12109 +# Q E , K N R S H D T A >< M P Y G , V L I W C F + +# Component 4 +0.0781233 +0.070143 0.01114 0.019479 0.094657 0.013162 0.048038 0.077 0.032939 0.576639 0.072293 0.02824 0.080372 0.037661 0.185037 0.506783 0.073732 0.071587 0.042532 0.011254 0.028723 +# K R , Q , H >< N E T M S , P W Y A L G V C I , D F + +# Component 5 +0.0834977 +0.041103 0.014794 0.00561 0.010216 0.153602 0.007797 0.007175 0.299635 0.010849 0.999446 0.210189 0.006127 0.013021 0.019798 0.014509 0.012049 0.035799 0.180085 0.012744 0.026466 +# L M , I , F V ><, W Y C T Q , A P H R , K S E N , D G + +# Component 6 +0.0904123 +0.115607 0.037381 0.012414 0.018179 0.051778 0.017255 0.004911 0.796882 0.017074 0.285858 0.075811 0.014548 0.015092 0.011382 0.012696 0.027535 0.088333 0.94434 0.004373 0.016741 +# I V ,, L M >< C T A , F , Y S P W N , E Q K R D G H + +# Component 7 +0.114468 +0.093461 0.004737 0.387252 0.347841 0.010822 0.105877 0.049776 0.014963 0.094276 0.027761 0.01004 0.187869 0.050018 0.110039 0.038668 0.119471 0.065802 0.02543 0.003215 0.018742 +# D , E N , Q H S >< K G P T A , R Y , M V L F W I C + +# Component 8 +0.0682132 +0.452171 0.114613 0.06246 0.115702 0.284246 0.140204 0.100358 0.55023 0.143995 0.700649 0.27658 0.118569 0.09747 0.126673 0.143634 0.278983 0.358482 0.66175 0.061533 0.199373 +# M , V I L F T Y C A >< W S H Q R N K , P E G , D + +# Component 9 +0.234585 +0.005193 0.004039 0.006722 0.006121 0.003468 0.016931 0.003647 0.002184 0.005019 0.00599 0.001473 0.004158 0.009055 0.00363 0.006583 0.003172 0.00369 0.002967 0.002772 0.002686 +# P G W , C H R D E >< N Q K F Y T L A M , S V I + + +## Insert emissions +1 # Single component +1.0 # with probability 1.0 +681 120 623 651 313 902 241 371 687 676 143 548 647 415 551 926 623 505 102 269 diff --git a/forester/archive/RIO/others/hmmer/tutorial/fn3.slx b/forester/archive/RIO/others/hmmer/tutorial/fn3.slx new file mode 100644 index 0000000..631b108 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/fn3.slx @@ -0,0 +1,351 @@ +# ID fn3 +# AC PF00041 +# DE Fibronectin type III domain +# AU Sonnhammer ELL +# AL HMM_simulated_annealing +# AM hmma -qR +# SE Swissprot_feature_table +# GA Bic_raw 18 hmmls 20 +# CC There is no clear separation between signal and noise. +# DR PROSITE; PDOC00214; +# DR SCOP; 1ttf; sf; +# RN [1] +# RA Bazan J.F. +# RL PNAS USA 87:6934-6938(1990). +# RN [2] +# RA Little E., Bork P., Doolittle R. +# RL J. Mol. Evol. 39:631-643(1994). +# RN [3] +# RA Kornblihtt A.R., et al. +# RL EMBO J. 4:1755-1759(1985). +# SQ 109 +7LES_DROME/1799-1891 P.SPP.RNFSVRVL..SPRELEVSWLPPEq...LRSESVYYTLHW...QQ +7LES_DROVI/1917-1997 S.YAPlPPLQLIEL..NAYGMTLAWPGT......PDALSSLTLEC...QS +APU_THETY/928-1009 A.PQPiTDLKAVS...GNGKVDLSWSVV.......DKAVSYNI.YR...S +APU_THETY/1165-1248 P.TAP.V.LQQPGI..ESSRVTLNWSPSA....DDVAIFGYEIYK...SS +AXO1_CHICK/602-692 PpGPP.GGVVVRDI..GDTTVQLSWSRGFd...NHSPIARYSIEAR...T +AXO1_CHICK/807-896 PkVAP.FRVTAKAV..LSSEMDVSWEPVEqg.dMTGVLLGYEIRY...WK +CAML_HUMAN/812-907 P.QAI.PELEGIEIl.NSSAVLVKWRPVDla.qVKGHLRGYNVTY...WR +CHI1_BACCI/465-542 P.SVP.GNARSTGV..TANSVTLAWNAST....DNVGVTGYNV.YN.... +CHIT_STRLI/142-219 P.SAP.GTPTASNI..TDTSVKLSWSAAT....DDKGVKNYDV.LR.... +CHIX_STROI/169-240 P.PAPpTGLRTGSV..TATSVALSWSPV.......TGATGYAV.YR.... +CONT_CHICK/799-884 PtEVP.TDVSVKVL..SSSEISVSWHHVT.....EKSVEGYQIRY...WA +CPSF_CHICK/630-716 P.DPP.QSVRVTSV..GEDWAVLSWEAPPf..dGGMPITGYLMER...KK +CPSF_CHICK/923-1008 P.GPP.QAVRVMEV..WGSNALLQWEPPKd..dGNAEISGYTVQK...AD +ECK_HUMAN/329-420 P.SAP.HYLTAVGM..GA.KVELRWTPPQd..sGGREDIVYSVTCEqcWP +ECK_HUMAN/436-519 Q.TEP.PKVRLEGR..STTSLSVSWSIPPp...QQSRVWKYEVTYR...K +EPH1_HUMAN/333-435 P.SAP.RNLSFSA...SGTQLSLRWEPPAd..tGGRQDVRYSVRCS..QC +EPH3_CHICK/333-429 P.SAP.QAV.ISSV..NETSLMLEWTPPRd..sGGREDLVYNIIC...KS +EPH3_CHICK/444-528 P.SAV.SIMHQVSR..TVDSITLSWSQPDq...PNGVILDYELQY...YE +ETK1_CHICK/325-421 P.SAP.RNV.ISNI..NETSVILDWSWPLd..tGGRKDVTFNIIC...KK +FAS2_SCHAM/530-616 P.SAV.LQVKMDVM..TATTVTFKFFGPGn..dGGLPTKNYAVQY...KQ +FAS2_SCHAM/642-735 T.SGT.ENEVVVSP..YPNRYELRWQVPAd...NGEPITHYSVKS...CP +FINC_BOVIN/577-660 T.SGP.VQVIITETpsQPNSHPIQWSAPE.....SSHISKYILRW...KP +FINC_BOVIN/689-768 P.VVA.TSESVTEI..TASSFVVSWVSA......SDTVSGFRVEY...EL +FINC_BOVIN/780-858 P.DAP.PDPTVDQV..DDTSIVVRWSRP......RAPITGYRIVY...SP +FINC_BOVIN/875-955 KvPPP.RDLQFVEV..TDVKITIMWTPP......ESPVTGYRVDV...IP +FINC_BOVIN/1142-1225 PlSPP.TNLHLEANp.DTGVLTVSWERST.....TPDITGYRITT...TP +FINC_BOVIN/1236-1316 V.PPP.TDLRFTNV..GPDTMRVTWAPPS.....SIELTNLLVRY...SP +FINC_BOVIN/1327-1406 L.DSP.SGIDFSDI..TANSFTVHWIAP......RATITGYRIRHH...P +FINC_BOVIN/1417-1499 S.DVP.RDLEVIAA..TPTSLLISWDAP......AVTVRYYRITY...GE +FINC_BOVIN/1511-1590 I.DKP.SQMQVTDV..QDNSISVRWLPS......SSPVTGYRVTT...AP +FINC_BOVIN/1601-1680 I.PAP.TNLKFTQV..TPTSLTAQWTAP......NVQLTGYRVRV...TP +FINC_BOVIN/1693-1771 V.SPP.RRARVTDA..TETTITISWRTK......TETITGFQVDA...IP +FINC_BOVIN/1782-1861 I.DAP.SNLRFLAT..TPNSLLVSWQPP......RARITGYIIKY...EK +FINC_CHICK/551-630 I.DRP.KGLTFTEV..DVDSIKIAWESP......QGQVTRYRVTY...SS +FINC_RAT/1266-1346 V.PQL.TDLSFVDI..TDSSIGLRWTPLN.....SSTIIGYRITV...VA +GUNB_CELFI/651-733 P.TTP.GTPVATGV..TTVGASLSWAASTd...AGSGVAGYEL.YR...V +IL7R_HUMAN/129-221 P.EAP.FDLSVIYRe.GANDFVVTFNTSHlq.kKYVKVLMHDVAYR..QE +ITB4_HUMAN/1127-1208 L.GAP.QNPNAKAA..GSRKIHFNWLPP......SGKPMGYRVKY...WI +ITB4_HUMAN/1220-1310 P.SEP.GRLAFNVV..SSTVTQLSWAEPAe...TNGEITAYEVCY...GL +ITB4_HUMAN/1581-1665 P.DTP.TRLVFSAL..GPTSLRVSWQEPR....CERPLQGYSVEY...QL +ITB4_HUMAN/1694-1781 P.SAP.GPLVFTAL..SPDSLQLSWERPRr...PNGDIVGYLVTC...EM +KALM_CHICK/178-271 P.LKPrKELKFIEL..QSGDLEVKWSSKFn...ISIEPVIYVVQRR..WN +KALM_CHICK/544-642 L.AKP.ENLSASFIv.QEGNITGHFSWKIskavLHQPMTGFQVTW...AE +KMLC_CHICK/60-145 P.DPPaGTPCASDI..RSSSLTLSWYGSSy..dGGSAVQSYTVEI...WN +LAR_DROME/322-404 P.TAP.TDVQISEV..TATSVRLEWSYK.....GPEDLQYYVIQY...KP +LAR_DROME/417-503 E.SAP.RNVQVRTL..SSSTMVITWEPPEt...PNGQVTGYKV.Y...YT +LAR_DROME/515-598 P.SQP.SNFRATDI..GETAVTLQWTKPTh...SSENIVHYELYW...ND +LAR_DROME/709-800 P.GDP.QDVKATPL..NSTSIHVSWKPPLek.dRNGIIRGYHIHA...QE +LAR_DROME/909-995 PgGPP.SNITIRFQ..TPDVLCVTWDPPTre.hRNGIITRYDVQFH..KK +MPSF_CHICK/371-457 P.GAP.MDVKCHDA..NRDYVIVTWKPPNt..tSQNPVIGYFVDK...CE +MPSF_CHICK/499-585 P.GPP.TNVHASEI..SKTYVVLSWDPPVp...RGREPLTYFIEK...SM +MPSF_CHICK/600-684 P.SAP.GRVVATRN..TKTSVVVQWDKPK....HEENLYGYYIDY...SV +MPSF_CHICK/699-785 P.SYP.HGITLLNC..DGHSMTLGWKAPKy..sGGSPILGYYIDKR...E +MPSF_CHICK/801-887 P.GPA.YDLTVCEV..RNTSLVLLWKAPVy..eGKSPITGYLVDY...KE +NCA1_BOVIN/509-597 P.SSP.SIDQVEP...YSSTAQVQFDEPEa..tGGVPILKYKAEWR...A +NCA1_BOVIN/610-691 P.SAP.KLEGQMGE..DGNSIKVKLIKQDd...GGSPIRHYLVKYR...A +NGCA_CHICK/700-794 PeRNP.GGVHGEGN..ETGNLVITWEPLPpq.aWNAPWARYRVQWR...P +NRCA_CHICK/623-709 P.NPP.LDLELTGQ..LERSIELSWVPGEe...NNSPITNFVIEY...ED +NRCA_CHICK/726-810 ....P.SNVQGIGS..EPDNLVITWESLKgf.qSNGPGLQYKVSWR..QK +NRCA_CHICK/928-1014 P.SPP.SFLKITNP..TLDSLTLEWGSPTh...PNGVLTSYILKF...QP +NRG_DROME/717-799 ....P.DNVVGQGT..EPNNLVISWTPMPei.eHNAPNFHYYVSW...K. +NRG_DROME/815-905 PlDAP.TNFTMRQIt.SSTSGYMAWTPVSee.sVRGHFKGYKIQT...WT +NRG_DROME/917-1007 P.SPV.QGLDAYPL..GSSAFMLHWKKPLy...PNGKLTGYKIYY...EE +PHB_ALCFA/344-418 G.SAP.TGLAVTAT..TSTSVSLSWNAV.......ANASSYGV.YR.... +PTP1_DROME/123-205 P.DPP.SNLSVQVR..SGKNAIILWSPPT.....QGSYTAFKIKV...LG +PTP1_DROME/217-301 P.NTP.GKFIVWFR..NETTLLVLWQPPY....PAGIYTHYKVSI...EP +PTP1_DROME/312-394 P.LRP.LNVTFDRDfiTSNSFRVLWEAPK....GISEFDKYQVSV...AT +PTP1_DROME/405-485 P.LPV.RNLRSINDd.KTNTMIITWEADP.....ASTQDEYRIVYHe.LE +PTP1_DROME/583-661 P.NPP.RNMTIETV..RSNSVLVHWSPPE.....SGEFTEYSIRYR...T +PTP1_DROME/864-944 P.EPI.TQLHATNI..TDTEISLRWDLP......KGEYNDFDIAY...LT +PTP1_DROME/958-1044 P.GRV.ERFHPTDV..QPSEINFEWSLPSs..eANGVIRQFSIAY...TN +PTP6_DROME/236-321 V.PQV.SIDFAKAV..GANKIYLNWTVND....GNDPIQKFFITL...QE +PTP6_DROME/332-425 Y.DPI.FIPKVETTgsTASTITIGWNPPPp..dLIDYIQYYELIV...SE +PTP9_DROME/171-259 P.SKP.QNLTILDV..SANSITMSWHPPKn...QNGAIAGYHVFH...IH +PTPB_HUMAN/22-103 AePER.CNFTLAESkaSSHSVSIQWRIL.......GSPCNFSLIY...SS +PTPB_HUMAN/112-192 P.PAR.FGVSKEKT..TSTGLHVWWTPS......SGKVTSYEVQL...FD +PTPB_HUMAN/467-543 P.LAV.LQLRVKHA..NETSLSIMWQTP......VAEWEKYIISL...AD +PTPB_HUMAN/554-632 P.AQV.TDLHVANQg.MTSSLFTNWTQA......QGDVEFYQVLL...IH +PTPB_HUMAN/643-725 P.SSV.SGVTVNNSg.RNDYLSVSWLVA......PGDVDNYEVTL...SH +PTPB_HUMAN/731-808 P.DKV.QGVSVSNSa.RSDYLRVSWVHA......TGDFDHYEVTI...KN +PTPB_HUMAN/907-984 P.SAV.KNIHISPNg.ATDSLTVNWTPG......GGDVDSYTVSA...FR +PTPB_HUMAN/995-1074 P.ASV.QGVIADNAy.SSYSLIVSWQKA......AGVAERYDILL...LT +PTPB_HUMAN/1085-1162 P.AAV.TDLRITEN..STRHLSFRWTAS......EGELSWYNIFL...YN +PTPB_HUMAN/1173-1250 P.ASV.SHLRGSNRn.TTDSLWFNWSPA......SGDFDFYELIL...YN +PTPB_HUMAN/1261-1344 P.SPP.SLMSFADI..ANTSLAITWKGPP....DWTDYNDFELQW...LP +PTPB_HUMAN/1355-1434 P.DKI.QNLHCRPQ..NSTAIACSWIPP......DSDFDGYSIECR...K +PTPK_MOUSE/290-376 P.PRPiAPPQLLGV..GPTYLLIQLNANSi..iGDGPIILKEVEYR...M +PTPZ_HUMAN/312-401 S.SEP.ENVQADPE..NYTSLLVTWERPRv..vYDTMIEKFAVLY...QQ +SEK_MOUSE/441-525 P.SSI.ALVQAKEV..TRYSVALAWLEPDr...PNGVILEYEVKY...YE +TENA_CHICK/593-671 V.SPP.TELTVTNV..TDKTVNLEWKHE.......NLVNEYLVTY...VP +TENA_CHICK/682-767 L.PAP.EGLKFKSV..RETSVQVEWDPL......SISFDGWELVFRnmQK +TENA_CHICK/774-853 L.DAP.SQIEAKDV..TDTTALITWSKP......LAEIEGIELTY...GP +TENA_CHICK/864-945 L.DAP.RNLKRVSQ..TDNSITLEWKNS......HANIDNYRIKF...AP +TENA_CHICK/956-1033 L.DNP.KDLEVSDP..TETTLSLRWRRP......VAKFDRYRLTY...VS +TENA_CHICK/1045-1124 E.PEL.GNLSVSET..GWDGFQLTWTAA......DGAYENFVIQV...QQ +TENA_CHICK/1136-1215 H.PEV.GELTVSDI..TPESFNLSWTTT......NGDFDAFTIEI...ID +TENA_CHICK/1227-1306 E.PEV.DNLLVSDA..TPDGFRLSWTAD......DGVFDSFVLKIR..DT +TENA_CHICK/1317-1395 V.GSP.KGISFSDI..TENSATVSWTPP......RSRVDSYRVSY...VP +TENA_CHICK/1406-1483 L.DSP.SGLVVMNI..TDSEALATWQPA......IAAVDNYIVSY...SS +TENA_CHICK/1494-1571 L.DAP.KDLSATEV..QSETAVITWRPP......RAPVTDYLLTY...ES +TENA_HUMAN/1254-1334 E.VPDmGNLTVTEV..SWDALRLNWTTP......DGTYDQFTIQV...QE +TENA_HUMAN/1528-1607 L.PLL.ENLTISDI..NPYGFTVSWMAS......ENAFDSFLVTV...VD +TIE1_HUMAN/446-533 P.PVPlAAPRLLTK..QSRQLVVSPLVSFs...GDGPISTVRLHYR..PQ +TIE1_HUMAN/545-632 PlLQP.WLEGWHVE..GTDRLRVSWSLPLv..pGPLVGDGFLLRL...WD +TIE1_HUMAN/644-729 P.PAP.RHLHAQAL..SDSEIQLTWKHPEa...LPGPISKYVVEV...QV +TIE2_HUMAN/444-529 L.PKPlNAPNVIDT..GHNFAVINISSEPy..fGDGPIKSKKLLY...KP +TIE2_HUMAN/543-626 L.PPP.RGLNLLPK..SQTTLNLTWQPIFp...SSEDDFYVEVERR...S +TIE2_HUMAN/639-724 P.PQP.ENIKISNI..THSSAVISWTILD.....GYSISSITIRY...KV +UFO_HUMAN/327-411 L.GPP.ENISATR...NGSQAFVHWQEPRa..pLQGTLLGYRLAY...QG + +7LES_DROME/1799-1891 ELDGEnvqd..rrewEAHER...RLET....AG..THRLTGIKPGSGYSL +7LES_DROVI/1917-1997 LREQ............LQFN...VAGN....HT..QMRLAPLQPKTRYSC +APU_THETY/928-1009 TVKGG..........LYEKI...ASNV....TQi.TYTDTEVTNGLKYVY +APU_THETY/1165-1248 SETGPf.........IKIAT...VSDS....VY..NYVDTDVVNGNVYYY +AXO1_CHICK/602-692 LLSNKwkq.....mrTNPVN...IEGN....AE..TAQVVNLIPWMDYEF +AXO1_CHICK/807-896 DGDKEea.......aDRVRT...AGLV....T...SAHVTGLNPNTKYHV +CAML_HUMAN/812-907 EGSQRkhsk..rhihKDHVV...VPAN....TT..SVILSGLRPYSSYHL +CHI1_BACCI/465-542 .GAN............LATS...VTGT....T....ATISGLTAGTSYTF +CHIT_STRLI/142-219 .DGA............KVAT...VTGT....T....YTDNGLTKGTAYSY +CHIX_STROI/169-240 .DGV............KVAT...ASGT....S....ATVTGLTPDTAYAF +CONT_CHICK/799-884 AHDKEa........aAQRVQ...VSNQ....EY..STKLENLKPNTRYHI +CPSF_CHICK/630-716 KGSMRw........mKLNFE...VFPD....T...TYESTKMIEGVFYEM +CPSF_CHICK/923-1008 TRTME..........WFTVL...EHSR....PT..RCTVSELVMGNEYRF +ECK_HUMAN/329-420 E.SGEcgp....ceaSVRYS...EPPHgl.tRT..SVTVSDLEPHMNYTF +ECK_HUMAN/436-519 KGDS............NSYN...VRRT....EGf.SVTLDDLAPDTTYLV +EPH1_HUMAN/333-435 QGTAQdggpcqpcgvGVHFSpgaRGLT....TP..AVHVNGLEPYANYTF +EPH3_CHICK/333-429 CGSGRgact...rcgDNVQF...APRQlgltEP..RIYISDLLAHTQYTF +EPH3_CHICK/444-528 KNLSE..........LNSTA...VKSP....TN..TVTVQNLKAGTIYVF +ETK1_CHICK/325-421 CGGSSkice...pcsDNVRF...LPRQtg.lTNt.TVTVVDLLAHTNYTF +FAS2_SCHAM/530-616 DSQGW..........EDALN...RTWP....VDs.PYILENLKPQTRYNF +FAS2_SCHAM/642-735 VEKYDtewrl.lpypCQEHK...LEGQ....AT..TFQLESLQPDTHYKV +FINC_BOVIN/577-660 KNSPDr.........WKEAT...IPGH....LN..SYTIKGLRPGVVYEG +FINC_BOVIN/689-768 SEEGDe.........PQYLD...LPST....AT..SVNIPDLLPGRKYTV +FINC_BOVIN/780-858 SVEGS..........STELN...LPET....AN..SVTLSDLQPGVQYNI +FINC_BOVIN/875-955 VNLPGe........hGQRLP...VSRN....T...FAEVTGLSPGVTYHF +FINC_BOVIN/1142-1225 TNGQQg........ySLEEV...VHAD....QS..SCTFENLSPGLEYNV +FINC_BOVIN/1236-1316 VKNEEd.........VAELS...ISPS....DN..AVVLTNLLPGTEYLV +FINC_BOVIN/1327-1406 ENMGGr.........PREDR...VPPS....RN..SITLTNLNPGTEYVV +FINC_BOVIN/1417-1499 TGGSSp.........VQEFT...VPGS....KS..TATISGLKPGVDYTI +FINC_BOVIN/1511-1590 KNGPGp.........SKTKT...VGPD....QT..EMTIEGLQPTVEYVV +FINC_BOVIN/1601-1680 KEKTGp.........MKEIN...LAPD....SS..SVVVSGLMVATKYEV +FINC_BOVIN/1693-1771 ANGQT..........PIQRT...IRPD....VR..SYTITGLQPGTDYKI +FINC_BOVIN/1782-1861 PGSPPr........eVVPRP...RPGV....T...EATITGLEPGTEYTI +FINC_CHICK/551-630 PEDG............IHEL...LPAPgg.eED..TAELHGLRPGSEYTI +FINC_RAT/1266-1346 AGEGIp.........IFEDF...VDSS....VG..YYTVTGLEPGIDYDI +GUNB_CELFI/651-733 QGTTQ..........TLVGT...TTAA....A....YILRDLTPGTAYSY +IL7R_HUMAN/129-221 KDENK..........WTHVN...LSST....KL..TLLQRKLQPAAMYEI +ITB4_HUMAN/1127-1208 QGDSEs.........EAHLL...DSKV....P...SVELTNLYPYCDYEM +ITB4_HUMAN/1220-1310 VNDDNrpi.....gpMKKVL...VDNP....KNr.MLLIENLRESQPYRY +ITB4_HUMAN/1581-1665 LNGGE..........LHRLN...IPNP....AQt.SVVVEDLLPNHSYVF +ITB4_HUMAN/1694-1781 AQGGGpa.......tAFRVD...GDSP....ES..RLTVPGLSENVPYKF +KALM_CHICK/178-271 QGIHPsed.....daTNWQT...VAQT....TDe.RVQLSDIRASRWYQF +KALM_CHICK/544-642 VTTESrqnslpnsiiSQSQI...LPAD....HY..VLTVPNLRPSMLYRL +KMLC_CHICK/60-145 SVDNK..........WTDLT...TCRS....T...SFNVQDLQADREYKF +LAR_DROME/322-404 KNANQ..........AFSEI...SGII....TM..YYVVRALSPYTEYEF +LAR_DROME/417-503 TNSNQpe......asWNSQM...VDNS....E...LTTVSDVTPHAIYTV +LAR_DROME/515-598 TYANQ..........AHHKR...ISNS....E...AYTLDGLYPDTLYYI +LAR_DROME/709-800 LRDEGkgf....lnePFKFD...VVDT....L...EFNVTGLQPDTKYSI +LAR_DROME/909-995 IDHGL..........GSERN...MTLR....K....AVFTNLEENTEYIF +MPSF_CHICK/371-457 VGLEN..........WVQCN...DAPV....KIc.KYPVTGLYEGRSYIF +MPSF_CHICK/499-585 VGSGS..........WQRVNaqvAVKS....P...RYAVFDLAEGKPYVF +MPSF_CHICK/600-684 VGSNQwe.......pANHKP...INYN....R....FVVHGLETGEQYIF +MPSF_CHICK/699-785 ANHKN..........WHEVNssvISRT....I....YTVEDLTEDAFYEF +MPSF_CHICK/801-887 VDTED..........WITAN...EKPT....SHr.YFKVTDLHQGHTYVF +NCA1_BOVIN/509-597 MGEEVw........hSKWYD...AKEA....SMegIVTIVGLKPETTYAV +NCA1_BOVIN/610-691 LSSEW..........KPEIR...LPSG....SD..HVMLKSLDWNAEYEV +NGCA_CHICK/700-794 LEEPGgggps.ggfpWAEST...VDAP....P....VVVGGLPPFSPFQI +NRCA_CHICK/623-709 GLHEPg........vWHYQT...EVPG....SH..TTVQLKLSPYVNYSF +NRCA_CHICK/726-810 DVDDE..........WTSVV...VANV....S...KYIVSGTPTFVPYEI +NRCA_CHICK/928-1014 INNTHel......gpLVEIR...IPAN....ES..SLILKNLNYSTRYKF +NRG_DROME/717-799 .RDIPaa......awENNNI...FDWR....QN..NIVIADQPTFVKYLI +NRG_DROME/815-905 ENEGEe........gLREIH...VKGD....TH..NALVTQFKPDSKNYA +NRG_DROME/917-1007 V.KESyvge..rreyDPHIT...DPRV....T...RMKMAGLKPNSKYRI +PHB_ALCFA/344-418 .NGS............KVGS...ATAT....A....YTDSGLIAGTTYSY +PTP1_DROME/123-205 LSEASss.......yNRTFQ...VNDN....TF..QHSVKELTPGATYQV +PTP1_DROME/217-301 PDANDsvl.....yvEKEGE...PPGP....A...QAAFKGLVPGRAYNI +PTP1_DROME/312-394 TRRQS..........TVPRS...NEPV....AF..SDFRDIAEPGKTFNV +PTP1_DROME/405-485 TFNGD..........TSTLT...TDRT....R....FTLESLLPGRNYSL +PTP1_DROME/583-661 DSEQQ..........WVRLP...SVRS....T...EADITDMTKGEKYTI +PTP1_DROME/864-944 A.DNL..........LAQNM...TTRN....E....ITISDLRPHRNYTF +PTP1_DROME/958-1044 INNLT..........DAGMQ...DFES....EEa.FGVIKNLKPGETYVF +PTP6_DROME/236-321 AGTPTft.......yHKDFI...NGSH....T...SYILDHFKPNTTYFL +PTP6_DROME/332-425 SGEVPkvi.....eeAIYQQ...NSRN....L...PYMFDKLKTATDYEF +PTP9_DROME/171-259 DNQTGve......ivKNSRN...SVET....LI..HFELQNLRPYTDYRV +PTPB_HUMAN/22-103 DTLGAa........lCPTFR...IDNT....TY..GCNLQDLQAGTIYNF +PTPB_HUMAN/112-192 ENNQKiq......gvQIQES...TSWN....E....YTFFNLTAGSKYNI +PTPB_HUMAN/467-543 R.DLL..........LIHKS...LSKD....AK..EFTFTDLVPGRKYMA +PTPB_HUMAN/554-632 ENVV...........IKNES...ISSE....TS..RYSFHSLKSGSLYSV +PTPB_HUMAN/643-725 DGKV...........VQSLV...IAKS....VR..ECSFSSLTPGRLYTV +PTPB_HUMAN/731-808 KNNF...........IQTKS...IPKS....EN..ECVFVQLVPGRLYSV +PTPB_HUMAN/907-984 H.SQK..........VDSQT...IPKH....VF..EHTFHRLEAGEQYQI +PTPB_HUMAN/995-1074 ENGIL..........LRNTS...EPAT....TK..QHKFEDLTPGKKYKI +PTPB_HUMAN/1085-1162 PDGNLq.........ERAQV...DPLV....Q...SFSFQNLLQGRMYKM +PTPB_HUMAN/1173-1250 PNGTKk.........ENWKD...KDLT....E....WRFQGLVPGRKYVL +PTPB_HUMAN/1261-1344 RDALTv.........FNPYN...NRKS....E...GRIVYGLRPGRSYQF +PTPB_HUMAN/1355-1434 MDTQEv.........EFSRK...LEKE....KS..LLNIMMLVPHKRYLV +PTPK_MOUSE/290-376 T.SGS..........WTETH...AVNA....P...TYKLWHLDPDTEYEI +PTPZ_HUMAN/312-401 LDGEDq........tKHEFL...TDGY....QDl.GAILNNLLPNMSYVL +SEK_MOUSE/441-525 KDQN...........ERSYR...IVRT....AAr.NTDIKGLNPLTSYVF +TENA_CHICK/593-671 TSSGGl.........DLQFT...VPGN....QT..SATIHELEPGVEYFI +TENA_CHICK/682-767 KDDNG..........DITSS...LKRP....ET..SYMQPGLAPGQQYNV +TENA_CHICK/774-853 KDVPGd.........RTTID...LSED....EN..QYSIGNLRPHTEYEV +TENA_CHICK/864-945 ISGGD..........HTELT...VPKGnq.aTT..RATLTGLRPGTEYGI +TENA_CHICK/956-1033 P.SGK..........KNEME...IPVD....ST..SFILRGLDAGTEYTI +TENA_CHICK/1045-1124 SDNPEe.........TWNIT...VPGG....QH..SVNVTGLKANTPYNV +TENA_CHICK/1136-1215 SNRLLe.........PMEFN...ISGN....SR..TAHISGLSPSTDFIV +TENA_CHICK/1227-1306 KRKSD..........PLELI...VPGH....ER..THDITGLKEGTEYEI +TENA_CHICK/1317-1395 ITGGT..........PNVVT...VDGS....KT..RTKLVKLVPGVDYNV +TENA_CHICK/1406-1483 EDEP...........EVTQM...VSGN....TV..EYDLNGLRPATEYTL +TENA_CHICK/1494-1571 I.DGR..........VKEVI...LDPE....TT..SYTLTELSPSTQYTV +TENA_HUMAN/1254-1334 ADQVEe.........AHNLT...VPGS....LR..SMEIPGLRAGTPYTV +TENA_HUMAN/1528-1607 SGKLLd.........PQEFT...LSGT....QR..KLELRGLITGIGYEV +TIE1_HUMAN/446-533 DSTMD..........WSTIV...VDPS....E...NVTLMNLRPKTGYSV +TIE1_HUMAN/545-632 GTRGQ..........ERREN...VSSP....QAr.TALLTGLTPGTHYQL +TIE1_HUMAN/644-729 AGGAGd.........PLWID...VDRP....EEt.STIIRGLNASTRYLF +TIE2_HUMAN/444-529 VNHYEa.........WQHIQ...VTNE....I....VTLNYLEPRTEYEL +TIE2_HUMAN/543-626 VQKSD..........QQNIK...VPGN....LT..SVLLNNLHPREQYVV +TIE2_HUMAN/639-724 QGKNE..........DQHVDv.kIKNA....TIi.QYQLKGLEPETAYQV +UFO_HUMAN/327-411 QDTPE..........VLMDI...GLRQ....EV..TLELQGDGSVSNLTV + +7LES_DROME/1799-1891 WVQ.AHATPTk....SNSS +7LES_DROVI/1917-1997 RLA.LAYAATp....GAPI +APU_THETY/928-1009 AVT.AVDNDGn...eSALS +APU_THETY/1165-1248 KVV.AVDTSYn....RTAS +AXO1_CHICK/602-692 RVL.ASNILGv....GEPS +AXO1_CHICK/807-896 SVR.AYNRAGa....GPPS +CAML_HUMAN/812-907 EVQ.AFNGRGs....GPAS +CHI1_BACCI/465-542 TIK.AKDAAGn...lSAAS +CHIT_STRLI/142-219 SVK.ARDTADq...tGPAS +CHIX_STROI/169-240 QVA.AVNGA.......GES +CONT_CHICK/799-884 DVS.AFNSAGy....GPPS +CPSF_CHICK/630-716 RVF.AVNAIGv....SQPS +CPSF_CHICK/923-1008 RVY.SENVCGt....SQEP +ECK_HUMAN/329-420 TVE.ARNGV........SG +ECK_HUMAN/436-519 QVQ.ALTQEGq....GAGS +EPH1_HUMAN/333-435 NVE.AQNGVSglgssGHAS +EPH3_CHICK/333-429 EIQ.AVNGVTd...qSPFS +EPH3_CHICK/444-528 QVR.ARTVAGy....GRYS +ETK1_CHICK/325-421 EID.AVNGVSd...lSTLS +FAS2_SCHAM/530-616 RFA.AQNEVGf....GPWS +FAS2_SCHAM/642-735 EVR.ATNAIGn....SVPG +FINC_BOVIN/577-660 QLI.SVQHY......GQRE +FINC_BOVIN/689-768 NVY.EISEE.......GEQ +FINC_BOVIN/780-858 TIY.AVEEN.......QES +FINC_BOVIN/875-955 KVF.AVNQG.......RES +FINC_BOVIN/1142-1225 SVY.TVKDD.......KES +FINC_BOVIN/1236-1316 SVS.SVYEQ.......HES +FINC_BOVIN/1327-1406 SIV.ALNSK.......EES +FINC_BOVIN/1417-1499 TVY.AVTGRGd....SPAS +FINC_BOVIN/1511-1590 SVY.AQNQN.......GES +FINC_BOVIN/1601-1680 SVY.ALKDT.......LTS +FINC_BOVIN/1693-1771 HLY.TLNDN.......ARS +FINC_BOVIN/1782-1861 QVI.ALKNN.......QKS +FINC_CHICK/551-630 NIV.AIYDD.......MES +FINC_RAT/1266-1346 SVI.TLING.......GES +GUNB_CELFI/651-733 VVK.AKDVAGn...vSAAS +IL7R_HUMAN/129-221 KVR.SIPDHYfkgfwSEWS +ITB4_HUMAN/1127-1208 KVC.AYGAQGe....GPYS +ITB4_HUMAN/1220-1310 TVK.ARNGAGw....GPER +ITB4_HUMAN/1581-1665 RVR.AQSQEGw....GRER +ITB4_HUMAN/1694-1781 KVQ.ARTTEGf....GPER +KALM_CHICK/178-271 RVA.AVNVHGt...rGFTA +KALM_CHICK/544-642 EVQ.VLTTGGe....GPAT +KMLC_CHICK/60-145 RVR.AANVYGi....SEPS +LAR_DROME/322-404 YVI.AVNNIGr....GPPS +LAR_DROME/417-503 RVQ.AYTSMGa....GPMS +LAR_DROME/515-598 WLA.ARSQRGe....GATT +LAR_DROME/709-800 QVA.ALTRKGd....GDRS +LAR_DROME/909-995 RVR.AYTKQGa....GPFS +MPSF_CHICK/371-457 RVR.AVNSAGi....SRPS +MPSF_CHICK/499-585 RVL.SANKHGi....SDPS +MPSF_CHICK/600-684 RVK.AVNAVGf....SENS +MPSF_CHICK/699-785 KIA.AANVVGi....GHPS +MPSF_CHICK/801-887 KVR.AVNDAGv....GKSS +NCA1_BOVIN/509-597 RLA.ALNGKGl....GEIS +NCA1_BOVIN/610-691 YVV.AENQQ.......GKS +NGCA_CHICK/700-794 RVQ.AVNGAGk....GPEA +NRCA_CHICK/623-709 RVI.AVNEIGr....SQPS +NRCA_CHICK/726-810 KVQ.ALNDLGy...aPEPS +NRCA_CHICK/928-1014 YFN.AQTSV......GSGS +NRG_DROME/717-799 KVV.AINDR.......GES +NRG_DROME/815-905 RIL.AYNGRFn....GPPS +NRG_DROME/917-1007 SIT.ATTKMGe....GSEH +PHB_ALCFA/344-418 TVT.AVDPTAg...eSQPS +PTP1_DROME/123-205 QAY.TIYDG.......KES +PTP1_DROME/217-301 SVQ.TMSED.......EIS +PTP1_DROME/312-394 IVK.TVSGK.......VTS +PTP1_DROME/405-485 SVQ.AVSKK.......MES +PTP1_DROME/583-661 QVN.TVSFG.......VES +PTP1_DROME/864-944 TVV.VRSGTEss..vLRSS +PTP1_DROME/958-1044 KIQ.AKTAIGf....GPER +PTP6_DROME/236-321 RIV.GKNSIGn....GQPT +PTP6_DROME/332-425 RVR.ACSDLTkt..cGPWS +PTP9_DROME/171-259 IVK.AFTTKNe....GEPS +PTPB_HUMAN/22-103 KII.SLDEE........RT +PTPB_HUMAN/112-192 AIT.AVSGG.......KRS +PTPB_HUMAN/467-543 TVT.SISGD........LK +PTPB_HUMAN/554-632 VVT.TVSGG.......ISS +PTPB_HUMAN/643-725 TIT.TRSGKYe...nHSFS +PTPB_HUMAN/731-808 TVT.TKSGQ........YE +PTPB_HUMAN/907-984 MIA.SVSGS........LK +PTPB_HUMAN/995-1074 QIL.TVSGG.......LFS +PTPB_HUMAN/1085-1162 VIV.THSGE........LS +PTPB_HUMAN/1173-1250 WVV.THSGD........LS +PTPB_HUMAN/1261-1344 NVK.TVSGDSw....KTYS +PTPB_HUMAN/1355-1434 SIK.VQSAG.......MTS +PTPK_MOUSE/290-376 RVLlTRPGEGg...tGLPG +PTPZ_HUMAN/312-401 QIV.AICTNGl...yGKYS +SEK_MOUSE/441-525 HVR.ARTAAGy....GDFS +TENA_CHICK/593-671 RVF.AILKN.......KKS +TENA_CHICK/682-767 SLH.IVKNNTr...gPGLS +TENA_CHICK/774-853 TLI.SRRGD.......MES +TENA_CHICK/864-945 GVT.AVRQD.......RES +TENA_CHICK/956-1033 SLV.AEKGR.......HKS +TENA_CHICK/1045-1124 TLY.GVIRG.......YRT +TENA_CHICK/1136-1215 YLY.GISHG.......FRT +TENA_CHICK/1227-1306 ELY.GVSSG.......RRS +TENA_CHICK/1317-1395 NII.SVKGF.......EES +TENA_CHICK/1406-1483 RVH.AVKDA.......QKS +TENA_CHICK/1494-1571 KLQ.ALSRS.......MRS +TENA_HUMAN/1254-1334 TLH.GEVRG.......HST +TENA_HUMAN/1528-1607 MVS.GFTQG.......HQT +TIE1_HUMAN/446-533 RVQlSRPGEGg...eGAWG +TIE1_HUMAN/545-632 DVQ.LYHCTLl....GPAS +TIE1_HUMAN/644-729 RMR.ASI.QGl....GDWS +TIE2_HUMAN/444-529 CVQ.LVRRGEg....GEGH +TIE2_HUMAN/543-626 RAR..VNTKAq....GEWS +TIE2_HUMAN/639-724 DIF.AENNIGs....SNPA +UFO_HUMAN/327-411 CVA.AYTAAGd....GPWS + diff --git a/forester/archive/RIO/others/hmmer/tutorial/globins50.msf b/forester/archive/RIO/others/hmmer/tutorial/globins50.msf new file mode 100644 index 0000000..2f04100 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/globins50.msf @@ -0,0 +1,427 @@ +!!AA_MULTIPLE_ALIGNMENT 1.0 +PileUp of: *.pep + + Symbol comparison table: GenRunData:blosum62.cmp CompCheck: 6430 + + GapWeight: 12 + GapLengthWeight: 4 + + pileup.msf MSF: 308 Type: P August 16, 1999 09:09 Check: 9858 .. + + Name: lgb1_pea Len: 308 Check: 2200 Weight: 1.00 + Name: lgb1_vicfa Len: 308 Check: 214 Weight: 1.00 + Name: myg_escgi Len: 308 Check: 3961 Weight: 1.00 + Name: myg_horse Len: 308 Check: 5619 Weight: 1.00 + Name: myg_progu Len: 308 Check: 6401 Weight: 1.00 + Name: myg_saisc Len: 308 Check: 6606 Weight: 1.00 + Name: myg_lycpi Len: 308 Check: 6090 Weight: 1.00 + Name: myg_mouse Len: 308 Check: 6613 Weight: 1.00 + Name: myg_musan Len: 308 Check: 3942 Weight: 1.00 + Name: hba_ailme Len: 308 Check: 4558 Weight: 1.00 + Name: hba_prolo Len: 308 Check: 5054 Weight: 1.00 + Name: hba_pagla Len: 308 Check: 5383 Weight: 1.00 + Name: hba_macfa Len: 308 Check: 5135 Weight: 1.00 + Name: hba_macsi Len: 308 Check: 5198 Weight: 1.00 + Name: hba_ponpy Len: 308 Check: 5050 Weight: 1.00 + Name: hba2_galcr Len: 308 Check: 5609 Weight: 1.00 + Name: hba_mesau Len: 308 Check: 4702 Weight: 1.00 + Name: hba2_bosmu Len: 308 Check: 4241 Weight: 1.00 + Name: hba_erieu Len: 308 Check: 4680 Weight: 1.00 + Name: hba_frapo Len: 308 Check: 3549 Weight: 1.00 + Name: hba_phaco Len: 308 Check: 4440 Weight: 1.00 + Name: hba_trioc Len: 308 Check: 5465 Weight: 1.00 + Name: hba_ansse Len: 308 Check: 3300 Weight: 1.00 + Name: hba_colli Len: 308 Check: 3816 Weight: 1.00 + Name: hbad_chlme Len: 308 Check: 4571 Weight: 1.00 + Name: hbad_pasmo Len: 308 Check: 6777 Weight: 1.00 + Name: hbaz_horse Len: 308 Check: 7187 Weight: 1.00 + Name: hba4_salir Len: 308 Check: 7329 Weight: 1.00 + Name: hbb_ornan Len: 308 Check: 2667 Weight: 1.00 + Name: hbb_tacac Len: 308 Check: 4356 Weight: 1.00 + Name: hbe_ponpy Len: 308 Check: 3827 Weight: 1.00 + Name: hbb_speci Len: 308 Check: 1556 Weight: 1.00 + Name: hbb_speto Len: 308 Check: 2051 Weight: 1.00 + Name: hbb_equhe Len: 308 Check: 3414 Weight: 1.00 + Name: hbb_sunmu Len: 308 Check: 2927 Weight: 1.00 + Name: hbb_calar Len: 308 Check: 3836 Weight: 1.00 + Name: hbb_mansp Len: 308 Check: 4322 Weight: 1.00 + Name: hbb_ursma Len: 308 Check: 4428 Weight: 1.00 + Name: hbb_rabit Len: 308 Check: 4190 Weight: 1.00 + Name: hbb_tupgl Len: 308 Check: 4185 Weight: 1.00 + Name: hbb_triin Len: 308 Check: 1163 Weight: 1.00 + Name: hbb_colli Len: 308 Check: 3958 Weight: 1.00 + Name: hbb_larri Len: 308 Check: 3517 Weight: 1.00 + Name: hbb1_varex Len: 308 Check: 6009 Weight: 1.00 + Name: hbb2_xentr Len: 308 Check: 7617 Weight: 1.00 + Name: hbbl_ranca Len: 308 Check: 5606 Weight: 1.00 + Name: hbb2_tricr Len: 308 Check: 8767 Weight: 1.00 + Name: glb2_mormr Len: 308 Check: 6103 Weight: 1.00 + Name: glbz_chith Len: 308 Check: 8634 Weight: 1.00 + Name: hbf1_ureca Len: 308 Check: 9035 Weight: 1.00 + +// + + 1 50 + lgb1_pea ~~~~~~~~~G FTDKQEALVN SSSE.FKQNL PGYSILFYTI VLEKAPAAKG +lgb1_vicfa ~~~~~~~~~G FTEKQEALVN SSSQLFKQNP SNYSVLFYTI ILQKAPTAKA + myg_escgi ~~~~~~~~~V LSDAEWQLVL NIWAKVEADV AGHGQDILIR LFKGHPETLE + myg_horse ~~~~~~~~~G LSDGEWQQVL NVWGKVEADI AGHGQEVLIR LFTGHPETLE + myg_progu ~~~~~~~~~G LSDGEWQLVL NVWGKVEGDL SGHGQEVLIR LFKGHPETLE + myg_saisc ~~~~~~~~~G LSDGEWQLVL NIWGKVEADI PSHGQEVLIS LFKGHPETLE + myg_lycpi ~~~~~~~~~G LSDGEWQIVL NIWGKVETDL AGHGQEVLIR LFKNHPETLD + myg_mouse ~~~~~~~~~G LSDGEWQLVL NVWGKVEADL AGHGQEVLIG LFKTHPETLD + myg_musan ~~~~~~~~~~ ~~~VDWEKVN SVWSAVESDL TAIGQNILLR LFEQYPESQN + hba_ailme ~~~~~~~~~V LSPADKTNVK ATWDKIGGHA GEYGGEALER TFASFPTTKT + hba_prolo ~~~~~~~~~V LSPADKANIK ATWDKIGGHA GEYGGEALER TFASFPTTKT + hba_pagla ~~~~~~~~~V LSSADKNNIK ATWDKIGSHA GEYGAEALER TFISFPTTKT + hba_macfa ~~~~~~~~~V LSPADKTNVK AAWGKVGGHA GEYGAEALER MFLSFPTTKT + hba_macsi ~~~~~~~~~V LSPADKTNVK DAWGKVGGHA GEYGAEALER MFLSFPTTKT + hba_ponpy ~~~~~~~~~V LSPADKTNVK TAWGKVGAHA GDYGAEALER MFLSFPTTKT +hba2_galcr ~~~~~~~~~V LSPTDKSNVK AAWEKVGAHA GDYGAEALER MFLSFPTTKT + hba_mesau ~~~~~~~~~V LSAKDKTNIS EAWGKIGGHA GEYGAEALER MFFVYPTTKT +hba2_bosmu ~~~~~~~~~V LSAADKGNVK AAWGKVGGHA AEYGAEALER MFLSFPTTKT + hba_erieu ~~~~~~~~~V LSATDKANVK TFWGKLGGHG GEYGGEALDR MFQAHPTTKT + hba_frapo ~~~~~~~~~V LSAADKNNVK GIFGKISSHA EDYGAEALER MFITYPSTKT + hba_phaco ~~~~~~~~~V LSAADKNNVK GIFTKIAGHA EEYGAEALER MFITYPSTKT + hba_trioc ~~~~~~~~~V LSANDKTNVK TVFTKITGHA EDYGAETLER MFITYPPTKT + hba_ansse ~~~~~~~~~V LSAADKGNVK TVFGKIGGHA EEYGAETLQR MFQTFPQTKT + hba_colli ~~~~~~~~~V LSANDKSNVK AVFAKIGGQA GDLGGEALER LFITYPQTKT +hbad_chlme ~~~~~~~~~M LTADDKKLLT QLWEKVAGHQ EEFGSEALQR MFLTYPQTKT +hbad_pasmo ~~~~~~~~~M LTAEDKKLIQ QIWGKLGGAE EEIGADALWR MFHSYPSTKT +hbaz_horse ~~~~~~~~~S LTKAERTMVV SIWGKISMQA DAVGTEALQR LFSSYPQTKT +hba4_salir ~~~~~~~~~S LSAKDKANVK AIWGKILPKS DEIGEQALSR MLVVYPQTKA + hbb_ornan ~~~~~~~~VH LSGGEKSAVT NLWGKV..NI NELGGEALGR LLVVYPWTQR + hbb_tacac ~~~~~~~~VH LSGSEKTAVT NLWGHV..NV NELGGEALGR LLVVYPWTQR + hbe_ponpy ~~~~~~~~VH FTAEEKAAVT SLWSKM..NV EEAGGEALGR LLVVYPWTQR + hbb_speci ~~~~~~~~VH LSDGEKNAIS TAWGKV..HA AEVGAEALGR LLVVYPWTQR + hbb_speto ~~~~~~~~VH LTDGEKNAIS TAWGKV..NA AEIGAEALGR LLVVYPWTQR + hbb_equhe ~~~~~~~~VQ LSGEEKAAVL ALWDKV..NE EEVGGEALGR LLVVYPWTQR + hbb_sunmu ~~~~~~~~VH LSGEEKACVT GLWGKV..NE DEVGAEALGR LLVVYPWTQR + hbb_calar ~~~~~~~~VH LTGEEKSAVT ALWGKV..NV DEVGGEALGR LLVVYPWTQR + hbb_mansp ~~~~~~~~VH LTPEEKTAVT TLWGKV..NV DEVGGEALGR LLVVYPWTQR + hbb_ursma ~~~~~~~~VH LTGEEKSLVT GLWGKV..NV DEVGGEALGR LLVVYPWTQR + hbb_rabit ~~~~~~~~VH LSSEEKSAVT ALWGKV..NV EEVGGEALGR LLVVYPWTQR + hbb_tupgl ~~~~~~~~VH LSGEEKAAVT GLWGKV..DL EKVGGQSLGS LLIVYPWTQR + hbb_triin ~~~~~~~~VH LTPEEKALVI GLWAKV..NV KEYGGEALGR LLVVYPWTQR + hbb_colli ~~~~~~~~VH WSAEEKQLIT SIWGKV..NV ADCGAEALAR LLIVYPWTQR + hbb_larri ~~~~~~~~VH WSAEEKQLIT GLWGKV..NV ADCGAEALAR LLIVYPWTQR +hbb1_varex ~~~~~~~~VH WTAEEKQLIC SLWGKI..DV GLIGGETLAG LLVIYPWTQR +hbb2_xentr ~~~~~~~~VH WTAEEKATIA SVWGKV..DI EQDGHDALSR LLVVYPWTQR +hbbl_ranca ~~~~~~~~VH WTAEEKAVIN SVWQKV..DV EQDGHEALTR LFIVYPWTQR +hbb2_tricr ~~~~~~~~VH LTAEDRKEIA AILGKV..NV DSLGGQCLAR LIVVNPWSRR +glb2_mormr PIVDSGSVSP LSDAEKNKIR AAWDIVYKNY EKNGVDILVK FFTGTPAAQA +glbz_chith ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbf1_ureca ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + + 51 100 + lgb1_pea LFSFLKD... TAGVEDSPKL QAHAEQVFGL VRDSAAQLRT KGEVVLGNAT +lgb1_vicfa MFSFLKD... SAGVVDSPKL GAHAEKVFGM VRDSAVQLRA TGEVVLDGKD + myg_escgi KFDKFKHLKT EAEMKASEDL KKHGNTVLTA LGGILKKKGH ...HEAELKP + myg_horse KFDKFKHLKT EAEMKASEDL KKHGTVVLTA LGGILKKKGH ...HEAELKP + myg_progu KFDKFKHLKA EDEMRASEEL KKHGTTVLTA LGGILKKKGQ ...HAAELAP + myg_saisc KFDKFKHLKS EDEMKASEEL KKHGTTVLTA LGGILKKKGQ ...HEAELKP + myg_lycpi KFDKFKHLKT EDEMKGSEDL KKHGNTVLTA LGGILKKKGH ...HEAELKP + myg_mouse KFDKFKNLKS EEDMKGSEDL KKHGCTVLTA LGTILKKKGQ ...HAAEIQP + myg_musan HFPKFKN.KS LGELKDTADI KAQADTVLSA LGNIVKKKGS ...HSQPVKA + hba_ailme YFPHF.DLSP .....GSAQV KAHGKKVADA LTTAVGHLDD ...LPGALSA + hba_prolo YFPHF.DLSP .....GSAQV KAHGKKVADA LTLAVGHLDD ...LPGALSA + hba_pagla YFPHF.DLSH .....GSAQV KAHGKKVADA LTLAVGHLED ...LPNALSA + hba_macfa YFPHF.DLSH .....GSAQV KGHGKKVADA LTLAVGHVDD ...MPQALSA + hba_macsi YFPHF.DLSH .....GSAQV KGHGKKVADA LTLAVGHVDD ...MPQALSA + hba_ponpy YFPHF.DLSH .....GSAQV KDHGKKVADA LTNAVAHVDD ...MPNALSA +hba2_galcr YFPHF.DLSH .....GSTQV KGHGKKVADA LTNAVLHVDD ...MPSALSA + hba_mesau YFPHF.DVSH .....GSAQV KGHGKKVADA LTNAVGHLDD ...LPGALSA +hba2_bosmu YFPHF.DLSH .....GSAQV KGHGAKVAAA LTKAVGHLDD ...LPGALSE + hba_erieu YFPHF.DLNP .....GSAQV KGHGKKVADA LTTAVNNLDD ...VPGALSA + hba_frapo YFPHF.DLSH .....GSAQV KGHGKKVVAA LIEAANHIDD ...IAGTLSK + hba_phaco YFPHF.DLSH .....GSAQI KGHGKKVVAA LIEAVNHIDD ...ITGTLSK + hba_trioc YFPHF.DLHH .....GSAQI KAHGKKVVGA LIEAVNHIDD ...IAGALSK + hba_ansse YFPHF.DLQP .....GSAQI KAHGKKVAAA LVEAANHIDD ...IAGALSK + hba_colli YFPHF.DLSH .....GSAQI KGHGKKVAEA LVEAANHIDD ...IAGALSK +hbad_chlme YFPHF.DLHP .....GSEQV RGHGKKVAAA LGNAVKSLDN ...LSQALSE +hbad_pasmo YFPHF.DLSQ .....GSDQI RGHGKKVVAA LSNAIKNLDN ...LSQALSE +hbaz_horse YFPHF.DLHE .....GSPQL RAHGSKVAAA VGDAVKSIDN ...VAGALAK +hba4_salir YFSHWASVAP .....GSAPV KKHGITIMNQ IDDCVGHMDD ...LFGFLTK + hbb_ornan FFEAFGDLSS AGAVMGNPKV KAHGAKVLTS FGDALKNLDD ...LKGTFAK + hbb_tacac FFESFGDLSS ADAVMGNAKV KAHGAKVLTS FGDALKNLDN ...LKGTFAK + hbe_ponpy FFDSFGNLSS PSAILGNPKV KAHGKKVLTS FGDAIKNMDN ...LKTTFAK + hbb_speci FFDSFGDLSS ASAVMGNAKV KAHGKKVIDS FSNGLKHLDN ...LKGTFAS + hbb_speto FFDSFGDLSS ASAVMGNAKV KAHGKKVIDS FSNGLKHLDN ...LKGTFAS + hbb_equhe FFDSFGDLSN PAAVMGNPKV KAHGKKVLHS FGEGVHHLDN ...LKGTFAQ + hbb_sunmu FFDSFGDLSS ASAVMGNPKV KAHGKKVLHS LGEGVANLDN ...LKGTFAK + hbb_calar FFESFGDLST PDAVMNNPKV KAHGKKVLGA FSDGLTHLDN ...LKGTFAH + hbb_mansp FFDSFGDLSS PDAVMGNPKV KAHGKKVLGA FSDGLNHLDN ...LKGTFAQ + hbb_ursma FFDSFGDLSS ADAIMNNPKV KAHGKKVLNS FSDGLKNLDN ...LKGTFAK + hbb_rabit FFESFGDLSS ANAVMNNPKV KAHGKKVLAA FSEGLSHLDN ...LKGTFAK + hbb_tupgl FFDSFGDLSS PSAVMSNPKV KAHGKKVLTS FSDGLNHLDN ...LKGTFAK + hbb_triin FFEHFGDLSS ASAIMNNPKV KAHGEKVFTS FGDGLKHLED ...LKGAFAE + hbb_colli FFSSFGNLSS ATAISGNPNV KAHGKKVLTS FGDAVKNLDN ...IKGTFAQ + hbb_larri FFASFGNLSS PTAINGNPMV RAHGKKVLTS FGEAVKNLDN ...IKNTFAQ +hbb1_varex QFSHFGNLSS PTAIAGNPRV KAHGKKVLTS FGDAIKNLDN ...IKDTFAK +hbb2_xentr YFSSFGNLSN VSAVSGNVKV KAHGNKVLSA VGSAIQHLDD ...VKSHLKG +hbbl_ranca YFSTFGDLSS PAAIAGNPKV HAHGKKILGA IDNAIHNLDD ...VKGTLHD +hbb2_tricr YFHDFGDLSS CDAICRNPKV LAHGAKVMRS IVEATKHLDN ...LREYYAD +glb2_mormr FFPKFKGLTT ADALKKSSDV RWHAERIINA VNDAVKSMDD TEKMSMKLQE +glbz_chith ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbf1_ureca ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + + 101 150 + lgb1_pea LGAIHVQKGV TNP.HFVVVK EALLQTIKKA SGNNWSEELN TAWEVAYDGL +lgb1_vicfa .GSIHIQKGV LDP.HFVVVK EALLKTIKEA SGDKWSEELS AAWEVAYDGL + myg_escgi LAQSHATKHK IPIKYLEFIS DAIIHVLHSR HPGDFGADAQ AAMNKALELF + myg_horse LAQSHATKHK IPIKYLEFIS DAIIHVLHSK HPGNFGADAQ GAMTKALELF + myg_progu LAQSHATKHK IPVKYLEFIS EAIIQVLQSK HPGDFGADAQ GAMSKALELF + myg_saisc LAQSHATKHK IPVKYLELIS DAIVHVLQKK HPGDFGADAQ GAMKKALELF + myg_lycpi LAQSHATKHK IPVKYLEFIS DAIIQVLQNK HSGDFHADTE AAMKKALELF + myg_mouse LAQSHATKHK IPVKYLEFIS EIIIEVLKKR HSGDFGADAQ GAMSKALELF + myg_musan LAATHITTHK IPPHYFTKIT TIAVDVLSEM YPSEMNAQVQ AAFSGAFKII + hba_ailme LSDLHAHKLR VDPVNFKLLS HCLLVTLASH HPAEFTPAVH ASLDKFFSAV + hba_prolo LSDLHAYKLR VDPVNFKLLS HCLLVTLACH HPAEFTPAVH ASLDKFFTSV + hba_pagla LSDLHAYKLR VDPVNFKLLS HCLLVTLACH HPAEFTPAVH SALDKFFSAV + hba_macfa LSDLHAHKLR VDPVNFKLLS HCLLVTLAAH LPAEFTPAVH ASLDKFLASV + hba_macsi LSDLHAHKLR VDPVNFKLLS HCLLVTLAAH LPAEFTPAVH ASLDKFLASV + hba_ponpy LSDLHAHKLR VDPVNFKLLS HCLLVTLAAH LPAEFTPAVH ASLDKFLASV +hba2_galcr LSDLHAHKLR VDPVNFKLLR HCLLVTLACH HPAEFTPAVH ASLDKFMASV + hba_mesau LSDLHAHKLR VDPVNFKLLS HCLLVTLANH HPADFTPAVH ASLDKFFASV +hba2_bosmu LSDLHAHKLR VDPVNFKLLS HSLLVTLASH LPSDFTPAVH ASLDKFLANV + hba_erieu LSDLHAHKLR VDPVNFKLLS HCLLVTLALH HPADFTPAVH ASLDKFLATV + hba_frapo LSDLHAHKLR VDPVNFKLLG QCFLVVVAIH HPSALTPEVH ASLDKFLCAV + hba_phaco LSDLHAHKLR VDPVNFKLLG QCFLVVVAIH HPSALTPEVH ASLDKFLCAV + hba_trioc LSDLHAQKLR VDPVNFKLLG QCFLVVVAIH HPSVLTPEVH ASLDKFLCAV + hba_ansse LSDLHAQKLR VDPVNFKFLG HCFLVVLAIH HPSLLTPEVH ASMDKFLCAV + hba_colli LSDLHAQKLR VDPVNFKLLG HCFLVVVAVH FPSLLTPEVH ASLDKFVLAV +hbad_chlme LSNLHAYNLR VDPANFKLLA QCFQVVLATH LGKDYSPEMH AAFDKFLSAV +hbad_pasmo LSNLHAYNLR VDPVNFKFLS QCLQVSLATR LGKEYSPEVH SAVDKFMSAV +hbaz_horse LSELHAYILR VDPVNFKFLS HCLLVTLASR LPADFTADAH AAWDKFLSIV +hba4_salir LSELHATKLR VDPTNFKILA HNLIVVIAAY FPAEFTPEIH LSVDKFLQQL + hbb_ornan LSELHCDKLH VDPENFNRLG NVLIVVLARH FSKDFSPEVQ AAWQKLVSGV + hbb_tacac LSELHCDKLH VDPENFNRLG NVLVVVLARH FSKEFTPEAQ AAWQKLVSGV + hbe_ponpy LSELHCDKLH VDPENFKLLG NVMVIILATH FGKEFTPEVQ AAWQKLVSAV + hbb_speci LSELHCDKLH VDPENFKLLG NMIVIVMAHH LGKDFTPEAQ AAFQKVVAGV + hbb_speto LSELHCDKLH VDPENFKLLG NMIVIVMAHH LGKDFTPEAQ AAFQKVVAGV + hbb_equhe LSELHCDKLH VDPENFRLLG NVLVVVLARH FGKDFTPELQ ASYQKVVAGV + hbb_sunmu LSELHCDKLH VDPENFRLLG NVLVVVLASK FGKEFTPPVQ AAFQKVVAGV + hbb_calar LSELHCDKLH VDPENFRLLG NVLVCVLAHH FGKEFTPVVQ AAYQKVVAGV + hbb_mansp LSELHCDKLH VDPENFKLLG NVLVCVLAHH FGKEFTPQVQ AAYQKVVAGV + hbb_ursma LSELHCDKLH VDPENFKLLG NVLVCVLAHH FGKEFTPQVQ AAYQKVVAGV + hbb_rabit LSELHCDKLH VDPENFRLLG NVLVIVLSHH FGKEFTPQVQ AAYQKVVAGV + hbb_tupgl LSELHCDKLH VDPENFRLLG NVLVRVLACN FGPEFTPQVQ AAFQKVVAGV + hbb_triin LSELHCDKLH VDPENFRLLG NVLVCVLARH FGKEFSPEAQ AAYQKVVAGV + hbb_colli LSELHCDKLH VDPENFRLLG DILVIILAAH FGKDFTPECQ AAWQKLVRVV + hbb_larri LSELHCDKLH VDPENFRLLG DILIIVLAAH FAKDFTPDSQ AAWQKLVRVV +hbb1_varex LSELHCDKLH VDPTNFKLLG NVLVIVLADH HGKEFTPAHH AAYQKLVNVV +hbb2_xentr LSKSHAEDLH VDPENFKRLA DVLVIVLAAK LGSAFTPQVQ AVWEKLNATL +hbbl_ranca LSEEHANELH VDPENFRRLG EVLIVVLGAK LGKAFSPQVQ HVWEKFIAVL +hbb2_tricr LSVTHSLKFY VDPENFKLFS GIVIVCLALT LQTDFSCHKQ LAFEKLMKGV +glb2_mormr LSVKHAQSFY VDRQYFKVLA GII....... ..ADTTAPGD AGFEKLMSMI +glbz_chith ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbf1_ureca ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + + 151 200 + lgb1_pea ATAIKKAMKT A~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +lgb1_vicfa ATAIKAA~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_escgi RKDIAAKYKE LGFQG~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_horse RNDIAAKYKE LGFQG~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_progu RNDIAAKYKE LGFQG~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_saisc RNDMAAKYKE LGFQG~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_lycpi RNDIAAKYKE LGFQG~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_mouse RNDIAAKYKE LGFQG~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_musan CSDIEKEYKA ANFQG~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ailme STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_prolo STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_pagla STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_macfa STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_macsi STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ponpy STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba2_galcr STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_mesau STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba2_bosmu STVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_erieu ATVLTSKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_frapo GNVLTAKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_phaco GTVLTAKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_trioc GNVLSAKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ansse ATVLTAKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_colli GTVLTAKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbad_chlme AAVLAEKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbad_pasmo ASVLAEKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbaz_horse SSVLTEKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba4_salir ALALAEKYR~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_ornan AHALGHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_tacac SHALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbe_ponpy AIALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_speci ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_speto ANALSHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_equhe ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_sunmu ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_calar ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_mansp ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_ursma ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_rabit ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_tupgl ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_triin ANALAHKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_colli AHALARKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_larri AHALARKYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb1_varex SHSLARRYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb2_xentr VAALSHGYF~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbbl_ranca VDALSHSYH~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb2_tricr SHALGHGY~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +glb2_mormr CILLSSAY~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +glbz_chith ~~~~~MKFII LALCVAAASA LSGDQIGLVQ STYGKVKGDS VGILYAVFKA +hbf1_ureca ~~~~~~~~~~ ~~~~GLTTAQ IKAIQDHWFL NIKGCLQAAA DSIFFKYLTA + + 201 250 + lgb1_pea ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +lgb1_vicfa ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_escgi ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_horse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_progu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_saisc ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_lycpi ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_mouse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_musan ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ailme ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_prolo ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_pagla ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_macfa ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_macsi ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ponpy ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba2_galcr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_mesau ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba2_bosmu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_erieu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_frapo ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_phaco ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_trioc ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ansse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_colli ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbad_chlme ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbad_pasmo ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbaz_horse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba4_salir ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_ornan ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_tacac ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbe_ponpy ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_speci ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_speto ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_equhe ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_sunmu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_calar ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_mansp ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_ursma ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_rabit ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_tupgl ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_triin ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_colli ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_larri ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb1_varex ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb2_xentr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbbl_ranca ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb2_tricr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +glb2_mormr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +glbz_chith DPTIQAAFPQ FVGKDLDAIK GGAEFSTHAG RIVGFLGGVI DDL.PNIGKH +hbf1_ureca YPGDLAFFHK FSSVPLYGLR SNPAYKAQTL TVINYLDKVV DALGGNAGAL + + 251 300 + lgb1_pea ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +lgb1_vicfa ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_escgi ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_horse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_progu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_saisc ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_lycpi ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_mouse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + myg_musan ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ailme ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_prolo ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_pagla ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_macfa ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_macsi ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ponpy ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba2_galcr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_mesau ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba2_bosmu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_erieu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_frapo ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_phaco ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_trioc ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_ansse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hba_colli ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbad_chlme ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbad_pasmo ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbaz_horse ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hba4_salir ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_ornan ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_tacac ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbe_ponpy ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_speci ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_speto ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_equhe ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_sunmu ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_calar ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_mansp ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_ursma ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_rabit ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_tupgl ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_triin ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_colli ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ + hbb_larri ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb1_varex ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb2_xentr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbbl_ranca ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +hbb2_tricr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +glb2_mormr ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ ~~~~~~~~~~ +glbz_chith VDALVATHKP RGVTHAQFNN FRAAFIAYLK GHVDYTAAVE AAWGATFDAF +hbf1_ureca MKAKVPSHDA MGITPKHFGQ LLKLVGGVFQ EEFSADPTTV AAWGDAAGVL + + 301 + lgb1_pea ~~~~~~~~ +lgb1_vicfa ~~~~~~~~ + myg_escgi ~~~~~~~~ + myg_horse ~~~~~~~~ + myg_progu ~~~~~~~~ + myg_saisc ~~~~~~~~ + myg_lycpi ~~~~~~~~ + myg_mouse ~~~~~~~~ + myg_musan ~~~~~~~~ + hba_ailme ~~~~~~~~ + hba_prolo ~~~~~~~~ + hba_pagla ~~~~~~~~ + hba_macfa ~~~~~~~~ + hba_macsi ~~~~~~~~ + hba_ponpy ~~~~~~~~ +hba2_galcr ~~~~~~~~ + hba_mesau ~~~~~~~~ +hba2_bosmu ~~~~~~~~ + hba_erieu ~~~~~~~~ + hba_frapo ~~~~~~~~ + hba_phaco ~~~~~~~~ + hba_trioc ~~~~~~~~ + hba_ansse ~~~~~~~~ + hba_colli ~~~~~~~~ +hbad_chlme ~~~~~~~~ +hbad_pasmo ~~~~~~~~ +hbaz_horse ~~~~~~~~ +hba4_salir ~~~~~~~~ + hbb_ornan ~~~~~~~~ + hbb_tacac ~~~~~~~~ + hbe_ponpy ~~~~~~~~ + hbb_speci ~~~~~~~~ + hbb_speto ~~~~~~~~ + hbb_equhe ~~~~~~~~ + hbb_sunmu ~~~~~~~~ + hbb_calar ~~~~~~~~ + hbb_mansp ~~~~~~~~ + hbb_ursma ~~~~~~~~ + hbb_rabit ~~~~~~~~ + hbb_tupgl ~~~~~~~~ + hbb_triin ~~~~~~~~ + hbb_colli ~~~~~~~~ + hbb_larri ~~~~~~~~ +hbb1_varex ~~~~~~~~ +hbb2_xentr ~~~~~~~~ +hbbl_ranca ~~~~~~~~ +hbb2_tricr ~~~~~~~~ +glb2_mormr ~~~~~~~~ +glbz_chith FGAVFAKM +hbf1_ureca VAAMK~~~ + diff --git a/forester/archive/RIO/others/hmmer/tutorial/globins630.fa b/forester/archive/RIO/others/hmmer/tutorial/globins630.fa new file mode 100644 index 0000000..b936a34 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/globins630.fa @@ -0,0 +1,2520 @@ +> BAHG_VITSP +MLDQQTINIIKATVPVLKEHGVTITTTFYKNLFAKHPEVRPLFDMGRQESLEQPKALAM +TVLAAAQNIENLPAILPAVKKIAVKHCQAGVAAAHYPIVGQELLGAIKEVLGDAATDDIL +DAWGKAYGVIADVfiqveadLYAQAVE +> GLB1_ANABR +PSVQGAAAQLTADVKKDLRDSWKVIGSDKKGNGVALMTTLFADNQETIGYFKRLGNVSQ +GMANDKLRGHSITLMYALQNFIDQLDNTDDLVCVVEKFAVNHITRKISAAEFGKINGPIK +KVLASKNFGDKYANAWAKLVAVVQAAL +> GLB1_ARTSX +ERVDPITGLSGLEKNAILDTWGKVRGNLQEVGKATFGKLFAAHPEYQQMFRFFQGVQLA +FLVQSPKFAAHTQRVVSALDQTLLALNRPSDQFVYMIKELGLDHINRGTDRSFVEYLKES +LGDSVDEFTVQSFGEVIVNFLNEGLRQA +> GLB1_CALSO +VSANDIKNVQDTWGKLYDQWDAVHAsKFYNKLFKDSEDISEAFVKAGTGSGIAMKRQAL +VFGAILQEFVANLNDPTALTLKIKGLCATHKTRGITNMELFAFALADLVAYMGTtISFTA +AQKASWTAVNDVILHQMSSYFATVA +> GLB1_CHITH +GPSGDQIAAAKASWNTVKNNQVDILYAVFKANPDIQTAFSQFAGKDLDSIKGTPDFSKH +AGRVVGLFSEVMDLLGNDANTPTILAKAKDFGKSHKSRASPAQLDNFRKSLVVYLKGATK +WDSAVESSWAPVLDFVFSTLKNEL +> GLB1_GLYDI +GLSAAQRQVIAATWKDIAGADNGAGVGKDCLIKFLSAHPQMAAVFGFSGASDPGVAALG +AKVLAQIGVAVSHLGDEGKMVAQMKAVGVRHKGYGNKHIKAQYFEPLGASLLSAMEHRIG +GKMNAAAKDAWAAAYADISGALISGLQS +> GLB1_LUMTE +ECLVTEGLKVKLQWASAFGHAHQRVAFGLELwkgILREHPEIKAPFSRVRGDNIYSPQF +GAHSQRVLSGLDITISMLDTPDmLAAQLAHLKVQHVERNLKPEFFDIFLKHLLHVLGDRL +GTHFDFGAWHDCVDQIIDGIKDI +> GLB1_MORMR +PIVDSGSVSPLSDAEKNKIRAAWDLVYKDYEKTGVDILVKFFTGTPAAQAFFPKFKGLT +TADDLKQSSDVRWHAERIINAVNDAVKSMDDTEKMSMKLKELSIKHAQSFYVDRQYFKVL +AGIIADTTAPGDAGFEKLMSMICILLSSAY +> GLB1_PARCH +GGTLAIQSHGDLTLAQKKIVRKTWHQLMRNKTSFVTDLFIRIFAYDPAAQNKFPQMAGM +SASQLRSSRQMQAHAIRVSSIMSEYIEELDSDILPELLATLARTHDLNKVGPAHYDLFAK +VLMEALQAELGSDFNQKTRDSWAKAFSIVQAVLLVKHG +> GLB1_PETMA +PIVDSGSVPALTAAEKATIRTAWAPVYAKYQSTGVDILIKFFTSNPAAQAFFPKFQGLT +SADQLKKSMDVRWHAERIINAVNDAVVAMDDTEKMSLKLRELSGKHAKSFQVDPQYFKVL +AAVIVDTVLPGDAGLEKLMSMICILLRSSY +> GLB1_PHESE +DCNTLKRFKVKHQWQQVFSGEhHRTEFSLHFWKEFLHDHPDLVSLFKRVQGENIYSPEF +QAHGIRVLAGLDSVIGVLDEDDTFTVQLAHLKAQHTERGTKPEYFDLFGTQLFDILGDKL +GTHFDQAAWRDCYAVIAAGIKP +> GLB1_SCAIN +PSVYDAAAQLTADVKKDLRDSWKVIGSDKKGNGVALMTTLFADNQETIGYFKRLGNVSQ +GMANDKLRGHSITLMYALQNFIDQLDNPDDLVCVVEKFAVNHITRKISAAEFGKINGPIK +KVLASKNFGDKYANAWAKLVAVVQAAL +> GLB1_TYLHE +TDCGILQRIKVKQQWAQVYSVGESRTDFAIDVFNNFFRTNPDRSLFNRVNGDNVYSPEF +KAHMVRVFAGFDILISVLDDKPVLDQALAHYAAFHKQFGTIPFKAFGQTMFQTIAEHIHG +ADIGAWRACYAEqIVTGITA +> GLB2_ANATR +PSVQDAAAQLTADVKKDLRDSWKVLGSDKKGDGMALMTTLFNDHQETIAYFKRMGDVSQ +GMANSKLRGHSITLMYALQNFIDQLDSTDDLICVVEKFAVNHITRKISGAEFGKINGPMK +KVLASKNFGDKYANAWAKLVGVVQAAL +> GLB2_CALSO +VSQADIAAVQTSWRRCYCSWDNEDGLKFYQTLFDSNSKIRHAFESAGATNDTEMEKQAN +LFGLMMTQFIDNLDDTTALNYKISGLMATHKTRNVVDPALFAIALNELVKFIGNQQPAWK +NVTAVILSQMKIALSSN +> GLB2_CHITH +APLSADEASLVRGSWAQVKHSEVDILYYIFKANPDIMAKFPQFAGKDLETLKGTGQFAT +HAGRIVGFVSEIVALMGNSANMPAMETLIKDMAANHKARGIPKAQFNEFRASLVSYLQSK +VSWNDSLGAAWTQGLDNVFNMMFSYL +> GLB2_LUMTE +KKQCGVLEGLKVKSEWGRAYGSGhDREAFSQAIWRATFAQVPESRSLFKRVHGDDTSHP +AFIAHAERVLGGLDIAISTLDQPATLKEELDHLQVQHEGRKIPDNYFDAFKTAILHVVAA +QLGRCYDREAWDACIDHIEDGIKGHH +> GLB2_MORMR +PIVDSGSVSPLSDAEKNKIRAAWDIVYKNYEKNGVDILVKFFTGTPAAQAFFPKFKGLT +TADALKKSSDVRWHAERIINAVNDAVKSMDDTEKMSMKLQELSVKHAQSFYVDRQYFKVL +AGIIADTTAPGDAGFEKLMSMICILLSSAY +> GLB2_TYLHE +SSDHCGPLQRLKVKQQWAKAYGVGHERVELgialwksMFAQDNDARDLFKRVHGEDVHS +PAFEAHMARVFNGLDRVISSLTDEPVLNAQLEHLRQQHIKLGITGHMFNLMRTGLAYVLP +AQLGRCFDKEAWAACWDEVIYPGIKHD +> GLB3_CHITH +MKFLILALCFAAASALSADQISTVQASFDKVKGDPVGILYAVFKADPSIMAKFTQFAGK +DLESIKGTAPFEIHANRIVGFFSKIIGELPNIEADVNTFVASHKPRGVTHDQLNNFRAGF +VSYMKAHTDFAGAEAAWGATLDTFFGMIFSKM +> GLB3_CHITP +LSADQISTVQASFDKVKGDPVGILYAVFKADPSIMAKFTQFAGKDLESIKGTAPFETHA +NRIVGFFSKIIGELPNIEADVNTFVASHKPRGVTHDQLNNFRAGFVSYMKAHTDFAGAEA +AWGATLDTFFGMIFSKM +> GLB3_LAMSP +YECGPLQRLKVKRQWAEAYGSGnDREEFGHFIWTHVFKDAPSARDLFKRVRGDNIHTPA +FRAHATRVLGGLDMCIALLDDEGVLNTQLAHLASQHSSRGVSAAQYDVVEHSVMMGVEHE +IGqNVFDKDAWQACLDVITGGIQGN +> GLB3_MORMR +PIVDSGSVSPLTAADKTKILAAWDLVYKNYEKNSVDILVKFFTGTPAAQAFFPKFKGLT +TADDLKKSSDVRWHAERIINAVNDAVKSMDDTEKMSMKLKELSNKHVKNFNVDRKYFKVL +AGVIADTVAPGDASFEKLMSIICILLNSAY +> GLB3_MYXGL +PITDHGQPPTLSEGDKKAIRESWPQIYKNFEQNSLAVLLEFLKKFPKAQDSFPKFSAKK +SHLEQDPAVKLQAEVIINAVNHTIGLMDKEAAMKKYLKDLSTKHSTEFQVNPDMFKELSA +VFVSTMGGKAAYEKLFSIIATLLRSTYDA +> GLB3_PETMA +PIVDSGSVAPLSAAEKTKIRSAWAPVYSNYETTGVDILVKFFTSTPAAQEFFPKFKGLT +TADQLKKSADVRWHAERIINAVNDAVVSMDDTEKMSMKLGDLSGKHAKSFQVDPQYFKVL +AAVIADTVAAGDAGFEKLMSMICILLRSAY +> GLB3_TYLHE +DDCCSAADRHEVLDNWKGIWSAEftgRRVAIGQAIFQELFALDPNAKGVFGRVNVDKPS +EADWKAHVIRVINGLDLAVNLLEDPKALQEELKHLARQHRERSGVKAVYFDEMEKALLKV +LPQVSSHFNSGAWDRCFTRIADVIKAELP +> GLB4_CHITH +MKLLILALCFAAASALTADQISTVQSSFAGVKGDAVGILYAVFKADPSIQAKFTQFAGK +DLDSIKGSADFSAHANKIVGFFSKIIGDLPNIDGDVTTFVASHTPRGVTHDQLNNFRAGF +VSYMKAHTDFAGAEAAWGATLDAFFGMVFAKM +> GLB4_GLYDI +GLSAAQRQVVASTWKDIAGSDNGAGVGKECFTKFLSAHHDIAAVFGFSGASDPGVADLG +AKVLAQIGVAVSHLGDEGKMVAEMKAVGVRHKGYGYKHIKAEYFEPLGASLLSAMEHRIG +GKMTAAAKDAWAAAYADISGALISGLQS +> GLB4_LUMTE +ADDEDCCSYEDRREIRHIWDDVWSSSftdRRVAIVRAVFDDLFKHYPTSKALFERVKID +EPESGEFKSHLVRVANGLDLLINLLDDTLVLQSHLGHLADQHIQRKGVTKEYFRGIGEAF +ARVLPQVLSCFNVDAWNRCFHRLVARIAKDLP +> GLB4_TYLHE +DTCCSIEDRREVQALWRSIWSAEDTGRRTLigrllfEELFEIDGATKGLFKRVNVDDTH +SPEEFAHVLRVVNGLDTLIGVLGDSDTLNSLIDHLAEQHKARAGFKTVYFKEFGKALNHV +LPEVASCFNPEAWNHCFDGLVDVISHRIDG +> GLB5_PETMA +PIVDTGSVAPLSAAEKTKIRSAWAPVYSTYETSGVDILVKFFTSTPAAQEFFPKFKGLT +TADQLKKSADVRWHAERIINAVNDAVASMDDTEKMSMKLRDLSGKHAKSFQVDPQYFKVL +AAVIADTVAAGDAGFEKLMSMICILLRSAY +> GLB6_CHITH +AVLTTEQADLVKKTWSTVKFNEVDILYAVFKAYPDIMAKFPQFAGKDLDSIKDSAAFAT +HATRIVSFLSEVISLAGSDANIPAIQNLAKELATSHKPRGVSKDQFTEFRTALFTYLKAH +INFDGPTETAWTLALDTTYAMLFSAMDS +> GLB7_ARTSX +ALTALEKQSIQDIWTILKAVGLEFLqvkmfGKLFADHPEYKAHFDNFLTAIFSVAedlv +pKLRAHLHRVIDAFDLVIFALGRESLRGSLKDLGIFHTGRDIVDPVEsltgFKLMVAVIE +EGLDTFRAVPEYSKGLEGrFGNVDNINENAPFR +> GLB7_CHITH +APLSADQASLVKSTWAQVRNSEVEILAAVFTAYPDIQARFPQFAGKDVASIKDTGAFAT +HAGRIVGFVSEIIALIGNESNAPAVQTLVGQLAASHKARGISQAQFNEFRAGLVSYVSSN +VAWNAAAESAWTAGLDNIFGLLFAAL +> GLB8_CHITH +AVTPMSADQLALFKSSWNTVKHNEVDILYAVFKANPDIQAKFPQFAGKDLDSIKDSADF +AVHSGRIVGFFSEVIGLIGNPENRPALKTLIDGLASSHKARGIEKAQFEEFRASLVDYLS +HHLDWNDTMKSTWDLALNNmFFYILHALEVAQ +> GLB9_CHITH +DPVSSDEANAIRASWAGVKHNEVDILAAVFSDHPDIQARFPQFAGKDLASIKDTGAFAT +HAGRIVGFISEIVALVGNESNAPAMATLINELSTSHHNRGITKGQFNEFRSSLVSYLSSH +ASWNDATADAWTHGLDNIFGMIFAHL +> GLBA_ANATR +VADAVAKVCGSEAIKGNLRRSWGVLMSADIEATGLTYLANLFTLRPDTKTYFTRLGDVQ +KGKANSKLRGHAITLTYALDWFVDSLDDPSRLKCVVEKFAVNHINRKISGDAFGSIIPEM +KETLKARMGSYSDDVGAAWVQAILGMQNAVLSAL +> GLBA_SCAIN +VADAVAKVCGSEAIKANLRRSWGVLSADIEATGLMLMSNLFTLRPDTKTYFTRLGDVQK +GKANSKLRGHAITLTYALNNFVDSLDDPSRLKCVVEKFAVNHINRKISGDAFGAIVEPMK +ETLKARMGNYYSDDVAGAWAALVGVVQAAL +> GLBB_ANATR +STVAELANAVVSNADQKDLLRLSWGVLSVDMEGTGLMLMANLFKTSSAARTKFARLGDV +SAGKDNSKLRGHSITLMYALQNFIDALDNVDRLKCVVEKFAVNHINRQISADEFGEIVGP +LRQTLKARMGSYFDEDTVSAWAALVAVVQASL +> GLBB_SCAIN +SKVAELANAVVSNADQKDLLRMSWGVLSVDMEGTGLMLMANLFKTSPSAKGKFARLGDV +SAGKDNSKLRGHSITLMYALQNFVDALDDVERLKCVVEKFAVNHINRQISADEFGEIVGP +LRQTLKARMGNYFDEDTVSAWASLVAVVQASL +> GLBC_CAUAR +GTLAIQAQGDLTLAQKKIVRKTWHQLMRNKTSFVTDVFIRIFAYDPSAQNKFPQMAGMS +ASQLRSSRQMQAHAIRVSSIMSEYVEELDSDILPELLATLARTHDLNKVGADHYNLFAKV +LMEALQAELGSAFNEKTRDAWAKAFSVVQAVLLVKHGN +> GLBC_CHITH +MKFFAVLALCIVGaiaSPLTADEASLVQSSWKAVSHNEVDILAAVFAAYPDIQAKFPQF +AGKDLASIKDTGAFATHATRIVSFLSEVIALSGNESNASAVNSLVSKLGDDHKARGVSAA +QFGEFRTALVAYLSNHVSWGDNVAAAWNKALDNTYAIVVPRL +> GLBD_CAUAR +GQATSFQSVGDLTPAEKDLIRSTWDQLMTHRTGFVADVFIRIFHNDPWAQRKFPQMAGL +SPAELRTSRQMHAHAIRVSALMTTYIDEMDTEVLPELLATLTRTHDKNHVGKKNYDLFGK +VLMEAIKAELGVGFTKQVHDAWAKTFAIVQGVLITKHAS +> GLBD_CHITH +MKFFAVLALCIVGaiaSPLTADEASLVQSSWKAVSHNEVDILAAVFAAYPDIQAKFPQF +AGKDLASIKDTGAFATHATRIVSFLSEVIALSGNASNAAAVEGLLNKLGSDHKARGVSAA +QFGEFRTALVSYLSNHVSWGDNVAAAWNKALDNTMAVAVAHL +> GLBE_CHITH +MKFFAVLALCIVGaiaSPLTADEASLVQSSWKAVSHNEVEILAAVFAAYPDIQNKFSQF +AGKDLASIKDTGAFATHATRIVSFLSEVIALSGNTSNAAAVNSLVSKLGDDHKARGVSAA +QFGEFRTALVAYLQANVSWGDNVAAAWNKALDNTFAIVVPRL +> GLBF_CHITH +MKFFAVLALCIVGaiaSPLTADEASLVQSSWKAVSHNEVEILAAVFAAYPDIQNKFSQF +AGKDLASIKDTGAFATHATRIVSFLSEVIALSGNDSNAAAVNSLVSKLGDDHKARGVSAA +QFGEFRTALVAYLQANVSWGDNVAAAWNKALDNTFAIVVPRL +> GLBH_CHITH +MKFFAVLALCVVGaiaSPLSADEAAIVKSSWDQVKHNEVDILAAVFAAYPDIQAKFPQF +AGKDLASIKDTAAFATHATRIVSFFTEVISLSGNQANLSAVYALVSKLGVDHKARGISAA +QFGEFRTALVSYLQAHVSWGDNVAAAWNHALDNTYAVALKSLE +> GLBI_CHITP +MKFFAVLALCIVGaiaSPLTADEASLVQSSWKAVSHNEVEILAAVFAAYPDIQNKFPQF +AGKDLASIKDTGAFATHATRIVSFLSEVIALSGNESNASAVNSLVSKLGDDHKARGVSAA +QFGEFRTALVAYLQANVSWGDNVAAAWNKALDNTFAIVVPRL +> GLBM_ANATR +STFGELANEVVNNSYHKDLLRLSWGVLSDDMEGTGLMLMANLFNMSPESRLKFGRLGHL +STGRDNSKLRGHSITLMYALKNFVDALDDVDRLKCVVEKFAVNHINRQISAEEFGKIVGP +FRAVLRIRMGDYFDEEIVAAWAALIAVVQAAL +> GLBT_CHITH +VATPAMPSMTDAQVAAVKGDWEKIKGSGVEILYFFLNKFPGNFPMFKKLGNDLAAAKGT +AEFKDQADKIIAFLQGVIEKLGSDMGGAKALLNQLGTSHKAMGITKDQFDQFRQALTELL +GNLGFGGNIGAWNATVDLMFHVIFNALDGTPV +> GLBX_CHITH +DPEWHTLDAHEVEQVQATWKAVSHDEVEILYTVFKAHPDIMAKFPKFAGKDLEAIKDTA +DFAVHASRIIGFFGEYVTLLGSSGNQAAIRTLLHDLGVFHKTRGITKAQFGEFRETMTAY +LKGHNKwnADISHSWDDAFDKAFSVIFEVLES +> GLBY_CHITP +MKVLAIFALCIIGALATPcDDFKIMQEAWNTMKNEEVEILYTVFKAYPDIQAKFPQFVG +KDLETIKGTAEFAVHATRIVSFMTEVISLLGNPDNLPAIMSLLSKLGKDHKGRGITVKQF +DEFHEAFHNFLHTHSVWNDNVDAAWHCNEKEIRKVINANLE +> GLBZ_CHITH +MKFIILALCVAAASALSGDQIGLVQSTYGKVKGDSVGILYAVFKADPTIQAAFPQFVGK +DLDAIKGGAEFSTHAGRIVGFLGGVIDDLPNIGKHVDALVATHKPRGVTHAQFNNFRAAF +IAYLKGHVDYTAAVEAAWGATFDAFFGAVFAKM +> GLB_APLJU +ALSAADAGLLAQSWAPVFANSDANGASFLVALFTQFPESANFFNDFKGKSLADIQASPK +LRDVSSRIFARLNEFVSNAADAGKMGSMLQQFATEHAGFGVGSAQFQNVRSMFPGFVASL +SAPAADAAWNSLFGLIISALQSAGK +> GLB_APLKU +SLSAAEADLVGKSWAPVYANKDADGANFLLSLFEKFPNNANYFADFKGKSIADIKASPK +LRDVSSRIFTRLNEFVNNAADAGKMSAMLSQFASEHVGFGVGSAQFENVRSMFPAFVASL +SAPPADDAWNKLFGLIVAALKAAGK +> GLB_APLLI +SLSAAEADLAGKSWAPVFANKNANGADFLVALFEKFPDSANFFADFKGKSVADIKASPK +LRDVSSRIFTRLNEFVNDAANAGKMSAMLSQFAKEHVGFGVGSAQFENVRSMFPGFVASV +AAPPAGADAWTKLFGLIIDALKAAGK +> GLB_BUSCA +GLDGAQKTALKESWKVLGADGPtmmKNGSLLFGLLFKTYPDTKKHFKHFDDATFAAMDT +TGVGKAHGVAVFSGLGSMICSIDDDDCVXGLAKKLSRNHLARGVSAADFKLLEAVFKXFL +DEATQRKATDAQKDADGALLTMLIKAHV +> GLB_CERRH +SLQPASKSALASSWKTLAKDAAtiqNNGATLFSLLFKQFPDTRNYFTHFGNMSDAEMKT +TGVGKAHSMAVFAGIGSMIDSMDDADCMNGLALKLSRNHIQRKIGASRFGEMRQVFPNFL +DEALGGGASGDVKGAWDALLAYLqdnkqAQAL +> GLB_DOLAU +ALSAAEAEVVAKSWGPVFANKDANGDNFLIALFEAYPDSPNFFADFKGKSIADIRASPK +LRNVSSRIVSRLNEFVSSAADAGKMAAMLDQFSKEHAGFGVGSQQFQNVSAMFPGFVASI +AAPPAGADAAWGKLFGLIIDAMKKAGK +> GLB_LAMFL +PIVDSGSVAPLSAAEKTKIRSAWAPVYSNYETSGVDILVKFFTSTPAAQEFFPKFKGMT +SADQLKKSADVRWHAERIINAVNDAVASMDDTEKMSMKLRDLSGKHAKSFQVDPQYFKVL +AAVIADTVAAGDAGFEKLMSMICILLRSAY +> GLB_TETPY +MNKPQTIYEKLGGENAMKAAVPLFYKKVLADERVKHFFKNTDMDHQTKQQTDFLTMLLG +GPNHYKGKNMTEAHKGMNLQNLHFDAIIENLAATLKELGVTDAVINEAAKVIEHTRKDML +GK +> GLB_TUBTU +ECDALQRFKVKHQWAEAFGTShHRLDFGLKLWNSIFRDAPEIRGLFKRVDGDNAYSAEF +EAHAERVLGGLDMTISLLDDQAAFDAQLAHLKSQHAERNIKADYYGVFVNELLAVLPDYL +GTKLDFKAWSECLGVITGAIHD +> GLP1_GLYDI +MHLTADQVAALKASWPEVSAGDGGAQLGLEMFTKYFHENPQMMFIFGYSGRTEALKHSS +KLQHHGKVIIDQIGKAVAEMDNAKQMAGTLHALGVRHKGFGDIRAEFFPALGMCLLDAME +EKVPGLNRTLWAAAYREISDACIAGLQS +> GLP2_GLYDI +MPLTADQVAALKASWPEVSAGDGGGQLGLELFTKYFHENPQMMFIFGYSGRTDALKHNA +KLQNHGKVIIDQIGKAVAEMDNAKQMAGTLHALGVRHKGFGDIRADFFPALGMCLLDAME +EKVPGLNRTLWAAAYREISDALVAGLES +> GLP3_GLYDI +MHLTADQVAALKASWPEVSAGDGGAQLGLEMFTRYFDENPQMMFVFGYSGRTSALKHNS +KLQNHGKIIVHQIGQAVSELDDGSKFEATLHKLGQEHKGFGDIKGEYFPALGDALLEAMN +SKVHGLDRTLWAAGYRVISDALIAGLES +> HBA1_BOSMU +VLSAADKGNVKAAWGKVGGHAAEYGAEALERMFLSFPTTKTYFPHFDLSQGSAQVKGHG +AKVAAALTKAVEHLDDLPGALSELSDLHAHKLRVDPVNFKLLSHSLLVTLASHLPSDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA1_GALCR +VLSPTDKSIVKAAWEKVGAHAGDYGAEALERMFLSFPTTKTYFPQFDLSHGSAQVKGHG +KKVADALTNAVLHVDDMPSALSALSDLHAHKLTVDPVNFKLLSHCLLVTLACHLPAEFTP +AVHASLDKFMASVSTVLTSKYR +> HBA1_IGUIG +VLTEDDKNHIRAIWGHVDNNPEAFGVEALTRLFLAYPATKTYFAHFDLNPGSAQIKAHG +KKVVDALTQAVNNLDDIPDALAKLADLHAEKLRVDPVNFGLLGHCILVTIAAHNHGPLKA +DVALSMDKFLTKVAKTLVAHYR +> HBA1_LEMVA +VLSPADKNNVKSAWNAIGSHAGEHGAEALERMFLSFPPTKTYFPHFDLSHGSAQIKTHG +KKVADALTNAVNHIDDMPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLASHHPAEFTP +AVHASLDKFFAAVSTVLTSKYR +> HBA1_NOTCO +SLSDKDKAAVKALWSKIGKSADAIGNDALSRMIVVYPQTKTYFSHWPSVTPGHPDIKAH +GKKVMGGLAIAVSKINDLKAGLSNLSQQHAYKLRVDPANFKILNHCILVVISTMFPKNFT +PQAHVSLNKFLSGVALALAQRYR +> HBA1_PLEWA +KLTAEDKHNVKAIWDHVKGHEEAIGAEALYRMFCCMPTTRIYFPAKDLSERSSYLHSHG +KKVVGALTNAVAHIDDIDTAFSKLSDKHAEELMVDPANFPKLAHNILVVLGIHLKPHFTY +SVHRSVDKFLSTVAYVLASKYR +> HBA1_SALIR +SLTAKDKSVVKAFWGKISGKADVVGAEALGRdkMLTAYPQTKTYFSHWADLSPGSGPVK +KHGGIIMGAIGKAVGLMDDLVGGMSALSDLHAFKLRVDPGNFKILSHNILVTLAIHFPSD +FTPEVHIAVDKFLAAVSAALADKYR +> HBA1_TACAC +VLTDAEKKEVTSLWGKASGHAEEYGAEALERLFLSFPTTKTYFSHMDLSKGSAQVKAHG +KRVADALTTAAGHFNDMDSALSALSDLHAHKLRVDPVNFKLLAHCFLVVLARHHPAEFTP +SAHAAMDKFLSRVATVLTSKYR +> HBA1_TADBR +VLSPEDKNNVKAAWSKVGGQAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVGEALTTAVNHMDDLPGALSTLSDLHAYKLRVDPVNFKLLSHCLLVTLACHNPGEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA1_TORMA +VLSEGNKKAIKNLLQKIHSQTEVLGAEALARLFECHPQTKSYFPKFSGFSANDKRVKHH +GALVLKALVDTNKHLDDLPHHLNKLAEKHGKGLLVDPHNFKLFSDCIAVTLAAHLQEFSP +ETHCAVDKFLEEVTYQLSSLYR +> HBA1_TRICR +MKLSADDKHNVKAIWEHVKGHEEAIGAEALCRMFTSLPTTRTYFPTKDIKEGSSFLHSH +GKKVMGALSNAVAHIDDIDGALSKLSDKHAEELMVDPANFPKLAHNILVVLGIHLKPHLT +YSVHSSVDKFLATVGYVLASKYR +> HBA1_XENBO +LLSADDKKHIKAIMPSIAAHGDKFGGEALYRMFLVNPKTKTYFPTFDFHHNSKQISAHG +KKVVDALNEASNHLDNIAGSLSKLSDLHAYDLRVDPGNFPLLAHNILVVVAMNFPKQFDP +ATHKALDKFLATVSSVLTSKYR +> HBA1_XENLA +LLSADDKKHIKAIMPAIAAHGDKFGGEALYRMFIVNPKTKTYFPSFDFHHNSKQISAHG +KKVVDALNEASNHLDNIAGSMSKLSDLHAYDLRVDPGNFPLLAHNILVVVAMNFPKQFDP +ATHKALDKFLATVSTVLTSKYR +> HBA2_BOSMU +VLSAADKGNVKAAWGKVGGHAAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +AKVAAALTKAVGHLDDLPGALSELSDLHAHKLRVDPVNFKLLSHSLLVTLASHLPSDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA2_GALCR +VLSPTDKSNVKAAWEKVGAHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSTQVKGHG +KKVADALTNAVLHVDDMPSALSALSDLHAHKLRVDPVNFKLLRHCLLVTLACHHPAEFTP +AVHASLDKFMASVSTVLTSKYR +> HBA2_LEMVA +VLSPADKNNVKSAWKAIGSHAGEHGAEALERMFLSFPPTKTYFPHFDLSHGSAQIKTHG +KKVADALTNAVNHIDDMPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLASHHPAEFTP +AVHASLDKFFAAVSTVLTSKYR +> HBA2_NOTCO +SLSTKDKETVKAFWSKVSGKSEDIGNDALSRMLVVYPQTKTYFSHWKELTPGSAPVRKH +GMTVMKGVGDAVSKIEDLTAGLMELSELHAFTLRVDPANFKISHNILVVFAIMFPKEFTA +EVHVSMDKFLAALARALSEKYR +> HBA2_PLEWA +NVKAVWEHVKGHEEVYGAEALYRAFLCDPQTQTYFAGKDLSENSAFLHSHGKKVMCALT +NAIAHIDDIDGCMSKLSDKHAHELMVDPGNFDILAHHILTVLAMFLSQLLTCANHRSVDK +FLSCVKNVLTSRYR +> HBA2_TACAC +VLTDAERKEVTSLWGKASGHAEDYGAEALERLFLSFPTTKTYFSHMDLSKGSAHVRAHG +KKVADALTTAVGHFNDMDGALSDLSDLHAHKLRVDPVNFKLLAHCFLVVLARHHPEEFTP +SAHAAMDKFLSRVATVLTSKYR +> HBA2_TORMA +VLSEGNKKIIKNLLQKIHSQTEVLGAEALARLFECHPQTKSYFPKFSGFSANDKRVKHH +GDLVLKALVDTNDHLDDLPHHLHKLAEKHGKDLLVDPHNFKLFSDCIAVTLAAHLQEKSP +ETHCAVDKFLEEVTYQLSSLYR +> HBA2_TRICR +VLSSQDKANVKAVWEHVKGHEEVYGAEALHRAFVCDPQTQTYFAGKDLKENSAYLHGHG +KKVMSALTNAVAHIDDIEGSMSKLSDKHAHELMVDPGNFDILAHHILTTMAMFMPQCLTS +ANHRSVDKFLSTVKHVLTSKYR +> HBA2_VAREX +VLTEDDKNHVKGLWAHVHDHIDEIAADALTRMFLAHPASKTYFAHFDLSPDNAQIKAHG +KKVANALNQAVAHLDDIKGTLSKLSELHAQQLRVDPVNFGFLRHCLEVSIAAHLHDHLKA +SVIVSLDKFLEEVCKDLVSKYR +> HBA2_XENBO +LLTADDKKHIKAILPSIAAHGDKFGGEALYRMFLINPKTKTYFPNFDFHHNSKQISAHG +KKVVDALNEAANHLDNIAGSMSKLSDLHAYDLRVDPGNFPLLAHNILVTVAMYFPQQFDP +HTHKALDKFLASVSSVLTSKYR +> HBA2_XENLA +LLSADDKKHIKAIMPSIAAHGDKFGGEASYRMFLVNPKTKTYFPSFDFHHNSKQITSHG +KKVVDALNEAANHLDNIAGSMSKLSDLHAYDLRVDPGNFPLLAHNLLVVVAMHFPKQFDP +ATHKALDKFLATVSTVLTSKYR +> HBA3_GORGO +VLSPADKTNVKAAWGKVGAHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAXVKGHG +KKVAKALTXAVXHLDDMPNALSALSXLHAHKLRVXPVXFKLLNHCLLVTLAAXFPSXFTP +AVHASVDKFLASVSTVLTSKYR +> HBA3_PANTR +VLSPADKTNVKAAWGKVGAHAGXYGAEALERMFLSFPTTKTYFPHFDLSHGSAXVKGHG +KKVAKALSXAVXHLDDMPNALSALSXLHAHKLRVXPVXFKLLNHCLLVTLAAXFPSXFTP +AVHASVDKFLASVSTVLTSKYR +> HBA3_PLEWA +MVLSAEEKALVVGLCGKISGHCDALGGEALDRLFASFGQTRTYFSHFDLSPGSADVKRH +GGKVLSAIGEAAKHIDSMDQALSKLSDLHAYNLRVDPGNFQLLSHCIQAVLAAHFPADFT +PQCQAAWDKFLAAVSAVLTSKYR +> HBA3_RANCA +SLSASEKAAVLSIVGKIGSQGSALGSEALTRLFLSFPQTKTYFPHFDLTPGSADLNTHG +GKIINALAGAANHLDDLAGNLSSLSDLHAYNLRVDPGNFPLLAHIIQVVLATHFPGDFTA +EVQAAWDKFLALVSAVLTSKYR +> HBA3_XENLA +TLTDSDKAAVVALWGKIAPQANAIGAEALERLFLSYPQTKTYFSHFDLSHGSADLANHG +GKVVNALGEAAKHIDDLDAALSTLSDLHAYNLRVDPGNFKLLSHTIQVTLAIHFHKEFDA +ATQAAWDKFLAEVATVLTSKYR +> HBA3_XENTR +TLTDSEKAAVVALWSKIAPQASAIGAEALERLFLSYPQTKTYFSHFDVSHGSADLQNHG +GKVVNALGEAAKHLNDLDAALSTLSDLHAYNLRVDPGNFKLLSHTIQVTLAVHFQKEFDA +ATQAAWDKFLSEVATVLTSKYR +> HBA4_SALIR +SLSAKDKANVKAIWGKILPKSDEIGEQALSRMLVVYPQTKAYFSHWASVAPGSAPVKKH +GITIMNQIDDCVGHMDDLFGFLTKLSELHATKLRVDPTNFKILAHNLIVVIAAYFPAEFT +PEIHLSVDKFLQQLALALAEKYR +> HBA4_XENLA +TLTDSDKAAIVALWGKIAPQASAIGAEALERLFLSYPQTKTYFSHFDVSHGSADLSNHG +GKVVNALGEAAKHIDDLDSALSTLSDLHAYNLRIDPGNFKLLSHTIQVTLAIHFHKEFDA +ATQAAWDKFLAEVATVLTSKYR +> HBA5_XENLA +TFSSAEKAAIASLWGKVSGHTDEIGAEALERLFLSYPQTKTYFSHFDLSHGSKDLRSHG +GKVVKAIGNAATHIDDIPHALSALSDLHAFKLKVDPGNFKLLSHAIQVTLAIHFPAEFNA +DAQAAWDKFLAVVSAVLVSKYR +> HBAD_ACCGE +MLTAEDKKLIQAIWDKVQGHQEDFGAEALQRMFITYPTTKTYFPHFDLSPGSDQVRSHG +KKVVNALGNAVKSMDNLSQALSELSNLHAYNLRVDPVNFKLLSQCFQVVLAVHLGKEYTP +EVHSAFDKFLSAVAAVLAEKYR +> HBAD_AEGMO +MLTADDKKLIQATWDKVQGHQEDFGAEALQRMFITYPPTKTYFPHFDLSPGSDQVRGHG +KKVVNALGNAVKSMDNLSQALSELSNLHAYNLRVDPVNFKLLSQCFQVVLAVHLGKEYTP +EVHAAFDKFLSAVAAVLAEKYR +> HBAD_ANAPL +MLTAEDKKLITQLWEKVAGHQEEFGSEALQRMFLAYPQTKTYFPHFDLHPGSEQVRGHG +KKVAAALGNAVKSLDNLSQALSELSNLHAYNLRVDPVNFKLLAQCFQVVLAAHLGKDYSP +EMHAAFDKFMSAVAAVLAEKYR +> HBAD_ANSAN +MLTADDKKLLAQLWEKVAGHQDEFGNEALQRMFVTYPQTKTYFPHFDLHPGSEQVRSHG +KKVAAALGNAVKSLDNISQALSELSNLHAYNLRVDPANFKLLSQCFQVVLAVHLGKDYTP +EMHAAFDKFLSAVAAVLAEKYR +> HBAD_ANSIN +MLSADDKKIIAQLWEKVAGHQDEFGNEALQRMFVTYPQTKTYFPHFDVHPGSEQVRSHG +KKVAAALGNAVKSLDNISQALSELSNLHAYNLRVDPANFKLLSQCFQVVLAVHLGKDYTP +EMHAAFDKFLSAVAAVLAEKYR +> HBAD_APUAP +MLTAEDKKLIQQVWDKLQGCQEEVGAETLQRMFTTYPQTKTYFPHFDLSPGSDQIRGHG +KKVVAALGTAVKSLDNLSQALSELSNLHAYNLRVDPVNFKLLAQCLQVVLATHMTKDYTP +EIHAAFDKFLSAVAAVLAEKYR +> HBAD_BRACA +MLTADDKKILAQLWEKVAGHQDEFGNEALERMFVTYPQTKTYFPHFDLHPGSEQVRSHG +KKVAAALSNAVKSIDNLSQALSELSNLHAYNLRVDPANFKLLSQCFQVVLAVHLGKDYTP +EMHAAFDKFLSAVAAVLAEKYR +> HBAD_CAIMO +MLTAEDKKLIVQVWEKVAGHQEEFGSEALQRMFLAYPQTKTYFPHFDLHPGSEQVRGHG +KKVAAALGNAVKSLDNLSQALSELSNLHAYNLRVDPVNFKLLAQCFQVVLAAHLGKDYSP +EMHAAFDKFLSAVAAVLAEKYR +> HBAD_CHICK +MLTAEDKKLIQQAWEKAASHQEEFGAEALTRMFTTYPQTKTYFPHFDLSPGSDQVRGHG +KKVLGALGNAVKNVDNLSQAMAELSNLHAYNLRVDPVNFKLLSQCIQVVLAVHMGKDYTP +EVHAAFDKFLSAVSAVLAEKYR +> HBAD_CHLME +MLTADDKKLLTQLWEKVAGHQEEFGSEALQRMFLTYPQTKTYFPHFDLHPGSEQVRGHG +KKVAAALGNAVKSLDNLSQALSELSNLHAYNLRVDPANFKLLAQCFQVVLATHLGKDYSP +EMHAAFDKFLSAVAAVLAEKYR +> HBAD_CHRPI +MLNHDEKQLIKHAWEKVLGHQEDFGAEALERMFAVYPQTKTYFPHFDLHHDSEQIRHHG +KKVVTALGDAVRHMDNLSEALSELSNLHAYNLRVDPVNFKLLSHCFQVVLAVHLADEYTP +QVHVAYDKFLAAVSAVLAEKYR +> HBAD_GYPRU +MLTADDKKLIQTTWDKVQGHQEDFGAEALQRMFITYPQTKTYFPHFDLSPGSDQVRGHG +KKVVNALGNAVKSMDNLSQALSELSNLHAYNLRVDPVNFKLLSQCFQVVLAVHLGKEYTP +EVHSAFDKFLSAVAAVLAEKYR +> HBAD_PASMO +MLTAEDKKLIQQIWGKLGGAEEEIGADALWRMFHSYPSTKTYFPHFDLSQGSDQIRGHG +KKVVAALSNAIKNLDNLSQALSELSNLHAYNLRVDPVNFKFLSQCLQVSLATRLGKEYSP +EVHSAVDKFMSAVASVLAEKYR +> HBAD_PHACA +MLGAEETALVRGVWQKVESAKDEMGEETLTRMFLVYPKTKTYFPHFDLHHGSEQIRNHG +KKVVTALGNAIQNLDNLRQTLADLSNLHAYNLRVDPVNFKLLAQCFQVVLAVHLGQEYTP +EVHVAFDKFLTAVAAVLAEKYR +> HBAD_PHACO +MLNAEDKKLIQQAWEKAASHQQEFGAEALVRMFTAYPQTKTYFPHFDLSPGSDQIRGHG +KKVLGALSNAVKNVDNLSQAMSELSNLHAYNLRVDPVNFKLLSQCIEVVLAVHMGKDYTP +EVHAAFDKFLSAVSAVLAEKYR +> HBAD_PHRHI +MLSADEKQLILHAWEKVHTHQEDFGAEALERMFTVYPQTKTYFHHFDLHHGSEQIRRHG +KKVVVALENAVHHMDNLSAALCKLSDLHAYNLRVDPVNFKLLSHCFHVVLAGHLGEEYSP +QVHVAYDKFLAAVSDVLAEKYR +> HBAD_RHEAM +MLTADDKKLISQIWTKVAEHGGEFGGEALERMFITYPQTKTYFPHFDLHVGSEQVRGHG +KKVVNALSNAVKNLDNLSQALAELSNLHAYNLRVDPVNFKLLSQCFQVVLAVHLGKEYTP +EVHAAYDKFLSAVASVLAEKYR +> HBAD_SPHPU +VLTHEDCELLQQTWEKVLGHQEDFGAEALERMFITYPQTKTYFPHFDLHHGSEQIRNHG +RKVVNALGEAVKNMDHMSTASGELSNLHAYNLRVDPVNFKLLSECFEVVLAVHLKDQYTP +DVHRAYDKFLSAVGDMLAEKYR +> HBAD_STRCA +MLTADDKKLIQQIWEKVGSHLEDFGAEALERMFITYPQTKTYFPHFDLHPGSEQIRGHG +KKVANALGNAVKSLDNLSQALSELSNLHAYNLRVDPVNFKLLSQCFQVVLAVHMGKDYTP +EVHAAYDKFLTAVAAVLAEKYR +> HBAD_STUVU +VLTAEDKKLIQQTWGKLGGAEEEIGAEALWRMFHAYPPTKTYFPHFDLSQGSDQIRGHG +KKVVAALGNAIKNLDNLSQALSELSNLHAYNLRVDPVNFKFLSQCLQVTLATRLGKEYSP +EVHSAVDKFMSAVAAVLAEKYR +> HBAD_TURME +VLTGEDKKHVQHIWGLLSGAEEDLGAEVLYRMFQSYPPTKTYFPHFDVTQGSEQIRGHG +KKFMAALGNAVKNVDNLSQALSELSNLHAYNLRVDPVNFKFLSQCLQVALAARLGKEYSP +EVHSAVDKFMAAVAAVLAEKYR +> HBAM_RANCA +GLSDSEKSAVASLWEKIAPQTNKLGAESMERLFKNHPETKSFFSRFDISPGSQDLLTHG +GKIFGALGEAIKSLDNLQKYQDLHTNKLKLSSDHMKLLSAAIIEVFTAHFGGEVNQAAWN +KFLGEVGAILTSS +> HBAT_HORSE +ALAAADRATVRALWKKMGSNVGVYATEALERMFLGFPSTTTYFLHLDLSLGSTQVKAHG +QKVADALTLAVEHLEDLPRALSALRHRHVRELRVDPASFQLLGHCLLVTPARHFPGDFSP +TLHASLVKFLSHVISALASDCR +> HBAT_HUMAN +ALSAEDRALVRALWKKLGSNVGVYTTEALERTFLAFPATKTYFSHLDLSPGSSQVRAHG +QKVADALSLAVERLDDLPHALSALSHLHACQLRVDPASFQLLGHCLLVTLARHYPGDFSP +ALQASLDKFLSHVISALVSEYR +> HBAT_PAPAN +ALSAEDRALVRALWKKLGSNVGVYATEALERTFLAFPATKTYFSHLDLSPGSAQVRAHG +QKVADALSLAVERLDDLPRALSALSHLHACQLRVDPANFPAPGPLPAGDPRPALPRRLQP +GAAGVAGQVPEPRDLCAGFRVP +> HBAT_PONPY +ALSAEDRALVRALWKKLGSNVGVYTTEALERTFLAFPATKTYFSHLDLSPGSSQVRAHG +QKVADALSLAVERLDDLPHALSALSHLHACQLRVDPASFQLLGHCLLVTLARHYPGDFSP +ALQASLDKFLSHVISALASEYR +> HBAZ_CAPHI +SLTRTERTIILSLWSKISTQADVIGTETLERLFSCYPQAKTYFPHFDLHSGSAQLRAHG +SKVVAAVGDAVKSIDNVTSALSKLSELHAYVLRVDPVNFKFLSHCLLVTLASHFPADFTA +DAHAAWDKFLSIVSGVLTEKYR +> HBAZ_HORSE +SLTKAERTMVVSIWGKISMQADAVGTEALQRLFSSYPQTKTYFPHFDLHEGSPQLRAHG +SKVAAAVGDAVKSIDNVAGALAKLSELHAYILRVDPVNFKFLSHCLLVTLASRLPADFTA +DAHAAWDKFLSIVSSVLTEKYR +> HBAZ_HUMAN +SLTKTERTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHFDLHPGSAQLRAHG +SKVVAAVGDAVKSIDDIGGALSKLSELHAYILRVDPVNFKLLSHCLLVTLAARFPADFTA +EAHAAWDKFLSVVSSVLTEKYR +> HBAZ_MOUSE +SLMKNERAIIMSMWEKMAAQAEPIGTETLERLFCSYPQTKTYFPHFDLHHGSQQLRAHG +FKIMTAVGDAVKSIDNLSSALTKLSELHAYILRVDPVNFKLLSHCLLVTMAARFPADFTP +EVHEAWDKFMSILSSILTEKYR +> HBAZ_PANTR +SLTKTEGTIIVSMWAKISTQADTIGTETLERLFLSHPQTKTYFPHFDLHPGSAQLRAHG +SKVVAAVGDAVKSIDNIGGALSKLSELHAYILRVDPVNFKLLSHCLLVTLAARFPADFTA +EAHAAWDKFLSVVSSVLTEKYR +> HBAZ_PIG +SLTKAERTIIGSMWTKISSQADTIGTETLERLFASYPQAKTYFPHFDLNPGSAQLRAHG +SKVLAAVGEAVKSIDNVSAALAKLSELHAYVLRVDPVNFKFLSHCLLVTLASHFPADLTA +EAHAAWDKFLTIVSGVLTEKYR +> HBA_ACCGE +VLSANDKTNVKNVFTKIGGHAEEYGAETLERMFTTYPPTKTYFPHFDLHHGSAQIKAHG +KKVVGALIEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_AEGMO +VLSANDKTNVKTVFTKITGHAEDYGAETLERMFITYPPTKTYFPHFDLHHGSAQIKAHG +KKVVGALIEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_AILFU +VLSPADKTNVKSTWDKLGGHAGEYGGEALERTFASFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTLAVGHLDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_AILME +VLSPADKTNVKATWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTTAVGHLDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLASHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_ALCAA +VLSATDKSNVKAAWGKVGGNAPAYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +EKVANALTKAVGHLDDLPGTLSDLSDLHAHKLRVDPVNFKLLSHTLLVTLAAHLPSDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_ALLMI +VLSMEDKSNVKAIWGKASGHLEEYGAEALERMFCAYPQTKIYFPHFDMSHNSAQIRAHG +KKVFSALHEAVNHIDDLPGALCRLSELHAHSLRVDPVNFKFLAHCVLVVFAIHHPSALSP +EIHASLDKFLCAVSAVLTSKYR +> HBA_AMBME +FKLSGEDKANVKAVWDHVKGHEDAFGHEALGRMFTGIEQTHTYFPDKDLNEGSFALHSH +GKKVMGALSNAVAHIDDLEATLVKLSDKHAHDLMVDPAEFPRLAEDILVVLGFHLPAKFT +YAVQCSIDKFLHVTMRLCISKYR +> HBA_ANAPE +VLSAADKTNVKGVFSKIGGHAEEYGAETLERMFIAYPQTKTYFPHFDLSHGSAQIKAHG +KKVAAALVEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPAALTP +EVHASLDKFLCAVGAVLTAKYR +> HBA_ANAPL +VLSAADKTNVKGVFSKIGGHAEEYGAETLERMFIAYPQTKTYFPHFDLSHGSAQIKAHG +KKVAAALVEAVNHVDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPAALTP +EVHASLDKFMCAVGAVLTAKYR +> HBA_ANAPP +VLSAADKTNVKGVFSKIGGHAEEYGAETLERMFIAYPQTKTYFPHFDLSHGSAQIKAHG +KKVAAALVEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPAALTP +EVHASLDKFMCAVGAVLTAKYR +> HBA_ANSAN +VLSAADKTNVKGVFSKIGGHAEEYGAETLERMFTAYPQTKTYFPHFDLQHGSAQIKAHG +KKVAAALVEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPSALTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_ANSIN +VLSAADKTNVKGVFSKISGHAEEYGAETLERMFTAYPQTKTYFPHFDLQHGSAQIKAHG +KKVVAALVEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPSALTA +EVHASLDKFLCAVGTVLTAKYR +> HBA_ANSSE +VLSAADKGNVKTVFGKIGGHAEEYGAETLQRMFQTFPQTKTYFPHFDLQPGSAQIKAHG +KKVAAALVEAANHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVLAIHHPSLLTP +EVHASMDKFLCAVATVLTAKYR +> HBA_ANTPA +VLSPADKTNVKAAWDKVGGHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVGDALGNAVAHMDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPGDFTP +AVHASLDKFLASVSTVLVSKYR +> HBA_APTFO +VLSADDKSNVKSIFSKLHTHACEYGAEPLERMFXTYPTTKTYFPHFDLSHGSAXVKAHG +KKVAXXIGKAIAXLXXIAGALSKLSXLHAXKLRVXPVXFKLLSHGLXVAXAKXLVRXFTP +GVTASLXKIHKSVSAAHQAKYR +> HBA_APUAP +VLSAADKTNVKGVFAKIGGQAEALGGEALARMFAAYPPTKTYFPHFDLSPGSAQVKAHG +KKVASALVEAANNIDDIAGALSKLSDLHAQKLRVDPVNFKLLGHCFLVVVAIHHPSVLTP +EVHASLDKFLCAVATVLTAKYR +> HBA_AQUCH +VLSANDKTNVKNVFTKISGHAEDYGAEALERMFTTYPPTKTYFPHFDLHHGSAQIKAHG +KKVVGALIEAVNHIDDMAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_ARAAR +VLSGSDKTNVKGIFSKIGGQAEDYGAEALERMFATFPQTKTYFPHFDVSPGSAQVKAHG +KKVAAALVEAANHIDDIATALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHNPSALTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_ATEGE +VLSPADKSNVKAAWGKVGGHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_BALAC +VLSPTDKSNVKATWAKIGNHGAEYGAEALERMFMNFPSTKTYFPHFDLGHDSAQVKGHG +KKVADALTKAVGHMDNLLDALSDLSDLHAHKLRVDPANFKLLSHCLLVTLALHLPAEFTP +SVHASLDKFLASVSTVLTSKYR +> HBA_BISBO +VLSAADKGNVKAAWGKVGGHAAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +AKVAAALTKAVGHLDDLPGALSELSDLHAHKLRVDPVNFKLLSHSLLVTLASHLPNDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_BOSGA +VLSAADKGNVKAAWGKVGDHAAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +AKVAAALTKAVGHLDDLPGALSELSDLHAHKLRVDPVNFKLLSHSLLVTLASHLPNDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_BOVIN +VLSAADKGNVKAAWGKVGGHAAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +AKVAAALTKAVEHLDDLPGALSELSDLHAHKLRVDPVNFKLLSHSLLVTLASHLPSDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_BRACA +VLSAADKTNVKGVFSKIGGHADEYGAETLERMFVAYPQTKTYFPHFDLQHGSAQIKAHG +KKVAAALVEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPSALTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_BRATR +VLSAADKAHVKAFWTKIGGHAGEYGGEALERTFLSFPTTKTYFPHFDLSPGSAQVKAHG +KKVGDALTLAVGHLDDLPGALSDLSDLHAHKLRVDPVNFKLLGHCVLVTLALHHPDAFTP +AVHASLDKFITTVSTVLTSKYR +> HBA_CAICR +VLSEEDKSHVKAIWGKVAGHLEEYGAEALERMFCAYPQTKIYFPHFDMSHNSAQIRGHG +KKVFAALHDAVNHIDDLAGALCRLSDLHAHNLRVDPVNFKFLSQCILVVFGVHHPCSLTP +EVHASLDKFLCAVSAMLTSKYR +> HBA_CAIMO +VLSAADKTNVKGVFSKIGGHAEEYGAETLERMFIAYPQTKTYFPHFDLQHGSAQIKAHG +KKVAAALVEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPAALTP +EVHASLDKFMCAVGAVLTAKYR +> HBA_CALAR +VLSPADKSNVKAAWGKVGSHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_CAMDR +VLSSKDKTNVKTAFGKIGGHAAEYGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTKAADHLDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTVAAHHPGDFTP +SVHASLDKFLANVSTVLTSKYR +> HBA_CANFA +VLSPADKTNIKSTWDKIGGHAGDYGGEALDRTFQSFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTTAVAHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPTEFTP +AVHASLDKFFAAVSTVLTSKYR +> HBA_CAPHI +VLSAADKSNVKAAWGKVGGNAGAYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +EKVAAALTKAVGHLDDLPGTLSDLSDLHAHKLRVDPVNFKLLSHSLLVTLACHLPNDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_CARAU +SLSDKDKAVVKALWAKIGSRADEIGAEALGRMLTVYPQTKTYFSHWSDLSPGSGPVKKH +GKTIMGAVGDAVSKIDDLVGALSALSELHAFKLRIDPANFKILAHNVIVVIGMLFPGDFT +PEVHMSVDKFFQNLALALSEKYR +> HBA_CATCL +SLSDKDKADVKIAWAKISPRADEIGAEALGRMLTVYPQTKTYFAHWADLSPGSGPVKHG +KKViMGAIGDAVTKFDDLLGGLASLSELHASKLRVDPSNFKILANCITVVIMFYLPGDFP +PEVHASVDKFFQNLALALGQKYR +> HBA_CAVPO +VLSAADKNNVKTTWDKIGGHAAEYVAEGLTRMFTSFPTTKTYFHHIDVSPGSGDIKAHG +KKVADALTTAVGHLDDLPTALSTLSDVHAHKLRVDPVNFKFLNHCLLVTLAAHLGADFTP +SIHASLDKFFASVSTVLTSKYR +> HBA_CEBAP +VLSPADKTNVKTAWGKVGGHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALSNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_CEBCA +VLSPADKTNVKTAWGKVGAHAGDYGADALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALSNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_CERAE +VLSPADKSNVKAAWGKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAVGHVDDMPHALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_CERSI +VLSPTDKTNVKTAWGHVGAQAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTQAVGHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLALHHPQDFTP +AVHASLDKFLSNVSTVLTSKYR +> HBA_CERTO +VLSPDDKKHVKAAWGKVGEHAGEYGAEALERMFLSFPTTKTYFPHFNLSHGSDQVKGHG +KKVADALTLAVGHVDDMPHALSKLSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_CHICK +VLSAADKNNVKGIFTKIAGHAEEYGAETLERMFTTYPPTKTYFPHFDLSHGSAQIKGHG +KKVVAALIEAANHIDDIAGTLSKLSDLHAHKLRVDPVNFKLLGQCFLVVVAIHHPAALTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_CHLME +VLSAADKANVKGVFSKIGGHADDYGAETLERMFIAYPQTKTYFPHFDLHHGSAQIKAHG +KKVAAALVEAVNHIDDITGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPAALTP +EVHASLDKFMCAVGAVLTAKYR +> HBA_CHRPI +VLNAGDKANVKAVWNKVAAHVEEYGAETLERMFTVYPQTKTYFPHFDLHHGSAQIRTHG +KKVLTALGEAVNHIDDLASALSKLSDIHAQTLRVDPVNFKFLNHCFLVVVAIHQPSVLTP +EVHVSLDKFLSAVGTVLTSKYR +> HBA_CICCI +VLSANDKSNVRGVFGKISAHADDYGAETLERMFTVHPTQKTYFPHFDLHRGSAQIKAHG +KKVAGALLEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAVHHPSLLTP +EVHASLDKFLCTVSTVLTDKYR +> HBA_COLBA +VLSPADKTNVKTAWGKVGGHGGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAAAHVDDMPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_COLLI +VLSANDKSNVKAVFAKIGGQAGDLGGEALERLFITYPQTKTYFPHFDLSHGSAQIKGHG +KKVAEALVEAANHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGHCFLVVVAVHFPSLLTP +EVHASLDKFVLAVGTVLTAKYR +> HBA_COTJA +VLSAADKTNVKGIFAKIAGHAEEYGAEALDRMFTTYPQTKTYFPHFDVSHGSAQIKGHG +KKVAAALVEAANHIDDIAGTLSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPAALTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_CRIGA +VLSADDKANIKATWEKIGGHGAEYGAEALERMFASFPTTKTYFPHFDVSHGSAQVKSHG +KKVADALANAAHHLDDLPGALSALSDLHAHKLRVDPVNFKLLGHCLLVTLATHLQAGLTP +AAHASLDKFLASVSTVLTSKYR +> HBA_CROCR +VLSSADKANIKATWDKIGGHGGEYGAEALERTFLCFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALALAAAHLDDLPSALSALSDLHAYKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHSDLDKFLSSVSTVLTSKYR +> HBA_CRONI +VLSSDDKCNVKAVWSKVAGHLEEYGAEALERMFCAYPQTKIYFPHFDLSHGSAQIRAHG +KKVFAALHEAVNHIDDLPGALCRLSELHAHSLRVDPVNFKFLAQCVLVVVAIHHPGSLTP +EVHASLDKFLCAVSSVLTSKYR +> HBA_CTEGU +VLSAADKTNVKAAWDKIGGHGGEYGAEALERMFLSFPTTKTYFPHFDVSHGSAQVKAHG +KKVADALANAASHLDDLPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFLATVATVLTSKYR +> HBA_CYGMA +SLSDKDKAAVKALWTTISKSSDAIGNDALSRMIVVYPQTKTYFSHWPDVTPGSTHIRDH +GKKVMGGISLAVSKIDDLKTGLFELSEQHAFKLRVDPANFKILNHCILVVIATMFPKEFT +PEAHVSLDKFLSGVALALAERYR +> HBA_CYGOL +VLSAADKTNVKGVFSKIGGHADDYGAETLERMFIAYPQTKTYFPHFDLQHGSAQIKAHG +KKVAAALVEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKFLGHCFLVVVAIHHPSALTP +EVHASLDKFLCAVGAVLTAKYR +> HBA_CYNSP +VLSPADKTNVKAAWDKVGGNAGEYGAEALERMFLSFPTTKTYFPHFDLAHGSPQVKGHG +KKVGDALTNAVSHIDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLANHLPSDFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_CYPCA +SLSDKDKAAVKGLWAKISPKADDIGAEALGRMLTVYPQTKTYFAHWADLSPGSGPVKKH +GKVIMGAVGDAVSKIDDLVGGLAALSELHAFKLRVDPANFKILAHNVIVVIGMLYPGDFP +PEVHMSVDKFFQNLALALSEKYR +> HBA_DASNO +VLSAADKTHVKAFWGKVGGHAAEFGAEALERMFASFPPTKTYFSHMDLSHGSAQVKAHG +KKVADALTLAVGHLDDLPGALSTLSDLHAHKLRVDPVNFKFLSHCLLVTLACHLPDDFTP +AVHASMDKFMAGVSTVLVSKYR +> HBA_DASVI +VLSDADKTHVKAIWGKVGGHAGAYAAEALARTFLSFPTTKTYFPHFDLSPGSAQIQGHG +KKVADALSQAVAHLDDLPGTLSKLSDLHAHKLRVDPVNFKLLSHCLIVTLAAHLSKDLTP +EVHASMDKFFASVATVLTSKYR +> HBA_DIDMA +VLSANDKTNVKGAWSKVGGNSGAYMGEALYRTFLSFPTTKTYFPNYDFSAGSAQIKTQG +QKIADAVGLAVAHLDDMPTALSSLSDLHAHELKVDPVNFKFLCHNVLVTMAAHLGKDFTP +EIHASMDKFLASVSTVLTSKYR +> HBA_ECHTE +VLSAADKANVKAVWEKAGGNVGKYGGEALDRTFLSFPTTKTYFPHMDLTPGSADIMAHG +KKVADALTLAVGHMDDLPGALSKLSDLHAYKLRVDPVNFKLLSHCLLVTLACHLGGDFTP +AAHASLDKFLSSVSTVLTSKYR +> HBA_ELEEL +SLTAKSKSIVKAFWGKIGSRADDIGAEAFGRMLTVYPETKTYFASWSDLSPGSAAVKKH +GKTIMGGIAEAVGHIDDLTGGLASLSELHAFKLRVDPANFKILAHNLIVVLALFFPADFT +PEVHMAVDKFFQNVASALSEKYR +> HBA_ELEMA +VLSDKDKTNVKATWSKVGDHASDYVAEALERMFFSFPTTKTYFPHFDLSHGSGQVKGHG +KKVGEALTQAVGHLDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTLSSHQPTEFTP +EVHASLDKFLSNVSTVLTSKYR +> HBA_EQUAS +VLSAADKTNVKAAWSKVGGNAGEFGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTLAVGHLDDLPGALSNLSDLHAHKLRVDPVNFKLLSHCLLSTLAVHLPNDFTP +AVHASLDKFLSSVSTVLTSKYR +> HBA_EQUHE +VLSAADKTNVKAAWSKVGGHAGDFGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTLAVGHLDDLPGALSNLSDLHAHKLRVDPVNFKLLSHCLLSTLAVHLPNDFTP +AVHASLDKFLSTVSTVLTSKTR +> HBA_EQUZE +VLSAADKTNVKAAWSKVGGNAGEFGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTLAVGHLDDLPGALSNLSDLHAHKLRVDPVNFKLLSHCLLSTLAVHLPNDFTP +AVHASLDKFLSTVSTVLTSKYR +> HBA_ERIEU +VLSATDKANVKTFWGKLGGHGGEYGGEALDRMFQAHPTTKTYFPHFDLNPGSAQVKGHG +KKVADALTTAVNNLDDVPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLALHHPADFTP +AVHASLDKFLATVATVLTSKYR +> HBA_EUDCR +VLSANDKSNVKGVFSKISSHAEEYGAETLERMFTTYPQTKTYFPHFDLHHGSAQVKAHG +KKVATALMEAANHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVMAIHHPSALTP +EVHASLDKFLCAVGNVLTSKYR +> HBA_EUDSC +VLSAADKTNVKGIFAKIGGHGDDYGAETLDRMFTVYPQTKTYFPHFDVSHGSAQIKAHG +KKVVAALVEAVNHIDDIAGALSKLSDLHAHKLRVDPANFKLLGQCFLVVVGIHHASALTP +EVHASLDKFLCAVSTVLTAKYR +> HBA_FELCA +VLSAADKSNVKACWGKIGSHAGEYGAEALERTFCSFPTTKTYFPHFDLSHGSAQVKAHG +QKVADALTQAVAHMDDLPTAMSALSDLHAYKLRVDPVNFKFLSHCLLVTLACHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_FRAPO +VLSAADKNNVKGIFGKISSHAEDYGAEALERMFITYPSTKTYFPHFDLSHGSAQVKGHG +KKVVAALIEAANHIDDIAGTLSKLSDLHAHKLRVDPVNFKLLGQCFLVVVAIHHPSALTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_GORGO +VLSPADKTNVKAAWGKVGAHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_GYPRU +VLSANDKTNVKNVFTKITGHAEDYGAETLERMFTTYPPTKTYFPHFDLHHGSAQIKAHG +KKVVGALIEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_HETPO +STSTSTSDYSAADRAELAALSKVLAQNAEAFGAEALARMFTVYAATKSYFKDYKDFTAA +APSIKAHGAKVVTALAKACDHLDDLKTHLHKLATFHGSELKVDPANFQYLSYCLEVALAV +HLTEFSPETHCALDKFLTNVCHELSSRYR +> HBA_HIPAM +VLSANDKSNVKAAWGKVGNHAPEYGAEALERMFLSFPTTKTYFPHFDLSHGSSQVKAHG +KKVADALTKAVGHLDDLPGALSDLSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPSDFTP +AAHASLDKFLANVSTVLTSKYR +> HBA_HORSE +VLSAADKTNVKAAWSKVGGHAGEYGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTLAVGHLDDLPGALSNLSDLHAHKLRVDPVNFKLLSHCLLSTLAVHLPNDFTP +AVHASLDKFLSSVSTVLTSKYR +> HBA_HUMAN +VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_LAMGL +VLSSKDKANIKTAFGKIGGHAADYGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTKAADHLDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTVAAHHPGDFTP +AVDASLDKFLANVSTVLTSKYR +> HBA_LAMPA +VLSSKDKANIKTAFGKIGGHAADYGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTKAADHLDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTVAAHHPGDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_LAMVI +VLSSKDKANVKTAFGKIGGHAADYGAEALERMFLGFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTKAADHLDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTVAAHHPGDFTP +AVHASLDKFLTNVSTVLTSKYR +> HBA_LARRI +VLSGSDKTNVKGVFGKIGGHAEEYGAETLERMFATYPQTKTYFPHFDLQHGSAQVKAHG +KKVAAALVEAANHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_LATCH +GLTAADKTLIKSIWGKVEKETEAIGVEALVRLFKCFPQSKVYFDHFTDLSPSSQKLHAH +AKVVLGALTKAVNHLDNITDTLHDISLVHAKKLLVDPVNFELLGHCLEVALAAHFATDFT +PEVHLAIDKFLYEVEKALFETYR +> HBA_LEMFU +VLSPADKTNVKTAWNAVGGQAGEHGAEALERMFLSFPTTKTYFPHFDLSHGSGQVKAHG +KKVADALTNAVSHLDDMPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLASHHPAEFTP +AVHASLDKFFAAVSTVLTSKYR +> HBA_LEPPA +MRFSQDDEVLIKEAWGLLHQIPNAGGEALARMFSCYPGTKSYFPHFGhDFSANNEKVKH +HGKKVVDAIGQGVQHLHDLSSCLHTLSEKHARELMVDPCNFQYLIEAIMTTIAAHYGEKF +TPEINCAAEKCLGQIVHVLISLYR +> HBA_LEPWE +VLSPADKTNVKTTWDKIGGHAGEYGGEALERTFMAFPTTKTYFPHFDLSPGSAQVKTHG +KKVADALTTAVSHIDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPADFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_LIOMI +VLTAEDRRLLQASVGKLGCRLEDIGADALNRLLITFPQSKTYFSHFNLSPGSKDIIHQG +EKVGKALDSALKHLDDIRGTLSQLSDLHAYNLRVDPVNFQLLSKCIHVSLATHLRNEYSA +SVTLAWDKFLELVADVLSEKYR +> HBA_LORTA +VLSPADKTNVKTAWEKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTTAVSHVDDMPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_LOXAF +VLSDNDKTNVKATWSKVGDHASDYVAEALERMFFSFPTTKTYFPHFDLGHGSGQVKAHG +KKVGEALTQAVGHLDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTLSSHQPTEFTP +EVHASLDKFLSNVSTVLTSKYR +> HBA_LUTLU +VLSPADKTNVKSTWDKIGGHAGEYGGEALERTFVSFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTNAVAHMDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_MACAS +VLSPADKTNVKAAWGKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAVGHVDDMPHALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MACCA +VLSAADKGNVKAAWDKVGGQAGEYGAEALERMFLSFPTTKTYFPHFDLAHGSAQVKGHG +KKVADALTNAVGHMDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLASHHPAEFTP +AIHASLDKFFASVSTVLTSKYR +> HBA_MACFA +VLSPADKTNVKAAWGKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAVGHVDDMPQALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MACGG +VLSPADKANVKAAWDKVGGQAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALSNAAGHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLASHHAAEFTP +AVHASLDKFLASVGTVLTSKYR +> HBA_MACGI +VLSAADKGHVKAIWGKVGGHAGEYAAEGLERTFHSFPTTKTYFPHFDLSHGSAQIQAHG +KKIADALGQAVEHIDDLPGTLSKLSDLHAHKLRVDPVNFKLLSHCLLVTFAAHLGDAFTP +EVHASLDKFLAAVSTVLTSKYR +> HBA_MACMU +VLSPADKSNVKAAWGKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAVGHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MACNE +VLSPADKTNVKAAWGKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAVDHVDDMPQALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVGTVLTSKYR +> HBA_MACSI +VLSPADKTNVKDAWGKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAVGHVDDMPQALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MACSP +VLSPADKTNVKAAWDKVGGHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTLAVGHVDDMPHALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MANSP +VLSPADKKNVKAAWDKVGGHAGEYGAEALERMFLSFPTTKTYFPHFNLSHGSDQVKGHG +KKVADALTLAVGHVDDMPQALSKLSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MARFO +VLSPADKTNVKSTWDKIGGHAGEYGGEALERTFVSFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTLAVGHLDDLAGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSTVSTVLTSKYR +> HBA_MARMA +VLSPADKTNVKAAWEKIGGHGAAYGAEALERMFLSFPTTKTYFPHFDLSHGSAQIQGHG +KKVADALANAAAHVDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MEGLY +VLSAADKANVKAAFDKVGGQAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALVNAVGHLDDLPGALSALSDLHAYKLRVDPVNFKLASNVLLVTLAVHVAAGFTP +AVHASLDKFLASVGTVLTSKYR +> HBA_MELCA +VLSPSDKANVKATWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTNAVAHGDDLPMALSTLSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSTVSTVLTSKYR +> HBA_MELME +VLSPADKANIKATWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTNAVAHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFLSSVSTVLTSKYR +> HBA_MESAU +VLSAKDKTNISEAWGKIGGHAGEYGAEALERMFFVYPTTKTYFPHFDVSHGSAQVKGHG +KKVADALTNAVGHLDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLANHHPADFTP +AVHASLDKFFASVSTVLTSKYR +> HBA_MOUSE +VLSGEDKSNIKAAWGKIGGHGAEYGAEALERMFASFPTTKTYFPHFDVSHGSAQVKGHG +KKVADALASAAGHLDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLASHHPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_MUSLU +VLSPADKTNVKSTWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTNAVAHMDDLPGAMSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_MUSPF +VLSPADKTNVKSTWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTNAVAHVDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_MUSPU +VLSPADKTNVKSTWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTNAVAHMDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_MYOVE +VLSPADKTNIKAAWDKVGAHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVGDALGNAVAHMDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHLPGEFTP +AIHASLDKFLASVSTVLVSKYR +> HBA_NASNA +VLSPADKTNIKSTWEKIGSHASEYGGEALERTFASFPTTKTYFPHFDLSPGSAQVKAHG +KKVAEALTNAVAHLDDLPGALSTLSDLHAYKLRVDPVNFKFLSHCLLVTLASHHPAEFTP +AVHASLDKFFSSVSTVLTSKYR +> HBA_NYCCO +VLSPADKTNVKAAWEKVGSHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTNAVSHVDDMPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_ODORO +VLSPADKTNVKTTWDKLGGHAGEYGGEALERTFMSFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTTAVAHIDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSTVSTVLTSKYR +> HBA_ODOVI +VLSAAXKSXVKAAWGKVGGNAAPYGAXALXRMFLSFPTTKTYFPHFXLSHGSAXVKAHG +XKVAXALTKAVGHLXXLPGTLSXLSXLHAHKLRVXPVXFKLLSHSLLVTLATHLPXXFTP +AVHASLXKFLAXVSTVLTSKYR +> HBA_ONDZI +VLSGEDKNNIKTAWGKIGGHAAEYGAEALERMFVVYPTTKTYFPHFDVSHGSGQVKAHG +KKVADALTTAVGHLDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLANHIPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_ORNAN +MLTDAEKKEVTALWGKAAGHGEEYGAEALERLFQAFPTTKTYFSHFDLSHGSAQIKAHG +KKVADALSTAAGHFDDMDSALSALSDLHAHKLRVDPVNFKLLAHCILVVLARHCPGEFTP +SAHAAMDKFLSKVATVLTSKYR +> HBA_PAGBE +SLSDKDKAAVRALWSKIGKSADAIGNDALSRMIVVYPQTKTYFSHWPDVTPGSPHIKAH +GKKVMGGIALAVSKIDDLKTGLMELSEQHAYKLRVDPANFKILNHCILVVISTMFPKEFT +PEAHVSLDKFLSGVALALAERYR +> HBA_PAGLA +VLSSADKNNIKATWDKIGSHAGEYGAEALERTFISFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTLAVGHLEDLPNALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHSALDKFFSAVSTVLTSKYR +> HBA_PANLE +VLSSADKNNVKACWGKIGSHAGEYGAEALERTFCSFPTTKTYFPHFDLSHGSAQVQAHG +QKVADALTKAVVHINDLPNALSDLSDLHAYKLRVDPVNFKFLSHCLLVTLACHHPEEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_PANPO +VLSSADKNNVKACWGKIGSHAGEYGAEALERTFCSFPTTKTYFPHFDLSHGSAQVQAHG +QKVADALTKAVAHINDLPNALSDLSDLHAYKLRVDPVNFKFLSHCLLVTLACHHPEEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_PANPS +VLSSADKNNVKACWGKIGSHAGEYGAEALERTFCSFPTTKTYFPHFDLSHGSAQVQTHG +QKVADALTKAVAHINDLPNALSDLSDLHAYKLRVDPVNFKFLSHCLLVTLACHHPEEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_PANTS +VLSSADKNNVKACWGKIGSHAGEYGAEALERTFCSFPTTKTYFPHFDLSHGSAQVQTHG +QKVADALTKAVAHINNLPNALSDLSDLHAYKLRVDPVNFKFLSHCLLVTLACHHPEEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_PAPCY +VLSPDDKKHVKAAWGKVGEHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSDQVNKHG +KKVADALTLAVGHVDDMPQALSKLSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_PASMO +VLSPADKSNVKGVFAKIGGQAEEYGADALERMFATYPQTKTYFPHFDLGKGSAQVKGHG +KKVAAALVEAVNNIDDLAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVATGNPALLTP +EVHAPLDKFLCAVGTVLTAKYR +> HBA_PHACA +VLSASDKTNVKGVFAKVGGSAEAYGAETLERMFTAYPQTKTYFPHFDLHHGSAQIKAHG +KKVAAALVEAANHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGHCFLVVVAIHHPTLLTP +EVHASLDKFMCAVAKELTAKYR +> HBA_PHACO +VLSAADKNNVKGIFTKIAGHAEEYGAEALERMFITYPSTKTYFPHFDLSHGSAQIKGHG +KKVVAALIEAVNHIDDITGTLSKLSDLHAHKLRVDPVNFKLLGQCFLVVVAIHHPSALTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_PHORU +VLSSHDKSNVKGLFGKVGGHLEEYCAETLARMFAAYPQTKTYFPHFDLQPGSAQVKAHG +KKVAGALAEAANHIDDIASALSKLSDLHQHKLRVDPVNFKLLAHCFLVVMAIHHPSLLTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_PHOVI +VLSPADKTNVKATWDKIGGHAGEYGGEALERTFTAFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTTAVAHMDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPADFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_PHYCA +VLSPADKTNVKAAWAKVGNHAADFGAEALERMFMSFPSTKTYFSHFDLGHNSTQVKGHG +KKVADALTKAVGHLDTLPDALSDLSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPGDFTP +SVHASLDKFLASVSTVLTSKYR +> HBA_PIG +VLSAADKANVKAAWGKVGGQAGAHGAEALERMFLGFPTTKTYFPHFNLSHGSDQVKAHG +QKVADALTKAVGHLDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPDDFNP +SVHASLDKFLANVSTVLTSKYR +> HBA_PONPY +VLSPADKTNVKTAWGKVGAHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKDHG +KKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_PREEN +VLSPADKTNVKAAWGKVGGHGGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTNAVAHVDDMPHALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_PROCR +VLSSADKANIKATWDKIGGHGGEYGAEALERTFLCFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALAVAAAHLDDLPAALSALSDLHAYKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHASLDKFLSSVSTVLTSKYR +> HBA_PROHA +VLSAADKNNVKGAWEKVGTHAGEYGAEALERMFLSFPTTKTYFPHFDLTHGSAQVKAHG +QKVGAALTKAVGHLDDLPNALSDLSDLHAHKLRVDPVNFKLLSHCLLVTLSRHLPeQEFT +PAVHASLDKFFSNVSTVLTSKYR +> HBA_PROLO +VLSPADKANIKATWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTLAVGHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFTSVSTVLTSKYR +> HBA_PSIKR +VLSGTDKTNVKSIFSKIGGQADDYGAEALERMFVTYPQTKTYFPHFDVSPGSAQVKAHG +KKVAGGLSEAANHIDDIATSLSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHNPSALTP +EAHASLDKFLCAVGLVLTAKYR +> HBA_PTEAL +VLSSTDKSNVKAAWDKVGGHVGEYGAEALERMFLSFPTTKTYFPHFDLAHGSSQVKAHG +KKVGDALTNAVGHIDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLASHLPSDFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_PTEBR +VLSPADKTNVKATWDKIGGHAGEYGGEALERTFASFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTNAVAHMDDLPAALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPAEFTP +AVHASLDKFFSTVSTVLTSKYR +> HBA_PTEPO +VLSSTDKSNVKAAWDKVGGNVGEYGAEALERMFLSFPTTKTYFPHFDLAHGSSQVKAHG +KKVGDALTNAVGHMDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLANHLPNDFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_RABIT +VLSPADKTNIKTAWEKIGSHGGEYGAEAVERMFLGFPTTKTYFPHFDFTHGSEQIKAHG +KKVSEALTKAVGHLDDLPGALSTLSDLHAHKLRVDPVNFKLLSHCLLVTLANHHPSEFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_RANTA +VLSAADKSNVKAAWGKVGGNAPAYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +EKVANALTKAVGHLDDLPGTLSDLSDLHAHKLRVDPVNFKLLSHTLLVTLASHLPSDFTP +AVHASLDKFLANVSTVLTSKYR +> HBA_RAT +VLSADDKTNIKNCWGKIGGHGGEYGEEALQRMFAAFPTTKTYFSHIDVSPGSAQVKAHG +KKVADALAKAADHVEDLPGALSTLSDLHAHKLRVDPVNFKFLSHCLLVTLACHHPGDFTP +AMHASLDKFLASVSTVLTSKYR +> HBA_RHEAM +VLSGPDKTNVKNVFAKIGGHADAYGAETLERMFTTYPQTKTYFPHFDLHHGSAQIKTHG +KKVVSALIDAANNIDDIYGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSLLTP +EVHASLDKFLCAVGAVLTAKYR +> HBA_RHIUN +VLSPTDKTNVKTAWSHVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKAHG +KKVGDALTQAVGHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLALHNPQDFTP +AVHASLDKFLSNVSTVLTSKYR +> HBA_ROUAE +VLSSADKTNIKAAWDKVGGNAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVGDALTNAVGHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLANHLPSDFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_SAGFU +VLSPADKSNVKAAWGKVGGHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHG +KKVADALTVAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_SALSA +SLTARDKSVVNAFWGKIKGKADVVGAEALGRMLTAYPQTKTYFSHWADLSPGSAPVKKH +GGVIMGAIGNAVGLMDDLVGGMSGLSDLHAFKLRVDPGNFKILSHNILVTLAIHFPADFT +PEVHIAVDKFLAALSAALADKYR +> HBA_SPAEH +VLSPEDKNHVRSTWDKIGGHGAEYGAEALERMFTSFPTTKTYFPHFDVSHGSAQVKAHG +KKVADALANAAGHLDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLANHHPAEFTP +GVHASLDKFLASVSTVLTSKYR +> HBA_SPECI +VLSPADKKNVKDCWEKIGGHGAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVQGHG +KKVADALANAAAHVDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_SPEPA +VLSPADKTNVKASWEKIGGHGAAYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVQGHG +KKVADALANAAAHVDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_SPETO +VLSPADKNNVKACWEKIGGHGAAYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVQGHG +KKVADALANAAAHVDDLPSALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_SPHPU +MLSASDKANVKAIWSKVCVHAEEYGAETLERMFTVYPSTKTYFPHFDLTHGSAQVKAHG +KKVVNAMGEAVNHLDDMAGALLKLSDLHAQKLRVDPVNFKLLAQCFLVVLGVHHPAALTP +EVHASLDKFLCAVGLVLTAKYR +> HBA_SQUAC +VLSAADKTAIKHLTGSLRTNAEAWGAESLARMFATTPSTKTYFSKFTDFSANGKRVKAH +GGKVLNAVADATDHLDNVAGHLDPLAVLHGTTLCVDPHNFPLLTQCILVTLAAHLTELKP +ETHCALDKFLCEVATALGSHYR +> HBA_STRCA +VLSGTDKTNVKGIFSKISSHAEEYGAETLERMFITYPQTKTYFPHFDLHHGSAQIKAHG +KKVANALIEAVNHIDDISGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSALTP +EVHASLDKFLCAVGAVLTAKYR +> HBA_STUVU +VLSASDKANVKAVFGKIGGQAEEFGAETLERMFATYPQTKTYFPHFDLGKGSAQVKGHG +KKVAAALVEAANHVDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVASHNPALLTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_SUNMU +VLSANDKANVKAAWDKVGGQAANYGAEALERTFASFPTTKTYFPHYDLSPGSAQVKAHG +KKVADALTKAVGSMDDLPGALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHHPADFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_TALEU +VLSGTDKSNIKAAWDKVGAHAGEYGAEALERTFTSFPTTKTYFPHFDLSHGSAQVKAHG +KKVADALTNAVGHLDDLPGAMSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPNDFTP +AVHASLDKFLATVSTVLTSKYR +> HBA_TAPTE +VLSPTDKTNVKAAWSKVGSHAGEYGAEALERMFLGFPTTKTYFPHFDLSHGSAQVQAHG +KKVGDALTQAVGHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLALHHPDDFTP +AIHASLDKFLSNVSTVLTSKYR +> HBA_TARBA +VLSPADKTNVKAAWDKVGGHAGDYGAEALERMFLSFPTTKTYFPHFDLSHGSSQVKGHG +KKVADALTTAVGHIDNMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLACHHPADFTP +AVHASLDKFVASVSTVLTSKYR +> HBA_TARGR +MKLSAEDKHNVKTTWDHIKGHEEALGAEALFRMFTSLPATRTYFPAKDLSEGSSFLHSH +GKKVMGALSNAVAHIDDIDAALCKLSDKHAQDLMVDPANFPKLAHNILVVMGIHLKAHLT +YPVHCSVDKFLDVVGHVLTSKYR +> HBA_THEGE +VLSPDDKKHVKDAWGKVGEHAGQYGAEALERMFLSFPTTKTYFPHFDLSHGSDQVKKHG +KKVADALTLAVGHVDDMPQALSKLSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTP +AVHASLDKFLASVSTVLTSKYR +> HBA_THUTH +TTLSDKDKSTVKALWGKISKSADAIGADALGRMLAVYPQTKTYFSHWPDMSPGSGPVKA +HGKKVMGGVALAVTKIDDLTTGLGDLSELHAFKMRVDPSNFKILSHCILVVVAKMFPKEF +TPDAHVSLDKFLASVALALAERYR +> HBA_TRAST +LSAADKGHVKAAWGKVGSHAAEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGA +KVAAALTKAVDHLDDLPGALSDLSDLHAHKLRVDPVNFKLLSHSLLVTLASHLPGDFTPA +VHASLDKFLANVSTVLTSKYR +> HBA_TRIIN +VLSDEDKTNVKTFWGKIGTHTGEYGGEALERMFLSFPTTKTYFPHFDLSHGSGQIKAHG +KKVADALTRAVGHLEDLPGTLSELSDLHAHRLRVDPVNFKLLSHCLLVTLSSHLREDFTP +SVHASLDKFLSSVSTVLTSKYR +> HBA_TRIOC +VLSANDKTNVKTVFTKITGHAEDYGAETLERMFITYPPTKTYFPHFDLHHGSAQIKAHG +KKVVGALIEAVNHIDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTP +EVHASLDKFLCAVGNVLSAKYR +> HBA_TUPGL +VLSPGDKSNIKAAWGKIGGQAPQYGAEALERMFLSFPTTKTYFPHFDMSHGSAQIQAHG +KKVADALSTAVGHLDDLPTALSALSDLHAHKLRVDPANFKLLSHCILVTLACHHPGDFTP +EIHASLDKFLANVSTVLTSKYR +> HBA_TURME +VLSAADKTNVKSAFSKIGGQADEYGAETLERMFATYPQTKTYFPHFDLGKGSAQVKAHG +KKVAAALVEAANAVDDIAGALSKLSDLHAQKLRVDPVNFKLLGQCFLVTVATHNPSLLTP +EVHASLDKFLCAVGTVLTAKYR +> HBA_TURTR +VLSPADKTNVKGTWSKIGNHSAEYGAEALERMFINFPSTKTYFSHFDLGHGSAQIKGHG +KKVADALTKAVGHIDNLPDALSELSDLHAHKLRVDPVNFKLLSHCLLVTLALHLPADFTP +SVHASLDKFLASVSTVLTSKYR +> HBA_URSMA +VLSPADKSNVKATWDKIGSHAGEYGGEALERTFASFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTTAAGHLDDLPGALSALSDLHAHKLRVDPVNFKFLSHCLLVTLASHHPAEFTP +AVHASLDKFFSAVSTVLTSKYR +> HBA_VIPAS +VLSEDDKNRVRTSVGKNPELPGEYGSETLTRMFAAHPTTKTYFPHFDLSSGSPNLKAHG +KKVIDALDNAVEGLDDAVATLSKLSDLHAQKLRVDPANFKILSQCLLSTLANHRNPEFGP +AVLASVDKFLCNVSEVLESKYR +> HBA_VULGR +VLSGSDKTNVKGVFAKIGGHAEDYGAETLERMFITYPQTKTYFPHFDLQHGSAQIKGHG +KKVVGALIEAANHIDDIAASLSKLSDLHAQKLRVDPVNFKLLGQCFLVVVAIHHPSVLTP +EVHASLDKFLCAVGNVLTAKYR +> HBA_VULVV +VLSPADKTNIKSTWDKIGGHAGDYGGEALDRTFQSFPTTKTYFPHFDLSPGSAQVKAHG +KKVADALTTAVAHLDDLPGALSALSDLHAYKLRVDPVNFKLLSHCLLVTLACHHPNEFTP +AVHASLDKFFTAVSTVLTSKYR +> HBA_XENTR +HLTADDKKHIKAIWPSVAAHGDKYGGEALHRMFMCAPKTKTYFPDFDFSEHSKHILAHG +KKVSDALNEACNHLDNIAGCLSKLSDLHAYDLRVDPGNFPLLAHQILVVVAIHFPKQFDP +ATHKALDKFLVSVSNVLTSKYR +> HBB0_MOUSE +VHFTAEEKAAITSIWDKVDLEKVGGETLGRLLIVYPWTQRFFDKFGNLSSAQAIMGNPR +IKAHGKKVLTSLGLAVKNMDNLKETFAHLSELHCDKLHVDPENFKLLGNMLVIVLSSYFG +KEFTAEAQAAWQKLVVGVATALSHKYH +> HBB1_CYGMA +VKWSKTELTIINDIFSHLDYDDIGPKALSRCLIVYPWTQRHFSGFGNLYNAEAIIGNAN +VAAHGIKVLHGLDRGLKNMDNIVDAYAELSTLHSEKLHVDPDNFKLLSDCITIVLAAKLG +KAFTAETQAAFQKFMAVVVSALGKQYH +> HBB1_IGUIG +VHWTAEEKQLITQVWGKIDVAQIGGETLACLLVVYPWTQRFFPDFGNLSNAAAICGNAK +VKAHGKKVLTSFGDAVKNLDNIKDTFAKLSELHCDKLHVDPVNFRLLGNVMITRLAAHFG +KDFTPACHAAFQKLTGAVAHALARRYH +> HBB1_MOUSE +VHLTDAEKAAVSCLWGKVNSDEVGGEALGRLLVVYPWTQRYFDSFGDLSSASAIMGNAK +VKAHGKKVITAFNDGLNHLDSLKGTFASLSELHCDKLHVDPENFRLLGNMIVIVLGHHLG +KDFTPAAQAAFQKVVAGVATALAHKYH +> HBB1_RAT +VHLTDAEKAAVNGLWGKVNPDDVGGEALGRLLVVYPWTQRYFDSFGDLSSASAIMGNPK +VKAHGKKVINAFNDGLKHLDNLKGTFAHLSELHCDKLHVDPENFRLLGNMIVIVLGHHLG +KEFTPCAQAAFQKVVAGVASALAHKYH +> HBB1_SALIR +VEWTDAEKSTISAVWGKVNIDEIGPLALARVLIVYPWTQRYFGSFGNVSTPAAIMGNPK +VAAHGKVVCGALDKAVKNMGNILATYKSLSETHANKLFVDPDNFRVLADVLTIVIAAKFG +ASFTPEIQATWQKFMKVVVAAMGSRYF +> HBB1_SPHPU +VHWTAEEKHLLGSLWAKVDVADIGGEALGRLLVVYPWTQRFFADFGNLSSATAICGNPR +VKAHGKKVFTMFGEALKHLDNLKETFASLSELHCDKLHVDTENFKLLGNLVIVVLAARLH +DSFTPAAQAAFHKLAYSVAHALARRYH +> HBB1_TAPTE +VELTGEEKAAVLALWDKVDEDKVGGEALGRLLVVYPWTQRFFDSFGDLSTAAAVMGNPK +VKAHGKKVLHSFGDGVHHLDDLKVTFAQLSELHCDKLHVDPENFRLLGNVLVVVLAQQFG +KAFTPELQAAYQKVVAGVANALAHKYH +> HBB1_TORMA +VSLTDEEIRLIQHIWSNVNVVEITAKALERVFYVYPWTTRLFTSFnhNFKASDKQVHDH +AVNVSNAISAAIGDLHDINKNFSALSTKHQKKLGVDTSNFMLLGQAFLVELAALEKDKFT +PQYHKAALKLFEVVTEALSCQYH +> HBB1_TRICR +TFTNDESQHIHDVCGKIPVDQVGAEALGRLILVNPWTRRYFKSFGDLSSAEAIQHNPKV +ASHGAKVMHSIAEAVKHLDDLKAYYADLSTIHCKKLYVDPANFKLFGGIVSIVTGMHLGT +DYTAQKQAAFEKFLHHVEAALATGYH +> HBB1_UROHA +VHWTAEEKALINAYWGKVDVGSVGGETLANLLVVYPWTQRFFEDFGNLSTPSAILNNPK +XXXXXXXVITSFGDALKNLDNVXXXXXKLSEYHCNKLHVDPVNFRLLGDVLITLSAANFG +KXXXXXXXXXXXXLVGVVAHALARRYH +> HBB1_VAREX +VHWTAEEKQLICSLWGKIDVGLIGGETLAGLLVIYPWTQRQFSHFGNLSSPTAIAGNPR +VKAHGKKVLTSFGDAIKNLDNIKDTFAKLSELHCDKLHVDPTNFKLLGNVLVIVLADHHG +KEFTPAHHAAYQKLVNVVSHSLARRYH +> HBB1_XENBO +GLTAHDRQLINSTWGKVCAKTIGKEALGRLLWTYPWTQRYFSSFGNLNSADAVFHNEAV +AAHGEKVVTSIGEAIKHMDDIKGYYAQLSKYHSETLHVDPCNFKRFGGCLSISLARQFHE +EYTPELHAAYEHLFDAIADALGKGYH +> HBB1_XENLA +GLTAHDRQLINSTWGKLCAKTIGQEALGRLLWTYPWTQRYFSSFGNLNSADAVFHNEAV +AAHGEKVVTSIGEAIKHMDDIKGYYAQLSKYHSETLHVDPLNFKRFGGCLSIALARHFHE +EYTPELHAAYEHLFDAIADALGKGYH +> HBB1_XENTR +VNLTAKERQLITGTWSKICAKTLGKQALGSMLYTYPWTQRYFSSFGNLSSIEAIFHNAA +VATHGEKVLTSIGEAIKHMDDIKGYYAQLSKYHSETLHVDPYNFKRFCSCTIISMAQTLQ +EDFTPELQAAFEKLFAAIADALGKGYH +> HBB2_CYGMA +VEWTNFERATIKDIFSKLEYDVVGPATLARCLVVYPWTQRYFGKFGNLYNAAAIAENAM +VSKHGTTIIHGLDQAVKNMDDIKNTYAELSVLHCDKLHVDPDNFQLLAECLTIVLAAQLG +KEFTGEVQAAFQKFMAVVVSSLGKQYH +> HBB2_MOUSE +VHLTDAEKSAVSCLWAKVNPDEVGGEALGRLLVVYPWTQRYFDSFGDLSSASAIMGNPK +VKAHGKKVITAFNEGLKNLDNLKGTFASLSELHCDKLHVDPENFRLLGNAIVIVLGHHLG +KDFTPAAQAAFQKVVAGVATALAHKYH +> HBB2_NAJNA +VHWSAEEKQLITSLWAKVDVPEVGAATLGKMMVMYPWTQRFFAHFGNLSGPSALCGNPQ +VRAHGKKVLTSFGEALKHLDNVKETFAKLSELHFDKLHVDPENFKLLGNVLIIVLAGHHG +KEFTPSTHASFQKLVNVVAHALARRYH +> HBB2_PANLE +GHLTPEEKSAVTALWSKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPPVQAAYQKVVAGVANALAHKYH +> HBB2_RAT +VHLTDAEKATVSGLWGKVNADNVGAEALGRLLVVYPWTQRYFSKFGDLSSASAIMGNPQ +VKAHGKKVINAFNDGLKHLDNLKGTFAHLSELHCDKLHVDPENFRLLGNMIVIVLGHHLG +KEFTPCAQAAFQKVVAGVASALAHKYH +> HBB2_SPHPU +VHWTAEEKQLVTSLWTKVNVDECGGEALGRLLIVYPWTQRFFSSFGNLSSSTAICGNPR +VKAHGKKVFTSFGEAVKNLDNIKATYAKLSELHCEKLHVDPQNFNLLGDIFIIVLAAHFG +KDFTPACQAAWQKLVRVVAHALAYHYH +> HBB2_TAPTE +VHLHGDEKAAVLALWDKVDEEKVGGEALGRLLVVYPWTQRFFDSFGDLSTAAAVMGNPK +VKAHGKKVLHSFGEGVHHLDDLKVTFAQLSELHCDKLHVDPENFRLLGNVLVVVLAQQFG +KAFTPELQAAYQKVVAGVASALAHKYH +> HBB2_TORMA +VSLTDEEKHLIQHIWSNVNVVEITAKALERVFYVYPWTTRLFTSFNHNFKASDKgVHDH +AVNVSKALSAAIGDLHNVNKNFSALSTKHQKKLGVDTSNFMLLGQAFLVELAAFEKDKFT +PQYHKAALKLFEVVTEALSCQYH +> HBB2_TRICR +VHLTAEDRKEIAAILGKVNVDSLGGQCLARLIVVNPWSRRYFHDFGDLSSCDAICRNPK +VLAHGAKVMRSIVEATKHLDNLREYYADLSVTHSLKFYVDPENFKLFSGIVIVCLALTLQ +TDFSCHKQLAFEKLMKGVSHALGHGY +> HBB2_XENBO +GLTAHEKQLITGSWGKINAKAIGKEALGRLLNTFPWTQRYFSSFGNLGSAEAIFHNEAV +AAHGEKVVTSVGEAIKHMDDIKGYYAELSKYHSETLHVDPNNFKRFGGCLSITLGHHFGE +EYTPELHAAYEHLFDAIADALGKGYH +> HBB2_XENLA +VHWTAEEKAAITSVWQKVNVEHDGHDALGRLLIVYPWTQRYFSNFGNLSNSAAVAGNAK +VQAHGKKVLSAVGNAISHIDSVKSSLQQLSKIHATELFVDPENFKRFGGVLVIVLGAKLG +TAFTPKVQAAWEKFIAVLVDGLSQGYN +> HBB2_XENTR +VHWTAEEKATIASVWGKVDIEQDGHDALSRLLVVYPWTQRYFSSFGNLSNVSAVSGNVK +VKAHGNKVLSAVGSAIQHLDDVKSHLKGLSKSHAEDLHVDPENFKRLADVLVIVLAAKLG +SAFTPQVQAVWEKLNATLVAALSHGYF +> HBB4_SALIR +VDWTDAERSAIVGLWGKISVDEIGPQALARLLIVSPWTQRHFSTFGNLSTPAAIMGNPA +VAKHGKTVMHGLDRAVQNLDDIKNTYATLSVMHSEKLHVDPDNFRLLADCITVCVAAKLG +pAVFSADTQEAFQKFLAVVVSALGRQYH +> HBBA_BOSJA +MLTAEEKAAVTAFWGKVHVDEVGGEALGRLLVVYPWTQRFFESFGDLSTADAVMNNPKV +KAHGKKVLDSFSDGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARNFGK +EFTPVLQADFQKVVAGVANALAHRYH +> HBBA_CAPHI +MLTAEEKAAVTGFWGKVKVDEVGAEALGRLLVVYPWTQRFFEHFGDLSSADAVMNNAKV +KAHGKKVLDSFSNGMKHLDDLKGTFAQLSELHCDKLHVDPENFKLLGNVLVVVLARHHGS +EFTPLLQAEFQKVVAGVANALAHRYH +> HBBC_CAPHI +PNKALITGFWSKVKVDEVGAEALGRLLVVYPWTQRFFEHFGDLSSADAVLGNAKVKAHG +KKVLDSFSNGVQHLDDLKGTFAELSELHCDKLHVDPENFRLLGNVLVIVLARHFGKEFTP +ELQAEFQKVVAGVASALAHRYH +> HBBC_RANCA +GGSDVSAFLAKVDKRAVGGEALARLLIVYPWTQRYFSTFGNLGSADAISHNSKVLAHGQ +RVLDSIEEGLKHPXXLKAYYAKLSERHSGELHVDPANFYRLGNVLITVMARHFHEEFTPE +LQCALHSSFCAVGEALAKGYH +> HBBC_SHEEP +PNKALITGFWSKVKVDEVGAEALGRLLVVYPWTQRFFEHFGDLSTADAVLGNAKVKAHG +KKVLDSFSNGVQHLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVVVLARHFGKEFTP +ELQAEFQKVVAGVASALAHRYH +> HBBF_BOVIN +MLSAEEKAAVTSLFAKVKVDEVGGEALGRLLVVYPWTQRFFESFGDLSSADAILGNPKV +KAHGKKVLDSFCEGLKQLDDLKGAFASLSELHCDKLHVDPENFRLLGNVLVVVLARRFGS +EFSPELQASFQKVVTGVANALAHRYH +> HBBF_CAPHI +MLSAEEKASVLSLFAKVNVEEVGGEALGRLLVVYPWTQRFFEHFGDLSSADAILGNPKV +KAHGKKVLDTFSEGLKQLDDLKGAFASLSELHCDKLHVDPENFRLLGNVLVVVLARRFGG +EFTPELQANFQKVVTGVANALAHRYH +> HBBF_SHEEP +MLTAEEKASVISLFAKVNVEEVGGEALGRLLVVYPWTQRFFEHFGDLSSADAILGNPKV +KGHGKKVLNSFSEGLKQLDDLKGAFASLSELHCDKLHVDPENFRLLGNVLVVVLARRFGG +EFTPELQANFQKVVTGVANALAHRYH +> HBBL_RANCA +VHWTAEEKAVINSVWQKVDVEQDGHEALTRLFIVYPWTQRYFSTFGDLSSPAAIAGNPK +VHAHGKKILGAIDNAIHNLDDVKGTLHDLSEEHANELHVDPENFRRLGEVLIVVLGAKLG +KAFSPQVQHVWEKFIAVLVDALSHSYH +> HBBL_XENLA +VHLSADEKSAINAVWSKVNIENDGHDALTRLLVVFPWTQRYFSSFGNLSNVAAISGNAK +VRAHGKKVLSAVDESIHHLDDIKNFLSVLSTKHAEELHVDPENFKRLADVLVIVLAGKLG +AAFTPQVQAAWEKFSAGLVAALSHGYF +> HBBN_AMMLE +PXKALITGFWSKVKVXXVGAXALGRLLVVYPWTXRFFXHFGXLSSAXAVMXXAKVKAHG +KKVLXSFSXGLKHLXXLKGAFASLSXLHCXKLHVXPXXFRLLGXVLVVVLARHFGKXFXP +XLXAXFXKVVAGVASALAHRYH +> HBBZ_MOUSE +VHFTAEEKAAITSIWDKVDLEKVGGETLGRLLIVYPWTQRFFDKFGNLSSALAIMGNPR +IRAHGKKVLTSLGLGVKNMDNLKETFAHLSELHCDKLHVDPENFKLLGNMLVIVLSTHFA +KEFTPEVQAAWQKLVIGVANALSHKYH +> HBB_ACCGE +VQWAAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSATAVLGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIVVLAAHFG +KDFSPDCQAAWQKLVRAVAHALARKYH +> HBB_AEGMO +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAIIGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFG +KDFSPDCQAAWQKLVRAVAHALARKYH +> HBB_AILFU +VHLTGEEKAAVTGLWSKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_AILME +VHLTGEEKAAVTGLWSKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSTPDAVMNNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_ALCAA +MLTAEEKAAVTAFWGKVKVDEVGGEALGRLLVVYPWTQRFFEHFGDLSTADAVMHNAKV +KEHGKRVLDAFSEGLKHLDDLKGAFAKLSELHCDKLHVDPENFRLLGNVLVVVLARHFGK +EFTPELQADYQKVVTGVANALAHRYH +> HBB_ALLMI +ASFDAHERKFIVDLWAKVDVAQCGADALSRMLIVYPWKRRYFEHFGKMCNAHDILHNSK +VQEHGKKVLASFGEAVKHLDNIKGHFANLSKLHCEKFHVDPENFKLLGDIIIIVLAAHHP +EDFSVECHAAFQKLVRQVAAALAAEYH +> HBB_ANAPL +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFT +KDFTPECQAAWQKLVRVVAHALARKYH +> HBB_ANAPP +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFP +KEFTPECQAAWQKLVRVVAHALARKYH +> HBB_ANSAN +VHWSAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFSSFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KEFTPECQAAWQKLVRVVAHALARKYH +> HBB_ANSIN +VHWSAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFSSFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KEFTPDCQAAWQKLVRVVAHALARKYH +> HBB_ANSSE +VHWSAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTERFFSSFGNLSSPTAIIGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFS +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_ANTPA +VHLTADEKSAVTGLWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSNAGAVMGNAK +VKAHGKKVLNAFSDGLKNLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLMIVLARHFG +KEFCPPVQAAFQKVSLGVATALGHKYH +> HBB_AOTTR +VHLTGEEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMNNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_APUAP +VQWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSATAVIGNPM +VRAHGKKVLTSFGEAVKNLDSIKSTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFS +KDFTPEAQQAWAKLVRAVAHALARKYH +> HBB_AQUCH +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAIIGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFT +KDFSPDCQAAWQKLVRAVAHALARKYH +> HBB_ARAAR +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFG +KDFTPECQAALQKLVRVVAHALARKYH +> HBB_ATEGE +VHLTGEEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMSNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQLQAAYQKVVAGVANALAHKYH +> HBB_BALAC +VHLTAEEKSAVTALWAKVNVEEVGGEALGRLLVVYPWTQRFFEAFGDLSTADAVMKNPK +VKAHGKKVLASFSDGLKHLDDLKGTFATLSELHCDKLHVDPENFRLLGNVLVIVLARHFG +KEFTPELQAAYQKVVAGVANALAHKYH +> HBB_BISBO +MLTAEEKAAVTAFWGKVHVDEVGGEALGRLLVVYPWTQRFFESFGDLSSADAVMNNAKV +KAHGKKVLDSFSNGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARHFGK +EFTPVLQADFQKVVTGVANALAHRYH +> HBB_BOSGA +MLTAEEKAAVTAFWGKVHVDEVGGEALGRLLVVYPWTQRFFESFGDLSTADAVMNNPKV +KAHGKKVLDSFSNGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARHFGK +EFTPVLQADFQKVVAGVANALAHRYH +> HBB_BOSMU +MLTAEEKAAVTAFWGKVKVDEVGGEALGRLLVVYPWTQRFFESFGDLSSADAVMNNPKV +KAHGKKVLDSFSNGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARHFGK +EFTPVLQADFQKVVVGVANALAHRYH +> HBB_BOVIN +MLTAEEKAAVTAFWGKVKVDEVGGEALGRLLVVYPWTQRFFESFGDLSTADAVMNNPKV +KAHGKKVLDSFSNGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARNFGK +EFTPVLQADFQKVVAGVANALAHRYH +> HBB_BRACA +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFSSFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_BRATR +VHLADDEKAAVSALWNKVHVEEFGGEALGRLLVVYPWTSRFFESFGDLSSADAVFSNAK +VKAHGKKVLTSFGEGLKHLDDLKGTYAHLSELHCDKLHVDPENFKLLGNVLVIVLARHFG +KEFTPQLQASYQKVTTGVSTALAHKYH +> HBB_CAICR +SPFSAHEEKLIVDLWAKVDVASCGGDALSRMLIIYPWKRRYFEHFGKLSTDQDVLHNEK +IREHGKKVLASFGEAVKHLDNIKGHFAHLSKLHFEKFHVDCENFKLLGDIIIVVLGMHHP +KDFTLQTHAAFQKLVRHVAAALSAEYH +> HBB_CAIMO +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFT +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_CALAR +VHLTGEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMNNPK +VKAHGKKVLGAFSDGLTHLDNLKGTFAHLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPVVQAAYQKVVAGVANALAHKYH +> HBB_CAMDR +VHLSGDEKNAVHGLWSKVKVDEVGGEALGRLLVVYPWTRRFFESFGDLSTADAVMNNPK +VKAHGSKVLNSFGDGLNHLDNLKGTYAKLSELHCDKLHVDPENFRLLGNVLVVVLARHFG +KEFTPDLQAAYQKVVAGVANALAHRYH +> HBB_CANFA +VHLTAEEKSLVSGLWGKVNVDEVGGEALGRLLIVYPWTQRFFDSFGDLSTPDAVMSNAK +VKAHGKKVLNSFSDGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_CARAU +VEWTDAERSAIIGLWGKLNPDELGPQALARCLIVYPWTQRYFATFGNLSSPAAIMGNPK +VAAHGRTVMGGLERAIKNMDNIKATYAPLSVMHSEKLHVDPDNFRLLADCITVCAAMKFG +pSGFNADVQEAWQKFLSVVVSALCRQYH +> HBB_CAVPO +VHLTAAEKSAILDLWGKVNVGEIGAEALGRLLVVYPWTQRFFEKFGDLSSASAIMSNAH +VKSHGAKVLASFSEGLKHLQDLKGTFAKLSELHCDKLHVDPENFRLLGNMIVIALAHHHP +SEFTPCTQAAFQKVTAGVANALAHKYH +> HBB_CEBAL +VHLTAEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSTPDAVMNNPK +VKAHGKKVLGAFSDGLTHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_CEBAP +VHLTAEEKSAVTTLWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSTPDAVMNNPK +VKAHGKKVLGAFSDGLTHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_CERAE +VHLTPEEKTAVTTLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_CERSI +VELTAEEKAAVLALWDKVKEDEVGGEALGRLLVVYPWTQRFFDSFGDLSTPAAVMGNAK +VKAHGKKVLHSFGDGVHHLDNLKGTFAALSELHCDKLHVDPENFRLLGNVLVVVLAKHFG +KQFTPELQAAYQKVVAGVANALAHKYH +> HBB_CERTO +VHLTPEEKVAVTTLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSNPDAVMGNPK +VKAHGKKVLGAFSDGLNHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_CHICK +VHWTAEEKQLITGLWGKVNVAECGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFSQLSELHCDKLHVDPENFRLLGDILIIVLAAHFS +KDFTPECQAAWQKLVRVVAHALARKYH +> HBB_CHLME +VHWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAISGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFSQLSELHCDKLHVDPENFRLLGDILIIVLAAHFT +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_CHRPI +VHWTADEKQLITSLWGKVNVEECGSEALARLLIVYPWTQRFFSTFGNLSNAEAILHNPH +VHAHGKKVLTSFGEAVKNLDHIKQTFATLSKLHCEKLHVDPENFKLLGNVLIIVLASHFT +KEFTPACQAAWQKLVSAVAHALALGYH +> HBB_CICCI +VHWTAEEKQLITGLWGKVNVDECGAEALARLLIVYPWTQRFFASFGNLATASAITGNAM +VHAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFKLLGDILIIVLAAHFG +KDFTPDCQAAWKKLVRVVAHALARKYH +> HBB_COLBA +VHLTPDEKNAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSTADAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_COLLI +VHWSAEEKQLITSIWGKVNVADCGAEALARLLIVYPWTQRFFSSFGNLSSATAISGNPN +VKAHGKKVLTSFGDAVKNLDNIKGTFAQLSELHCDKLHVDPENFRLLGDILVIILAAHFG +KDFTPECQAAWQKLVRVVAHALARKYH +> HBB_COLPO +VHLTPDEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_CROCR +GFLTAEEKSLVNDLWSKVNVDEVGGEALGRLLVVYPWTQRFFQSFGDLSSADAIMGNSK +VKAHGKKVLNSFSDGLKHIDDLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +NEFTPPVQAAYQKVVAGVANALAHKYH +> HBB_CRONI +ASFDPHEKQLIGDLWHKVDVAHCGGEALSRMLIVYPWKRRYFENFGDISNAQAIMHNEK +VQAHGKKVLASFGEAVCHLDGIRAHFANLSKLHCEKLHVDPENFKLLGDIIIIVLAAHYP +KDFGLECHAAYQKLVRQVAAALAAEYH +> HBB_CTEGU +VHLSAEEKAAVTGLWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSSAAAVMGNPK +VKAHGKKVLTSFSEGLSHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNMIVITLAHHYG +PEFGPQTQAAFQKVVAGVANALAHKYH +> HBB_CYNSP +VHLSGEEKSAVTSLWGKVKVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSASAVMGNAK +VKAHGKKVLDSFSEGLQHLDSLKGTFAKLSELHCDKLHVDPENFRLLGNVLVVVLARHFG +KEFTPQLQAAYQKVVAGVATALAHKYH +> HBB_CYPCA +VEWTDAERSAIIALWGKLNPDELGPEALARCLIVYPWTQRFFASYGNLSSPAAIMGNPK +VAAHGRTVEGGLMRAIKDMDNIKATYAPLSVMHSEKLHVDPDNFRLLADCITVCAAMKFG +pSGFSPNVQEAWQKFLSVVVNALKRQYH +> HBB_DASNO +VNLTSDEKTAVLALWNKVXVXXHGGXALGRLLVVYPWTQRFFESFGDLSTPAAVFANAK +VKAHGKKVLTSFGEGMNHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNMLVVVMARHFG +KEFDHWMHACFQRVVAGVANALAHKYH +> HBB_DIDMA +VHLTSEEKNCITTIWSKVQVDQTGGEALGRMLVVYPWTTRFFGSFGDLSSPGAVMSNSK +VQAHGAKVLTSFGEAVKHLDNLKGTYAKLSELHCDKLHVDPENFKMLGNIIVICLAEHFG +KDFTPECQVAWQKLVAGVAHALAHKYH +> HBB_ECHTE +VHMTDAEKKLVTTMWGKLDVDAAGAETLGRVLVVYPWTQRFFGHFGDLSSACAVMDNPK +VQAHGKKVLHSLGDGLNHLDDLKHFYAALSELHCDKLHVDPENFRLLGNVLVCVMSRHFG +AEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_ELEEL +VELTEAQRGAIVNLWGHLSPDEIGPQALARLLIVYPWTQRYFASFGNISSAAAIMGNPK +VAAHGKVVVGALDKAVKNLNNIKGTYAALSTIHSEKLHVDPDNFRLLAESFTVSVAMKLG +pSGFNAETQHALAKFLAEVVSALGKQYH +> HBB_ELEMA +VNLTAAEKTQVTNLWGKVNVKELGGEALSRLLVVYPWTRRFFEHFGDLSTADAVLHNAK +VLAHGEKVLTSFGEGLKHLDNLKGTFADLSELHCDKLHVDPENFRLLGNVLVIVLARHFG +KEFTPDVQAAYEKVVAGVANALAHKYH +> HBB_EQUHE +VQLSGEEKAAVLALWDKVNEEEVGGEALGRLLVVYPWTQRFFDSFGDLSNPAAVMGNPK +VKAHGKKVLHSFGEGVHHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVVVLARHFG +KDFTPELQASYQKVVAGVANALAHKYH +> HBB_ERIEU +VHLTAEEKALVTGLWGKVKVEEFGGEALGRLLVVYPWTQRFFDSFGDLSSADAVMGNPK +VKAHGAKVLQSMGDGIKNLDNLKGTFSKLSELHCDKLHVDPENFRLLGNVLVCVLARHFG +KDFTPAAQAAFQKVVAGVANALAAKYH +> HBB_EUDCR +VHWSAEEKQLITGLWGKVNVAQCGGEALARLLIVYPWTQRFFSSFGNLSSPSAILGNPM +VRAHGKKVLTSFGDAVKNMDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KDFTPECQAAWEKLVRVVAHALARKYH +> HBB_FELCA +GFLTAEEKGLVNGLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSSADAIMSNAK +VKAHGKKVLNSFSDGLKNIDDLKGAFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +HDFNPQVQAAFQKVVAGVANALAHKYH +> HBB_FRAPO +VHWTAEEKQLITGLWGKVNVAECGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFSQLSELHCDKLHVDPENFRLLGDILIIVLAAHFS +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_GALCR +VHLTPDEKNAVCALWGKVNVEEVGGEALGRLLVVYPWTQRFFDSFGDLSSPSAVMGNPK +VKAHGKKVLSAFSDGLQHLDNLCGTFAKLSELHCDKLHVNPENFRLLGNVLVCVLAHHFG +KDFTPEVQAAYEKVVAGVATALAHKYH +> HBB_GORGO +VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPPVQAAYQKVVAGVANALAHKYH +> HBB_HETPO +VHWSEVELHEITTTWKSIDKHSLGAKALARMFIVYPWTTRYFGNLKEFTACSYGVKEHA +KKVTGALGVAVTHLGDVKSQFTDLSKKHAEELHVDVESFKLLAKCFVVELGILLKDKFAP +QTQAIWEKYFGVVVDAISKEYH +> HBB_HIPAM +VHLTAEEKDAVLGLWGKVNVQEVGGEALGRLLVVYPWTQRFFESFGDLSSADAVMNNPK +VKAHGKKVLDSFADGLKHLDNLKGTFAALSELHCDQLHVDPENFRLLGNELVVVLARTFG +KEFTPELQAAYQKVVAGVANALAHRYH +> HBB_HORSE +VQLSGEEKAAVLALWDKVNEEEVGGEALGRLLVVYPWTQRFFDSFGDLSNPGAVMGNPK +VKAHGKKVLHSFGEGVHHLDNLKGTFAALSELHCDKLHVDPENFRLLGNVLVVVLARHFG +KDFTPELQASYQKVVAGVANALAHKYH +> HBB_HUMAN +VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPPVQAAYQKVVAGVANALAHKYH +> HBB_HYLLA +VHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_LAMGL +VNLSGDEKNAVHGLWSKVKVDEVGGEALGRLLVVYPWTRRFFESFGDLSTADAVMNNPK +VKAHGSKVLNSFGDGLSHLDNLKGTYAKLSELHCDKLHVDPENFRLLGNVLVVVLARHFG +KEFTPDLQAAYQKVVAGVANALAHRYH +> HBB_LARRI +VHWSAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAINGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KDFTPDSQAAWQKLVRVVAHALARKYH +> HBB_LATCH +VHWTETERATIETVYQKLHLDEVGREALTRLFIVYPWTTRYFKSFGDLSSSKAIASNPK +VTEHGLKVMNKLTEAIHNLDHIKDLFHKLSEKHFHELHVDPQNFKLLSKCLIIVLATKLG +KQLTPDVQATWEKLLSVVVAALSREYH +> HBB_LEMCA +TFLTPEENGHVTSLWGKVNVEKVGGEALGRLLVVYPWTQRFFESFGDLSSPDAIMGNPK +VKAHGKKVLSAFSEGLHHLDNLKGTFAQLSELHCVALHVDPENFKLLGNVLVIVLAHHFG +NDFSPQTQAAFQKVVTGVANALAHKYH +> HBB_LEMFU +TLLSAEENAHVTSLWGKVDVEKVGGEALGRLLVVYPWTQRFFESFGDLSSPSAVMGNPK +VKAHGKKVLSAFSEGLHHLDNLKGTFAQLSELHCDKLHVDPQNFTLLGNVLVVVLAEHFG +NAFSPAVQAAFQKVVAGVANALAHKYH +> HBB_LEMVA +TFLTPEENNHVTSLWGKVNVEKVGGEALGRLLVVYPWTQRFFESFGDLSSPDAIMGNPK +VKAHGKKVLTAFSEGLHHLDNLKGTFAQLSELHCDKLHVDPQNFKLLGNVLVIVLAHHFG +NDFSPQTQAAFQKVVTGVANALAHKYH +> HBB_LEPEU +VHLSGEEKSAVTALWGKVNVEEVGGETLGRLLVVYPWTQRFFESFGDLSTASAVMGNPK +VKAHGKKVLAAFSEGLSHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVIVLSHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_LEPPA +VHWEDAEKQYIVSVFSKIDVDHVGANTLERVLIVFPWTKRYFNSFGDLSSPGAIKHNNK +VSAHGRKVLAAIIECTRHFGNIKGHLANLSHLHSEKLHVDPHNFRVLGQCLRIELAAALG +fKEFTPERNAYFQKFMDVISHSLGREYH +> HBB_LEPWE +VHLTAEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPNAIMSNPK +VKAHGKKVLNSFSDGLKNLDNLKGTFAKLSELHCDQLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_LIOMI +VHWTAEEKSAITAIWGKVDVAAIGGEALCRLLIVYPWTQRFFTSFGNLSNAAAIQSNAQ +VKAHGKKVFTAFGDAVKNPEGVKDTFAKLSELHCDKLHVDPVNFKLLGQILITVLAAHFG +KDFTPNVQAAYQKLVSVVAHALAHQYH +> HBB_LORTA +VHLTGEEKSAVTGLWGKVNVEDVGGEALGRLLVVYPWTQRFFESFGDLSSPSAVMGNPK +VKAHGKKVLSAFSDGLNHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVVVLAHHFG +KDFTPQVQSAYQKVVAGVANALAHKYH +> HBB_LOXAF +VNLTAAEKTQVTNLWGKVNVKELGGEALSRLLVVYPWTRRFFEHFGDLSTAEAVLHNAK +VLAHGEKVLTSFGEGLKHLDNLKGTFADLSELHCDKLHVDPENFRLLGNVLVIVLARHFG +KEFTPDVQAAYEKVVAGVANALAHKYH +> HBB_LUTLU +VHLTGEEKAAVTSLWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_MACCA +VHLTGEEKSTVSALWGKVNVEEIGGEALGRLLVVYPWTQRFFDSFGDLSSPSAVFGNAK +VKSHGKKVLDSFSNGMQHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVVVLARNFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_MACFU +VHLTPEEKNAVTTLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLNHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_MACGG +VHLTGEEKAAVTGLWGKVNVEEVGGEALGRLLVVYPWTQRFFDSFGDLSSPSAVMGNPK +VKAHGKKVLNSFSDGLKNLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLARHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_MACGI +VHLTAEEKNAITSLWGKVAIEQTGGEALGRLLIVYPWTSRFFDHFGDLSNAKAVMANPK +VLAHGAKVLVAFGDAIKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNIIVICLAEHFG +KEFTIDTQVAWQKLVAGVANALAHKYH +> HBB_MACMU +VHLTPEEKNAVTTLWGKVNVDEVGGEALGRLLLVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLNHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_MACRU +VHLTAEEKNAITSLWGKVAIEQTGGEALGRLLIVYPWTSRFFDHFGDLSNAKAVMGNPK +VLAHGAKVLVAFGDAIKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNIIVICLAEHFG +KEFTIDTQVAWQKLVAGVANALAHKYH +> HBB_MANSP +VHLTPEEKTAVTTLWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLNHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_MARMA +VHLSDGEKNAISTAWGKVNAADIGAEALGRLLVVYPWTQRFFDSFGDLSSASAVMGNAK +VKAHGKKVIDSFSNGLKHLDNLKGTFASLSELHCDKLHVDPENFKLLGNMIVIVMAHHLG +KDFTPEAQAAFQKVVAGVANALAHKYH +> HBB_MEGLY +VHLTNEEKTAVIGLWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSSPSAIMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFRLLGYILLCVLARHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_MELCA +VHLTAEEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_MELME +VHLTAEEKSAVTSLWGKVNVDEVGGEALGRLLVVYPWTQRYFDSFGDLSTPDAVMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_MESAU +VHLTDAEKALVTGLWGKVNADAVGAEALGRLLVVYPWTQRFFEHFGDLSSASAVMNNPQ +VKAHGKKVIHSFADGLKHLDNLKGAFSSLSELHCDKLHVDPENFKLLGNMIIIVLSHDLG +KDFTPSAQSAFHKVVAGVANALAHKYH +> HBB_MESBR +VHLTDAEKNLVSGLWGKVNADAVGAEALGRLLVVTPWTQRFFEHFGDLSSASAVMNNPQ +VKAHGKKVIHSFADGLKHLDNLKGAFSSLSELHCDKLHVDPENFKLLGNMIIIVLSHDLG +KDFTPSAQSAFHKVVAGVANALAHKYH +> HBB_MICXA +VHLTDAEKAAISGLWGKVXAXAAGAXALGRLLVVYPWTXRFFXHFGXLSSASAVMGNAQ +VKAHGKKVIHAFADGLKHLDXLKGTFASLSXLHCXKLHVXPXXFRLLGXMIVIVLAHHLG +KDFTPSAXAAFXKVVAGVASALAHKYH +> HBB_MUSLU +VHLTAEEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_MUSPF +VHLTGEEKAAVTALWGKVNVDEVGGETLGRLLVVYPWTQRFFDSFGDLSSPDAVMSNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_MYOVE +VHLTADEKAAVSGLWGKVNVDEVGGEALGRLLVVYPWTQRFFTSFGDLSNAAAVMGNSK +VKAHGKKVLNSFGEGLKNVDNLKGTFASLSELHCDKLHVDPENFRLLGNVLVIVLARHFG +KEFTPQVQGAFQKLALGVATALAHKYH +> HBB_NASNA +VHLTGEEKTAVTNLWAKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAIMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_NOTCO +VNWSDSERAIITDIFSHMDYDDIGPKALSRCLIVYPWTQRHFSGFGNLYNAEAILGNAN +VAAHGIKVLHGLDRGVKNMDKIVDAYAELSMLHSEKLHVDPDNFKLLSDCITIVVAAKMG +SAFTPEIQCAFQKFLAVVVSALGKQYH +> HBB_NYCCO +VHLTGEEKSAVTALWGKVNVDDVGGEALGRLLVVYPWTQRFFESFGDLSSPSAVMGNPK +VKAHGKKVLSAFSDGLNHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVVVLAHHFG +KDFTPQVQSAYQKVVAGVANALAHKYH +> HBB_ODORO +VHLTADEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMGNPK +VKAHGKKVLNSFSDGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_ODOVI +MLTAEEKAAVTGFWGKVNVDVVGAEALGRLLVVYPWTQRFFEHFGDLSSAGAVMGNPKV +KAHGKRVLDAFSEGLKHLDDLKGAFAELSELHCNKLHVDPENFRLLGNVLVVVLARNFGG +EFTPLVQADFQKVVAGVANALAHRYH +> HBB_ONDZI +VHLTDAEKAAISGLWGKVNADGVGAEALGRLLVVYPWTQRFFEHFGDLSSSSAVMGNAK +VKSHGKKVITAFADGLKHLDNLKGTFSALSELHCDKLHVDPENFKLLGNMIVIVLSHDLG +KDFTPDAQSAFQKVVTGVATALGHKYH +> HBB_ORNAN +VHLSGGEKSAVTNLWGKVNINELGGEALGRLLVVYPWTQRFFEAFGDLSSAGAVMGNPK +VKAHGAKVLTSFGDALKNLDDLKGTFAKLSELHCDKLHVDPENFNRLGNVLIVVLARHFS +KDFSPEVQAAWQKLVSGVAHALGHKYH +> HBB_OVIMU +MLTAEEKAAVTGFWGKVKVDEVGAEALGRLLVVYPWTQRFFEHFGDLSSADAVMNNAKV +KAHGKKVLXSFSNGMKHLDDLKGTFAQLSELHCDKLHVXPXXFRXXXXXXXXXXXXHHGS +EFTPVLQAXFQKVVAGVANALAHRYH +> HBB_PAGBE +VEWTDKERSIISDIFSHMDYDDIGPKALSRCLIVYPWTQRHFSGFGNLYNAEAIIGNAN +VAAHGIKVLHGLDRGVKNMDNIAATYADLSTLHSEKLHVDPDNFKLLSDCITIVLAAKMG +HAFTAETQGAFQKFLAVVVSALGKQYH +> HBB_PAGLA +GFLTAEEKGLVNGLWGKVNVDEVGGEALGRLLVVYPWTQRFFQSFGDLSSADAIMHNSK +VKAHGKKVLNSFSDGLKHVDDLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVASALAHRYH +> HBB_PANPO +SFLSAEEKNLVSGLWGKVNVDEVGGEALGRLLVVYPWTQRFFQSFGDLSSADAIMSNAK +VKAHGKKVLNSFSDGLKNIDDLKGAFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +HEFNPQVQAAFQKVVAGVASALAHRYH +> HBB_PANTS +SFLSAEEKGLVNGLWSKVNVDEVGGEALGRLLVVYPWTQRFFQSFGDLSSADAIMSNAK +VKAHGKKVLNSFSDGLKNIDDLKGAFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +HEFNPQVQAAFQKVVAGVASALAHRYH +> HBB_PAPCY +VHLTPEEKNAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPAAVMGNPK +VKAHGKKVLGAFSDGLNHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_PASMO +VQWTAEEKQLITGLWGKVNVAECGGEALARLLIVYPWTQRFFASFGNLSSPTAVLGNPK +VQAHGKKVLTSFGEAVKNLDSIKNTFSQLSELHCDKLHVDPENFRLLGDILVVVLAAHFG +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_PHACA +VHWTAEEKQLITGLWGKVNVAECGAEALARLLIVYPWTQRFFASFGNLSSATAITGNPM +VRAHGKKVLTSFGEAVKNLDNIKATFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KDFTPECQAAWQKLVGAVAHALARKYH +> HBB_PHACO +VHWSAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFSQLSELHCDKLHVDPENFRLLGDILIIVLAAHFS +KDFTPECQAAWQKLVRVVAHALARKYH +> HBB_PHORU +VHWSAEEKQLITSLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KDFTPECQAAWQKLVRVVAHALARKYH +> HBB_PHOVI +VHLTGEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSADAIMGNPK +VKAHGKKVLNSFSDGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_PHYCA +VHLTGEEKSGLTALWAKVNVEEIGGEALGRLLVVYPWTQRFFEHFGDLSTADAVMKNPK +VKKHGQKVLASFGEGLKHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVVVLARHFG +KEFTPELQTAYQKVVAGVANALAHKYH +> HBB_PIG +VHLSAEEKEAVLGLWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSNADAVMGNPK +VKAHGKKVLQSFSDGLKHLDNLKGTFAKLSELHCDQLHVDPENFRLLGNVIVVVLARRLG +HDFNPDVQAAFQKVVAGVANALAHKYH +> HBB_POTTR +VHLSSEEKGLITSLWGKIDIEQTGGEALGRLLIVYPWTSRFFDHFGDLSSAKAVLGNAK +VLAHGAKVLVSFGDAIKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVICLAEHFG +KDFTIDAQVAWQKLVAGVANALAHKYH +> HBB_PREEN +VHLTPEEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_PROCR +GFLTAEEKSLVNDLWSKVNVDEVGGEALGRLLVVYPWTQRFFQSFGDLSSADAIMGNGK +VKAHGKKVLNSFSDGLKHIDDLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +NEFTPPVQAAYQKVVAGVANALAHKYH +> HBB_PROHA +VHLTDAEKAAVTGLWGKVKVDEYGGEALGRLLVVYPWTQRFFEHFGDLSNADAIMHNPK +VLAHGKKVLSSFGDGLNHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVVVLARHFH +EEFTPDVQAAFQKVVTGVANALAHKYH +> HBB_PROLO +VHLTADEKTAVTTLWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSSADAIMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPPVQAAYQKVVAGVANALAHKYH +> HBB_PSIKR +VHWSAEEKQLITGLWGKVNVAECGAEALARLLIVYPWTQRFFTSFGNLSSASAVLGNPN +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAGHFG +KDFTPDCQAAWQKLVRAVAHALARKYH +> HBB_PTEAL +VHLSGEEKAAVTGLWGKVKVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSASAVMGNPK +VKAHGKKVLDSFSEGLQHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLARHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_PTEBR +VHLTGEEKAAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMGNPK +VKAHGKKVLNSFSEGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_PTEPO +VHLSGEEKAAVTGLWGKVKVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSAPAVMGNPK +VKAHGKKVLDSFSEGLQHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLARHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_RABIT +VHLSSEEKSAVTALWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSSANAVMNNPK +VKAHGKKVLAAFSEGLSHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVIVLSHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_RANES +GSDLVSGFWGKVDAHKIGGEALARLLVVYPWTQRYFTTFGNLGSADAICHNAKVLAHGE +KVLAAIGEGLKHPENLKAHYAKLSEYHSNKLHVDPANFRLLGNVFITVLARHFQHEFTPE +LQHALEAHFCAVGDALAKAYH +> HBB_RANTA +MLTSEEKAAVTGFWGKVKVDEVGAEALGRLLVVYPWTQRFFEHFGDLSSADAIMHNDKV +KAHGKRVLDAFSDGLKHLDDLKGAFAKLSELHCDKLHVDPENFRLLGNVLVVVLARHFGK +DFTPVLQADYQKVVTGVANALAHRYH +> HBB_RHEAM +VQWTAEEKQLITGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFA +KDFTPECQAAWQKLVRVVAHALARKYH +> HBB_RHIUN +VDLTAEEKAAVLALWGKVNEDEVGGEALGRLLVVYPWTQRFFDSFGDLSTPAAVLGNAK +VKAHGKKVLHSFGDGVHNLDNLKGTYAALSELHCDKLHVDPENFRLLGNVLVVVLAQHFG +QEFTPELQAAYQKVVAGVANALAHKYH +> HBB_ROUAE +VHLSGEEKAAVTALWGKVKVEEVGGEALGRLLVVYPWTQRFFDSFGDLSSASAVMSNPK +VKAHGKKVLDSFSEGLQHLDSLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLARHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_SAGFU +VHLTGEEKSAVTTLWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCNKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_SAGMY +VHLTGEEKSAVTTLWGKVNVEEVGGEALGRLLVVYPWTQRFFDSFGDLSSPDAVMNNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_SAGNI +VHLTGEEKSAVTTLWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMNNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_SAISC +VHLTGDEKAAVTALWGKVNVEDVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMNNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_SHEEP +MLTAEEKAAVTGFWGKVKVDEVGAEALGRLLVVYPWTQRFFEHFGDLSNADAVMNNPKV +KAHGKKVLDSFSNGMKHLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVVVLARHHGN +EFTPVLQADFQKVVAGVANALAHKYH +> HBB_SPAEH +VHLTDAEKAAVSGLWSKVNVDEIGGEALGRLLVVYPWTQRFFDSFGDLSSPSAVMSNPK +VKAHGKKVLNSFSEGLKHLDNLKGTFSSLSELHCDKLHVDPENFKLLGNVIVVVLAHHLG +KDFTPAAQAAFQKVVAGVATALAHKYH +> HBB_SPECI +VHLSDGEKNAISTAWGKVHAAEVGAEALGRLLVVYPWTQRFFDSFGDLSSASAVMGNAK +VKAHGKKVIDSFSNGLKHLDNLKGTFASLSELHCDKLHVDPENFKLLGNMIVIVMAHHLG +KDFTPEAQAAFQKVVAGVANALAHKYH +> HBB_SPETO +VHLTDGEKNAISTAWGKVNAAEIGAEALGRLLVVYPWTQRFFDSFGDLSSASAVMGNAK +VKAHGKKVIDSFSNGLKHLDNLKGTFASLSELHCDKLHVDPENFKLLGNMIVIVMAHHLG +KDFTPEAQAAFQKVVAGVANALSHKYH +> HBB_SQUAC +VHWTGEEKALVNAVWTKTDHQAVVAKALERLFVVYPWTKTYFVKFNGKFHASDSTVQTH +AGKVVSALTVAYNHIDDVKPHFVELSKKHYEELHVDPENFKLLANCLEVELGHALHKEFT +PEVQAAWSKFSNVVVDALSKGYH +> HBB_STRCA +VQWSAEEKQLISGLWGKVNVADCGAEALARLLIVYPWTQRFFASFGNLSSPTAILGNPM +VRAHGKKVLTSFGDAVKNLDNIKNTFAQLSELHCDKLHVDPENFRLLGDILIIVLAAHFT +KEFTPECQAAWQKLVRVVAHALARKYH +> HBB_STUVU +VQWTAEEKQLITGLWGKVNVAECGAEALARLLIVYPWTQRFFASFGNLSSPTAVLGNPK +VQAHGKKVLTSFGDAVKNLDSIKNTFSQLSELHCDKLHVDPENFRLLGDILVVVLAAHFG +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_SUNMU +VHLSGEEKACVTGLWGKVNEDEVGAEALGRLLVVYPWTQRFFDSFGDLSSASAVMGNPK +VKAHGKKVLHSLGEGVANLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVVVLASKFG +KEFTPPVQAAFQKVVAGVANALAHKYH +> HBB_TACAC +VHLSGSEKTAVTNLWGHVNVNELGGEALGRLLVVYPWTQRFFESFGDLSSADAVMGNAK +VKAHGAKVLTSFGDALKNLDNLKGTFAKLSELHCDKLHVDPENFNRLGNVLVVVLARHFS +KEFTPEAQAAWQKLVSGVSHALAHKYH +> HBB_TADBR +VHLSGEEKGAVTALWGKVNQEEVGGEALGRLLVVYPWTQRFFDSFGDLSSASAVMGNAK +VKAHGKKVLNSFSDGLKNLDNLKGAFAKLSELHCDKLHVDPENFKLLGNVLVVVLARTFG +KEFTPPVQSAFQKVAAGVATALAHKYH +> HBB_TALEU +VHLSGEEKGLVTGMWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSASAIMGNAK +VKAHGKKVANSITDGVKNLDNLKGTYAKLSELHCDKLHVDPENFRLLGNVLVCVLARNLG +KEFTPQAQAAFQKVVLGVATALAHKYH +> HBB_TARBA +VHLTADEKAAVTALWGKVDVEDVGGEALGRLLVVYPWTQRFFDSFGDLSTPAAVMGNAK +VKAHGKKVLNAFSEGMAHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_TARSY +VHLTAEEKAAVTALWGKVDVEDVGGEALGRLLVVYPWTQRFFDSFGDLSTPAAVMSNAK +VKAHGKKVLNAFSDGMAHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVATALAHKYH +> HBB_THEGE +VHLTPEEKNAVTTLWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSPAAVMGNPK +VKAHGKKVLGAFSDGLNHLDNLKGTFAQLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_THUTH +VEWTQQERSIIAGFIANLNYEDIGPKALARCLIVYPWTQRYFGAYGDLSTPDAIKGNAK +IAAHGVKVLHGLDRAVKNMDNINEAYSELSVLHSDKLHVDPDNFRILGDCLTVVIAANLG +DAFTVETQCAFQKFLAVVVFALGRKYH +> HBB_TRAST +MLTAEEKAAVTAFWGKVKVDEVGGEALGRLLVVYPWTQRFFESFGDLSTADAVMNNPKV +KAHGKKVLDSFSNGMKHLDDLKGTFAALSELHCDKLHVDPENFKLLGNVLVVVLARHFGK +EFTPELQADYQKVVTGVANALAHRYH +> HBB_TRIIN +VHLTPEEKALVIGLWAKVNVKEYGGEALGRLLVVYPWTQRFFEHFGDLSSASAIMNNPK +VKAHGEKVFTSFGDGLKHLEDLKGAFAELSELHCDKLHVDPENFRLLGNVLVCVLARHFG +KEFSPEAQAAYQKVVAGVANALAHKYH +> HBB_TUPGL +VHLSGEEKAAVTGLWGKVDLEKVGGQSLGSLLIVYPWTQRFFDSFGDLSSPSAVMSNPK +VKAHGKKVLTSFSDGLNHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVRVLACNFG +PEFTPQVQAAFQKVVAGVANALAHKYH +> HBB_TURME +VQWTAEEKQLITGLWGKVNVAECGGEALARLLIVYPWTQRFFASFGNLSSPTAVLGNPK +VQAHGKKVLTSFGEAVKNLDSIKGTFAQLSELHCDKLHVDPENFRLLGDILVVVLAAHFG +KDFTPDCQAAWQKLVRVVAHALARKYH +> HBB_TURTR +VHLTGEEKSAVTALWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGDLSTADAVMKNPN +VKKHGQKVLASFGEGLKHLDDLKGTFAALSELHCDKLHVDPENFRLLGNVLVVVLARHFG +KEFTPELQSAYQKVVAGVATALAHKYH +> HBB_URSMA +VHLTGEEKSLVTGLWGKVNVDEVGGEALGRLLVVYPWTQRFFDSFGDLSSADAIMNNPK +VKAHGKKVLNSFSDGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBB_VULGR +VHWSAEEKQLITGLWGKVNVAECGAEALARLLIVYPWTQRFFASFGNLSSPTAIIGNPM +VRAHGKKVLTSFGEAVKNLDNIKNTFAQLSELHCEKLHVDPENFRLLGDILIIVLAAHFA +KDFTPDCQAAWQKLVRAVAHALARKYH +> HBB_VULVV +VHLTAEEKSLVTGLWGKVNVDEVGGEALGRLLIVYPWTQRFFDSFGDLSTPDAVMGNAK +VKAHGKKVLNSFSDGLKNLDNLKGTFAKLSELHCDKLHVDPENFKLLGNVLVCVLAHHFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBD_AOTTR +VHLTGDEKSAVAALWGKVNVEEVGGEALGRLLVVYPWTQRFFESFGALSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLARNFG +KEFTPLLQAAFQKVVAGVATALAHKYH +> HBD_ATEGE +VHLTPEEKAAVAALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPAAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLARNFG +KEFTPQVQAAFQKVVAGVATALAHKYH +> HBD_COLPO +VHLTPEEKTVVSALWGKVNVDAVGGEALGRLLVVYPWTQRFFESFGDLSSPAAVMGNPK +VKAHGKKVLGAFSDGLAHLDSLKGTFSQLSELHCDKLHVDPENFRLLGNVLVCVLAHNFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBD_HUMAN +VHLTPEEKTAVNALWGKVNVDAVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPENFRLLGNVLVCVLARNFG +KEFTPQMQAAYQKVVAGVANALAHKYH +> HBD_PANTR +VHLTPEEKTAVNALWGKVNVDAVGGEALGRLLVVYPWTQRFFESFGDLSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFSQLSELHCDKLHVDPENFRLLGNVLVCVLARNFG +KEFTPQVQAAYQKVVAGVANALAHKYH +> HBD_SAGMY +VHLTGDEKSAVAALWSKVNVDEVGGEALGRLLVVYPWTQRFFESFGALSSPDAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLARNFG +KEFTPRVQAAFQKVVAGVATALAHKYH +> HBD_SAISC +VHLTGDEKSAVAALWSKVNVDEVGGEALGRLLVVYPWTQRFFESFGALSSADAVMGNPK +VKAHGKKVLGAFSDGLAHLDNLKGTFAQLSELHCDKLHVDPENFRLLGNVLVCVLARNFG +KEFTPQVQAAFQKVVAGVATALAHKYH +> HBD_TARSY +VHLTADEKAAVTALWSKVNVEDVGGEALGRLLVVYPWTQRFFDSFGDLSTPAAVMSNAK +VKAHGKKVLNAFSDGMAHLDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLVCVLAHHFG +KQFTPQLQAAYQKVVAGVAAALAHKYH +> HBE1_CAPHI +VHFTAEEKAAITGLWGKVNVEEAGGEALGRLLVVYPWTQRFFDSFGNLSSASAIMGNPK +VKAHGKKVLTSFGEAIKNLDNLKGAFAKLSELHCDKLHVDPENFRLLGNVIVIILATHFG +REFTPDVQAAWQKLVSGVATALAHKYH +> HBE2_BOVIN +VHFTTEENVAVASLWAKVNVEVVGGESLARLLIVCPWTQRFFDSFGNLYSESAIMGNPK +VKVYGRKVLNSFGNAIKHMDDLKGTFADLSELHCDKLHVDPENFRLLGNMILIVLATHFS +KEFTPQMQAAWQKLTNAVANALTHKYH +> HBE2_CAPHI +VHFTTEEKAAVASLWAKVNVEVVGGESLARLLIVYPWTQRFFDSFGNLCSESAIMGNPK +VKAHGRKVLNSFGNAIKHMDDLKGTFADLSELHCDKLHVDPPNFRLLGNMILIVLATHFS +KEFTPQMQAAWQKLTNAVANALAHKYH +> HBE4_BOVIN +VHFTTEEKAAVASLWAKVNVEVVGGESLARLLIVYPWTQRFFDSFGNLYSESAIMGNPK +VKAHGRKVLNSFGNAIEHMDDLKGTFADLSELHCDKLHVDPENFRLLGNMILIVLATHFS +KEFTPQMQASWQKLTNAVANALAHKYH +> HBE_CAIMO +VHWSAEEKQLITGLWGKVNVEECGAEALARLLIVYPWTQRFFSSFGNLSSPTAIIGNPK +VPPHGRKFFTSFGEPVKNLDNIKNTYAKLSELHCEKLQVEPENFRLLGDILIIVLASHFA +RDFTPACQFPWQKLVSVVAHALPRKYH +> HBE_CHICK +VHWSAEEKQLITSVWSKVNVEECGAEALARLLIVYPWTQRFFASFGNLSSPTAIMGNPR +VRAHGKKVLSSFGEAVKNLDNIKNTYAKLSELHCDKLHVDPENFRLLGDILIIVLASHFA +RDFTPACQFAWQKLVNVVAHALARKYH +> HBE_DIDMA +VHFTPEDKTNITSVWTKVDVEDVGGESLARLLVVYPWTQRFFDSFGNLSSASAVMGNPK +VKAHGKKVLTSFGEGVKNMDNLKGTFAKLSELHCDKLHVDPENFRLLGNVLIIVLASRFG +KEFTPEVQASWQKLVSGVSSALGHKYH +> HBE_GALCR +VHFTAEEKAIIMSLWGKVNIEEAGGEALGRLLVVYPWTQRFFETFGNLSSASAIMGNPK +VKAHGKKVLTSFGEAVKNMDNLKGAFAKLSELHCDKLHVDPENFKLLGNVMVIILATHFG +KEFTPDVQAAWQKLVSGVATALAHKYH +> HBE_HUMAN +VHFTAEEKAAVTSLWSKMNVEEAGGEALGRLLVVYPWTQRFFDSFGNLSSPSAILGNPK +VKAHGKKVLTSFGDAIKNMDNLKPAFAKLSELHCDKLHVDPENFKLLGNVMVIILATHFG +KEFTPEVQAAWQKLVSAVAIALAHKYH +> HBE_LEMFU +VHFTAEEKSTILSLWGKVNVEEAGGEALGRLLVVYPWTQRFFDNFGNLSSASAILGNPK +VKAHGKKVLTSFGEAVKNMDNLKGAFAKLSELHCDKLHVDPENFKLLGNVMVIILATHFG +KEFTPDVQAAWQKLVSGVATALAHKYH +> HBE_MOUSE +VNFTAEEKTLINGLWSKVNVEEVGGEALGRLLVVYPWTQRFFDSFGNLSSASAIMGNPR +VKAHGKKVLTAFGESIKNLDNLKSALAKLSELHCDKLHVDPENFKLLGNVLVIVLASHFG +NEFTAEMQAAWQKLVAGVATALSHKYH +> HBE_PIG +VHFTAEEKAVITGLWSRVNVEETGGEAVGRLLVVYPWTQRFFDSFGNMSSPSAIMGNPK +VKAHGKKVLTSFGDAVKNMDNLKGTFAKLSELHCDKLHVDPENFRLLGNMIVIILASHFG +REFTPEVQAAWQKLVAGVATALAHKYH +> HBE_PONPY +VHFTAEEKAAVTSLWSKMNVEEAGGEALGRLLVVYPWTQRFFDSFGNLSSPSAILGNPK +VKAHGKKVLTSFGDAIKNMDNLKTTFAKLSELHCDKLHVDPENFKLLGNVMVIILATHFG +KEFTPEVQAAWQKLVSAVAIALAHKYH +> HBE_RABIT +VHFTPEEKCIISKQWGQVNIDETGGEALGRLLVVYPWTQRFFDNFGNLSSSSAIMGNPK +VKAHGKKVLTSFGDAIKNMDNLKGAFAKLSELHCDKLHVDPENFKLLGNVLLIVLATHFG +KEFTPEVQAAWQKLVSGVAIALAHKYH +> HBE_TARSY +VHLTAEEKSSVTSLWGKMNVDEAGGEALGRLLVVYPWTQRFFDNFGNLSSSSAIMGNPK +VKAHGKKVLTSFGDAIKNMDNLKGAFAKLSELHCDKLHVDPENFRLLGNVLVIILVTHFG +KDFTPEVQVAWQKLVSGVATALAHKYH +> HBF1_URECA +GLTTAQIKAIQDHWFLnIKGCLQAAADSIFFKYLTAYPGDLAFFHKFSSVPLYGLRSNP +AYKAQTLTVINYLDKVVDALGGNAGALMKAKVPSHDAMGITPKHFGQLLKLVGGVFQEEF +SADPTTVAAWGDAAGVLVAAMK +> HBG1_PONPY +GHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPK +VKAHGKKVLTSLGDAIKNLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVTVLAIHFG +KEFTPEVQASWQKMVTGVASALSSRYH +> HBG2_PONPY +SNFTAEDKAAITSLWGKLNVEDAGGETLGRLLLVYPWTQRFFDSFGSLSSPSAIMGNPK +VKAHGVKVLTSLGGAVKNLDDLKGTFGQLSELHCDKLHVDPENFRLLGNVLVTVLAILHG +KEFTPEVQASWQKMVAAVASALASRYH +> HBG_ATEGE +SNFTAEDKAAITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGSLSSPSAIMGNPK +VKAHGVKVLTSLGEAIKNLDDLKGTFGQLSELHCDKLHVDPENFRLLGNVLVTVLAILHG +KEFTPEVQASWQKMVAGVASALASRYH +> HBG_CHEME +VHFTVEEKAVITSLWGKVNVEEAGGEALGRLLVVYPWTQRFFDNFGNLSSASAIMGNPK +VKAHGKKVLTSLGEAIKNMDDLKGTFAHLSELHCDRLHVDPENFKLLGNELVIVLAKHFG +KEFTPQVQAAWQKMVAGVAIALAHKYH +> HBG_GALCR +VHFTAEEKAIITSLWGKVNVEEDGGEALGRLLVVYPWTQRFFDTFGNLSSASAIMGNPK +VKAHGKKVLSSLGEAIKNMDDLKGTFSHLSELHCDRLHVDPENFRLLGNVLVIVLAKHFG +KEFTPQIQAASQKMVAGVATALAHKYH +> HBG_GORGO +GHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPK +VKAHGKKVLTSLGGAIKHLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVTVLAIHFG +KEFTPEVQASWQKMVTAVASALSSRYH +> HBG_HUMAN +GHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPK +VKAHGKKVLTSLGDAIKHLDDLKGTFAQLSELHCDKLHVDPENFKLLGNVLVTVLAIHFG +KEFTPEVQASWQKMVTAVASALSSRYH +> HBG_HYLLA +GHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPK +VKAHGKKVLTSLGGAIKNLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVTVLAIHFG +KEFTPEVQASWQKMVAGVASALSSRYH +> HBG_LEMFU +VHFTAEEKAVITSLWGKVNVEEAGGEALGRLLVVYPWTQRFFDNFGNLSSASAIMGNPK +VKAHGKKVLTSLGDAIKNMDDLKGTFAHLSELHCDRLHVDPENFKLLGNELVIVLAKYFG +KEFTPQVQAAWQKMVAGVAIALAHKYH +> HBG_MACMU +GHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPK +VKAHGKKVLTSLGDAIKNLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVTVLAIHFG +KEFTPEVQASWQKMVAGVASALSSRYH +> HBG_MACNE +GHFTEEDKATITSLWGKVNVEDAGGETLGRLLVVYPWTQRFFDSFGNLSSASAIMGNPK +VKAHGKKVLTSLGDAIKNLDDLKGTFAQLSELHCDKLHVDPENFRLLGNVLVTVLAIRFG +KEFTPEVQASWQKMVAGVASALSSRYH +> HBG_RABIT +VHFTAEEKAAITSTWKLVDVEDAGAEALGRLLVVYPWTQRFFDSFGNLSSSSAIMGNPK +VKAHGKKVLTAFGDAVKNVDDLKNTFAHLSELHCDRLHVDPENFKLLGNVLVIVLAKYFG +KEFTPQVQSAWQKLVAGVATALAHKYH +> HBG_TARSY +VHFTAEEKAIITSLWAKVNVEETGGEALGRLLVVYPWTQRFFDNFGNLSSASAIMGNPK +VKAHGKKVLSSLGEAVTHMDDLKDAFAHLSRLHCDELHVDPENFRVTpGKRAVIVLAHHF +GREFTPQVQAAWKKLMSAVAIAMGHKYH +> HBP1_CASGL +ALTEKQEALLKQSWEVLKQNIPAHSLRLFALIIEAAPESKYVFSFLKDSNEIPENNPKL +KAHAAVIFKTICESATELRQKGHAVWDNNTLKRLGSIHLKNKITDPHFEVMKGALLGTIK +EAIKENWSDEMGQAWTEAYNQLVATIKAEMKE +> HBP2_CASGL +MSTLEGRGFTEEQEALVVKSWSAMKPNAGELGLKFFLKIFEIAPSAQKLFSFLKDSNVP +LERNPKLKSHAMSVFLMTCESAVQLRKAGKVTVRESSLKKLGASHFKHGVADEHFEVTKF +ALLETIKEAVPETWSPEMKNAWGEAYDKLVAAIKLEMKPSS +> HBPI_CAIMO +TLTQAEKAAVITIWTKVATQADAIGAESLERLFSSYPQTKTYFPHFDLSQGSTQLRGHG +SKVMNAIGEAVKNIDDIRGALAKLSELHAYILRVDPVNFKLLCHCILCSVAARYPSDFTP +EVHAAWDKFLSSVSSVLTEKYR +> HBPI_CHICK +ALTQAEKAAVTTIWAKVATQIESIGLESLERLFASYPQTKTYFPHFDVSQGSVQLRGHG +SKVLNAIGEAVKNIDDIRGALAKLSELHAYILRVDPVNFKLLSHCILCSVAARYPSDFTP +EVHAEWDKFLSSISSVLTEKYR +> HBPL_PARAD +SSSEVNKVFTEEQEALVVKAWAVMKKNSAELGLQFFLKIFEIAPSAKNLFSYLKDSPVP +LEQNPKLKPHATTVFVMTCESAVQLRKAGKATVKESDLKRIGAIHFKTGVVNEHFEVTRF +ALLETIKEAVPEMWSPEMKNAWGVAYDQLVAAIKFEMKPSST +> HBPL_TRETO +MSSSEVDKVFTEEQEALVVKSWAVMKKNSAELGLKFFLKIFEIAPSAKNLFSYLKDSPI +PLEQNPKLKPHAMTVFVMTCESAVQLRKAGKVTVRESNLKRLGAIHFKNGVVNEHFETRF +ALLETIKEAVPEMWSPEMKNAWGEAYDQLVAAIKSEMKPSST +> HBRH_CHICK +VHWSAEEKQLITSVWSKVNVEECGAEALARLLIVYPWTQRFFDNFGNLSSPTAIIGNPK +VRAHGKKVLSSFGEAVKNLDNIKNTYAKLSELHCEKLHVDPENFRLLGNILIIVLAAHFT +KDFTPTCQAVWQKLVSVVAHALAYKYH +> HBT_PIG +VHFTAEEKSVITGLWGKVNVEETGGEAVGRLLVVYPWTQRFFDSFGNMSSPSAIMGNPK +VKAHGKKVLTSFGDAVKNMDNLKGTFAKLSELHCDKLHVDPENFRLLGNMIVIILASHFG +GEFTPEVQAAWQKLVAGVATALAHKYH +> LGB1_LUPLU +GVLTDVQVALVKSSFEEFNANIPKNTHRFFTLVLEIAPGAKDLFSFLKGSSEVPQNNPD +LQAHAGKVFKLTYEAAIQLQVNGAVASDATLKSLGSVHVSKGVVDAHFPVVKEAILKTIK +EVVGDKWSEELNTAWTIAYDELAIIIKKEMKDAA +> LGB1_MEDSA +MSFTDKQEALVNSSWEAFKQNLPRYSVFFYTVVLEKAPAAKGLFSFLKNSAEVQDSPQL +QAHAEKVFGLVRDSAVQLRATGGVVLGDATLGAIHVRKGVVDPHFVVVKEALLKTIKEAA +GDKWSEELNTAWEVAYDALATAIKKAMS +> LGB1_MEDTR +MSFTDKQEALVNSSYEAFKQNLSGYSVFFYTVILEKAPAAKGLFSFLKDSAGVQDSPQL +QAHAEKVFGLVRDSASQLRATGGVVLGDAALGAIHIQKGVVDPHFVVVKEALLKTIKEAA +GDKWSEELSTAWEVAYDALATEIKKAMS +> LGB1_PEA +GFTDKQEALVNSSSEFKQNLPGYSILFYTIVLEKAPAAKGLFSFLKDTAGVEDSPKLQA +HAEQVFGLVRDSAAQLRTKGEVVLGNATLGAIHVQKGVTNPHFVVVKEALLQTIKKASGN +NWSEELNTAWEVAYDGLATAIKKAMKTA +> LGB1_SOYBN +GAFTEKQEALVSSSFEAFKANIPQYSVVFYNSILEKAPAAKDLFSFLANGVDPTNPKLT +GHAEKLFALVRDSAGQLKTNGTVVADAALVSIHAQKAVTDPQFVVVKEALLKTIKEAVGG +NWSDELSSAWEVAYDELAAAIKKA +> LGB1_VICFA +GFTEKQEALVNSSSQLFKQNPSNYSVLFYTIILQKAPTAKAMFSFLKDSAGVVDSPKLG +AHAEKVFGMVRDSAVQLRATGEVVLDGKDGSIHIQKGVLDPHFVVVKEALLKTIKEASGD +KWSEELSAAWEVAYDGLATAIKAA +> LGB2_LUPLU +GALTESQAALVKSSWEEFNANIPKHTHRFFILVLEIAPAAKDLFSFLKGTSEVPQNNPE +LQAHAGKVFKLVYEAAIQLQVTGVVVTDATLKNLGSVHVSKGVADAHFPVVKEAILKTIK +EVVGAKWSEELNSAWTIAYDELAIVIKKEMNDAA +> LGB2_MEDTR +MGFTEKQEALVNSSWELFKQNPGNSVLFYTIILEKAPAAKGMFSFLKDTAGVQDSPKLQ +SHAEKVFGMVRDSAVQLRATGGVVLGDATLGAIHIQKGVVDPHFVVVKEALLKTIKEVSG +DKWSEELSTAWEVAYDALAAAIKKAMG +> LGB2_SESRO +GFTEKQEALVNASYEAFKQNLPGNSVLFYSFILEKAPAAKGMFSFLKDSDGVPQNNPSL +QAHAEKVFGLVRDSAAQLRATGVVVLADASLGSVHVQKGVLDPHFVVVKEALLKTLKEAA +GATWSDEVSNAWEVAYDGLSAAIKKAMS +> LGB2_SOYBN +GAFTEKQEALVSSSFEAFKANIPQYSVVFYTSILEKAPAAKDLFSFLSNGVDPSNPKLT +GHAEKLFGLVRDSAGQLKANGTVVADAALGSIHAQKAITDPQFVVVKEALLKTIKEAVGD +KWSDELSSAWEVAYDELAAAIKKAF +> LGB3_MEDSA +MGFTDKQEALVNSSWESFKQNPGNSVLFYTIILEKAPAAKGMFSFLKDSAGVQDSPKLQ +SHAEKVFGMVRDSAAQLRATGGVVLGDATLGAIHIQKGVVDPHFAVVKEALLKTIKEVSG +DKWSEELNTAWEVAYDALATAIKKAMV +> LGB3_SESRO +GFTEKQEALVNASYEAFKQNLPGNSVLFYSFILEKAPAAKGMFSFLKDFDEVPQNNPSL +QAHAEKVFGLVRDSAAQLRATGVVVLADASLGSVHVQKGVLDPHFVVVKEALLKTLKEAG +GATWSDEVSNAWEVAYDELSAAIKKAMS +> LGB3_SOYBN +GAFTDKQEALVSSSFEAFKTNIPQYSVVFYTSILEKAPVAKDLFSFLANGVDPTNPKLT +GHAEKLFGLVRDSAGQLKASGTVVIDAALGSIHAQKAITDPQFVVVKEALLKTIKEAVGD +KWSDELSSAWEVAYDELAAAIKKAF +> LGB4_MEDSA +MGFTADQEALVNSSWESFKQNLPGYSVFFYTTILEKAPAAKGMFSFLKDSAGVQDSPQL +QAHAEKVFGMVRDSAVQLRATGEVVLGDATLGSIHIQKGVVDPHFVVVKEALLKTIKEAV +GDKWSEELSTSWEVAYDGLASAIKKAMS +> LGBA_PHAVU +GAFTEKQEALVNSSWEAFKGNIPQYSVVFYTSILEKAPAAKNLFSFLANGVDPTNPKLT +AHAESLFGLVRDSAAQLRANGAVVADAALGSIHSQKGVSNDQFLVVKEALLKTLKQAVGD +KWTDQLSTALELAYDELAAAIKKAYA +> LGBA_SOYBN +VAFTEKQDALVSSSFEAFKANIPQYSVVFYTSILEKAPAAKDLFSFLANGVDPTNPKLT +GHAEKLFALVRDSAGQLKASGTVVADAALGSVHAQKAVTDPQFVVVKEALLKTIKAAVGD +KWSDELSRAWEVAYDELAAAIKKA +> LGB_PSOTE +MGGFTEKQEALVNSSYEAFKANVPQYSVVFYTSILEKAPAAKDLFPFLANGVDPTNPKL +IGHAEKLFGLVHDSAAQLRAKGAVVADAALGSLHAQKGVTDPQFVVVKEALLKTVKEAVG +DKWSDELSNAWEVAYNELAAALKKAF +> MYG_ALLMI +MELSDQEWKHVLDIWTKVESKLPEHGHEVIIRLLQEHPETQERFEKFKHMKTADEMKSS +EKMKQHGNTVFTALGNILKQKGNHAEVLKPLAKSHALEHKIPVKYLEFISEIIVKVIAEK +YPADFGADSQAAMRKALELFRNDMASKYKEFGYQG +> MYG_AOTTR +GLSDGEWQLVLNVWGKVEADVPSHGQEVLISLFKGHPETLEKFDKFKHLKSEDEMKASE +ELKKHGVTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPVKYLEFISDAIVHVLQKKH +PGDFGADAQGAMKKALELFRNDMAAKYKELGFQG +> MYG_APTFO +GLNDQEWQQVLTMWGKVESDLAGHGHAVLMRLFKSHPETMDRFDKFRGLKTPDEMRGSE +DMKKHGVTVLTLGQILKKKGHHEAELKPLSQTHATKHKVPVKYLEFISEAIMKVIAQKHA +SNFGADAQEAMKKALELFRNDMASKYKEFGFQG +> MYG_BALAC +VLSDAEWHLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSRH +PAEFGADAQAAMNKALELFRKDIAAKYKELGFQG +> MYG_BALPH +VLTDAEWHLVLNIWAKVEADVAGHGQDILISLFKGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSRH +PADFGADAQAAMNKALELFRKDIAAKYKELGFQG +> MYG_BOVIN +GLSDGEWQLVLNAWGKVEADVAGHGQEVLIRLFTGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAEVKHLAESHANKHKIPVKYLEFISDAIIHVLHAKH +PSDFGADAQAAMSKALELFRNDMAAQYKVLGFHG +> MYG_CALJA +GLSDGEWQLVLNVWGKVEADIPSHGQEVLISLFKGHPETLEKFDKFKHLKSEDEMKASE +ELKKHGVTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPVKYLEFISDAIVHVLQKKH +PGDFGADAQGAMKKALELFRNDMAAKYKELGFQG +> MYG_CANFA +GLSDGEWQIVLNIWGKVETDLAGHGQEVLIRLFKNHPETLDKFDKFKHLKTEDEMKGSE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPVKYLEFISDAIIQVLQSKH +SGDFHADTEAAMKKALELFRNDIAAKYKELGFQG +> MYG_CASFI +GLSDGEWQLVLHVWGKVEADLAGHGQEVLIRLFKGHPETLEKFNKFKHIKSEDEMKASE +DLKKHGVTVLTALGGVLKKKGHHEAEIKPLAQSHATKHKIPIKYLEFISEAIIHVLQSKH +PGXFGADAXGAMNKALELFRKDIAAKYKELGFQG +> MYG_CEBAP +GLSDGEWQLVLNVWGKVEADIPSHGQEVLISLFKGHPETLEKFDKFKHLKSEDEMKASE +ELKKHGATVLTALGGILKKKGQHEAELKPLAQSHATKHKIPVKYLEFISDAIVHVLQKKH +PGDFGADAQGAMKKALELFRNDMAAKYKELGFQG +> MYG_CEREL +GLSDGEWQLVLNAWGKVEADVAGHGQEVLIRLFTGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAEVKHLAESHANKHKIPVKYLEFISDAIIHVLHAKH +PSNFGADAQGAMSKALELFRNDMAAQYKVLGFQG +> MYG_CHEMY +GLSDDEWNHVLGIWAKVEPDLTAHGQEVIIRLFQLHPETQERFAKFKNLTTIDALKSSE +EVKKHGTTVLTALGRILKQKNNHEQELKPLAESHATKHKIPVKYLEFICEIIVKVIAEKH +PSDFGADSQAAMKKALELFRNDMASKYKEFGFLG +> MYG_CHICK +GLSDQEWQQVLTIWGKVEADIAGHGHEVLMRLFHDHPETLDRFDKFKGLKTPNEMKGSE +DLKKHGATVLTQLGKILKQKGQHESDLKPLAQTHATKHKIPVKYLEFISEVIIKVIAEKH +AADFGADSQAAMKKALELFRNDMASKYKEFGFQG +> MYG_CTEGU +GLSDGEWQLVLNAWGKVETDIGGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGTTVLTALGNILKKKGQHEAELAPLAQSHATKHKIPVKYLEFISEAIIQVLESKH +PGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +> MYG_CYPCA +HDAELVLKCWGGVEADFEGTGGEVLTRLFKQHPETQKLFPKFVGIASNELAGNAAVKAH +GATVLKKLGELLKARGDHAAILKPLATTHANTHKIALNNFRLITEVLVKVMAEKAGLDAG +GQSALRRVMDVVIGDIDTYYKEIGFAG +> MYG_DIDMA +GLSDGEWQLVLNAWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGNILKKKGNHEAELKPLAQSHATKHKISVQFLEFISEAIIQVIQSKH +PGDFGGDAQAAMGKALELFRNDMAAKYKELGFQG +> MYG_ELEMA +GLSDGEWELVLKTWGKVEADIPGHGEFVLVRLFTGHPETLEKFDKFKHLKTEGEMKASE +DLKKQGVTVLTALGGILKKKGHHEAEIQPLAQSHATKHKIPIKYLEFISDAIIHVLQSKH +PAEFGADAQGAMKKALELFRNDIAAKYKELGFQG +> MYG_ERIEU +GLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKDHPETLEKFDKFKHLKSEDEMKSSE +DLKKHGTTVLTALGGILKKKGQHEAQLAPLAQSHANKHKIPVKYLEFISEAIIQVLKSKH +AGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +> MYG_ESCGI +VLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSRH +PGDFGADAQAAMNKALELFRKDIAAKYKELGFQG +> MYG_GALAU +ADWDKVNSVWSAMEANITAVGQNILLRLFEQYPESQSYFPKLKNKSLGELKDTADIKAQ +ADTVLKALGNIVKKKGNHSQPVKALAATHITTHKIPPHYFTKITTIAVGVLSEMYPSEMN +AQAQEAFSGAFKSICSDIEKEYKAANFQG +> MYG_GALCR +GLSDGEWQLVLKIWGKVEADLAGHGQDVLIRLFTAHPETLEKFDKFKNLKTADEMKASE +DLKKHGVTVLTALGGILKKKGQHEAEIKPLAQSHATKHKIPVKYLEFISEAIIHVLQNKH +SGDFGTDVQGAMSKALELFRNDIAAKYKELGFQG +> MYG_GALJA +AXWDKVNSVWSAVEQNITAIGQNILLRLFEQYPESEDYFPKLKNKSLGELKDTADIKAQ +ADTVLRALGNIVKKKGDHSQPVKALAATHITTHKIPPHYFTKITTIAVGVLSEMYPSEMN +AQAQAAFSGAFKNICSDIEKEYKAANFQG +> MYG_GLOME +GLSDGEWQLVLNVWGKVEADLAGHGQDILIRLFKGHPETLEKFDKFKHLKTEADMKASE +DLKKHGNTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRH +PAEFGADAQGAMNKALELFRKDIAAKYKELGFHG +> MYG_GORBE +GLSDGEWQLVLNVWGKVEADISGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKH +PGDFGADAQGAMNKALELFRKDMASNYKELGFQG +> MYG_GRAGE +GLSDDEWHHVLGIWAKVEPDLSAHGQEVIIRLFQVHPETQERFAKFKNLKTIDELRSSE +EVKKHGTTVLTALGRILKLKNNHEPELKPLAESHATKHKIPVKYLEFICEIIVKVIAEKH +PSDFGADSQAAMRKALELFRNDMASKYKEFGFQG +> MYG_HALGR +GLSDGEWHLVLNVWGKVETDLAGHGQEVLIRLFKSHPETLEKFDKFKHLKSEDDMRRSE +DLRKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSKH +PAEFGADAQAAMKKALELFRNDIAAKYKELGFHG +> MYG_HETPO +TEWEHVNKVWAVVEPDIPAVGLAILLRLFKEHKETKDLFPKFKEIPVQQLGNNEDLRKH +GVTVLRALGNILKQKGKHSTNVKELADTHINKHKIPPKNFVLITNIAVKVLTEMYPSDMT +GPMQESFSKVFTVICSDLETLYKEANFQG +> MYG_HORSE +GLSDGEWQQVLNVWGKVEADIAGHGQEVLIRLFTGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGTVVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSKH +PGNFGADAQGAMTKALELFRNDIAAKYKELGFQG +> MYG_HUMAN +GLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKH +PGDFGADAQGAMNKALELFRKDMASNYKELGFQG +> MYG_HYLAG +GLSDGEWQLVLNVWGKVEADIPSHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKH +PGDFGADAQGAMNKALELFRKDMASNYKELGFQG +> MYG_INIGE +GLSDGEWQLVLNIWGKVEADLAGHGQDVLIRLFKGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRH +PGDFGADAQAAMNKALELFRKDIAAKYKELGFHG +> MYG_KOGSI +VLSEGEWQLVLHVWAKVEADIAGHGQDILIRLFKHHPETLEKFDRFKHLKSEAEMKASE +DLKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRH +PADFGADAQGAMSKALELFRKDIAAKYKELGYQG +> MYG_LAGLA +GLSDGEWQLVLNIWGKVEADIPSHGQEVLISLFKGHPETLEKFDKFKHLKSEDEMKASE +ELKKHGVTVLTALGGILKKKGQHEAELKPLAQSHATKHKIPVKYLEFISDAIIHALQKKH +PGDFGADAQGAMKKALELFRNDMAAKYKELGFQG +> MYG_LAGMA +GLSDGEWQLVLNVWGKVEADLGGHGQEVLIRLFKGHPETLEKFDKFKHLKAEDEMRASE +DLKKHGTTVLTALGGILKKRGQHAAELAPLAQSHATKHKIPVKYLEFISEAIIQVLQSKH +PGDFGADAQAAMSKALELFRNDIAAKYKELGFQG +> MYG_LEPMU +GLSDGEWQLVLNVWGKVEADVGGHGQEVLIRLFTGHPETLEKFDKFKHLKTADEMKASE +DLKKHGTTVLTALGGILKKKGQHEAELKPLAQSHATKHKIPIKYLEFISDAIVHVLHSKH +PAEFGADAQAAMKKALELFRNDIAAKYKELGFQG +> MYG_LOXAF +GLSDGEWELVLKTWGKVEADIPGHGEFVLVRLFTGHPETLEKFDKFKHLKTEGEMKASE +DLKKQGVTVLTALGGILKKKGHHEAEIQPLAQSHATKHKIPIKYLEFISDAIIHVLQSKH +PAEFGADAQAAMKKALELFRNDIAAKYKELGFQG +> MYG_LUTLU +GLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKGSE +DLKKHGNTVLTALGGILKKKGKHEAELKPLAQSHATKHKIPIKYLEFISEAIIQVLQSKH +PGXFGADAQGAMKRALELFRNDIAAKYKELGFQG +> MYG_LYCPI +GLSDGEWQIVLNIWGKVETDLAGHGQEVLIRLFKNHPETLDKFDKFKHLKTEDEMKGSE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPVKYLEFISDAIIQVLQNKH +SGDFHADTEAAMKKALELFRNDIAAKYKELGFQG +> MYG_MACFA +GLSDGEWQLVLNVWGKVEADIPSHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGVTVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLELISESIIQVLQSKH +PGDFGADAQGAMNKALELFRNDMAAKYKELGFQG +> MYG_MACRU +GLSDGEWQLVLNIWGKVETDEGGHGKDVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGITVLTALGNILKKKGHHEAELKPLAQSHATKHKIPVQFLEFISDAIIQVIQSKH +AGNFGADAQAAMKKALELFRHDMAAKYKEFGFQG +> MYG_MEGNO +VLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSRH +PADFGADAQAAMNKALELFRKDIAAKYKELGFQG +> MYG_MELME +GLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKGSE +DLKKHGNTVLTALGGILKKKGHQEAELKPLAQSHATKHKIPVKYLEFISDAIAQVLQSKH +PGNFAAEAQGAMKKALELFRNDIAAKYKELGFQG +> MYG_MESCA +GLSEAEWQLVLHVWAKVEADLSGHGQEILIRLFKGHPETLEKFDKFKHLKSEAEMKASE +DLKKHGHTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSKH +PSDFGADAQGAMTKALELFRKDIAAKYKELGFHG +> MYG_MOUSE +GLSDGEWQLVLNVWGKVEADLAGHGQEVLIGLFKTHPETLDKFDKFKNLKSEEDMKGSE +DLKKHGCTVLTALGTILKKKGQHAAEIQPLAQSHATKHKIPVKYLEFISEIIIEVLKKRH +SGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +> MYG_MUSAN +VDWEKVNSVWSAVESDLTAIGQNILLRLFEQYPESQNHFPKFKNKSLGELKDTADIKAQ +ADTVLSALGNIVKKKGSHSQPVKALAATHITTHKIPPHYFTKITTIAVDVLSEMYPSEMN +AQVQAAFSGAFKIICSDIEKEYKAANFQG +> MYG_NYCCO +GLSDGEWQSVLNVWGKVEADLAGHGQEILIRLFTAHPETLEKFDKFKNLKTPDEMKASE +DLKKHGVTVLTALGGILKKKGQHEAEIKPLAQSHATKHKIPVKYLEFISGAIIHVLQSKH +PGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +> MYG_OCHPR +GLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKNHPETLEKFDKFKNLKSEDEMKGSD +DLKKHGNTVLSALGGILKKKGQHEAELKPLAQSHATKHKIPVKYLEFISEAIIQVLQSKH +PGDFGADAQGAMSKALELFRNDMAAKYKELGFQG +> MYG_ORCOR +GLSDGEWQLVLNVWGKVEADLAGHGQDILIRLFKGHPETLEKFDKFKHLKTEADMKASE +DLKKHGNTVLTALGAILKKKGHHDAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRH +PAEFGADAQGAMNKALELFRKDIAAKYKELGFHG +> MYG_ORNAN +GLSDGEWQLVLKVWGKVEGDLPGHGQEVLIRLFKTHPETLEKFDKFKGLKTEDEMKASA +DLKKHGGTVLTALGNILKKKGQHEAELKPLAQSHATKHKISIKFLEYISEAIIHVLQSKH +SADFGADAQAAMGKALELFRNDMAAKYKEFGFQG +> MYG_ORYAF +GLSDAEWQLVLNVWGKVEADIPGHGQDVLIRLFKGHPETLEKFDRFKHLKTEDEMKASE +DLKKHGTTVLTALGGILKKKGQHEAEIQPLAQSHATKHKIPVKYLEFISEAIIQVIQSKH +SGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +> MYG_PANTR +GLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLHSKH +PGDFGADAQGAMNKALELFRKDMASNYKELGFQG +> MYG_PAPAN +GLSDGEWQLVLNVWGKVEADIPSHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLELISESIIQVLQSKH +PGDFGADAQGAMNKALELFRNDMAAKYKELGFQG +> MYG_PERPO +GLSDGEWQSVLNVWGKVEADLAGHGQEILIRLFTAHPETLEKFDKFKNLKTPDEMKASE +DLKKHGVTVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISEAIIHVLQSKH +PGDFGADAQGAMNKALELFRNDIAAKYKELGFQG +> MYG_PHOPH +GLSEGEWQLVLNVWGKVEADLAGHGQDVLIRLFKGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHDAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRH +PAEFGADAQGAMNKALELFRKDIATKYKELGFHG +> MYG_PHYCA +VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRFKHLKTEAEMKASE +DLKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRH +PGDFGADAQGAMNKALELFRKDIAAKYKELGYQG +> MYG_PIG +GLSDGEWQLVLNVWGKVEADVAGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGNTVLTALGGILKKKGHHEAELTPLAQSHATKHKIPVKYLEFISEAIIQVLQSKH +PGDFGADAQGAMSKALELFRNDMAAKYKELGFQG +> MYG_PONPY +GLSDGEWQLVLNVWGKVEADIPSHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISESIIQVLQSKH +PGDFGADAQGAMNKALELFRKDMASNYKELGFQG +> MYG_PROGU +GLSDGEWQLVLNVWGKVEGDLSGHGQEVLIRLFKGHPETLEKFDKFKHLKAEDEMRASE +ELKKHGTTVLTALGGILKKKGQHAAELAPLAQSHATKHKIPVKYLEFISEAIIQVLQSKH +PGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +> MYG_RABIT +GLSDAEWQLVLNVWGKVEADLAGHGQEVLIRLFHTHPETLEKFDKFKHLKSEDEMKASE +DLKKHGNTVLTALGAILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISEAIIHVLHSKH +PGDFGADAQAAMSKALELFRNDIAAQYKELGFQG +> MYG_ROUAE +GLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASE +DLKKHGATVLTALGGILKKKGQHEAQLKPLAQSHATKHKIPVKYLEFISEVIIQVLQSKH +PGDFGADAQGAMGKALELFRNDIAAKYKELGFQG +> MYG_SAISC +GLSDGEWQLVLNIWGKVEADIPSHGQEVLISLFKGHPETLEKFDKFKHLKSEDEMKASE +ELKKHGTTVLTALGGILKKKGQHEAELKPLAQSHATKHKIPVKYLELISDAIVHVLQKKH +PGDFGADAQGAMKKALELFRNDMAAKYKELGFQG +> MYG_SHEEP +GLSDGEWQLVLNAWGKVEADVAGHGQEVLIRLFTGHPETLEKFDKFKHLKTEAEMKASE +DLKKHGNTVLTALGGILKKKGHHEAEVKHLAESHANKHKIPVKYLEFISDAIIHVLHAKH +PSNFGADAQGAMSKALELFRNDMAAEYKVLGFQG +> MYG_SPAEH +GLSDGEWQLVLNVWGKVEGDLAGHGQEVLIKLFKNHPETLEKFDKFKHLKSEDEMKGSE +DLKKHGNTVLTALGGILKKKGQHAAEIQPLAQSHATKHKIPIKYLEFISEAIIQVLQSKH +PGDFGADAQGAMSKALELFRNDIAAKYKELGFQG +> MYG_TACAC +GLSDGEWQLVLKVWGKVETDITGHGQDVLIRLFKTHPETLEKFDKFKHLKTEDEMKASA +DLKKHGGVVLTALGSILKKKGQHEAELKPLAQSHATKHKISIKFLEFISEAIIHVLQSKH +SADFGADAQAAMGKALELFRNDMATKYKEFGFQG +> MYG_THUAL +ADFDAVLKCWGPVEADYTTMGGLVLTRLFKEHPETQKLFPKFAGIAQADIAGNAAISAH +GATVLKKLGELLKAKGSHAAILKPLANSHATKHKIPINNFKLISEVLVKVMHEKAGLDAG +GQTALRNVMGIIIADLEANYKELGFSG +> MYG_TUPGL +GLSDGEWQLVLNVWGKVEADVAGHGQEVLIRLFKGHPETLEKFDKFKHLKTEDEMKASE +DLKKHGNTVLSALGGILKKKGQHEAEIKPLAQSHATKHKIPVKYLEFISEAIIQVLQSKH +PGDFGADAQAAMSKALELFRNDIAAKYKELGFQG +> MYG_TURTR +GLSDGEWQLVLNVWGKVEADLAGHGQDVLIRLFKGHPETLEKFDKFKHLKTEADMKASE +DLKKHGNTVLTALGAILKKKGHHDAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRH +PAEFGADAQGAMNKALELFRKDIAAKYKELGFHG +> MYG_VARVA +GLSDEEWKKVVDIWGKVEPDLPSHGQEVIIRMFQNHPETQDRFAKFKNLKTLDEMKNSE +DLKKHGTTVLTALGRILKQKGHHEAEIAPLAQTHANTHKIPIKYLEFICEVIVGVIAEKH +SADFGADSQEAMRKALELFRNDMASRYKELGFQG +> MYG_VULCH +GLSDGEWQLVLNIWGKVETDLAGHGQEVLIRLFKNHPETLDKFDKFKHLKTEDEMKGSE +DLKKHGNTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPVKYLEFISDAIIQVLQSKH +SGDFHADTEAAMKKALELFRNDIAAKYKELGFQG +> MYG_ZALCA +GLSDGEWQLVLNIWGKVEADLVGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKRSE +DLKKHGKTVLTALGGILKKKGHHDAELKPLAQSHATKHKIPIKYLEFISEAIIHVLQSKH +PGDFGADTHAAMKKALELFRNDIAAKYRELGFQG +> MYG_ZIPCA +GLSEAEWQLVLHVWAKVEADLSGHGQEILIRLFKGHPETLEKFDKFKHLKSEAEMKASE +DLKKHGHTVLTALGGILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISDAIIHVLHSRH +PSDFGADAQAAMTKALELFRKDIAAKYKELGFHG diff --git a/forester/archive/RIO/others/hmmer/tutorial/nucleic.null b/forester/archive/RIO/others/hmmer/tutorial/nucleic.null new file mode 100644 index 0000000..654e5e3 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/nucleic.null @@ -0,0 +1,15 @@ +# nucleic.null +# +# Example of a null model file for DNA/RNA sequences. +# The values in this file are the HMMER 2 default +# settings. + +Nucleic + +0.25 # A +0.25 # C +0.25 # G +0.25 # T + +0.999001 # p1 + diff --git a/forester/archive/RIO/others/hmmer/tutorial/nucleic.pri b/forester/archive/RIO/others/hmmer/tutorial/nucleic.pri new file mode 100644 index 0000000..c1aec46 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/nucleic.pri @@ -0,0 +1,27 @@ +# nucleic.pri +# +# Example of a prior file for DNA/RNA models. +# The values in this file are the HMMER 2 default settings. + +Dirichlet # Strategy (mixture Dirichlet) +Nucleic # type of prior (Amino or Nucleic) + +# Transitions +1 # Single component +1.0 # with probability = 1.0 +0.7939 0.0278 0.0135 # m->m, m->i, m->d alpha's +0.1551 0.1331 # i->m, i->i alpha's +0.9002 0.5630 # d->m, d->d alpha's + +# Match emissions +# The use of 1.0 for alpha's here makes a simple Laplace "plus-one" prior. +# +1 # single component +1.0 # with probability = 1.0 +1.0 1.0 1.0 1.0 + +# Insert emissions +# +1 # Single component +1.0 # with probability 1.0 +1.0 1.0 1.0 1.0 diff --git a/forester/archive/RIO/others/hmmer/tutorial/pkinase.slx b/forester/archive/RIO/others/hmmer/tutorial/pkinase.slx new file mode 100644 index 0000000..9f9b39c --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/pkinase.slx @@ -0,0 +1,712 @@ +# ID pkinase +# AC PF00069 +# DE Eukaryotic protein kinase domain +# AU Sonnhammer ELL +# AL Clustalw +# AM hmma -qR +# SE Unknown +# DR PROSITE; PDOC00100; +# DR PROSITE; PDOC00212; +# DR PROSITE; PDOC00213; +# DR PROSITE; PDOC00629; +# GA Bic_raw 25 hmmfs 20 +# NC hmmfs 17.60 YPKA_YERPS/2-292 +# CC hmmfs breaks up some members too much, e.g. KS61_MOUSE, which +# CC hmmls does not. However, hmmls inserts too many unwelcome insertions. +# ** hmmls fails badly: NINL_DROME gets the whole myosin domain inserted. +# ** Remaking SEED from more seqs only made things worse. +# DR SCOP; 1apm; sf; +# DR URL; http://www.sdsc.edu/Kinases/pk_home.html; +# RN [1] +# RM 92065863 +# RA S.K. Hanks, A.M. Quinn; +# RL Methods Enzymol. 200: 38-62 (1991) +# RN [2] +# RM 95285959 +# RA S.K. Hanks, T. Hunter; +# RL FASEB J. 9: 576-596 (1995) +# RN [3] +# RM 97172697 +# RA T. Hunter, G.D. Plowman; +# RL Trends Biochem. Sci. 22: 18-22 (1997) +# SQ 67 +7LES_DROME/2209-2481 LKLLRFLGSGAFGEVYEGQLKTED....SEEPQRVAIKSLRKG....... +ABL1_CAEEL/296-547 IIMHNKLGGGQYGDVYEGYWKR........HDCTIAVKALK......... +ARK1_BOVIN/191-453 FSVHRIIGRGGFGEVYGCRKA........DTGKMYAMKCLD......... +AVR2_HUMAN/192-479 LQLLEVKARGRFGCVWKAQLL..........NEYVAVKIFP......... +BFR2_HUMAN/367-643 LTLGKPLGEGCFGQVVMAEAVGIDK.DKPKEAVTVAVKMLKDDAT..... +BYR1_SCHPO/66-320 LEVVRHLGEGNGGAVSLVKHR..........NIFMARKTVYVG......S +BYR2_SCHPO/394-658 WIRGALIGSGSFGQVYLGMNAS........SGELMAVKQVILD....... +CC15_YEAST/25-272 YHLKQVIGRGSYGVVYKAINK........HTDQVVAIKEVVYE......N +CC21_MEDSA/1-284 GENVEKIGEGTYGVVYKARDR........VTNETIALKKIR........L +CC5_YEAST/82-337 YHRGHFLGEGGFARCFQIKDD.........SGEIFAAKTVA......... +CDPK_SOYBN/34-292 YEVGRKLGQGQFGTTFECTRR........ASGGKFACKSIP......... +CDR1_SCHPO/12-258 WRLGKTLGTGSTSCVRLAKHA........KTGDLAAIKIIP......... +CHK1_SCHPO/10-272 YHIGREIGTGAFASVRLCYDD.........NAKIYAVKFVN........K +CLK1_MOUSE/160-476 YEIVDTLGEGAFGKVVECIDHK.......VGGRRVAVKIVKN.......V +CTK1_YEAST/183-469 YLRIMQVGEGTYGKVYKAKNTN........TEKLVALKKLRLQ....... +ERK1_CANAL/68-371 YQILEIVGEGAYGIVCSAIHK........PSQQKVAIKKIEP.......F +ERK3_HUMAN/20-312 FVDFQPLGFGVNGLVLSAVDS........RACRKVAVKKIALS......D +FUSE_DROME/4-254 YAVSSLVGQGSFGCVYKATRK........DDSKVVAIKVIS......... +HR25_YEAST/9-273 FRIGRKIGSGSFGDIYHGTNL........ISGEEVAIKLES......... +JAK1_HUMAN/571-833 LVQGEHLGRGTRTHIYSGTLMDYKD..DEGTSEEKKIKVIL......... +JAK1_HUMAN/864-1137 LKRIRDLGEGHFGKVELCRYDPED.....NTGEQVAVKSLK......... +KAB7_YEAST/1096-1354 FVSLQKMGEGAYGKVNLCIHK........KNRYIVVIKMIFK.......E +KAKT_MLVAT/171-429 FEYLKLLGKGTFGKVILVKEK........ATGRYYAMKILK......... +KC21_CHICK/39-324 YQLVRKLGRGKYSEVFEAINIT........NNEKVVVKILK......... +KCC4_MOUSE/42-296 FEVESELGRGATSIVYRCKQK........GTQKPYALKVL.......... +KCR8_YEAST/316-590 GRCQEVLGKGAFGVVRICQKKNVSSQDGNKSEKLYAVKEFKR........ +KG3A_RAT/119-403 YTDIKVIGNGSFGVVYQARLA........ETRELVAIKKVLQ........ +KGP1_DROME/457-717 LEVVSTLGIGGFGRVELVKAHH......QDRVDIFALKCLK......... +KI28_YEAST/7-290 YTKEKKVGEGTYAVVYLGCQHS........TGRKIAIKEIK......... +KI82_YEAST/324-602 FEKIRLLGQGDVGKVYLVRER........DTNQIFALKVLN......... +KIN1_SCHPO/125-395 YVLGKTIGAGSMGKVKDAHHL........KTGEQFAIKIVTRLHPDITKA +KIR1_HUMAN/208-495 ITLLECVGKGRYGEVWRGSWQ..........GENVAVKIFS......... +KKIA_HUMAN/5-288 YEKIGKIGEGSYGVVFKCRNR........DTGQIVAIKKFLE........ +KKL6_YEAST/192-508 WKKVRPIGSGNFSTVLLYELMDQS....NPKLKQVAVKRLKYPEELSNVE +KMIL_AVIMH/82-339 VLLSTRIGSGSFGTVYKGKWHG........DVAVKILKVVDP.......T +KML2_CHICK/1453-1708 YNIEERLGSGKFGQVFRLVEK........KTGKVWAGKFFK......... +KMOS_CERAE/60-338 VCLLQRLGAGGFGSVYKATYH..........GVPVAIKQVN......... +KPBH_RAT/24-291 YDPKDIIGRGVSSVVRRCVHRA......TGDEFAVKIMEVS......... +KPIM_HUMAN/38-290 YQVGPLLGSGGFGSVYSGIRV........SDNLPVAIKHVE......... +KPK2_PLAFK/111-364 YVLNKKIGKGSFSTAYIGTNI........LYGNRVVVKEVD......... +KPRO_MAIZE/534-812 RKFKVELGRGESGTVYKGVLE.........DDRHVAVKKLEN........ +KR1_HSV11/191-478 FTIHGALTPGSEGCVFDSSHP.........DYPQRVIVKA.......... +KR1_PRVKA/53-332 FEVLQPLQSGSEGRVFVARRP.........GEADTVVLKV.......... +KS61_MOUSE/407-664 YVVKETIGVGSYSVCKRCVHK........ATNMEYAVKVID......... +KYK1_DICDI/1289-1559 LEFGQTIGKGFFGEVKRGYWR..........ETDVAIKIIY......... +KYK2_DICDI/108-364 IQFIQKVGEGAFSEVWEGWWK..........GIHVAIKKLKIIG.....D +MAK_RAT/4-284 YTTMRQLGDGTYGSVLMGKSN........ESGELVAIKRMK......... +MEK1_YEAST/162-444 EITNRIVGNGTFGHVLITHNSKERDEDVCYHPENYAVKIIK......... +MET_HUMAN/1078-1337 VHFNEVIGRGHFGCVYHGTLLDND.....GKKIHCAVKSLN......... +MKK1_YEAST/221-488 IETLGILGEGAGGSVSKCKLK........NGSKIFALKVIN......... +MLK1_HUMAN/3-262 LTLEEIIGIGGFGKVYRAFWI..........GDEVAVKAARHD......P +NINL_DROME/16-282 FEIYEEIAQGVNAKVFRAKELD........NDRIVALKIQHYD......E +NPR1_YEAST/438-742 IKTGADLGAGAGGSVKLAQRIS........DNKIFAVKEFR........T +PHY_CERPU/1004-1282 IQITGSLGSGSSATVEKAVWL..........GTPVAKKTFYG........ +PKD1_DICDI/36-291 FNFYGSLGSGSFGTAKLCRHR........GSGLFFCSKTLR......... +PKN1_MYXXA/59-321 FRLVRRLGRGGMGAVYLGEHVS........IGSRVAVKVLH......... +RAN1_SCHPO/18-295 LRFVSIIGAGAYGVVYKAEDIY........DGTLYAVKALC......... +RYK_HUMAN/327-593 ITLKDVLQEGTFGRIFHGILIDEKD...PNKEKQAFVKTVKD.......Q +SGV1_YEAST/60-366 YREDEKLGQGTFGEVYKGIHL........ETQRQVAMKKIIVS......V +SPK1_YEAST/198-466 SIIDEVVGQGAFATVKKAIER........TTGKTFAVKIIS......... +ST20_YEAST/620-871 YANLVKIGQGASGGVYTAYEIG........TNVSVAIKQMNLE....... +STE7_YEAST/191-466 LVQLGKIGAGNSGTVVKALHVP........DSKIVAKKTIP........V +SYK_PIG/364-619 TLEDKELGSGNFGTVKKGYYQMK......KVVKTVAVKILKN........ +TOP_DROME/938-1194 LRKGGVLGMGAFGRVYKGVWVPEG....ENVKIPVAIKELLKSTG..... +TRKA_HUMAN/504-775 IVLKWELGEGAFGKVFLAECHNLLP...EQDKMLVAVKALK......... +TTK_HUMAN/509-775 YSILKQIGSGGSSKVFQVLNE.........KKQIYAIKYVN........L +WEE1_HUMAN/299-569 FHELEKIGSGEFGSVFKCVKR........LDGCIYAIKRS.......... + +7LES_DROME/2209-2481 ASEFAE............LLQEAQLMSNFK......HENIVRLVGICF.. +ABL1_CAEEL/296-547 EDAMPLH..........EFLAEAAIMKDLH......HKNLVRLLGVCT.. +ARK1_BOVIN/191-453 KKRIKMKQGE......TLALNERIMLSLVSTG...DCPFIVCMSYAFH.. +AVR2_HUMAN/192-479 IQDKQS..........WQNEYEVYSLPGMK......HENILQFIGAEKRG +BFR2_HUMAN/367-643 EKDLSD............LVSEMEMMKMIG.....KHKNIINLLGACTQ. +BYR1_SCHPO/66-320 DSKLQK...........QILRELGVLHHCR......SPYIVGFYGAFQ.. +BYR2_SCHPO/394-658 SVSESKDRHAKLL...DALAGEIALLQELS......HEHIVQYLGSNL.. +CC15_YEAST/25-272 DEELN............DIMAEISLLKNLN......HNNIVKYHGFIR.. +CC21_MEDSA/1-284 EQEDEG.....VP...STAIREISLLKEMQ......HRNIVRLQDVVH.. +CC5_YEAST/82-337 KASIK...SEKTR...KKLLSEIQIHKSMS......HPNIVQFIDCFE.. +CDPK_SOYBN/34-292 KRKLLCKEDYED......VWREIQIMHHLSE.....HANVVRIEGTYE.. +CDR1_SCHPO/12-258 IRYAS.............IGMEILMMRLLR......HPNILRLYDVWT.. +CHK1_SCHPO/10-272 KHATSCMNAGVWA...RRMASEIQLHKLCNG.....HKNIIHFYNTAE.. +CLK1_MOUSE/160-476 DRYCEA............AQSEIQVLEHLNTTDP..HSTFRCVQMLEWF. +CTK1_YEAST/183-469 GEREG......FP...ITSIREIKLLQSFD......HPNVSTIKEIMVE. +ERK1_CANAL/68-371 ERSMLCLR..........TLRELKLLKHFN......HENIISILAIQRPI +ERK3_HUMAN/20-312 ARSMKH............ALREIKIIRRLD......HDNIVKVYEVLGPK +FUSE_DROME/4-254 KRGRATKELKN.......LRRECDIQARLK......HPHVIEMIESFE.. +HR25_YEAST/9-273 IRSRHP...........QLDYESRVYRYLS......GGVGIPFIRWFGR. +JAK1_HUMAN/571-833 KVLDPSHRDIS.....LAFFEAASMMRQVS......HKHIVYLYGVCV.. +JAK1_HUMAN/864-1137 PESGGN........HIADLKKEIEILRNLY......HENIVKYKGICTED +KAB7_YEAST/1096-1354 RILVDTWVRDRKL...GTIPSEIQIMATLNKK...PHENILRLLDFFE.. +KAKT_MLVAT/171-429 KEVIVAKDEVAH......TLTENRVLQNSR......HPFLTALKYSFQ.. +KC21_CHICK/39-324 PVKKKK............IKREIKILENLRG.....GPNIITLADIVKDP +KCC4_MOUSE/42-296 KKTVD....KKI......VRTEIGVLLRLS......HPNIIKLKEIFE.. +KCR8_YEAST/316-590 RTSESAEKYSKR......LTSEFCISSSLH......HTNIVTTLDLFQD. +KG3A_RAT/119-403 DKRFK..............NRELQIMRKLD......HCNIVRLRYFFYSS +KGP1_DROME/457-717 KRHIVDTKQEE......HIFSERHIMLSSR......SPFICRLYRTFR.. +KI28_YEAST/7-290 TSEFKDGLDMS.......AIREVKYLQEMQ......HPNVIELIDIFM.. +KI82_YEAST/324-602 KHEMIK...RKKI...KRVLTEQEILATSD......HPFIVTLYHSFQ.. +KIN1_SCHPO/125-395 KAAASAEATKAAQ...SEKNKEIRTVREAALSTLLRHPYICEARDVYI.. +KIR1_HUMAN/208-495 SRDEKS............WFRETELYNTVMLR....HENILGFIASDMTS +KKIA_HUMAN/5-288 SEDDP.....VIK...KIALREIRMLKQLK......HPNLVNLLEVFR.. +KKL6_YEAST/192-508 QINTSLRYKETLSRLENSLTRELQVLKSLN......HPCIVKLLGINNPI +KMIL_AVIMH/82-339 PEQFQA............FRNEVAVLRKTR......HVNILLFMGYMT.. +KML2_CHICK/1453-1708 AYSAK...EKEN......IRDEISIMNCLH......HPKLVQCVDAFE.. +KMOS_CERAE/60-338 KCTKNRLASRR.......SFWAELNVARLR......HDNIVRVVAASTRT +KPBH_RAT/24-291 AERLSLEQLEEVR...DATRREMHILRQVAG.....HPHIITLIDSYE.. +KPIM_HUMAN/38-290 KDRISDWGELPNG...TRVPMEVVLLKKVSSG....FSGVIRLLDWFE.. +KPK2_PLAFK/111-364 KSKVK....ESN......VYTEIEVLRKVM......HKYIIKLISAYE.. +KPRO_MAIZE/534-812 VRQGKE...........VFQAELSVIGRIN......HMNLVRIWGFCS.. +KR1_HSV11/191-478 GWYTS.............TSHEARLLRRLD......HPAILPLLDLHV.. +KR1_PRVKA/53-332 GQKPS.............TLMEGMLLQRLS......HDNVMRMKQMLA.. +KS61_MOUSE/407-664 KSKRD.............PSEEIEILLRYGQ.....HPNIITLKDVYD.. +KYK1_DICDI/1289-1559 RDQFKT...KSSL...VMFQNEVGILSKLR......HPNVVQFLGACTAG +KYK2_DICDI/108-364 EEQFKER...........FIREVQNLKKGN......HQNIVMFIGACY.. +MAK_RAT/4-284 RKFYSWDECMN........LREVKSLKKLN......HANVIKLKEVIR.. +MEK1_YEAST/162-444 LKPNK.............FDKEARILLRLD......HPNIIKVYHTFCD. +MET_HUMAN/1078-1337 RITDIGEVS........QFLTEGIIMKDFS......HPNVLSLLGICLR. +MKK1_YEAST/221-488 TLNTDPEYQKQ.......IFRELQFNRSFQ......SEYIVRYYGMFTDD +MLK1_HUMAN/3-262 DEDISQTIEN........VRQEAKLFAMLK......HPNIIALRGVCL.. +NINL_DROME/16-282 EHQVS.............IEEEYRTLRDYCD.....HPNLPEFYGVYKLS +NPR1_YEAST/438-742 KFENES..KRDYV...KKITSEYCIGTTLN......HPNIIETIEIVY.. +PHY_CERPU/1004-1282 RNNED.............FKREVEILAELC......HPNITSMFCSPL.. +PKD1_DICDI/36-291 RETIVHEKHKEH......VNNEINIMLNIS......HPYIVKTYSTFN.. +PKN1_MYXXA/59-321 AHLTMYPELVQR......FHAEARAVNLIG......HENIVSIFDMDA.. +RAN1_SCHPO/18-295 KDGLNEKQKK.......LQARELALHARVSS.....HPYIITLHRVLE.. +RYK_HUMAN/327-593 ASEIQVT..........MMLTESCKLRGLH......HRNLLPITHVCIE. +SGV1_YEAST/60-366 EKDLFP..........ITAQREITILKRLN......HKNIIKLIEMVYDH +SPK1_YEAST/198-466 KRKVIGNMDG........VTRELEVLQKLN......HPRIVRLKGFYE.. +ST20_YEAST/620-871 KQPKKE...........LIINEILVMKGSK......HPNIVNFIDSYV.. +STE7_YEAST/191-466 EQNNS.....TII...NQLVRELSIVKNVKP.....HENIITFYGAYYN. +SYK_PIG/364-619 EANDPALKD........ELLAEANVMQQLD......NPYIVRMIGICE.. +TOP_DROME/938-1194 AESSEE............FLREAYIMASEE......HVNLLKLLAVCM.. +TRKA_HUMAN/504-775 EASESAR.........QDFQREAELLTMLQ......HQHIVRFFGVCTE. +TTK_HUMAN/509-775 EEADNQTLDS........YRNEIAYLNKLQQH....SDKIIRLYDYEI.. +WEE1_HUMAN/299-569 KKPLAGSVDEQN......ALREVYAHAVLGQ.....HSHVVRYFSAWA.. + +7LES_DROME/2209-2481 .................DTESISLIMEHMEAG......DLLSYLRAARAT +ABL1_CAEEL/296-547 .................HEAPFYIITEFMCNG......NLLEYLRRTDKS +ARK1_BOVIN/191-453 .................TPDKLSFILDLMNGG......DLHYHLSQHG.. +AVR2_HUMAN/192-479 ...............TSVDVDLWLITAFHEKG......SLSDFLKANV.. +BFR2_HUMAN/367-643 ..................DGPLYVIVEYASKG......NLREYLRARRPP +BYR1_SCHPO/66-320 .................YKNNISLCMEYMDCG......SLDAILREGG.. +BYR2_SCHPO/394-658 .................NSDHLNIFLEYVPGG......SVAGLLTMYG.. +CC15_YEAST/25-272 .................KSYELYILLEYCANG......SLRRLISRSS.. +CC21_MEDSA/1-284 .................SDKRLYLVFEYLDL.......DLKKHMDSSPE. +CC5_YEAST/82-337 .................DDSNVYILLEICPNG......SLMELLKRRK.. +CDPK_SOYBN/34-292 .................DSTAVHLVMELCEGG......ELFDRIVQKG.. +CDR1_SCHPO/12-258 .................DHQHMYLALEYVPDG......ELFHYIRKHG.. +CHK1_SCHPO/10-272 .................NPQWRWVVLEFAQGG......DLFDKIEPDVG. +CLK1_MOUSE/160-476 ................EHRGHICIVFELLGLS.......TYDFIKENS.. +CTK1_YEAST/183-469 .................SQKTVYMIFEYADN.......DLSGLLLNKEV. +ERK1_CANAL/68-371 N..............YESFNEIYLIQELMET.......DLHRVIRTQN.. +ERK3_HUMAN/20-312 G........TDLQGELFKFSVAYIVQEYMET.......DLARLLEQGT.. +FUSE_DROME/4-254 .................SKTDLFVVTEFALM.......DLHRYLSYNG.. +HR25_YEAST/9-273 .................EGEYNAMVIDLLGP.......SLEDLFNYCHR. +JAK1_HUMAN/571-833 .................RDVENIMVEEFVEGG......PLDLFMHRKSD. +JAK1_HUMAN/864-1137 .................GGNGIKLIMEFLPSG......SLKEYLPKNKN. +KAB7_YEAST/1096-1354 .................DDDYYYIETPVHGETGC...IDLFDLIEFKT.. +KAKT_MLVAT/171-429 .................THDRLCFVMEYANGG......ELFFHLSRER.. +KC21_CHICK/39-324 .................VSRTPALVFEHVNNT......DFKQLYQTLT.. +KCC4_MOUSE/42-296 .................TPTEISLVLELVTGG......ELFDRIVEKG.. +KCR8_YEAST/316-590 .................AKGEYCEVMEYCAGG......DLFTLVVAAG.. +KG3A_RAT/119-403 G.............EKKDELYLNLVLEYVPET....VYRVARHFTKAK.. +KGP1_DROME/457-717 .................DEKYVYMLLEACMGG......EIWTMLRDRG.. +KI28_YEAST/7-290 .................AYDNLNLVLEFLPT.......DLEVVIKDKS.. +KI82_YEAST/324-602 .................TKDYLYLCMEYCMGG......EFFRALQTRKS. +KIN1_SCHPO/125-395 .................TNSHYYMVFEFVDGG......QMLDYIISHG.. +KIR1_HUMAN/208-495 R...............HSSTQLWLITHYHEMG......SLYDYLQLTT.. +KKIA_HUMAN/5-288 .................RKRRLHLVFEYCDHT......VLHELDRYQR.. +KKL6_YEAST/192-508 FVTSKKPLCDLIIKTPRALPPCDMIMSYCPAG......DLLAAVMARNG. +KMIL_AVIMH/82-339 ..................KDNLAIVTQWCEGS......SLYKHLHVQET. +KML2_CHICK/1453-1708 .................EKANIVMVLEMVSGG......ELFERIIDED.. +KMOS_CERAE/60-338 PAG..............SNSLGTIIMEFGGNV......TLHQVIYGAASH +KPBH_RAT/24-291 .................SSSFMFLVFDLMRKG......ELFDYLTEKV.. +KPIM_HUMAN/38-290 .................RPDSFVLILERPEPV.....QDLFDFITERG.. +KPK2_PLAFK/111-364 .................QEGFVYLVLEYLKGG......ELFEYLNNNG.. +KPRO_MAIZE/534-812 .................EGSHRLLVSEYVENG......SLANILFSEGG. +KR1_HSV11/191-478 .................VSGVTCLVLPKYQA.......DLYTYLSRRLN. +KR1_PRVKA/53-332 .................RGPATCLVLPHFRC.......DLYSYLTMRD.. +KS61_MOUSE/407-664 .................DGKHVYLVTELMRGG......ELLDKILRQK.. +KYK1_DICDI/1289-1559 .................GEDHHCIVTEWMGGG......SLRQFLTDHFN. +KYK2_DICDI/108-364 ...................KPACIITEYMAGG......SLYNILHNPNS. +MAK_RAT/4-284 .................ENDHLYFIFEYMKEN......LYQLMKDRNK.. +MEK1_YEAST/162-444 .................RNNHLYIFQDLIPGG......DLFSYLAKGDCL +MET_HUMAN/1078-1337 .................SEGSPLVVLPYMKHG......DLRNFIRNETH. +MKK1_YEAST/221-488 .................ENSSIYIAMEYMGGRSL...DAIYKNLLERGG. +MLK1_HUMAN/3-262 .................KEPNLCLVMEFARGG......PLNRVLSGKR.. +NINL_DROME/16-282 KPN..............GPDEIWFVMEYCAGGTA...VDMVNKLLKLDR. +NPR1_YEAST/438-742 .................ENDRILQVMEYCEY.......DLFAIVMSNK.. +PHY_CERPU/1004-1282 .................YRRKCSIIMELMDG.......DLLALMQRRLDR +PKD1_DICDI/36-291 .................TPTKIHFIMEYAGKK......DLFHHLRANK.. +PKN1_MYXXA/59-321 .................TPPRPYLIMEFLDG.......APLSAWVGTP.. +RAN1_SCHPO/18-295 .................TEDAIYVVLQYCPNG......DLFTYITEKKVY +RYK_HUMAN/327-593 .................EGEKPMVILPYMNWG......NLKLFLRQCKLV +SGV1_YEAST/60-366 SPDITN......AASSNLHKSFYMILPYMVA.......DLSGVLHNPR.. +SPK1_YEAST/198-466 .................DTESYYMVMEFVSGG......DLMDFVAAHG.. +ST20_YEAST/620-871 .................LKGDLWVIMEYMEGG......SLTDVVTHCI.. +STE7_YEAST/191-466 ...............QHINNEIIILMEYSDCGSLDKILSVYKRFVQRGTV +SYK_PIG/364-619 ..................AESWMLVMEMAELG......PLNKYLQQNR.. +TOP_DROME/938-1194 ..................SSQMMLITQLMPLG......CLLDYVRNNRD. +TRKA_HUMAN/504-775 ..................GRPLLMVFEYMRHG......DLNRFLRSHGPD +TTK_HUMAN/509-775 .................TDQYIYMVMECGNI.......DLNSWLKKKK.. +WEE1_HUMAN/299-569 .................EDDHMLIQNEYCNGG......SLADAISENYRI + +7LES_DROME/2209-2481 STQEP.....QPTAGLSLSELLAMCIDVANGCSYLEDMH..........F +ABL1_CAEEL/296-547 ..............LLPPIILVQMASQIASGMSYLEARH..........F +ARK1_BOVIN/191-453 ..............VFSEADMRFYAAEIILGLEHMHNRF..........V +AVR2_HUMAN/192-479 ...............VSWNELCHIAETMARGLAYLHEDIPGLKDGHKPAI +BFR2_HUMAN/367-643 GMEYSYDINRVPEEQMTFKDLVSCTYQLARGMEYLASQK..........C +BYR1_SCHPO/66-320 ..............PIPLDILGKIINSMVKGLIYLYNVLH.........I +BYR2_SCHPO/394-658 ..............SFEETLVKNFIKQTLKGLEYLHSRG..........I +CC15_YEAST/25-272 .T............GLSENESKTYVTQTLLGLKYLHGEG..........V +CC21_MEDSA/1-284 .............FIKDPRQVKMFLYQMLCGIAYCHSHR..........V +CC5_YEAST/82-337 ..............VLTEPEVRFFTTQICGAIKYMHSRR..........V +CDPK_SOYBN/34-292 ..............HYSERQAARLIKTIVEVVEACHSLG..........V +CDR1_SCHPO/12-258 ..............PLSEREAAHYLSQILDAVAHCHRFR..........F +CHK1_SCHPO/10-272 ..............ID.EDVAQFYFAQLMEGISFMHSKG..........V +CLK1_MOUSE/160-476 ............FLPFRMDHIRKMAYQICKSVNFLHSNK..........L +CTK1_YEAST/183-469 ..............QISHSQCKHLFKQLLLGMEYLHDNK..........I +ERK1_CANAL/68-371 ...............LSDDHIQYFIYQTLRALKAMHSAN..........V +ERK3_HUMAN/20-312 ...............LAEEHAKLFMYQLLRGLKYIHSAN..........V +FUSE_DROME/4-254 ..............AMGEEPARRVTGHLVSALYYLHSNR..........I +HR25_YEAST/9-273 ..............RFSFKTVIMLALQMFCRIQYIHGRS..........F +JAK1_HUMAN/571-833 ..............VLTTPWKFKVAKQLASALSYLEDKD..........L +JAK1_HUMAN/864-1137 ..............KINLKQQLKYAVQICKGMDYLGSRQ..........Y +KAB7_YEAST/1096-1354 ..............NMTEFEAKLIFKQVVAGIKHLHDQG..........I +KAKT_MLVAT/171-429 ..............VFSEDRARFYGAEIVSALDYLHSEKN.........V +KC21_CHICK/39-324 .................DYDIRFYMYEILKALDYCHSMG..........I +KCC4_MOUSE/42-296 ..............YYSERDARDAVKQILEAVAYLHENG..........I +KCR8_YEAST/316-590 ..............KLEYMEADCFFKQLIRGVVYMHEMG..........V +KG3A_RAT/119-403 .............LIIPIIYVKVYMYQLFRSLAYIHSQG..........V +KGP1_DROME/457-717 ..............SFEDNAAQFIIGCVLQAFEYLHARG..........I +KI28_YEAST/7-290 .............ILFTPADIKAWMLMTLRGVYHCHRNF..........I +KI82_YEAST/324-602 .K............CIAEEDAKFYASEVVAALEYLHLLG..........F +KIN1_SCHPO/125-395 ..............KLKEKQARKFERQIGSALSYLHQNS..........V +KIR1_HUMAN/208-495 ...............LDTVSCLRIVLSIASGLAHLHIEIFGTQGK..PAI +KKIA_HUMAN/5-288 ..............GVPEHLVKSITWQTLQAVNFCHKHN..........C +KKL6_YEAST/192-508 ..............RLEAWLIQRIFTEVVLAVKYLHENS..........I +KMIL_AVIMH/82-339 ..............KFQMFQLIDIARQTAQGMDYLHAKN..........I +KML2_CHICK/1453-1708 .F............ELTERECIKYMRQISEGVEYIHKQG..........I +KMOS_CERAE/60-338 PEGDAGEPHCSTGGPLTLGKCLKYSLDVVNGLLFLHSQS..........I +KPBH_RAT/24-291 ..............ALSEKETRSIMRSLLEAVNFLHVNN..........I +KPIM_HUMAN/38-290 ..............ALQEELARSFFWQVLEAVRHCHNCG..........V +KPK2_PLAFK/111-364 ..............PYTEQVAKKAMKRVLIALEALHSNG..........V +KPRO_MAIZE/534-812 ............NILLDWEGRFNIALGVAKGLAYLHHECL.......EWV +KR1_HSV11/191-478 ..............PLGRPQIAAVSRQLLSAVDYIHRQG..........I +KR1_PRVKA/53-332 G.............PLDMRDAGCVIRAVLRGLAYLHGMR..........I +KS61_MOUSE/407-664 ..............FFSEREASFVLHTISKTVEYLHSQG..........V +KYK1_DICDI/1289-1559 ............LLEQNPHIRLKLALDIAKGMNYLHGWTP.......P.I +KYK2_DICDI/108-364 ..S.......TPKVKYSFPLVLKMATDMALGLLHLHSIT..........I +MAK_RAT/4-284 ..............LFPESVIRNIMYQILQGLAFIHKHG..........F +MEK1_YEAST/162-444 T.............SMSETESLLIVFQILQALNYLHDQD..........I +MET_HUMAN/1078-1337 ..............NPTVKDLIGFGLQVAKGMKYLASKK..........F +MKK1_YEAST/221-488 ..............RISEKVLGKIAEAVLRGLSYLHEKK..........V +MLK1_HUMAN/3-262 ...............IPPDILVNWAVQIARGMNYLHDEAI.......VPI +NINL_DROME/16-282 ..............RMREEHIAYIIRETCRAAIELNRNH..........V +NPR1_YEAST/438-742 ...............MSYEEICCCFKQILTGVQYLHSIG..........L +PHY_CERPU/1004-1282 NED.......HDSPPFSILEVVDIILQTSEGMNYLHEKG..........I +PKD1_DICDI/36-291 ..............CFTEQTTKLIVAEIVLAIEYLHAEN..........I +PKN1_MYXXA/59-321 ...............LAAGAVVSVLSQVCDALQAAHARG..........I +RAN1_SCHPO/18-295 ..............QGNSHLIKTVFLQLISAVEHCHSVG..........I +RYK_HUMAN/327-593 EAN........NPQAISQQDLVHMAIQIACGMSYLARRE..........V +SGV1_YEAST/60-366 .............INLEMCDIKNMMLQILEGLNYIHCAK..........F +SPK1_YEAST/198-466 ..............AVGEDAGREISRQILTAIKYIHSMG..........I +ST20_YEAST/620-871 ..............LT.EGQIGAVCRETLSGLEFLHSKG..........V +STE7_YEAST/191-466 SSK.......K..TWFNELTISKIAYGVLNGLDHLYRQYK.........I +SYK_PIG/364-619 ..............HVKDKNIIELVHQVSMGMKYLEECN..........F +TOP_DROME/938-1194 ..............KIGSKALLNWSTQIAKGMSYLEEKR..........L +TRKA_HUMAN/504-775 AKLLAGGED.VAPGPLGLGQLLAVASQVAAGMVYLAGLH..........F +TTK_HUMAN/509-775 ..............SIDPWERKSYWKNMLEAVHTIHQHG..........I +WEE1_HUMAN/299-569 MS............YFKEAELKDLLLQVGRGLRYIHSMS..........L + +7LES_DROME/2209-2481 VHRDLACRNCLVTESTGSTD............RRRTVKIGDFGLARDIYK +ABL1_CAEEL/296-547 IHRDLAARNCLVSEH...................NIVKIADFGLARFMKE +ARK1_BOVIN/191-453 VYRDLKPANILLDEH...................GHVRISDLGLACDFS. +AVR2_HUMAN/192-479 SHRDIKSKNVLLKNN...................LTACIADFGLALKFEA +BFR2_HUMAN/367-643 IHRDLAARNVLVTEN...................NVMKIADFGLARDINN +BYR1_SCHPO/66-320 IHRDLKPSNVVVNSR...................GEIKLCDFGVSGELVN +BYR2_SCHPO/394-658 VHRDIKGANILVDNK...................GKIKISDFGISKKLEL +CC15_YEAST/25-272 IHRDIKAANILLSAD...................NTVKLADFGVSTIVN. +CC21_MEDSA/1-284 LHRDLKPQNLLIDRR..................TNSLKLADFGLARAFG. +CC5_YEAST/82-337 IHRDLKLGNIFFDSN...................YNLKIGDFGLAAVLAN +CDPK_SOYBN/34-292 MHRDLKPENFLFDTID.....E...........DAKLKATDFGLSVFYK. +CDR1_SCHPO/12-258 RHRDLKLENILIKVN..................EQQIKIADFGMATVEP. +CHK1_SCHPO/10-272 AHRDLKPENILLDYN...................GNLKISDFGFASLFSY +CLK1_MOUSE/160-476 THTDLKPENILFVKSDYTEAYNPKMKRDERTIVNPDIKVVDFGSATYDD. +CTK1_YEAST/183-469 LHRDVKGSNILIDNQ...................GNLKITDFGLARKMN. +ERK1_CANAL/68-371 LHRDLKPSNLLLNSN...................CDLKICDFGLARSIAS +ERK3_HUMAN/20-312 LHRDLKPANIFISTED..................LVLKIGDFGLARIVDQ +FUSE_DROME/4-254 LHRDLKPQNVLLDKN...................MHAKLCDFGLARNMT. +HR25_YEAST/9-273 IHRDIKPDNFLMGVGRR................GSTVHVIDFGLSKKYRD +JAK1_HUMAN/571-833 VHGNVCTKNLLLAREG..................IDSECGPFIKLSDPG. +JAK1_HUMAN/864-1137 VHRDLAARNVLVESE...................HQVKIGDFGLTKAIET +KAB7_YEAST/1096-1354 VHRDIKDENVIVDSK...................GFVKIIDFGSAAYVK. +KAKT_MLVAT/171-429 VYRDLKLENLMLDKD...................GHIKITDFGLCKEGIK +KC21_CHICK/39-324 MHRDVKPHNVMIDHEH..................RKLRLIDWGLAEFYHP +KCC4_MOUSE/42-296 VHRDLKPENLLYATP.A....P...........DAPLKIADFGLSKIV.. +KCR8_YEAST/316-590 CHRDLKPENLLLTHD...................GVLKITDFGNSECFKM +KG3A_RAT/119-403 CHRDIKPQNLLVDPDT..................AVLKLCDFGSAKQLVR +KGP1_DROME/457-717 IYRDLKPENLMLDER...................GYVKIVDFGFAKQIG. +KI28_YEAST/7-290 LHRDLKPNNLLFSPD...................GQIKVADFGLARAIP. +KI82_YEAST/324-602 IYRDLKPENILLHQS...................GHVMLSDFDLSIQATG +KIN1_SCHPO/125-395 VHRDLKIENILISKT...................GDIKIIDFGLSNLYR. +KIR1_HUMAN/208-495 AHRDLKSKNILVKKN...................GQCCIADLGLAVMHSQ +KKIA_HUMAN/5-288 IHRDVKPENILITKH...................SVIKLCDFGFARLLTG +KKL6_YEAST/192-508 IHRDLKLENILLKYSFDDINSFRDSPIY...CKQNFIELADFGLCKKIE. +KMIL_AVIMH/82-339 IHRDMKSNNIFLHGG...................LTVKIGDFGLATVKSR +KML2_CHICK/1453-1708 VHLDLKPENIMCVNKTG.................TSIKLIDFGLARRLE. +KMOS_CERAE/60-338 VHLDLKPANILISEQ...................DVCKISDFGCSEKLED +KPBH_RAT/24-291 VHRDLKPENILLDDN...................MQIRLSDFGFSCHLE. +KPIM_HUMAN/38-290 LHRDIKDENILIDLN..................RGELKLIDFGSGALLK. +KPK2_PLAFK/111-364 VHRDLKMENLMLENPN.....D...........PSSLKIIDFGLASFLN. +KPRO_MAIZE/534-812 IHCDVKPENILLDQA...................FEPKITDFGLVKLLNR +KR1_HSV11/191-478 IHRDIKTENIFINTP...................EDICLGDFGAACFVQG +KR1_PRVKA/53-332 MHRDVKAENIFLEDV...................DTVCLGDLGAARCN.. +KS61_MOUSE/407-664 VHRDLKPSNILYVDESG....N...........PECLRICDFGFAKQLR. +KYK1_DICDI/1289-1559 LHRDLSSRNILLDHNIDPKNPVVSS......RQDIKCKISDFGLSRLKKE +KYK2_DICDI/108-364 VHRDLTSQNILLDEL...................GNIKISDFGLSAEKSR +MAK_RAT/4-284 FHRDMKPENLLCMGP...................ELVKIADFGLARELR. +MEK1_YEAST/162-444 VHRDLKLDNILLCTP.E....P...........CTRIVLADFGIAKDLN. +MET_HUMAN/1078-1337 VHRDLAARNCMLDEK...................FTVKVADFGLARDMYD +MKK1_YEAST/221-488 IHRDIKPQNILLNEN...................GQVKLCDFGVSGEAV. +MLK1_HUMAN/3-262 IHRDLKSSNILILQKVENGDLS...........NKILKITDFGLAREWH. +NINL_DROME/16-282 LHRDIRGDNILLTKN...................GRVKLCDFGLSRQVDS +NPR1_YEAST/438-742 AHRDLKLDNCVINEK...................GIVKLIDFGAAVVFSY +PHY_CERPU/1004-1282 IHRDLKSMNILVKSVKVTKSEIG..........YVHVKVADFGLSKTKDS +PKD1_DICDI/36-291 IYRDLKPENILIDEK...................GHIKLTDFGFSKKTVG +PKN1_MYXXA/59-321 VHRDLKPDNIFLVRRNGN...............APFVKVLDFGIAKLADA +RAN1_SCHPO/18-295 YHRDLKPENIMVGNDG..................NTVYLADFGLATTEPY +RYK_HUMAN/327-593 IHKDLAARNCVIDDT...................LQVKITDNALSRDLFP +SGV1_YEAST/60-366 MHRDIKTANILIDHN...................GVLKLADFGLARLYYG +SPK1_YEAST/198-466 SHRDLKPDNILIEQD......D...........PVLVKITDFGLAKVQG. +ST20_YEAST/620-871 LHRDIKSDNILLSME...................GDIKLTDFGFCAQIN. +STE7_YEAST/191-466 IHRDIKPSNVLINSK...................GQIKLCDFGVSKKLIN +SYK_PIG/364-619 VHRDLAARNVLLVTQ...................HYAKISDFGLSKALRA +TOP_DROME/938-1194 VHRDLAARNVLVQTP...................SLVKITDFGLAKLLSS +TRKA_HUMAN/504-775 VHRDLATRNCLVGQG...................LVVKIGDFGMSRDIYS +TTK_HUMAN/509-775 VHSDLKPANFLIVDG....................MLKLIDFGIANQMQP +WEE1_HUMAN/299-569 VHMDIKPSNIFISRTSI....P...........NAASEEGDEDDWASN.. + +7LES_DROME/2209-2481 S.................DYYRKEGEGLLPVRWMSPES............ +ABL1_CAEEL/296-547 D.................TYTAHAGAKFPIKWT.APEG............ +ARK1_BOVIN/191-453 ...................KKKPHASVGTHGYM.APEV............ +AVR2_HUMAN/192-479 G................KSAGDTHGQVGTRRYM.APEV............ +BFR2_HUMAN/367-643 .................IDYYKKTTNGRLPVKWMAPEA............ +BYR1_SCHPO/66-320 ....................SVAQTFVGTSTYM.SPER............ +BYR2_SCHPO/394-658 N............STSTKTGGARPSFQGSSFWM.APEV............ +CC15_YEAST/25-272 ....................SSALTLAGTLNWM.APEI............ +CC21_MEDSA/1-284 .................IPVRTFTHEVVTLWYR.APEIL........... +CC5_YEAST/82-337 ..................ESERKYTICGTPNYI.APEV............ +CDPK_SOYBN/34-292 ..................PGESFCDVVGSPYYV.APEV............ +CDR1_SCHPO/12-258 ..................NDSCLENYCGSLHYL.APEI............ +CHK1_SCHPO/10-272 K................GKSRLLNSPVGSPPYA.APEI............ +CLK1_MOUSE/160-476 ....................EHHSTLVSTRHYR.APEV............ +CTK1_YEAST/183-469 ..................SRADYTNRVITLWYR.PPEL............ +ERK1_CANAL/68-371 Q...............EDNYGFMTEYVATRWYR.APEI............ +ERK3_HUMAN/20-312 HYS...............HKGYLSEGLVTKWYR.SPRL............ +FUSE_DROME/4-254 .................LGTHVLTSIKGTPLYM.APEL............ +HR25_YEAST/9-273 FN...........THRHIPYRENKSLTGTARYA.SVNT............ +JAK1_HUMAN/571-833 ................IPITVLSRQECIERIPWIAPEC............ +JAK1_HUMAN/864-1137 D................KEYYTVKDDRDSPVFWYAPEC............ +KAB7_YEAST/1096-1354 ...................SGPFDVFVGTIDYA.APEV............ +KAKT_MLVAT/171-429 ..................DGATMKTFCGTPEYL.APEV............ +KC21_CHICK/39-324 ...................GQEYNVRVASRYFK.GPEL............ +KCC4_MOUSE/42-296 .................EHQVLMKTVCGTPGYC.APEI............ +KCR8_YEAST/316-590 AWEK..............NIHLSGGVCGSSPYI.APEEY........... +KG3A_RAT/119-403 ...................GEPNVSYICSRYYR.APEL............ +KGP1_DROME/457-717 ..................TSSKTWTFCGTPEYV.APEI............ +KI28_YEAST/7-290 .................APHEILTSNVVTRWYR.APEL............ +KI82_YEAST/324-602 SKKPTMKDSTYLDTKICSDGFRTNSFVGTEEYL.APEV............ +KIN1_SCHPO/125-395 ..................RQSRLRTFCGSLYFA.APEL............ +KIR1_HUMAN/208-495 STN..............QLDVGNNPRVGTKRYM.APEV............ +KKIA_HUMAN/5-288 ..................PSDYYTDYVATRWYR.SPEL............ +KKL6_YEAST/192-508 ..................NNEMCTARCGSEDYV.SPEI............ +KMIL_AVIMH/82-339 WS................GSQQVEQPTGSILWM.APEVIR.......... +KML2_CHICK/1453-1708 ..................SAGSLKVLFGTPEFV.APEV............ +KMOS_CERAE/60-338 ...............LLCFQTPLYPLGGTYTHR.APEL............ +KPBH_RAT/24-291 ..................PGEKLRELCGTPGYL.APEI............ +KPIM_HUMAN/38-290 ...................DTVYTDFDGTRVYS.PPEW............ +KPK2_PLAFK/111-364 ...................SPSMNMRCGSPGYV.APEI............ +KPRO_MAIZE/534-812 G................GSTQNVSHVRGTLGYI.APEW............ +KR1_HSV11/191-478 S................RSSPFPYGIAGTIDTN.APEV............ +KR1_PRVKA/53-332 .................VAAPNFYGLAGTIETN.APEV............ +KS61_MOUSE/407-664 .................AENGLLMTPCYTANFV.APEV............ +KYK1_DICDI/1289-1559 ..................QASQMTQSVGCIPYM.APEV............ +KYK2_DICDI/108-364 E................GSMTMTNGGICNPRWR.PPEL............ +MAK_RAT/4-284 ..................SQPPYTDYVSTRWYR.APEVL........... +MEK1_YEAST/162-444 .................SNKERMHTVVGTPEYC.APEVGFRANRKAYQSF +MET_HUMAN/1078-1337 K...............EYYSVHNKTGAKLPVKWMALES............ +MKK1_YEAST/221-488 ...................NSLATTFTGTSFYM.APER............ +MLK1_HUMAN/3-262 ...................RTTKMSAAGTYAWM.APEV............ +NINL_DROME/16-282 ..................TLGKRGTCIGSPCWM.APEVVS.......... +NPR1_YEAST/438-742 P..............FSKNLVEASGIVGSDPYL.APEVC........... +PHY_CERPU/1004-1282 S................TRYSNQTWNRGTNRWM.APEVINLG........ +PKD1_DICDI/36-291 ...................GKNTSSVCGTFDYM.APEI............ +PKN1_MYXXA/59-321 H................MPQTHAGIIVGTPEYM.APEQ............ +RAN1_SCHPO/18-295 SSDFGCG..SLFYMSPECQREVKKLSSLSDMLPVTPEP............ +RYK_HUMAN/327-593 .................MDYHCLGDNENRPVRWMALES............ +SGV1_YEAST/60-366 CPPNLKYPG......GAGSGAKYTSVVVTRWYR.APELV........... +SPK1_YEAST/198-466 ..................NGSFMKTFCGTLAYV.APEVIR.......... +ST20_YEAST/620-871 .................ELNLKRTTMVGTPYWM.APEV............ +STE7_YEAST/191-466 ....................SIADTFVGTSTYM.SPER............ +SYK_PIG/364-619 D................ENYYKAQTHGKWPVKWYAPEC............ +TOP_DROME/938-1194 D.................SNEYKAAGGKMPIKWLALEC............ +TRKA_HUMAN/504-775 .................TDYYRVGGRTMLPIRWMPPES............ +TTK_HUMAN/509-775 ................DTTSVVKDSQVGTVNYM.PPEAIKDMS....... +WEE1_HUMAN/299-569 ..................KVMFKIGDLGHVTRISSPQV............ + +7LES_DROME/2209-2481 .....LVD......GLFTTQSDVWAFGVLCWEILTLG............. +ABL1_CAEEL/296-547 .....LAF......NTFSSKSDVWAFGVLLWEIATYG............. +ARK1_BOVIN/191-453 .....LQKG.....VAYDSSADWFSLGCMLFKLLRG.............. +AVR2_HUMAN/192-479 .....LEGAIN.FQRDAFLRIDMYAMGLVLWELASRCTAADG........ +BFR2_HUMAN/367-643 .....LFD......RVYTHQSDVWSFGVLMWEIFTLG............. +BYR1_SCHPO/66-320 .....IRG......GKYTVKSDIWSLGISIIELATQ.............. +BYR2_SCHPO/394-658 .....VKQ......TMHTEKTDIWSLGCLVIEMLTS.............. +CC15_YEAST/25-272 .....LGN......RGASTLSDIWSLGATVVEMLTK.............. +CC21_MEDSA/1-284 .....LGS......RHYSTPVDVWSVGCIFAEMANRRP............ +CC5_YEAST/82-337 .....LMG....KHSGHSFEVDIWSLGVMLYALLIG.............. +CDPK_SOYBN/34-292 .....LR.......KLYGPESDVWSAGVILYILLS............... +CDR1_SCHPO/12-258 .....VSHK.....PYRGAPADVWSCGVILYSLLSN.............. +CHK1_SCHPO/10-272 ......TQ......QYDGSKVDVWSCGIILFALLLG.............. +CLK1_MOUSE/160-476 .....ILA......LGWSQPCDVWSIGCILIEYYLGFTVFPTHD...... +CTK1_YEAST/183-469 .....LLG.....TTNYGTEVDMWGCGCLLVELFNKTAIFQ......... +ERK1_CANAL/68-371 .....MLT.....FQEYTTAIDVWSVGCILAEMLSGRPLFPGRDYHNQLW +ERK3_HUMAN/20-312 .....LLSP.....NNYTKAIDMWAAGCILAEMLTG.............. +FUSE_DROME/4-254 .....LAD......EPYDHHADMWSLGCIAYESMAG.............. +HR25_YEAST/9-273 .....HLG......IEQSRRDDLESLGYVLIYFCKG.............. +JAK1_HUMAN/571-833 .....VEDS.....KNLSVAADKWSFGTTLWEICYNG............. +JAK1_HUMAN/864-1137 .....LMQ......SKFYIASDVWSFGVTLHELLTYCD............ +KAB7_YEAST/1096-1354 .....LGGN.....PYEGQPQDIWAIGILLYTVVFK.............. +KAKT_MLVAT/171-429 .....LED......NDYGRAVDWWGLGVVMYEMMCG.............. +KC21_CHICK/39-324 .....LVDY.....QMYDYSLDMWSLGCMLASMIFRKEP........... +KCC4_MOUSE/42-296 .....LRG......CAYGPEVDMWSVGIITYILL................ +KCR8_YEAST/316-590 .....IKE......EFDPRPVDIWACGVIYMAMRTG.............. +KG3A_RAT/119-403 .....IFG.....ATDYTSSIDVWSAGCVLAELLLGQPIFP......... +KGP1_DROME/457-717 .....ILN......KGHDRAVDYWALGILIHELLNG.............. +KI28_YEAST/7-290 .....LFG.....AKHYTSAIDIWSVGVIFAELMLRIPYLP......... +KI82_YEAST/324-602 .....IRG......NGHTAAVDWWTLGILIYEMLFG.............. +KIN1_SCHPO/125-395 .....LNAQ.....PYIGPEVDVWSFGIVLYVLVCG.............. +KIR1_HUMAN/208-495 .....LDETIQVDCFDSYKRVDIWAFGLVLWEVARRMVSNG......... +KKIA_HUMAN/5-288 .....LVG.....DTQYGPPVDVWAIGCVFAELLSGVPLWP......... +KKL6_YEAST/192-508 .....LMG.....VPYDGHLSDTWALGVILYSLFED.............. +KMIL_AVIMH/82-339 .....MQDS.....NPFSFQSDVYSYGIVLYELMTG.............. +KML2_CHICK/1453-1708 .....INY......EPIGYETDMWSIGVICYILVSG.............. +KMOS_CERAE/60-338 .....LKG......EGVTPKADIYSFAITLWQMTTK.............. +KPBH_RAT/24-291 .....LKCSMDETHPGYGKEVDLWACGVILFTLLAG.............. +KPIM_HUMAN/38-290 .....IRYH.....RYHGRSAAVWSLGILLYDMVCG.............. +KPK2_PLAFK/111-364 .....LKC......ASYGTKVDIFSLGVILFNIL................ +KPRO_MAIZE/534-812 .....VSS......LPITAKVDVYSYGVVLLELLTG.............. +KR1_HSV11/191-478 .....LAG......DPYTTTVDIWSAGLVIFETAVHNASLFSAPRGP... +KR1_PRVKA/53-332 .....LAR......DRYDTKVDVWGAGVVLFETLAYPKTITGG.DEP... +KS61_MOUSE/407-664 .....LKR......QGYDEGCDIWSLGILLYTMLAG.............. +KYK1_DICDI/1289-1559 .....FKG......DSNSEKSDVYSYGMVLFELLTS.............. +KYK2_DICDI/108-364 .....TKN.....LGHYSEKVDVYCFSLVVWEILTG.............. +MAK_RAT/4-284 .....LRS......SVYSSPIDVWAVGSIMAELYTFRPLFPG........ +MEK1_YEAST/162-444 SRAATLEQ......RGYDSKCDLWSLGVITHIMLTG.............. +MET_HUMAN/1078-1337 .....LQT......QKFTTKSDVWSFGVVLWELMTRG............. +MKK1_YEAST/221-488 .....IQG......QPYSVTSDVWSLGLTILEVANG.............. +MLK1_HUMAN/3-262 .....IRA......SMFSKGSDVWSYGVLLWELLTG.............. +NINL_DROME/16-282 ....AMESR....EPDITVRADVWALGITTIELADG.............. +NPR1_YEAST/438-742 .....IFA......KYDPRPVDIWSSAIIFACMILKKFPWKIPKLRDNSF +PHY_CERPU/1004-1282 YESTEGEISFDGKVPKYPLKSDVYSFGMVCYEVLTG.............. +PKD1_DICDI/36-291 .....LNSS.....NGHGKPVDWWALGVVVYELVTG.............. +PKN1_MYXXA/59-321 .....SLG......RGVDGRADLYALGVIAYQLLTG.............. +RAN1_SCHPO/18-295 .....IESQ...SSSFATAPNDVWALGIILINLCCK.............. +RYK_HUMAN/327-593 .....LVN......NEFSSASDVWAFGVTLWELMTLG............. +SGV1_YEAST/60-366 .....LGD......KQYTTAVDIWGVGCVFAEFFEKKP............ +SPK1_YEAST/198-466 GKDTSVSPDEYEERNEYSSLVDMWSMGCLVYVILTG.............. +ST20_YEAST/620-871 .....VSR......KEYGPKVDIWSLGIMIIEMIEG.............. +STE7_YEAST/191-466 .....IQG......NVYSIKGDVWSLGLMIIELVTG.............. +SYK_PIG/364-619 .....INY......YKFSSKSDVWSFGVLMWEAFSYG............. +TOP_DROME/938-1194 .....IRN......RVFTSKSDVWAFGVTIWELLTFG............. +TRKA_HUMAN/504-775 .....ILY......RKFTTESDVWSFGVVLWEIFTYG............. +TTK_HUMAN/509-775 .....SSRENGKSKSKISPKSDVWSLGCILYYMTYG.............. +WEE1_HUMAN/299-569 .....EEG......DSRFLANEVLQENYTHLPKA....DIFAL....... + +7LES_DROME/2209-2481 .....................QQPYAAR.......NNFEVLAHVKEG... +ABL1_CAEEL/296-547 .....................MAPYPG........VELSNVYGLLENG.. +ARK1_BOVIN/191-453 .....................HSPFRQ........HKTKDKHEIDRMT.. +AVR2_HUMAN/192-479 ................PVDEYMLPFEEEIGQHPSLEDMQEVVVHKKKR.. +BFR2_HUMAN/367-643 .....................GSPYPG........IPVEELFKLLKEG.. +BYR1_SCHPO/66-320 .....................ELPWSFSNI.DDSIGILDLLHCIVQE... +BYR2_SCHPO/394-658 .....................KHPYPN.......CDQMQAIFRIGEN... +CC15_YEAST/25-272 .....................NPPYHN........LTDANIYYAVEN... +CC21_MEDSA/1-284 ...LSPGDSEIDELFKIFRILGTPNED......TWPGVTSLPDFKSTFPR +CC5_YEAST/82-337 .....................KPPFQAR.......DVNTIYERIKCR... +CDPK_SOYBN/34-292 ....GVPP...........FWAESEP.........GIFRQILLGKLD... +CDR1_SCHPO/12-258 .....................KLPFGG........QNTDVIYNKIRHG.. +CHK1_SCHPO/10-272 .....................NTPWDEA......ISNTGDYLLYKKQCE. +CLK1_MOUSE/160-476 ..........SREHLAMMERILGPLPKHMIQKTRKRRYFHHDRLDWDEHS +CTK1_YEAST/183-469 ......GSNELEQIESIFKIMGTPTINSWP.TLYDMPWFFMIMPQQTTKY +ERK1_CANAL/68-371 LIMEVLGTPNMEDYYNIKSKRAREYIRSLP.FCKKIPFSELFANTNNN.. +ERK3_HUMAN/20-312 ...RMLFAGAHELEQMQLILETIPVIRE....EDKDELLRVMPSFVSSTW +FUSE_DROME/4-254 .....................QPPFCA........SSILHLVKMIKH... +HR25_YEAST/9-273 .....................SLPWQG....LKATTKKQKYDRIMEKK.. +JAK1_HUMAN/571-833 .....................EIPLKD........KTLIEKERFYES... +JAK1_HUMAN/864-1137 ...................SDSSPMALFLKMIGPTHGQMTVTRLVNTLK. +KAB7_YEAST/1096-1354 .....................ENPFY..........NIDEILEGDLK... +KAKT_MLVAT/171-429 .....................RLPFYN........QDHEKLFELILM... +KC21_CHICK/39-324 ..FFHGHDNYDQLVRIAKVLGTEDLYDYID.KYNIELDPRFNDILGRHSR +KCC4_MOUSE/42-296 ......CG.............FEPFYD........ERGDQFMFRRILNC. +KCR8_YEAST/316-590 .....................RQLWSSAE...KDDPFYMNYLKGRKEK.. +KG3A_RAT/119-403 ......GDSGVDQLVEIIKVLGTPTREQIR.EMNPNYTEFKFPQIKAHP. +KGP1_DROME/457-717 .....................TPPFSAP.......DPMQTYNLILKG... +KI28_YEAST/7-290 ......GQNDVDQMEVTFRALGTPTDRDWP.EVSSFMTYNKLQIYPPPS. +KI82_YEAST/324-602 .....................CTPFKG........DNSNETFSNILT... +KIN1_SCHPO/125-395 .....................KVPFDD........QNMSALHAKIKK... +KIR1_HUMAN/208-495 ................IVEDYKPPFYDVVPNDPSFEDMRKVVCVDQQ... +KKIA_HUMAN/5-288 ....GKSDVDQLYLIRKTLGDLIPRHQ......QVFSTNQYFSGVKIPDP +KKL6_YEAST/192-508 .....................RLPFDPP.....PNASARQRSRATSHR.. +KMIL_AVIMH/82-339 .....................ELPYSHIN...NRDQIIFMVGRGYASP.. +KML2_CHICK/1453-1708 .....................LSPFMG....DNDNETLANVTSATWD... +KMOS_CERAE/60-338 .....................QAPYSGE.....RQHILYAVVAYDLRP.. +KPBH_RAT/24-291 ......SP...............PFWHR....RQILMLRMIMEGQYQ... +KPIM_HUMAN/38-290 .....................DIPFEH..........DEEIIRGQVF... +KPK2_PLAFK/111-364 ......CG.............YPPFRG........NNVKEIFKKNMR.C. +KPRO_MAIZE/534-812 ............TRVSELVGGTDEVHSMLR.KLVRMLSAKLEGEEQSWID +KR1_HSV11/191-478 ..KRGPCDSQITRIIRQAQVHVDEFSPHPE.SRLTSRYRSRAAGNNRPP. +KR1_PRVKA/53-332 .AINGEMH..LIDLIRALGVHPEEFP..PD.TRLRSEFVRYAGTHRQP.. +KS61_MOUSE/407-664 ..YTPFAN............GPSDTP........EEILTRIGSGKFT... +KYK1_DICDI/1289-1559 .....................DEPQQD.......MKPMKMAHLAAYES.. +KYK2_DICDI/108-364 .....................EIPFSD.......LDGSQRSAQVAYAG.. +MAK_RAT/4-284 .......TSEVDEIFKICQVLGTPKKSD.....WPEGYQLASSMNFRFPQ +MEK1_YEAST/162-444 .....................ISPFYG........DGSERSIIQNAKIG. +MET_HUMAN/1078-1337 .....................APPYPD.......VNTFDITVYLLQG... +MKK1_YEAST/221-488 .....................KFPCSSEKM.AANIAPFELLMWILTFTP. +MLK1_HUMAN/3-262 .....................EVPFRG.......IDGLRVAYGVAMN... +NINL_DROME/16-282 .....................KPPFAD.......MHPTRAMFQIIRNP.. +NPR1_YEAST/438-742 KLFCSGRD.CDSLSSLVTRTPDPPSYDESH.STEKKKPESSSNNVSDPNN +PHY_CERPU/1004-1282 .....................DVPFPE.......EKNPNNVKRMVLEG.. +PKD1_DICDI/36-291 .....................KLPFS.........NSKESLLNRKAD... +PKN1_MYXXA/59-321 .....................RLPFND.......EGLAAQLVAHQLRPP. +RAN1_SCHPO/18-295 .....................RNPWKR......ACSQTDGTYRSYVHN.. +RYK_HUMAN/327-593 .....................QTPYVD.......IDPFEMAAYLKDG... +SGV1_YEAST/60-366 ...ILQGKTDIDQGHVIFKLLGTPTEEDWA.VARYLPGAELTTTNYKP.. +SPK1_YEAST/198-466 .....................HLPFSG....STQDQLYKQIGRGSYH... +ST20_YEAST/620-871 .....................EPPYLN.......ETPLRALYLIATNG.. +STE7_YEAST/191-466 .....................EFPLGG......HNDTPDGILDLLQRIVN +SYK_PIG/364-619 .....................QKPYRG........MKGSEVSAMLEKG.. +TOP_DROME/938-1194 .....................QRPHEN........IPAKDIPDLIEVG.. +TRKA_HUMAN/504-775 .....................KQPWYQ.......LSNTEAIDCITQG... +TTK_HUMAN/509-775 .....................KTPFQQ......IINQISKLHAIIDPN.. +WEE1_HUMAN/299-569 .ALTVVCA...........AGAEPLP.........RNGDQWHEIRQG... + +7LES_DROME/2209-2481 ..........GRLQQPPMCTEK....LYSLLLLCWRTDPWERP.....SF +ABL1_CAEEL/296-547 ..........FRMDGPQGCPPS....VYRLMLQCWNWSPSDRP.....RF +ARK1_BOVIN/191-453 ........LTMAVELPDSFSPE....LRSLLEGLLQRDVNRRLGCLGRGA +AVR2_HUMAN/192-479 .........PVLRDYWQKHAGMA..MLCETIEECWDHDAEARL.....SA +BFR2_HUMAN/367-643 ..........HRMDKPANCTNE....LYMMMRDCWHAVPSQRP.....TF +BYR1_SCHPO/66-320 ..........EPPRLPSSFPED....LRLFVDACLHKDPTLRAS.....P +BYR2_SCHPO/394-658 ..........ILPEFPSNISSS....AIDFLEKTFAIDCNLRP.....TA +CC15_YEAST/25-272 ..........DTYYPPSSFSEP....LKDFLSKCFVKNMYKRP.....TA +CC21_MEDSA/1-284 W......PSKDLATVVPNLEPA....GLDLLNSMLCLDPTKRI.....TA +CC5_YEAST/82-337 .........DFSFPRDKPISDE....GKILIRDILSLDPIERP.....SL +CDPK_SOYBN/34-292 ..........FHSEPWPSISDS....AKDLIRKMLDQNPKTRL.....TA +CDR1_SCHPO/12-258 ...........AYDLPSSISSA....AQDLLHRMLDVNPSTRI.....TI +CHK1_SCHPO/10-272 .........RPSYHPWNLLSPG....AYSIITGMLRSDPFKRY.....SV +CLK1_MOUSE/160-476 SAGRYVSRRCKPLKEFMLSQDAEHEFLFDLVGKILEYDPAKRI.....TL +CTK1_YEAST/183-469 V......NNFSEKFKSVLPSSK....CLQLAINLLCYDQTKRF.....SA +ERK1_CANAL/68-371 .......TSTSNTGGRTNINPL....ALDLLEKLLIFNPAKRI.....TV +ERK3_HUMAN/20-312 E......VKRPLRKLLPEVNSE....AIDFLEKILTFNPMDRL.....TA +FUSE_DROME/4-254 ..........EDVKWPSTLTCE....CRSFLQGLLEKDPGLRI.....SW +HR25_YEAST/9-273 ..........LNVSVETLCSGLP.LEFQEYMAYCKNLKFDEKP......D +JAK1_HUMAN/571-833 ..........RCRPVTPSCKE.....LADLMTRCMNYDPNQRP.....FF +JAK1_HUMAN/864-1137 ........EGKRLPCPPNCPDE....VYQLMRKCWEFQPSNRT.....SF +KAB7_YEAST/1096-1354 ............FNNAEEVSED....CIELIKSILNRCVPKRP.....TI +KAKT_MLVAT/171-429 ..........EEIRFPRTLGPE....AKSLLSGLLKKDPTQRLGGGSEDA +KC21_CHICK/39-324 KR....WERFVHSENQHLVSPE....ALDFLDKLLRYDHQSRL.....TA +KCC4_MOUSE/42-296 .......EYYFISPWWDEVSLN....AKDLVKKLIVLDPKKRL.....TT +KCR8_YEAST/316-590 .........GGYEPIESLKRAR....CRNVIYSMLDPVPYRRI.....NG +KG3A_RAT/119-403 ..........WTKVFKSRTPPE....AIALCSSLLEYTPSSRL.....SP +KGP1_DROME/457-717 .........IDMIAFPKHISRW....AVQLIKRLCRDVPSERLGYQTGGI +KI28_YEAST/7-290 ........RDELRKRFIAASEY....ALDFMCGMLTMNPQKRW.....TA +KI82_YEAST/324-602 ........KDVKFPHDKEVSKN....CKDLIKKLLNKNEAKRLGSKS.GA +KIN1_SCHPO/125-395 ..........GTVEYPSYLSSD....CKGLLSRMLVTDPLKRA.....TL +KIR1_HUMAN/208-495 ..........RPNIPNRWFSDPTLTSLAKLMKECWYQNPSARL.....TA +KKIA_HUMAN/5-288 E......DMEPLELKFPNISYP....ALGLLKGCLHMDPTERL.....TC +KKL6_YEAST/192-508 ........IARFDWRWYRLSDY....KTNVG.KQIVENTLTRKN.QRWSI +KMIL_AVIMH/82-339 ..........DLSKLYKNCPKA....MKRLVADCLKKVREERP.....LF +KML2_CHICK/1453-1708 ..........FDDEAFDEISDD....AKDFISNLLKKDMKSRL.....NC +KMOS_CERAE/60-338 ........SLSAAVFQDSLPGQ...RLGDVIRRCWRPSAAQRP.....SA +KPBH_RAT/24-291 ..........FSSPEWDDRSNT....VKDLIAKLLQVDPNARL.....TA +KPIM_HUMAN/38-290 ..............FRQRVSSE....CQHLIRWCLALRPSDRP.....TF +KPK2_PLAFK/111-364 .......HISFNTKHWINKSES....VKEIILWMCCKNPDDRC.....TA +KPRO_MAIZE/534-812 GY....LDSKLNRPVNYVQART....LIKLAVSCLEEDRSKRP.....TM +KR1_HSV11/191-478 .......YTRPAWTRYYKMDID....VEYLVCKALTFDGALRP.....SA +KR1_PRVKA/53-332 .......YTQYARVARLGLPET....GAFLIYKMLTFDPVRRP.....SA +KS61_MOUSE/407-664 ..........LSGGNWNTVSET....AKDLVSKMLHVDPHQRL.....TA +KYK1_DICDI/1289-1559 ..........YRPPIPLTTSSK....WKEILTQCWDSNPDSRP.....TF +KYK2_DICDI/108-364 ..........LRPPIPEYCDPE....LKLLLTQCWEADPNDRP.....PF +MAK_RAT/4-284 C......IPINLKTLIPNASSE....AIQLMTEMLNWDPKKRP.....TA +MEK1_YEAST/162-444 .......KLNFKLKQWDIVSDN....AKSFVKDLLQTDVVKRL.....NS +MET_HUMAN/1078-1337 ..........RRLLQPEYCPDP....LYEVMLKCWHPKAEMRP.....SF +MKK1_YEAST/221-488 .......ELKDEPESNIIWSPS....FKSFIDYCLKKDSRERP.....SP +MLK1_HUMAN/3-262 .........KLALPIPSTCPEP....FAKLMEDCWNPDPHSRP.....SF +NINL_DROME/16-282 .........PPTLMRPTNWSKQ....INDFISESLEKNAENRP.....MM +NPR1_YEAST/438-742 .......VNIGPQRLLHSLPEE....TQHIVGRMIDLAPACRG.....NI +PHY_CERPU/1004-1282 ..........VRPDLPAHCPIE....LKALITDCWNQDPLKRP......S +PKD1_DICDI/36-291 .........FQLIFQNSYLSDE....IKDFIFQLLSVDPSKRLG..TFDS +PKN1_MYXXA/59-321 .........PPPSSVYPAVSAA....LEHVILRALAKKPEDRY....ASI +RAN1_SCHPO/18-295 .........PSTLLSILPISRE....LNSLLNRIFDRNPKTRI.....TL +RYK_HUMAN/327-593 ..........YRIAQPINCPDE....LFAVMACCWALDPEERP.....KF +SGV1_YEAST/60-366 .........TLRERFGKYLSET....GLDFLGQLLALDPYKRL.....TA +SPK1_YEAST/198-466 ..........EGPLKDFRISEE....ARDFIDSLLQVDPNNRS.....TA +ST20_YEAST/620-871 .........TPKLKEPENLSSS....LKKFLDWCLCVEPEDRA.....SA +STE7_YEAST/191-466 .......EPSPRLPKDRIYSKE....MTDFVNRCCIKNERERS.....SI +SYK_PIG/364-619 ..........ERMGCPPGCPRE....MYELMTLCWTYDVENRP.....GF +TOP_DROME/938-1194 ..........LKLEQPEICSLD....IYCTLLSCWHLDAAMRP.....TF +TRKA_HUMAN/504-775 ..........RELERPRACPPE....VYAIMRGCWQREPQQRH.....SI +TTK_HUMAN/509-775 ..........HEIEFPDIPEKD....LQDVLKCCLKRDPKQRI.....SI +WEE1_HUMAN/299-569 .........RLPRIPQVLSQE.....FTELLKVMIHPDPERRP.....SA + +7LES_DROME/2209-2481 RRCYNTL... +ABL1_CAEEL/296-547 RDIHFNL... +ARK1_BOVIN/191-453 QEVKESPFF. +AVR2_HUMAN/192-479 GCVGERI... +BFR2_HUMAN/367-643 KQLVEDL... +BYR1_SCHPO/66-320 QQLCAMPYF. +BYR2_SCHPO/394-658 SELLSHPFV. +CC15_YEAST/25-272 DQLLKHVWI. +CC21_MEDSA/1-284 RSAVEHEYF. +CC5_YEAST/82-337 TEIMDYVWF. +CDPK_SOYBN/34-292 HEVLRHPWI. +CDR1_SCHPO/12-258 PEFFSHPFL. +CHK1_SCHPO/10-272 KHVVQHPWL. +CLK1_MOUSE/160-476 KEALKHPFF. +CTK1_YEAST/183-469 TEALQSDYF. +ERK1_CANAL/68-371 EDALKHPYL. +ERK3_HUMAN/20-312 EMGLQHPYM. +FUSE_DROME/4-254 TQLLCHPFV. +HR25_YEAST/9-273 YLFLARLF.. +JAK1_HUMAN/571-833 RAIMRDI... +JAK1_HUMAN/864-1137 QNLIEGF... +KAB7_YEAST/1096-1354 DDINNDKWL. +KAKT_MLVAT/171-429 KEIMQHRFF. +KC21_CHICK/39-324 REAMEHPYF. +KCC4_MOUSE/42-296 FQALQHPWV. +KCR8_YEAST/316-590 KQILNSEWG. +KG3A_RAT/119-403 LEACAHSFF. +KGP1_DROME/457-717 QDIKKHKWF. +KI28_YEAST/7-290 VQCLESDYF. +KI82_YEAST/324-602 ADIKRHPFF. +KIN1_SCHPO/125-395 EEVLNHPWM. +KIR1_HUMAN/208-495 LRIKKTL... +KKIA_HUMAN/5-288 EQLLHHPYF. +KKL6_YEAST/192-508 NEIYESPFV. +KMIL_AVIMH/82-339 PQILSSI... +KML2_CHICK/1453-1708 TQCLQHPWL. +KMOS_CERAE/60-338 RPLLVDL... +KPBH_RAT/24-291 EQALQHPFF. +KPIM_HUMAN/38-290 EEIQNHPWM. +KPK2_PLAFK/111-364 LQALGHQWF. +KPRO_MAIZE/534-812 EHAVQTL... +KR1_HSV11/191-478 AELLCLPLF. +KR1_PRVKA/53-332 DEILNFGMW. +KS61_MOUSE/407-664 KQVLQHPWI. +KYK1_DICDI/1289-1559 KQIIVHL... +KYK2_DICDI/108-364 TYIVNKL... +MAK_RAT/4-284 SQALKHPYF. +MEK1_YEAST/162-444 KQGLKHIWI. +MET_HUMAN/1078-1337 SELVSRI... +MKK1_YEAST/221-488 RQMINHPWI. +MLK1_HUMAN/3-262 TNILDQL... +NINL_DROME/16-282 VEMVEHPFL. +NPR1_YEAST/438-742 EEIMEDPWI. +PHY_CERPU/1004-1282 FAVICQKL.. +PKD1_DICDI/36-291 CSIRNHKWF. +PKN1_MYXXA/59-321 AAFRNALQVA +RAN1_SCHPO/18-295 PELSTLV... +RYK_HUMAN/327-593 QQLVQCL... +SGV1_YEAST/60-366 MSAKHHPWF. +SPK1_YEAST/198-466 AKALNHPWI. +ST20_YEAST/620-871 TELLHDEYI. +STE7_YEAST/191-466 HELLHHDLI. +SYK_PIG/364-619 VAVELRL... +TOP_DROME/938-1194 KQLTTVF... +TRKA_HUMAN/504-775 KDVHARL... +TTK_HUMAN/509-775 PELLAHPYV. +WEE1_HUMAN/299-569 MALVKHSVL. + diff --git a/forester/archive/RIO/others/hmmer/tutorial/rrm.hmm b/forester/archive/RIO/others/hmmer/tutorial/rrm.hmm new file mode 100644 index 0000000..509d6d6 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/rrm.hmm @@ -0,0 +1,237 @@ +HMMER2.0 +NAME rrm +DESC +LENG 72 +ALPH Amino +RF no +CS no +MAP yes +COM ../src/hmmbuild -F rrm.hmm rrm.slx +COM ../src/hmmcalibrate rrm.hmm +NSEQ 70 +DATE Wed Jul 8 08:13:25 1998 +CKSUM 2768 +XT -8455 -4 -1000 -1000 -8455 -4 -8455 -4 +NULT -4 -8455 +NULE 595 -1558 85 338 -294 453 -1158 197 249 902 -1085 -142 -21 -313 45 531 201 384 -1998 -644 +EVD -53.840649 0.214434 +HMM A C D E F G H I K L M N P Q R S T V W Y + m->m m->i m->d i->m i->i d->m d->d b->m m->e + -21 * -6129 + 1 -1234 -371 -8214 -7849 -5304 -8003 -7706 2384 -7769 2261 -681 -7660 -7694 -7521 -7816 -7346 -5543 1527 -6974 -6639 1 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 -21 * + 2 -3634 -3460 -5973 -5340 3521 -2129 -4036 -831 -2054 -1257 -2663 -4822 -5229 -4557 -4735 -1979 -1569 -1476 -3893 3439 2 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 3 -5570 838 -8268 -7958 -5637 -8152 -8243 2427 -7947 -461 -539 -7805 -7843 -7878 -8124 -7550 -5559 3130 -7481 -7000 3 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 4 -1146 -4797 -1564 -2630 -1480 2769 -2963 -1850 992 -4812 -3887 737 -4397 -120 793 -205 -1019 -4418 -4981 -1059 4 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 5 -5242 -7035 445 -3538 -7284 1773 -4583 -7166 -4676 -7046 -6312 3633 -1651 -1262 -849 -1278 -5287 -6650 -7228 -291 5 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12326 -894 -1115 -701 -1378 * * + 6 -6898 -6238 -9292 -8703 -410 -9176 -7772 820 -8535 3071 -753 -8917 -8033 -7171 -7955 -8614 -6722 5 -6136 -6414 6 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 278 394 45 96 359 117 -369 -294 -249 + - -33 -6025 -12326 -153 -3315 -701 -1378 * * + 7 -5 -5297 178 -2982 -5685 -2278 -528 -5452 -1615 -5394 -4488 1396 3136 -3022 -3659 780 976 -4981 -5565 -4854 8 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -11 -11284 -12327 -894 -1115 -701 -1378 * * + 8 -3329 -4799 -805 543 789 -4303 572 -4868 140 -1087 -3888 -603 1691 530 183 -162 293 -2124 2317 2037 9 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11284 -12327 -894 -1115 -701 -1378 * * + 9 -373 -4801 2182 1353 -1426 44 -407 -1928 -366 -4817 -3891 1263 -4395 -1080 -666 295 50 -1947 -4985 397 10 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 10 450 1883 -5953 -5317 -1256 -1301 -4027 1322 -1847 -283 1542 -4802 -5206 -1502 -4713 -4241 2143 1615 -3893 -3551 11 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 11 -1786 -4835 1027 -807 -5155 -1278 -2989 -4907 -410 -4850 -3924 957 -4421 -943 -250 670 3048 -4456 -5017 -4333 12 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 12 -3329 -4802 1324 2670 -5123 -4302 -2961 -4874 732 -2424 -3891 -457 -262 553 250 -694 -989 -4424 1772 -1014 13 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 13 -325 -4802 1515 2286 -5123 -2017 868 -4874 260 -2865 -1087 -2938 -4395 2006 -810 492 -1754 -4424 -4985 -4302 14 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 14 -337 -4801 2075 1854 -5121 -723 -567 -1924 73 -634 -194 -1227 -4396 1588 -3049 -212 -414 -4422 -5 -4302 15 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 15 -6843 -6192 -9252 -8675 -481 -9132 -7773 1557 -8511 2856 467 -8869 -8024 -7180 -7953 -8566 -6676 459 -6154 -6421 16 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 16 5 -4654 -1525 936 444 -4347 -3013 -1809 2193 -441 -3760 -441 -4438 -2577 1775 -91 -3285 -1104 180 -259 17 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 17 -97 -4802 2341 1548 -5123 -2042 -2961 -4874 -347 -2479 -194 -5 -726 1566 807 -1858 42 -4424 -4985 -4302 18 + - -146 -501 232 42 -381 398 105 -627 210 -463 -721 275 393 44 95 361 116 -370 -295 -242 + - -45 -5457 -12327 -1928 -440 -701 -1378 * * + 18 358 -3435 -5945 -1175 1490 -5154 1309 1157 -1944 1759 -387 -4797 -5204 -4530 -1684 -4238 -376 166 -3893 1330 23 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12327 -894 -1115 -701 -1378 * * + 19 -2191 733 -7910 -7364 4360 -7323 -5649 -1557 -7016 -750 -407 -6877 -7039 -6263 -6681 -6482 -5572 -4211 -4950 -1019 24 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -12 -11285 -12328 -894 -1115 -701 -1378 * * + 20 -83 -4801 -3176 698 -5121 1566 -2961 -1977 942 -4817 -3890 -239 -4396 582 256 1807 -874 -1745 -4984 -1334 25 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 21 -1216 -4802 -289 1083 -1452 -655 -584 -4874 1345 -4818 -3891 964 1488 2130 -3049 -310 107 -2012 -4985 -1334 26 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 22 -45 1344 -1667 -843 2933 -2146 400 582 -4479 -1948 -2709 -506 -5117 -436 -1764 -4119 -3523 -96 215 2616 27 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 23 -556 -4294 -4426 -1796 -273 3377 -4149 -4100 -4273 -2279 -3695 -562 298 -4067 -4575 -1940 -3954 -3921 -4866 -77 28 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 24 -376 -4801 -143 1004 -1426 805 279 -1771 821 -1486 -3890 -527 2002 126 45 -287 -1679 -617 -4985 -4302 29 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 25 -3608 -178 -1585 -1970 660 -5154 -4024 2773 -894 -985 -386 -4796 -1707 -4528 -4707 -609 -1823 2145 -3893 -1100 30 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 26 -673 -173 -3429 1042 -4598 -2161 -3110 535 1570 9 283 -508 -4517 -255 382 -1924 313 1407 -4706 -4127 31 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 27 -1211 -4799 1518 768 -5119 -1218 -441 -945 -1312 -2414 -587 909 -4396 -1010 534 1815 78 -487 -4983 -128 32 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 28 1271 2236 -5933 -5299 810 -2278 -651 1901 -1970 -221 -2639 -1497 -5203 -4524 -629 -638 -1577 1521 -3894 -1008 33 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12328 -894 -1115 -701 -1378 * * + 29 -1909 -4796 153 441 -1513 -4304 -599 -1894 1709 25 -3886 689 -1498 243 1438 -189 -879 380 -126 -255 34 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -13 -11286 -12329 -894 -1115 -701 -1378 * * + 30 -1277 -3441 -5893 -1776 -1155 -5147 -513 1829 -1993 1189 1888 -1484 -703 -4503 -1652 -1974 -3546 2209 -3898 -3554 35 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -11287 -12329 -894 -1115 -701 -1378 * * + 31 -1299 746 -5893 -1992 -1190 -5147 -524 1691 424 -60 2330 -4774 111 -4503 -132 248 -1571 1419 -3898 -19 36 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -11287 -12329 -894 -1115 -701 -1378 * * + 32 -3370 -4477 -3387 50 -560 -1979 -449 -51 1375 -681 233 1068 701 -1040 1343 -1845 543 -480 -10 1246 37 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -240 -11287 -2797 -894 -1115 -701 -1378 * * + 33 -3122 -4595 3395 -593 -4916 -1399 589 -1433 360 -4611 -290 780 -1313 35 -1369 -1782 -3061 -1712 -4778 -4095 38 + - -151 -504 236 42 -380 396 122 -618 211 -468 -714 274 392 45 98 355 123 -373 -299 -248 + - -841 -2976 -1709 -1966 -426 -3668 -118 * * + 34 -452 -4116 -568 -735 -4435 -1350 -2280 -1270 1458 -4131 792 -2257 1620 415 1996 479 -765 -1327 -4300 -538 48 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -10529 -11571 -894 -1115 -1180 -840 * * + 35 272 -4448 -1054 1495 -1086 -283 -2616 -726 380 -1231 -3538 1286 -4050 1395 -988 154 68 50 -4633 -876 49 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -10906 -11948 -894 -1115 -2229 -346 * * + 36 -3050 -4521 457 -2349 -4841 -1681 65 -1545 404 -2305 -3610 996 -1241 -714 -1055 -351 3167 -4143 -4705 -4022 50 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -14 -10981 -12024 -894 -1115 -2036 -403 * * + 37 -943 -4583 277 -486 -4904 2690 -181 -1421 829 -2551 -758 866 -4177 -751 11 -804 -1361 -4205 -4766 -4084 51 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11049 -12091 -894 -1115 -2632 -254 * * + 38 -1544 -4606 -1206 -627 -1238 -1111 -220 -4677 1841 -1463 -537 -311 146 1310 2236 252 -1424 -1820 -4789 -1025 52 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11074 -12116 -894 -1115 -1795 -490 * * + 39 -871 902 -3255 -2704 -1212 -2110 605 -4156 -647 -1293 101 192 1442 -2552 91 2587 -171 -3858 -4584 -3996 53 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11128 -12170 -894 -1115 -1064 -938 * * + 40 -3251 -4717 -597 -2552 -1539 -1882 45 -4784 2499 -1083 -3807 -1125 -312 -892 2672 -1497 -649 -1932 -4902 -1040 54 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11202 -12244 -894 -1115 -158 -3269 * * + 41 -4425 -5751 -1160 -3492 -6118 3496 -552 -1896 -1318 -2596 -4883 -434 -258 -3375 -548 -4283 -4348 -5409 -5833 -5262 55 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 42 -3608 -96 -1795 -5308 3204 -5154 498 -1086 -989 -1857 1406 -4797 -5204 -807 -4709 -4238 -268 -366 187 3035 56 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 43 2573 2359 -7700 -8052 -7623 2634 -6965 -7447 -7655 -7712 -6731 -6019 -5985 -7072 -7238 -2014 -4755 -2203 -7845 -7842 57 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 44 -1896 -3552 -6072 -5447 4093 -5277 -4115 -1389 -5044 -1849 -2748 -4920 -5327 -4660 -4842 -2020 -787 -772 -3948 1996 58 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -15 -11288 -12330 -894 -1115 -701 -1378 * * + 45 -2123 1258 -8228 -7927 -5768 -8106 -8270 1951 -7921 -982 -4434 -7761 -7830 -7926 -8131 -7503 -5516 3355 -7605 -7039 59 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11288 -12331 -894 -1115 -701 -1378 * * + 46 -1158 -4801 136 2359 -5122 -4302 -508 -644 437 -2559 -3890 628 -4395 -213 172 18 1464 -2067 -4985 -1086 60 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 47 -7925 -6836 -8294 -8655 4067 -8176 -4357 -6786 -8211 -6080 795 -6785 -8028 -6925 -7569 -7427 -7774 -6956 -3603 3066 61 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 48 -633 -4801 851 2019 -1639 -2148 879 -1118 1178 -2414 -3891 -481 -71 241 -1485 -232 744 -569 -4985 -4302 62 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 49 -3331 -4805 2054 434 -5126 -1882 -432 -4877 377 -4821 -3894 2009 -4398 -269 -1336 1291 1198 -1970 -4988 -4305 63 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12331 -894 -1115 -701 -1378 * * + 50 -638 -4800 -1786 1796 -5120 -1884 1628 -1952 812 -444 -621 -1191 1228 530 -672 8 -873 45 -4983 -276 64 + - -149 -500 232 43 -381 398 105 -627 210 -466 -721 277 393 45 95 359 119 -370 -295 -239 + - -38 -6076 -12331 -1893 -453 -701 -1378 * * + 51 243 -4801 1218 2315 -5122 -1551 -485 -1640 -795 -2479 -783 -420 -685 -1027 1035 415 -3268 -631 -23 -4302 69 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -16 -11289 -12332 -894 -1115 -701 -1378 * * + 52 415 694 2467 1155 -1401 -4334 -490 -1800 -2599 -4689 -637 -384 -1759 -12 -3098 1144 -834 -569 -4907 -271 70 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 53 2846 -3442 -1698 -5254 -979 -5146 -4014 -750 -4864 -773 1875 -4771 -5197 -1456 -1779 -127 -329 428 -3898 -3555 71 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 54 581 -4801 1239 1462 -5122 -1606 -432 -367 1251 -1623 -3891 335 -4395 1283 -110 -3209 753 -1920 -4985 -4302 72 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 55 686 -4798 937 304 -1378 -4303 -437 -1924 2219 -1669 -621 828 -4396 -1012 742 0 -1608 -1126 -4982 -1015 73 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 56 3420 863 -7680 -7410 -5526 -6323 -6681 -57 -7168 -2455 -4425 -6591 -6708 -6875 -7058 -2256 -4981 -4 -6573 -6193 74 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -17 -11290 -12332 -894 -1115 -701 -1378 * * + 57 -2038 -3436 -5943 -5308 -1145 -5154 -4025 2255 423 1498 1203 -4797 -1707 -478 -1267 -2117 -3548 1450 -3893 -931 75 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11291 -12333 -894 -1115 -701 -1378 * * + 58 622 -4802 1764 1486 -5123 -4302 -2961 -1060 334 -4818 -3891 -420 -4396 1293 1148 487 -3268 -1087 -4985 -429 76 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -102 -11291 -4156 -894 -1115 -701 -1378 * * + 59 1265 -231 -1498 1351 -5045 -262 -355 -4796 922 -1073 -3813 778 -4318 877 -34 53 386 -2030 289 -4225 77 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11207 -12249 -894 -1115 -160 -3250 * * + 60 -684 813 -5723 -473 532 -2124 -3981 -2958 -121 2114 2840 -1421 -5174 -4409 -926 -4196 -1685 -376 -3915 497 78 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11291 -12333 -894 -1115 -701 -1378 * * + 61 -1812 -4803 1626 -749 -515 -1133 -415 -4875 -1294 -4819 -3892 3181 -793 1470 -1377 -246 -3268 -4425 -4986 -193 79 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -18 -11291 -12333 -894 -1115 -701 -1378 * * + 62 -1812 -4808 -1465 33 -1509 2998 1583 -4879 122 -4823 -3897 972 -4400 -1078 -3055 -1613 -682 -4429 -4991 -1114 80 + - -149 -500 232 43 -378 398 105 -627 212 -466 -721 275 393 45 98 359 117 -367 -295 -250 + - -98 -4229 -12334 -49 -4901 -701 -1378 * * + 63 -676 -4701 -742 -1422 825 -589 -545 255 1702 -2571 812 -2986 -4424 796 418 -221 1302 -1179 -4912 1028 82 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12334 -894 -1115 -701 -1378 * * + 64 -3341 -4695 350 1378 -1551 -1973 -2998 477 1265 78 273 -1163 21 504 -1507 -1108 282 114 -19 473 83 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12334 -894 -1115 -701 -1378 * * + 65 -3605 -3444 -949 -2090 2356 -1177 -4010 1410 -1703 1341 -404 -1673 -747 -4487 -4679 -2139 -1048 1197 -3900 411 84 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12334 -894 -1115 -701 -1378 * * + 66 -655 -539 1179 279 -1324 1202 -2962 -1895 147 -682 1298 1427 -2056 608 756 -1119 -1893 -4419 -4982 140 85 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -19 -11292 -12335 -894 -1115 -701 -1378 * * + 67 -1814 -4814 166 -2636 -5135 2921 -568 -4885 -1333 -2415 -3903 1495 -4406 -312 -619 602 -1672 -4436 -4997 -4314 86 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12335 -894 -1115 -701 -1378 * * + 68 -3329 1217 -624 -797 -1594 -4303 1580 -4872 2069 -2414 -3890 617 -4396 283 2449 -560 -267 -2067 -4984 -1334 87 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12335 -894 -1115 -701 -1378 * * + 69 108 566 -1460 747 -1608 -4306 -2965 -30 1407 -2607 -3878 346 1033 -336 863 -1038 745 617 -4975 -4296 88 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12335 -894 -1115 -701 -1378 * * + 70 -1318 -3465 -283 -172 -3423 -2053 -3974 1957 -4721 1761 1425 -4678 -1762 -4391 -1578 -1974 -1561 1341 -3918 -3570 89 + - -149 -500 233 43 -381 399 106 -626 210 -466 -720 275 394 45 96 359 117 -369 -294 -249 + - -20 -11293 -12336 -894 -1115 -701 -1378 * * + 71 -1165 -4790 -240 -275 -5105 -4306 1035 -2009 1665 -395 707 -1334 -218 -188 1891 -1077 -383 404 110 348 90 + - -149 -500 233 43 -381 398 106 -626 210 -464 -720 275 394 45 96 359 117 -369 -294 -249 + - -43 -6001 -12336 -150 -3342 -701 -1378 * * + 72 -1929 1218 -1535 -1647 -3990 -4677 -3410 1725 207 -1481 -3117 -3608 -810 -1118 -743 -1942 428 2687 -4325 -3869 92 + - * * * * * * * * * * * * * * * * * * * * + - * * * * * * * * 0 +// diff --git a/forester/archive/RIO/others/hmmer/tutorial/rrm.slx b/forester/archive/RIO/others/hmmer/tutorial/rrm.slx new file mode 100644 index 0000000..ea84b12 --- /dev/null +++ b/forester/archive/RIO/others/hmmer/tutorial/rrm.slx @@ -0,0 +1,167 @@ +#=ID rrm +#=AC PF00076 +#=DE RNA recognition motif. (aka RRM, RBD, or RNP domain) +# AU Sean Eddy +# GA HMM_iterative_training +# GA Bic_raw 25 hmmls 10 +# CC There is no separation between signal and noise. +# AL HMM_simulated_annealing +# AM hmma -qR +# SE Published_alignment +# RN [1] +# RM 94119674 +# RA Birney E., Kumar S., Krainer A.R. +# RL NAR 21:5803-5816(1993). +# DR PROSITE; PDOC00030; +# DR SCOP; 1SXL; sf; +# CC -!- The RRM motif is probably diagnostic of an RNA binding protein. +# CC -!- RRMs are found in a variety of RNA binding proteins, including +# CC various hnRNP proteins, proteins implicated in regulation +# CC of alternative splicing, and protein components +# CC of snRNPs. The motif also appears in a few single stranded +# CC DNA binding proteins. +# CC -!- The RRM structure consists of four strands +# CC and two helices arranged in an alpha/beta sandwich. +# SQ 70 +CABA_MOUSE/77-148 MFVGGL.SWDTSKKDLKD....YFTKFGEVVDCTIKMD.........PNT +CABA_MOUSE/161-232 IFVGGL.NPEATEEKIRE....YFGQFGEIEAIELPID.........PKL +CST2_HUMAN/18-89 VFVGNI.PYEATEEQLKD....IFSEVGPVVSFRLVYD.........RET +ELAV_DROME/250-322 LYVSGL.PKTMTQQELEA....IFAPFGAIITSRILQNa........GND +ELAV_DROME/404-475 IFIYNL.APETEEAALWQ....LFGPFGAVQSVKIVKD.........PTT +EWS_HUMAN/363-442 IYVQGL.NDSVTLDDLAD....FFKQCGVVKMNKRTGQpmih.iyldKET +GBP2_YEAST/124-193 IFVRNL.TFDCTPEDLKE....LFGTVGEVVEADIIT...........SK +GBP2_YEAST/221-291 VFIINL.PYSMNWQSLKD....MFKECGHVLRADVELD..........FN +GBP2_YEAST/351-421 IYCSNL.PFSTARSDLFD....LFGPIGKINNAELKPQ..........EN +GR10_BRANA/8-79 CFVGGL.AWATGDAELER....TFSQFGEVIDSKIIND.........RET +HUD_HUMAN/48-119 LIVNYL.PQNMTQEEFRS....LFGSIGEIESCKLVRD.........KIT +IF4B_HUMAN/98-169 AFLGNL.PYDVTEESIKE....FFRGLNISAVRLPREP.........SNP +MSSP_HUMAN/31-102 LYIRGL.PPHTTDQDLVK....LCQPYGKIVSTKAILD.........KTT +NAM8_YEAST/165-237 IFVGDL.APNVTESQLFE....LFiNRYASTSHAKIVHD........QVT +NOP3_YEAST/127-190 LFVRPF.PLDVQESELNE....IFGPFGPMKEVKILN............. +NOP3_YEAST/202-270 ITMKNL.PEGCSWQDLKD....LARENSLETTFSSVN............T +NOP4_YEAST/28-98 LFVRSI.PQDVTDEQLAD....FFSNFAPIKHAVVVKD..........TN +NOP4_YEAST/292-367 VFVRNV.PYDATEESLAP....HFSKFGSVKYALPVID.........KST +NSR1_YEAST/170-241 IFVGRL.SWSIDDEWLKK....EFEHIGGVIGARVIYE.........RGT +NSR1_YEAST/269-340 LFLGNL.SFNADRDAIFE....LFAKHGEVVSVRIPTH.........PET +NUCL_CHICK/283-353 LFVKNL.TPTKDYEELRT....AIKEFFGKKNLQVSEV..........RI +NUCL_CHICK/373-440 LFVKNL.PYRVTEDEMKN....VFENALEVRLVLNKE............. +PABP_DROME/4-75 LYVGDL.PQDVNESGLFD....KFSSAGPVLSIRVCRD.........VIT +PABP_DROME/92-162 VFIKNL.DRAIDNKAIYD....TFSAFGNILSCKVATD..........EK +PABP_DROME/183-254 VYVKNF.TEDFDDEKLKE....FFEPYGKITSYKVMSK..........ED +PABP_SCHPO/249-319 VYIKNL.DTEITEQEFSD....LFGQFGEITSLSLVKD..........QN +PES4_YEAST/93-164 LFIGDL.HETVTEETLKG....IFKKYPSFVSAKVCLD.........SVT +PES4_YEAST/305-374 IFIKNL.PTITTRDDILN....FFSEVGPIKSIYLSN...........AT +PSF_HUMAN/373-443 LSVRNL.SPYVSNELLEE....AFSQFGPIERAVVIVD..........DR +PUB1_YEAST/76-146 LYVGNL.DKAITEDILKQ....YFQVGGPIANIKIMID..........KN +PUB1_YEAST/163-234 LFVGDL.NVNVDDETLRN....AFKDFPSYLSGHVMWD.........MQT +PUB1_YEAST/342-407 AYIGNI.PHFATEADLIP....LFQNFGFILDFKHYPE............ +RB97_DROME/34-105 LFIGGL.APYTTEENLKL....FYGQWGKVVDVVVMRD.........AAT +RN15_YEAST/20-91 VYLGSI.PYDQTEEQILD....LCSNVGPVINLKMMFD.........PQT +RNP1_YEAST/37-109 LYVGNL.PKNCRKQDLRD....LFEPNYGKITINMLKKk........PLK +RO28_NICSY/99-170 LFVGNL.PYDIDSEGLAQ....LFQQAGVVEIAEVIYN.........RET +RO33_NICSY/116-187 LYVGNL.PFSMTSSQLSE....IFAEAGTVANVEIVYD.........RVT +RO33_NICSY/219-290 LYVANL.SWALTSQGLRD....AFADQPGFMSAKVIYD.........RSS +ROA1_BOVIN/106-177 IFVGGI.KEDTEEHHLRD....YFEQYGKIEVIEIMTD.........RGS +ROC_HUMAN/18-82 VFIGNLnTLVVKKSDVEA....IFSKYGKIVGCSVHK............. +ROG_HUMAN/10-81 LFIGGL.NTETNEKALEA....VFGKYGRIVEVLLMKD.........RET +RT19_ARATH/33-104 LYIGGL.SPGTDEHSLKD....AFSSFNGVTEARVMTN.........KVT +RU17_DROME/104-175 LFIARI.NYDTSESKLRR....EFEFYGPIKKIVLIHD.........QES +RU1A_HUMAN/12-84 IYINNL.NEKIKKDELKkslyAIFSQFGQILDILVSRS............ +RU1A_HUMAN/210-276 LFLTNL.PEETNELMLSM....LFNQFPGFKEVRLVPG............ +RU1A_YEAST/229-293 LLIQNL.PSGTTEQLLSQ....ILGNEALVEIRLVSV............. +RU2B_HUMAN/9-81 IYINNM.NDKIKKEELKRslyaLFSQFGHVVDIVALK............T +RU2B_HUMAN/153-220 LFLNNL.PEETNEMMLSM....LFNQFPGFKEVRLVPG............ +SC35_CHICK/16-87 LKVDNL.TYRTSPDTLRR....VFEKYGRVGDVYIPRD.........RYT +SP33_HUMAN/17-85 IYVGNL.PPDIRTKDIED....VFYKYGAIRDIDLKNR............ +SP33_HUMAN/122-186 VVVSGL.PPSGSWQDLKD....HMREAGDVCYADVYRD............ +SQD_DROME/58-128 LFVGGL.SWETTEKELRD....HFGKYGEIESINVKTD.........PQT +SQD_DROME/138-208 IFVGGL.TTEISDEEIKT....YFGQFGNIVEVEMPLD.........KQK +SR55_DROME/5-68 VYVGGL.PYGVRERDLER....FFKGYGRTRDILIKN............. +SSB1_YEAST/39-114 IFIGNV.AHECTEDDLKQ....LFvEEFGDEVSVEIPIKeh.....tDGH +SSB1_YEAST/188-268 LYINNV.PFKATKEEVAE....FFGTDADSISLPMRKMrdqhtgrifTSD +SXLF_DROME/127-198 LIVNYL.PQDMTDRELYA....LFRAIGPINTCRIMRD.........YKT +SXLF_DROME/213-285 LYVTNL.PRTITDDQLDT....IFGKYGSIVQKNILRD.........KLT +TIA1_HUMAN/9-78 LYVGNL.SRDVTEALILQ....LFSQIGPCKNCKMIMD...........T +TIA1_HUMAN/97-168 VFVGDL.SPQITTEDIKA....AFAPFGRISDARVVKD.........MAT +TIA1_HUMAN/205-270 VYCGGV.TSGLTEQLMRQ....TFSPFGQIMEIRVFPD............ +TRA2_DROME/99-170 IGVFGL.NTNTSQHKVRE....LFNKYGPIERIQMVID.........AQT +U2AF_HUMAN/261-332 LFIGGL.PNYLNDDQVKE....LLTSFGPLKAFNLVKD.........SAT +U2AF_SCHPO/312-383 IYISNL.PLNLGEDQVVE....LLKPFGDLLSFQLIKN.........IAD +WHI3_YEAST/540-615 LYVGNL.PSDATEQELRQ....LFSGQEGFRRLSFRNKnt......tSNG +X16_HUMAN/12-78 VYVGNL.GNNGNKTELER....AFGYYGPLRSVWVARN............ +YHC4_YEAST/348-415 IFVGQL.DKETTREELNR....RFSTHGKIQDINLIFK............ +YHH5_YEAST/315-384 ILVKNL.PSDTTQEEVLD....YFSTIGPIKSVFISE...........KQ +YIS1_YEAST/66-136 IFVGNI.TPDVTPEQIED....HFKDCGQIKRITLLYD.........RNT +YIS5_YEAST/33-104 IYIGNL.NRELTEGDILT....VFSEYGVPVDVILSRD.........ENT + +CABA_MOUSE/77-148 GRSRGFGFILFKDS....SSVEKVLDQKEH.RLDGRVIDP.K +CABA_MOUSE/161-232 NKRRGFVFITFKEE....DPVKKVLEKKFH.TVSGSKCEI.K +CST2_HUMAN/18-89 GKPKGYGFCEYQDQ....ETALSAMRNLNG.REFSGRALR.V +ELAV_DROME/250-322 TQTKGVGFIRFDKR....EEATRAIIALNG.TTPSSCTDP.I +ELAV_DROME/404-475 NQCKGYGFVSMTNY....DEAAMAIRALNG.YTMGNRVLQ.V +EWS_HUMAN/363-442 GKPKGDATVSYEDP....PTAKAAVEWFDG.KDFQGSKLK.V +GBP2_YEAST/124-193 GHHRGMGTVEFTKN....ESVQDAISKFDG.ALFMDRKLM.V +GBP2_YEAST/221-291 GFSRGFGSVIYPTE....DEMIRAIDTFNG.MEVEGRVLE.V +GBP2_YEAST/351-421 GQPTGVAVVEYENL....VDADFCIQKLNN.YNYGGCSLQ.I +GR10_BRANA/8-79 GRSRGFGFVTFKDE....KSMKDAIDEMNG.KELDGRTIT.V +HUD_HUMAN/48-119 GQSLGYGFVNYIDP....KDAEKAINTLNG.LRLQTKTIK.V +IF4B_HUMAN/98-169 ERLKGFGYAEFEDL....DSLLSALSLNEE.SLGNRRIRV.D +MSSP_HUMAN/31-102 NKCKGYGFVDFDSP....AAAQKAVSALKA.SGVQAQKAK.Q +NAM8_YEAST/165-237 GMSKGYGFVKFTNS....DEQQLALSEMQG.VFLNGRAIK.V +NOP3_YEAST/127-190 ....GFAFVEFEEA....ESAAKAIEEVHG.KSFANQPLE.V +NOP3_YEAST/202-270 RDFDGTGALEFPSE....EILVEALERLNN.IEFRGSVIT.V +NOP4_YEAST/28-98 KRSRGFGFVSFAVE....DDTKEALAKARK.TKFNGHILR.V +NOP4_YEAST/292-367 GLAKGTAFVAFKDQytynECIKNAPAAGST.SLLIGDDVM.P +NSR1_YEAST/170-241 DRSRGYGYVDFENK....SYAEKAIQEMQG.KEIDGRPIN.C +NSR1_YEAST/269-340 EQPKGFGYVQFSNM....EDAKKALDALQG.EYIDNRPVR.L +NUCL_CHICK/283-353 GSSKRFGYVDFLSA....EDMDKALQ.LNG.KKLMGLEIKlE +NUCL_CHICK/373-440 GSSKGMAYIEFKTE....AEAEKALEEKQG.TEVDGRAMV.I +PABP_DROME/4-75 RRSLGYAYVNFQQP....ADAERALDTMNF.DLVRNKPIR.I +PABP_DROME/92-162 GNSKGYGFVHFETE....EAANTSIDKVNG.MLLNGKKVY.V +PABP_DROME/183-254 GKSKGFGFVAFETT....EAAEAAVQALNGkDMGEGKSLY.V +PABP_SCHPO/249-319 DKPRGFGFVNYANH....ECAQKAVDELND.KEYKGKKLY.V +PES4_YEAST/93-164 KKSLGHGYLNFEDK....EEAEKAMEELNY.TKVNGKEIR.I +PES4_YEAST/305-374 KVKYLWAFVTYKNS....SDSEKAIKRYNN.FYFRGKKLL.V +PSF_HUMAN/373-443 GRSTGKGIVEFASK....PAARKAFERCSE.GVFLLTTTP.R +PUB1_YEAST/76-146 NKNVNYAFVEYHQS....HDANIALQTLNG.KQIENNIVK.I +PUB1_YEAST/163-234 GSSRGYGFVSFTSQ....DDAQNAMDSMQG.QDLNGRPLR.I +PUB1_YEAST/342-407 ...KGCCFIKYDTH....EQAAVCIVALAN.FPFQGRNLR.T +RB97_DROME/34-105 KRSRGFGFITYTKS....LMVDRAQENRPH.IIDGKTVEA.K +RN15_YEAST/20-91 GRSKGYAFIEFRDL....ESSASAVRNLNG.YQLGSRFLK.C +RNP1_YEAST/37-109 KPLKRFAFIEFQEG....VNLKKVKEKMNG.KIFMNEKIV.I +RO28_NICSY/99-170 DRSRGFGFVTMSTV....EEADKAVELYSQ.YDLNGRLLT.V +RO33_NICSY/116-187 DRSRGFAFVTMGSV....EEAKEAIRLFDG.SQVGGRTVK.V +RO33_NICSY/219-290 GRSRGFGFITFSSA....EAMNSALDTMNE.VELEGRPLR.L +ROA1_BOVIN/106-177 GKKRGFAFVTFDDH....DSVDKIVIQKYH.TVNGHNCEV.R +ROC_HUMAN/18-82 ....GFAFVQYVNE....RNARAAVAGEDG.RMIAGQVLD.I +ROG_HUMAN/10-81 NKSRGFAFVTFESP....ADAKDAARDMNG.KSLDGKAIK.V +RT19_ARATH/33-104 GRSRGYGFVNFISE....DSANSAISAMNG.QELNGFNIS.V +RU17_DROME/104-175 GKPKGYAFIEYEHE....RDMHAAYKHADG.KKIDSKRVL.V +RU1A_HUMAN/12-84 LKMRGQAFVIFKEV....SSATNALRSMQG.FPFYDKPMR.I +RU1A_HUMAN/210-276 ..RHDIAFVEFDNE....VQAGAARDALQG.FKITQNNAM.K +RU1A_YEAST/229-293 ...RNLAFVEYETV....ADATKIKNQLGS.TYKLQNNDV.T +RU2B_HUMAN/9-81 MKMRGQAFVIFKEL....GSSTNALRQLQG.FPFYGKPMR.I +RU2B_HUMAN/153-220 ..RHDIAFVEFEND....GQAGAARDALQGfKITPSHAMK.I +SC35_CHICK/16-87 KESRGFAFVRFHDK....RDAEDAMDAMDG.AVLDGRELR.V +SP33_HUMAN/17-85 RGGPPFAFVEFEDP....RDAEDAVYGRDG.YDYDGYRLR.V +SP33_HUMAN/122-186 ....GTGVVEFVRK....EDMTYAVRKLDN.TKFRSHEGE.T +SQD_DROME/58-128 GRSRGFAFIVFTNT....EAIDKVSA.ADE.HIINSKKVD.P +SQD_DROME/138-208 SQRKGFCFITFDSE....QVVTDLLK.TPK.QKIAGKEVD.V +SR55_DROME/5-68 ....GYGFVEFEDY....RDADDAVYELNG.KELLGERVV.V +SSB1_YEAST/39-114 IPASKHALVKFPTK....IDFDNIKENYDT.KVVKDREIH.I +SSB1_YEAST/188-268 SANRGMAFVTFSGE....NVDIEAKAEEFK.GKVFGDREL.T +SXLF_DROME/127-198 GYSFGYAFVDFTSE....MDSQRAIKVLNG.ITVRNKRLK.V +SXLF_DROME/213-285 GRPRGVAFVRYNKR....EEAQEAISALNNvIPEGGSQPL.S +TIA1_HUMAN/9-78 AGNDPYCFVEFHEH....RHAAAALAAMNG.RKIMGKEVK.V +TIA1_HUMAN/97-168 GKSKGYGFVSFFNK....WDAENAIQQMGG.QWLGGRQIR.T +TIA1_HUMAN/205-270 ...KGYSFVRFNSH....ESAAHAIVSVNG.TTIEGHVVK.C +TRA2_DROME/99-170 QRSRGFCFIYFEKL....SDARAAKDSCSG.IEVDGRRIR.V +U2AF_HUMAN/261-332 GLSKGYAFCEYVDI....NVTDQAIAGLNG.MQLGDKKLL.V +U2AF_SCHPO/312-383 GSSKGFCFCEFKNP....SDAEVAISGLDG.KDTYGNKLH.A +WHI3_YEAST/540-615 HSHGPMCFVEFDDV....SFATRALAELYGrQLPRSTVSS.K +X16_HUMAN/12-78 ..PPGFAFVEFEDP....RDAADAVRELDG.RTLCGCRVR.V +YHC4_YEAST/348-415 .PTNIFAFIKYETE....EAAAAALESENH.AIFLNKTMH.V +YHH5_YEAST/315-384 ANTPHKAFVTYKNE....EESKKAQKCLNK.TIFKNHTIW.V +YIS1_YEAST/66-136 GTPKGYGYIEFESP....AYREKALQ.LNG.GELKGKKIA.V +YIS5_YEAST/33-104 GESQGFAYLKYEDQ....RSTILAVDNLNG.FKIGGRALK.I + diff --git a/forester/archive/RIO/others/phylip_mod/IMPORTANT_NOTICE b/forester/archive/RIO/others/phylip_mod/IMPORTANT_NOTICE new file mode 100644 index 0000000..6d2a698 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/IMPORTANT_NOTICE @@ -0,0 +1,48 @@ +RIO - Phylogenomic Protein Function Analysis +---------------------------------------------------------------- + + +RIO contains modified versions of programs written by others: + +1. TREE-PUZZLE + (Strimmer, K., and A. von Haeseler. 1996. Quartet puzzling: A quartet maximum + likelihood method for reconstructing tree topologies. Mol. Biol. Evol. 13: 964-969.) + + +2. PHYLIP + (Felsenstein, J. 1993. PHYLIP (Phylogeny Inference Package) version 3.5c. + Distributed by the author. + Department of Genetics, University of Washington, Seattle.) + + +Please note: +------------ + +1. RIO uses modifications of these programs, the original versions were + written by others: + + TREE-PUZZLE: Heiko A. Schmidt, Korbinian Strimmer, Martin Vingron, Arndt von Haeseler + + PHYLIP: Joseph Felsenstein, see also http://evolution.genetics.washington.edu/phylip/credits.html + + +2. The programs in the RIO distribution have been modified specifically + to work within RIO and cannot be used for any other purpose. + + +3. I am responsible for any accidentally introduced errors. + + +4. The original can be downloaded from the following sites: + TREE-PUZZLE: http://www.tree-puzzle.de/ + PHYLIP: http://evolution.genetics.washington.edu/phylip.html + + +RIO also contains hmmer (version 2.2g). +hmmer can be downloaded at: http://hmmer.wustl.edu/ + + + +Christian Zmasek, 07/28/2006 + + diff --git a/forester/archive/RIO/others/phylip_mod/src/CHANGES b/forester/archive/RIO/others/phylip_mod/src/CHANGES new file mode 100644 index 0000000..3c03411 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/CHANGES @@ -0,0 +1,29 @@ +Based on Phylip 3.65 + +phylip.h +-------- + +#define MAXNCH 20 -> #define MAXNCH 26 +#define nmlngth 10 -> #define nmlngth 26 + + + +seq.h +----- + +#define MAXNCH 20 -> #define MAXNCH 26 + + + +protdist.c +---------- + +#define nmlngth 10 -> #define nmlngth 26 + + + +Makefile +-------- + +Commented out instructions for programs I currently don't need. + diff --git a/forester/archive/RIO/others/phylip_mod/src/Makefile b/forester/archive/RIO/others/phylip_mod/src/Makefile new file mode 100644 index 0000000..9aed148 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/Makefile @@ -0,0 +1,451 @@ +# Modified by Christian Zmasek. Use at your own risk. +# +# Generic Linux/Unix Makefile for PHYLIP 3.6. +# +# You should not need to change anything, though if you want you could +# change the first (noncomment) statement to some directory location +# that might be more useful. +# The compressed tar archive phylip.tar.Z when uncompressed and extracted +# puts the source code into a directory ./src, and also makes two other +# directories ./exe and ./doc for the final executables and the documentation +# files. Only change the EXEDIR settings if you want something different +# from that structure. +# If it causes trouble in compiling, the CFLAGS statement below may also need +# to be changed. +# +# To use the PHYLIP v3.6 Makefile, type +# make install to compile the whole package and install +# the executables in $(EXEDIR), and then +# remove the object files to save space +# make all to compile the whole package but not install it +# or remove the object files +# make put to move the executables into $(EXEDIR) +# make clean to remove all object files and executables from the +# current directory +# make dnaml to compile and link one program, (in this example, +# DnaML) and leave the executable and object files +# in the current directory (where the source code is). +# You will have to move the executable in to the +# executables directory (e.g. "mv dnaml ../exe") +# Note that the program name should be lower case. +# +# ---------------------------------------------------------------------------- +# (Starting here is the section where you may want to change things) +# ---------------------------------------------------------------------------- +# +# these are the statements we have been talking about: +# one of the reasons for changing them would be to put the executables +# on a different file system. +# The default configuration is to have within the overall PHYLIP +# directory three subdirectories: "src" for source code, "exe" for the +# executables, and "doc" for the documentation files. +# +# the following specifies the directory where the executables will be placed +EXEDIR = ../exe +# +# ---------------------------------------------------------------------------- +# +# In the following statements (the ones that set CFLAGS, DFLAGS, LIBS +# and DLIBS, CC and DC) you should make sure each is set properly. +# Usually this will simply involve making sure that the proper statement +# has no "#" as its first character and that all other possibilities +# have "#" for their first character. +# +# ---------------------------------------------------------------------------- +# +# This is the CFLAGS statement: +# +# if these statements say "-g" and that causes trouble, remove the "-g" +# if the "-lX11" does not work (X compiling not working), you may want to +# remove that switch or fix it. +# +# Here are some possible CFLAGS statements: +# +# +#A minimal one +#CFLAGS = +# +# A basic one for debugging +#CFLAGS = -g +# +# An optimized one for gcc +CFLAGS = -O3 -fomit-frame-pointer +# +# For some serious debugging using Gnu gcc +# +#CFLAGS=-g -Wall -Wmain -Wmissing-prototypes -Wreturn-type -Wstrict-prototypes -Wunused -Werror -Wredundant-decls -Waggregate-return -Wcast-align -Wcomment +# +# For Digital Alpha systems with Compaq Tru64 Unix +# (however, be aware that this may cause floating-point problems in programs +# like Dnaml owing to not using IEEE floating point standards). +#CFLAGS = -fast +# +# ---------------------------------------------------------------------------- +# +# and here are some possible DFLAGS statements: +# +# A minimal one +#DFLAGS = -DX $(CFLAGS) +# +# A basic one for debugging +#DFLAGS = -g -DX +# +# +# For Gnu C++ for runs +#DFLAGS = -I/usr/X11R6/include -O3 -DX -fomit-frame-pointer +# +# For Digital Alpha systems with Compaq Tru64 Unix +#DFLAGS = -DX -fast +# +# for Linux with X Windows development packages installed +# or for MacOS X with X Windows installed +DFLAGS = $(CFLAGS) -DX -I/usr/X11R6/include +# +# ---------------------------------------------------------------------------- +# +# These are the libraries for the CC and DC compiles, respectively +# +LIBS = -lm +# +# if the Xlib library for the X windowing system is somewhere +# unexpected, you may have to change the path /usr/X11R6/lib in this one +# +# For gcc for Linux with X windows development packages installed +# or for MacOS X with X windows installed +DLIBS= -L/usr/X11R6/lib/ -lX11 -lXaw -lXt +# +# ---------------------------------------------------------------------------- +# +# The next two assignments are the invocations of the compiler for the +# ordinary compiles and the tree-drawing programs, CC and DC +# +# This one specifies the "cc" C compiler +CC = cc $(CFLAGS) +# +# To use GCC instead, if it is not the compiler that "cc" invokes +#CC = gcc $(CFLAGS) +# +# This one specifies the "cc" C compiler for the Draw programs +DC = cc $(DFLAGS) +# +# To use GCC instead, if it is not the compiler that "cc" invokes +#DC = gcc $(DFLAGS) +# +# ---------------------------------------------------------------------------- +# (After this point there should not be any reason to change anything) +# ---------------------------------------------------------------------------- +# +# +# the list of programs +# +#PROGS = clique consense contml contrast dnacomp dnadist \ +# dnainvar dnaml dnamlk dnamove dnapars dnapenny \ +# dolmove dollop dolpenny factor fitch gendist kitsch \ +# mix move neighbor pars penny proml promlk protdist \ +# protpars restdist restml retree seqboot treedist \ +# drawgram drawtree + +PROGS = consense fitch neighbor proml promlk protdist protpars seqboot + +# +# general commands +# + +# +# The first uses a symbol you are unlikely to type. It is the one that +# is executed if you just type "make". It tells you how to use the +# Makefile. +# +a1b2c3d4: + @echo "" + @echo " To use the PHYLIP v3.6 Makefile, type" + @echo " make all to compile the whole package but not install it" + @echo " or remove the object files" + @echo " " + +introduce: + @echo "Building PHYLIP based on version 3.6 - Modified by Christian Zmasek. Use at your own risk." + +all: introduce $(PROGS) + @echo "Finished compiling." + @echo "" + +#install: all put clean +# @echo "Done." +# @echo "" + +#put: +# @echo "Installing PHYLIP v3.6 binaries in $(EXEDIR)" +# @mkdir -p $(EXEDIR) +# @cp $(PROGS) $(EXEDIR) +# @echo "Installing font files in $(EXEDIR)" +# @cp font* $(EXEDIR) +# @echo "Finished installation." +# @echo "" + +#clean: +# @echo "Removing object files to save space" +# @rm -f *.o +# @echo "Finished removing object files. Now will remove" +# @echo "executable files from the current directory, but not from the" +# @echo "executables directory. (If some are not here, the makefile" +# @echo "will terminate with an error message but this is not a problem)" +# @echo "" +# @echo "Removing executables from this directory" +# @rm -f $(PROGS) +# @echo "Finished cleanup." +# @echo "" + +# +# compile the shared stuff +# + +phylip.o: phylip.c phylip.h + $(CC) -c phylip.c + +seq.o: seq.c phylip.h seq.h + $(CC) -c seq.c + +#disc.o: disc.c phylip.h disc.h +# $(CC) -c disc.c + +#discrete.o: discrete.c discrete.h phylip.h +# $(CC) -c discrete.c + +#dollo.o: dollo.c phylip.h dollo.h +# $(CC) -c dollo.c + +#wagner.o: wagner.c phylip.h wagner.h +# $(CC) -c wagner.c + +dist.o: dist.c phylip.h dist.h + $(CC) -c dist.c + +#cont.o: cont.c cont.h phylip.h +# $(CC) -c cont.c + +#moves.o: moves.c phylip.h moves.h +# $(CC) -c moves.c + +# +# compile the individual programs +# + +#clique.o: clique.c disc.h phylip.h +# $(CC) -c clique.c + +#clique: clique.o clique.c disc.o disc.c phylip.o phylip.c disc.h phylip.h +# $(CC) clique.o disc.o phylip.o $(LIBS) -o clique + +cons.o: cons.c cons.h phylip.h + $(CC) -c cons.c + +consense.o: consense.c cons.h phylip.h + $(CC) -c consense.c + +consense: consense.o consense.c phylip.o phylip.c cons.o cons.c cons.h phylip.h + $(CC) consense.o cons.o phylip.o $(LIBS) -o consense + +#contml.o: contml.c cont.h phylip.h +# $(CC) -c contml.c + +#contml: contml.o contml.c cont.o cont.c phylip.o phylip.c cont.h phylip.h +# $(CC) contml.o cont.o phylip.o $(LIBS) -o contml + +#contrast.o: contrast.c cont.h phylip.h +# $(CC) -c contrast.c + +#contrast: contrast.o contrast.c cont.o cont.c phylip.o phylip.c cont.h phylip.h +# $(CC) contrast.o cont.o phylip.o $(LIBS) -o contrast + +#dnacomp.o: dnacomp.c seq.h phylip.h +# $(CC) -c dnacomp.c + +#dnacomp: dnacomp.o seq.o phylip.o dnacomp.c seq.c phylip.c seq.h phylip.h +# $(CC) dnacomp.o seq.o phylip.o $(LIBS) -o dnacomp + +#dnadist.o: dnadist.c seq.h phylip.h +# $(CC) -c dnadist.c + +#dnadist: dnadist.o seq.o phylip.o dnadist.c seq.c phylip.c seq.h phylip.h +# $(CC) dnadist.o seq.o phylip.o $(LIBS) -o dnadist + +#dnainvar.o: dnainvar.c seq.h phylip.h +# $(CC) -c dnainvar.c + +#dnainvar: dnainvar.o seq.o phylip.o dnainvar.c seq.c phylip.c seq.h phylip.h +# $(CC) dnainvar.o seq.o phylip.o $(LIBS) -o dnainvar + +#dnaml.o: dnaml.c seq.h phylip.h +# $(CC) -c dnaml.c + +#dnaml: dnaml.o seq.o phylip.o dnaml.c seq.c phylip.c seq.h phylip.h +# $(CC) dnaml.o seq.o phylip.o $(LIBS) -o dnaml + +#dnamlk.o: dnamlk.c seq.h phylip.h +# $(CC) -c dnamlk.c + +#dnamlk: dnamlk.o seq.o phylip.o dnamlk.c seq.c phylip.c +# $(CC) dnamlk.o seq.o phylip.o $(LIBS) -o dnamlk + +#dnamove.o: dnamove.c seq.h moves.h phylip.h +# $(CC) -c dnamove.c + +#dnamove: dnamove.o seq.o moves.o phylip.o dnamove.c seq.c phylip.c seq.h phylip.h +# $(CC) dnamove.o seq.o moves.o phylip.o $(LIBS) -o dnamove + +#dnapenny.o: dnapenny.c seq.h phylip.h +# $(CC) -c dnapenny.c + +#dnapenny: dnapenny.o seq.o phylip.o dnapenny.c seq.c phylip.c seq.h phylip.h +# $(CC) dnapenny.o seq.o phylip.o $(LIBS) -o dnapenny + +#dnapars.o: dnapars.c seq.h phylip.h +# $(CC) -c dnapars.c + +#dnapars: dnapars.o seq.o phylip.o dnapars.c seq.c phylip.c seq.h phylip.h +# $(CC) dnapars.o seq.o phylip.o $(LIBS) -o dnapars + +#dolmove.o: dolmove.c disc.h moves.h dollo.h phylip.h +# $(CC) -c dolmove.c + +#dolmove: dolmove.o disc.o moves.o dollo.o phylip.o dolmove.c disc.c moves.c dollo.c phylip.c disc.h moves.h dollo.h phylip.h +# $(CC) dolmove.o disc.o moves.o dollo.o phylip.o $(LIBS) -o dolmove + +#dollop.o: dollop.c disc.h dollo.h phylip.h +# $(CC) -c dollop.c + +#dollop: dollop.o disc.o dollo.o phylip.o dollop.c disc.c dollo.c phylip.c disc.h dollo.h phylip.h +# $(CC) dollop.o disc.o dollo.o phylip.o $(LIBS) -o dollop + +#dolpenny.o: dolpenny.c disc.h dollo.h phylip.h +# $(CC) -c dolpenny.c + +#dolpenny: dolpenny.o disc.o dollo.o phylip.o dolpenny.c disc.c dollo.c phylip.c disc.h dollo.h phylip.h +# $(CC) dolpenny.o disc.o dollo.o phylip.o $(LIBS) -o dolpenny + +#draw.o: draw.c draw.h phylip.h +# $(DC) -c draw.c + +#draw2.o: draw2.c draw.h phylip.h +# $(DC) -c draw2.c + +#drawgram.o: drawgram.c draw.h phylip.h +# $(DC) -c drawgram.c + +#drawgram: drawgram.o draw.o draw2.o phylip.o drawgram.c draw.c draw2.c draw.h phylip.h +# $(DC) $(DLIBS) draw.o draw2.o drawgram.o phylip.o $(LIBS) -o drawgram + +#drawtree.o: drawtree.c draw.h phylip.h +# $(DC) -c drawtree.c + +#drawtree: drawtree.o draw.o draw2.o phylip.o drawtree.c draw.c draw2.c draw.h phylip.h +# $(DC) $(DLIBS) draw.o draw2.o drawtree.o phylip.o $(LIBS) -o drawtree + +#factor.o: factor.c phylip.h +# $(CC) -c factor.c + +#factor: factor.o phylip.o factor.c phylip.c phylip.h +# $(CC) factor.o phylip.o $(LIBS) -o factor + +fitch.o: fitch.c dist.h phylip.h + $(CC) -c fitch.c + +fitch: fitch.o dist.o phylip.o fitch.c dist.c phylip.c dist.h phylip.h + $(CC) fitch.o dist.o phylip.o $(LIBS) -o fitch + +#gendist.o: gendist.c phylip.h +# $(CC) -c gendist.c + +#gendist: gendist.o phylip.o gendist.c phylip.c phylip.h +# $(CC) gendist.o phylip.o $(LIBS) -o gendist + +#kitsch.o: kitsch.c dist.h phylip.h +# $(CC) -c kitsch.c + +#kitsch: kitsch.o dist.o phylip.o kitsch.c dist.c phylip.c dist.h phylip.h +# $(CC) kitsch.o dist.o phylip.o $(LIBS) -o kitsch + +#mix.o: mix.c disc.h wagner.h phylip.h +# $(CC) -c mix.c + +#mix: mix.o disc.o wagner.o phylip.o mix.c disc.c wagner.c phylip.c disc.h wagner.h phylip.h +# $(CC) mix.o disc.o wagner.o phylip.o $(LIBS) -o mix + +#move.o: move.c disc.h moves.h wagner.h phylip.h +# $(CC) -c move.c + +#move: move.o disc.o moves.o wagner.o phylip.o move.c disc.c moves.c wagner.c phylip.c disc.h moves.h wagner.h phylip.h +# $(CC) move.o disc.o moves.o wagner.o phylip.o $(LIBS) -o move + +neighbor.o: neighbor.c dist.h phylip.h + $(CC) -c neighbor.c + +neighbor: neighbor.o dist.o phylip.o neighbor.c dist.c phylip.c dist.h phylip.h + $(CC) neighbor.o dist.o phylip.o $(LIBS) -o neighbor + +#pars.o: pars.c discrete.h phylip.h +# $(CC) -c pars.c + +#pars: pars.o pars.c discrete.o discrete.c phylip.o phylip.c discrete.h phylip.h +# $(CC) pars.o discrete.o phylip.o $(LIBS) -o pars + +#penny.o: penny.c disc.h wagner.h phylip.h +# $(CC) -c penny.c + +#penny: penny.o disc.o wagner.o phylip.o penny.c disc.c wagner.c disc.h wagner.h phylip.h +# $(CC) penny.o disc.o wagner.o phylip.o $(LIBS) -o penny + +proml.o: proml.c seq.h phylip.h + $(CC) -c proml.c + +proml: proml.o seq.o phylip.o proml.c seq.c phylip.c seq.h phylip.h + $(CC) proml.o seq.o phylip.o $(LIBS) -o proml + +promlk.o: promlk.c seq.h phylip.h + $(CC) -c promlk.c + +promlk: promlk.o seq.o phylip.o promlk.c seq.c phylip.c + $(CC) promlk.o seq.o phylip.o $(LIBS) -o promlk + +protdist.o: protdist.c seq.h phylip.h + $(CC) -c protdist.c + +protdist: protdist.o seq.o phylip.o protdist.c seq.c phylip.c seq.h phylip.h + $(CC) protdist.o seq.o phylip.o $(LIBS) -o protdist + +protpars.o: protpars.c seq.h phylip.h + $(CC) -c protpars.c + +protpars: protpars.o seq.o phylip.o protpars.c seq.c phylip.c seq.h phylip.h + $(CC) protpars.o seq.o phylip.o $(LIBS) -o protpars + +#restdist.o: restdist.c seq.h phylip.h +# $(CC) -c restdist.c + +#restdist: restdist.o seq.o phylip.o restdist.c seq.c phylip.c seq.h phylip.h +# $(CC) restdist.o seq.o phylip.o $(LIBS) -o restdist + +#restml.o: restml.c seq.h phylip.h +# $(CC) -c restml.c + +#restml: restml.o seq.o phylip.o restml.c seq.c phylip.c seq.h phylip.h +# $(CC) restml.o seq.o phylip.o $(LIBS) -o restml + +#retree.o: retree.c moves.h phylip.h +# $(CC) -c retree.c + +#retree: retree.o moves.o phylip.o retree.c moves.c phylip.c moves.h phylip.h +# $(CC) retree.o moves.o phylip.o $(LIBS) -o retree + +seqboot.o: seqboot.c phylip.h + $(CC) -c seqboot.c + +seqboot: seqboot.o seq.o phylip.o seqboot.c seq.c phylip.c seq.h phylip.h + $(CC) seqboot.o seq.o phylip.o $(LIBS) -o seqboot + +#treedist.o: treedist.c cons.h phylip.h +# $(CC) -c treedist.c + +#treedist: treedist.o phylip.o cons.o +# $(CC) treedist.o cons.o phylip.o $(LIBS) -o treedist diff --git a/forester/archive/RIO/others/phylip_mod/src/cons.c b/forester/archive/RIO/others/phylip_mod/src/cons.c new file mode 100644 index 0000000..2fa7c4d --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/cons.c @@ -0,0 +1,1457 @@ +#include "phylip.h" +#include "cons.h" + +int tree_pairing; + +Char outfilename[FNMLNGTH], intreename[FNMLNGTH], intree2name[FNMLNGTH], outtreename[FNMLNGTH]; +node *root; + +long numopts, outgrno, col, setsz; +long maxgrp; /* max. no. of groups in all trees found */ + +boolean trout, firsttree, noroot, outgropt, didreroot, prntsets, + progress, treeprint, goteof, strict, mr=false, mre=false, + ml=false; /* initialized all false for Treedist */ +pointarray nodep; +pointarray treenode; +group_type **grouping, **grping2, **group2;/* to store groups found */ +double *lengths, *lengths2; +long **order, **order2, lasti; +group_type *fullset; +node *grbg; +long tipy; + +double **timesseen, **tmseen2, **times2 ; +double trweight, ntrees, mlfrac; + +/* prototypes */ +void censor(void); +boolean compatible(long, long); +void elimboth(long); +void enternohash(group_type*, long*); +void enterpartition (group_type*, long*); +void reorient(node* n); + +/* begin hash table code */ + +#define NUM_BUCKETS 100 + +typedef struct namenode { + struct namenode *next; + plotstring naym; + int hitCount; +} namenode; + +typedef namenode **hashtype; + +hashtype hashp; + +long namesGetBucket(plotstring); +void namesAdd(plotstring); +boolean namesSearch(plotstring); +void namesDelete(plotstring); +void namesClearTable(void); +void namesCheckTable(void); +void missingnameRecurs(node *p); + +/** + * namesGetBucket - return the bucket for a given name + */ +long namesGetBucket(plotstring searchname) { + long i; + long sum = 0; + + for (i = 0; (i < MAXNCH) && (searchname[i] != '\0'); i++) { + sum += searchname[i]; + } + return (sum % NUM_BUCKETS); +} + + +/** + * namesAdd - add a name to the hash table + * + * The argument is added at the head of the appropriate linked list. No + * checking is done for duplicates. The caller can call + * namesSearch to check for an existing name prior to calling + * namesAdd. + */ +void namesAdd(plotstring addname) { + long bucket = namesGetBucket(addname); + namenode *hp, *temp; + + temp = hashp[bucket]; + hashp[bucket] = (namenode *)Malloc(sizeof(namenode)); + hp = hashp[bucket]; + strcpy(hp->naym, addname); + hp->next = temp; + hp->hitCount = 0; +} + +/** + * namesSearch - search for a name in the hash table + * + * Return true if the name is found, else false. + */ +boolean namesSearch(plotstring searchname) { + long i = namesGetBucket(searchname); + namenode *p; + + p = hashp[i]; + if (p == NULL) { + return false; + } + do { + if (strcmp(searchname,p->naym) == 0) { + p->hitCount++; + return true; + } + p = p->next; + } while (p != NULL); + + return false; +} + +/** + * Go through hash table and check that the hit count on all entries is one. + * If it is zero, then a species was missed, if it is two, then there is a + * duplicate species. + */ + +void namesCheckTable(void) { + namenode *p; + long i; + + for (i=0; i< NUM_BUCKETS; i++) { + p = hashp[i]; + while (p != NULL){ + if(p->hitCount >1){ + printf("\n\nERROR in user tree: duplicate name found: "); + puts(p->naym); + printf("\n\n"); + exxit(-1); + } else if(p->hitCount == 0){ + printf("\n\nERROR in user tree: name %s not found\n\n\n", + p->naym); + exxit(-1); + } + p->hitCount = 0; + p = p->next; + } + } +} + +/** + * namesClearTable - empty names out of the table and + * return allocated memory + */ +void namesClearTable(void) { + long i; + namenode *p, *temp; + + for (i=0; i< NUM_BUCKETS; i++) { + p = hashp[i]; + if (p != NULL) { + do { + temp = p; + p = p->next; + free(temp); + } while (p != NULL); + hashp[i] = NULL; + } + } +} +/* end hash table code */ + +void initconsnode(node **p, node **grbg, node *q, long len, long nodei, + long *ntips, long *parens, initops whichinit, + pointarray treenode, pointarray nodep, Char *str, + Char *ch, FILE *intree) +{ + /* initializes a node */ + long i; + char c; + boolean minusread; + double valyew, divisor, fracchange; + + switch (whichinit) { + case bottom: + gnu(grbg, p); + (*p)->index = nodei; + (*p)->tip = false; + for (i=0; inayme[i] = '\0'; + nodep[(*p)->index - 1] = (*p); + (*p)->v = 0; + break; + case nonbottom: + gnu(grbg, p); + (*p)->index = nodei; + (*p)->v = 0; + break; + case tip: + (*ntips)++; + gnu(grbg, p); + nodep[(*ntips) - 1] = *p; + setupnode(*p, *ntips); + (*p)->tip = true; + strncpy ((*p)->nayme, str, MAXNCH); + if (firsttree && prntsets) { + fprintf(outfile, " %ld. ", *ntips); + for (i = 0; i < len; i++) + putc(str[i], outfile); + putc('\n', outfile); + if ((*ntips > 0) && (((*ntips) % 10) == 0)) + putc('\n', outfile); + } + (*p)->v = 0; + break; + case length: + processlength(&valyew, &divisor, ch, &minusread, intree, parens); + fracchange = 1.0; + (*p)->v = valyew / divisor / fracchange; + break; + case treewt: + if (!eoln(intree)) { + fscanf(intree, "%lf", &trweight); + getch(ch, parens, intree); + if (*ch != ']') { + printf("\n\nERROR: Missing right square bracket\n\n"); + exxit(-1); + } else { + getch(ch, parens, intree); + if (*ch != ';') { + printf("\n\nERROR: Missing semicolon after square brackets\n\n"); + exxit(-1); + } + } + } + break; + case unittrwt: + /* This comes not only when setting trweight but also at the end of + * any tree. The following code saves the current position in a + * file and reads to a new line. If there is a new line then we're + * at the end of tree, otherwise warn the user. This function should + * really leave the file alone, so once we're done with 'intree' + * we seek the position back so that it doesn't look like we did + * anything */ + trweight = 1.0 ; + i = ftell (intree); + c = ' '; + while (c == ' ') { + if (eoff(intree)) { + fseek(intree,i,SEEK_SET); + return; + } + c = gettc(intree); + } + fseek(intree,i,SEEK_SET); + if ( c != '\n' && c!= '\r') + printf("WARNING: Tree weight set to 1.0\n"); + if ( c == '\r' ) + if ( (c == gettc(intree)) != '\n') + ungetc(c, intree); + break; + case hsnolength: + (*p)->v = -1; /* signal value that a length is missing */ + break; + default: /* cases hslength, iter, hsnolength */ + break; /* should there be an error message here?*/ + } +} /* initconsnode */ + + +void censor(void) +{ + /* delete groups that are too rare to be in the consensus tree */ + long i; + + i = 1; + do { + if (timesseen[i-1]) + if (!(mre || (mr && (2*(*timesseen[i-1]) > ntrees)) + || (ml && ((*timesseen[i-1]) > mlfrac*ntrees)) + || (strict && ((*timesseen[i-1]) == ntrees)))) { + free(grouping[i - 1]); + free(timesseen[i - 1]); + grouping[i - 1] = NULL; + timesseen[i - 1] = NULL; + } + i++; + } while (i < maxgrp); +} /* censor */ + + +void compress(long *n) +{ + /* push all the nonempty subsets to the front end of their array */ + long i, j; + + i = 1; + j = 1; + do { + while (grouping[i - 1] != NULL) + i++; + if (j <= i) + j = i + 1; + while ((grouping[j - 1] == NULL) && (j < maxgrp)) + j++; + if (j < maxgrp) { + grouping[i - 1] = (group_type *)Malloc(setsz * sizeof(group_type)); + timesseen[i - 1] = (double *)Malloc(sizeof(double)); + memcpy(grouping[i - 1], grouping[j - 1], setsz * sizeof(group_type)); + *timesseen[i - 1] = *timesseen[j - 1]; + free(grouping[j - 1]); + free(timesseen[j - 1]); + grouping[j - 1] = NULL; + timesseen[j - 1] = NULL; + } + } while (j != maxgrp); + (*n) = i - 1; +} /* compress */ + + +void sort(long n) +{ + /* Shell sort keeping grouping, timesseen in same order */ + long gap, i, j; + group_type *stemp; + double rtemp; + + gap = n / 2; + stemp = (group_type *)Malloc(setsz * sizeof(group_type)); + while (gap > 0) { + for (i = gap + 1; i <= n; i++) { + j = i - gap; + while (j > 0) { + if (*timesseen[j - 1] < *timesseen[j + gap - 1]) { + memcpy(stemp, grouping[j - 1], setsz * sizeof(group_type)); + memcpy(grouping[j - 1], grouping[j + gap - 1], setsz * sizeof(group_type)); + memcpy(grouping[j + gap - 1], stemp, setsz * sizeof(group_type)); + rtemp = *timesseen[j - 1]; + *timesseen[j - 1] = *timesseen[j + gap - 1]; + *timesseen[j + gap - 1] = rtemp; + } + j -= gap; + } + } + gap /= 2; + } + free(stemp); +} /* sort */ + + +boolean compatible(long i, long j) +{ + /* are groups i and j compatible? */ + boolean comp; + long k; + + comp = true; + for (k = 0; k < setsz; k++) + if ((grouping[i][k] & grouping[j][k]) != 0) + comp = false; + if (!comp) { + comp = true; + for (k = 0; k < setsz; k++) + if ((grouping[i][k] & ~grouping[j][k]) != 0) + comp = false; + if (!comp) { + comp = true; + for (k = 0; k < setsz; k++) + if ((grouping[j][k] & ~grouping[i][k]) != 0) + comp = false; + if (!comp) { + comp = noroot; + if (comp) { + for (k = 0; k < setsz; k++) + if ((fullset[k] & ~grouping[i][k] & ~grouping[j][k]) != 0) + comp = false; + } + } + } + } + return comp; +} /* compatible */ + + +void eliminate(long *n, long *n2) +{ + /* eliminate groups incompatible with preceding ones */ + long i, j, k; + boolean comp; + + for (i = 2; i <= (*n); i++) { + comp = true; + for (j = 0; comp && (j <= i - 2); j++) { + if ((timesseen[j] != NULL) && *timesseen[j] > 0) { + comp = compatible(i-1,j); + if (!comp) { + (*n2)++; + times2[(*n2) - 1] = (double *)Malloc(sizeof(double)); + group2[(*n2) - 1] = (group_type *)Malloc(setsz * sizeof(group_type)); + *times2[(*n2) - 1] = *timesseen[i - 1]; + memcpy(group2[(*n2) - 1], grouping[i - 1], setsz * sizeof(group_type)); + *timesseen[i - 1] = 0.0; + for (k = 0; k < setsz; k++) + grouping[i - 1][k] = 0; + } + } + } + if (*timesseen[i - 1] == 0.0) { + free(grouping[i - 1]); + free(timesseen[i - 1]); + timesseen[i - 1] = NULL; + grouping[i - 1] = NULL; + } + } +} /* eliminate */ + + +void printset(long n) +{ + /* print out the n sets of species */ + long i, j, k, size; + boolean noneprinted; + + fprintf(outfile, "\nSet (species in order) "); + for (i = 1; i <= spp - 25; i++) + putc(' ', outfile); + fprintf(outfile, " How many times out of %7.2f\n\n", ntrees); + noneprinted = true; + for (i = 0; i < n; i++) { + if ((timesseen[i] != NULL) && (*timesseen[i] > 0)) { + size = 0; + k = 0; + for (j = 1; j <= spp; j++) { + if (j == ((k+1)*SETBITS+1)) k++; + if (((1L << (j - 1 - k*SETBITS)) & grouping[i][k]) != 0) + size++; + } + if (size != 1 && !(noroot && size >= (spp-1))) { + noneprinted = false; + k = 0; + for (j = 1; j <= spp; j++) { + if (j == ((k+1)*SETBITS+1)) k++; + if (((1L << (j - 1 - k*SETBITS)) & grouping[i][k]) != 0) + putc('*', outfile); + else + putc('.', outfile); + if (j % 10 == 0) + putc(' ', outfile); + } + for (j = 1; j <= 23 - spp; j++) + putc(' ', outfile); + fprintf(outfile, " %5.2f\n", *timesseen[i]); + } + } + } + if (noneprinted) + fprintf(outfile, " NONE\n"); +} /* printset */ + + +void bigsubset(group_type *st, long n) +{ + /* Find a maximal subset of st among the n groupings, + to be the set at the base of the tree. */ + long i, j; + group_type *su; + boolean max, same; + + su = (group_type *)Malloc(setsz * sizeof(group_type)); + for (i = 0; i < setsz; i++) + su[i] = 0; + for (i = 0; i < n; i++) { + max = true; + for (j = 0; j < setsz; j++) + if ((grouping[i][j] & ~st[j]) != 0) + max = false; + if (max) { + same = true; + for (j = 0; j < setsz; j++) + if (grouping[i][j] != st[j]) + same = false; + max = !same; + } + if (max) { + for (j = 0; j < setsz; j ++) + if ((su[j] & ~grouping[i][j]) != 0) + max = false; + if (max) { + same = true; + for (j = 0; j < setsz; j ++) + if (su[j] != grouping[i][j]) + same = false; + max = !same; + } + if (max) + memcpy(su, grouping[i], setsz * sizeof(group_type)); + } + } + memcpy(st, su, setsz * sizeof(group_type)); + free(su); +} /* bigsubset */ + + +void recontraverse(node **p, group_type *st, long n, long *nextnode) +{ + /* traverse to add next node to consensus tree */ + long i, j = 0, k = 0, l = 0; + + boolean found, same = 0, zero, zero2; + group_type *tempset, *st2; + node *q, *r; + + for (i = 1; i <= spp; i++) { /* count species in set */ + if (i == ((l+1)*SETBITS+1)) l++; + if (((1L << (i - 1 - l*SETBITS)) & st[l]) != 0) { + k++; /* k is the number of species in the set */ + j = i; /* j is set to last species in the set */ + } + } + if (k == 1) { /* if only 1, set up that tip */ + *p = nodep[j - 1]; + (*p)->tip = true; + (*p)->index = j; + return; + } + gnu(&grbg, p); /* otherwise make interior node */ + (*p)->tip = false; + (*p)->index = *nextnode; + nodep[*nextnode - 1] = *p; + (*nextnode)++; + (*p)->deltav = 0.0; + for (i = 0; i < n; i++) { /* go through all sets */ + same = true; /* to find one which is this one */ + for (j = 0; j < setsz; j++) + if (grouping[i][j] != st[j]) + same = false; + if (same) + (*p)->deltav = *timesseen[i]; + } + tempset = (group_type *)Malloc(setsz * sizeof(group_type)); + memcpy(tempset, st, setsz * sizeof(group_type)); + q = *p; + st2 = (group_type *)Malloc(setsz * sizeof(group_type)); + memcpy(st2, st, setsz * sizeof(group_type)); + zero = true; /* having made two copies of the set ... */ + for (j = 0; j < setsz; j++) /* see if they are empty */ + if (tempset[j] != 0) + zero = false; + if (!zero) + bigsubset(tempset, n); /* find biggest set within it */ + zero = zero2 = false; /* ... tempset is that subset */ + while (!zero && !zero2) { + zero = zero2 = true; + for (j = 0; j < setsz; j++) { + if (st2[j] != 0) + zero = false; + if (tempset[j] != 0) + zero2 = false; + } + if (!zero && !zero2) { + gnu(&grbg, &q->next); + q->next->index = q->index; + q = q->next; + q->tip = false; + r = *p; + recontraverse(&q->back, tempset, n, nextnode); /* put it on tree */ + *p = r; + q->back->back = q; + for (j = 0; j < setsz; j++) + st2[j] &= ~tempset[j]; /* remove that subset from the set */ + memcpy(tempset, st2, setsz * sizeof(group_type)); /* that becomes set */ + found = false; + i = 1; + while (!found && i <= n) { + if (grouping[i - 1] != 0) { + same = true; + for (j = 0; j < setsz; j++) + if (grouping[i - 1][j] != tempset[j]) + same = false; + } + if ((grouping[i - 1] != 0) && same) + found = true; + else + i++; + } + zero = true; + for (j = 0; j < setsz; j++) + if (tempset[j] != 0) + zero = false; + if (!zero && !found) + bigsubset(tempset, n); + } + } + q->next = *p; + free(tempset); + free(st2); +} /* recontraverse */ + + +void reconstruct(long n) +{ + /* reconstruct tree from the subsets */ + long nextnode; + group_type *s; + + nextnode = spp + 1; + s = (group_type *)Malloc(setsz * sizeof(group_type)); + memcpy(s, fullset, setsz * sizeof(group_type)); + recontraverse(&root, s, n, &nextnode); + free(s); +} /* reconstruct */ + + +void coordinates(node *p, long *tipy) +{ + /* establishes coordinates of nodes */ + node *q, *first, *last; + long maxx; + + if (p->tip) { + p->xcoord = 0; + p->ycoord = *tipy; + p->ymin = *tipy; + p->ymax = *tipy; + (*tipy) += down; + return; + } + q = p->next; + maxx = 0; + while (q != p) { + coordinates(q->back, tipy); + if (!q->back->tip) { + if (q->back->xcoord > maxx) + maxx = q->back->xcoord; + } + q = q->next; + } + first = p->next->back; + q = p; + while (q->next != p) + q = q->next; + last = q->back; + p->xcoord = maxx + OVER; + p->ycoord = (long)((first->ycoord + last->ycoord) / 2); + p->ymin = first->ymin; + p->ymax = last->ymax; +} /* coordinates */ + + +void drawline(long i) +{ + /* draws one row of the tree diagram by moving up tree */ + node *p, *q; + long n, j; + boolean extra, done, trif; + node *r, *first = NULL, *last = NULL; + boolean found; + + p = root; + q = root; + fprintf(outfile, " "); + extra = false; + trif = false; + do { + if (!p->tip) { + found = false; + r = p->next; + while (r != p && !found) { + if (i >= r->back->ymin && i <= r->back->ymax) { + q = r->back; + found = true; + } else + r = r->next; + } + first = p->next->back; + r = p; + while (r->next != p) + r = r->next; + last = r->back; + } + done = (p->tip || p == q); + n = p->xcoord - q->xcoord; + if (extra) { + n--; + extra = false; + } + if (q->ycoord == i && !done) { + if (trif) + putc('-', outfile); + else + putc('+', outfile); + trif = false; + if (!q->tip) { + for (j = 1; j <= n - 7; j++) + putc('-', outfile); + if (noroot && (root->next->next->next == root) && + (((root->next->back == q) && root->next->next->back->tip) + || ((root->next->next->back == q) && root->next->back->tip))) + fprintf(outfile, "------|"); + else { + if (!strict) { /* write number of times seen */ + if (q->deltav >= 100) + fprintf(outfile, "%5.1f-|", (double)q->deltav); + else if (q->deltav >= 10) + fprintf(outfile, "-%4.1f-|", (double)q->deltav); + else + fprintf(outfile, "--%3.1f-|", (double)q->deltav); + } else + fprintf(outfile, "------|"); + } + extra = true; + trif = true; + } else { + for (j = 1; j < n; j++) + putc('-', outfile); + } + } else if (!p->tip && last->ycoord > i && first->ycoord < i && + (i != p->ycoord || p == root)) { + putc('|', outfile); + for (j = 1; j < n; j++) + putc(' ', outfile); + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + if (trif) + trif = false; + } + if (q != p) + p = q; + } while (!done); + if (p->ycoord == i && p->tip) { + for (j = 0; (j < MAXNCH) && (p->nayme[j] != '\0'); j++) + putc(p->nayme[j], outfile); + } + putc('\n', outfile); +} /* drawline */ + + +void printree() +{ + /* prints out diagram of the tree */ + long i; + long tipy; + + if (treeprint) { + fprintf(outfile, "\nCONSENSUS TREE:\n"); + if (mr || mre || ml) { + if (noroot) { + fprintf(outfile, "the numbers on the branches indicate the number\n"); + fprintf(outfile, "of times the partition of the species into the two sets\n"); + fprintf(outfile, "which are separated by that branch occurred\n"); + } else { + fprintf(outfile, "the numbers forks indicate the number\n"); + fprintf(outfile, "of times the group consisting of the species\n"); + fprintf(outfile, "which are to the right of that fork occurred\n"); + } + fprintf(outfile, "among the trees, out of %6.2f trees\n", ntrees); + if (ntrees <= 1.001) + fprintf(outfile, "(trees had fractional weights)\n"); + } + tipy = 1; + coordinates(root, &tipy); + putc('\n', outfile); + for (i = 1; i <= tipy - down; i++) + drawline(i); + putc('\n', outfile); + } + if (noroot) { + fprintf(outfile, "\n remember:"); + if (didreroot) + fprintf(outfile, " (though rerooted by outgroup)"); + fprintf(outfile, " this is an unrooted tree!\n"); + } + putc('\n', outfile); +} /* printree */ + + +void enternohash(group_type *s, long *n) +{ + /* if set s is already there, enter it into groupings in the next slot + (without hash-coding). n is number of sets stored there and is updated */ + long i, j; + boolean found; + + found = false; + for (i = 0; i < (*n); i++) { /* go through looking whether it is there */ + found = true; + for (j = 0; j < setsz; j++) { /* check both parts of partition */ + found = found && (grouping[i][j] == s[j]); + found = found && (group2[i][j] == (fullset[j] & (~s[j]))); + } + if (found) + break; + } + if (!found) { /* if not, add it to the slot after the end, + which must be empty */ + grouping[i] = (group_type *)Malloc(setsz * sizeof(group_type)); + timesseen[i] = (double *)Malloc(sizeof(double)); + group2[i] = (group_type *)Malloc(setsz * sizeof(group_type)); + for (j = 0; j < setsz; j++) + grouping[i][j] = s[j]; + *timesseen[i] = 1; + (*n)++; + } +} /* enternohash */ + + +void enterpartition (group_type *s1, long *n) +{ + /* try to put this partition in list of partitions. If implied by others, + don't bother. If others implied by it, replace them. If this one + vacuous because only one element in s1, forget it */ + long i, j; + boolean found; + +/* this stuff all to be rewritten but left here so pieces can be used */ + found = false; + for (i = 0; i < (*n); i++) { /* go through looking whether it is there */ + found = true; + for (j = 0; j < setsz; j++) { /* check both parts of partition */ + found = found && (grouping[i][j] == s1[j]); + found = found && (group2[i][j] == (fullset[j] & (~s1[j]))); + } + if (found) + break; + } + if (!found) { /* if not, add it to the slot after the end, + which must be empty */ + grouping[i] = (group_type *)Malloc(setsz * sizeof(group_type)); + timesseen[i] = (double *)Malloc(sizeof(double)); + group2[i] = (group_type *)Malloc(setsz * sizeof(group_type)); + for (j = 0; j < setsz; j++) + grouping[i][j] = s1[j]; + *timesseen[i] = 1; + (*n)++; + } +} /* enterpartition */ + + +void elimboth(long n) +{ + /* for Adams case: eliminate pairs of groups incompatible with each other */ + long i, j; + boolean comp; + + for (i = 0; i < n-1; i++) { + for (j = i+1; j < n; j++) { + comp = compatible(i,j); + if (!comp) { + *timesseen[i] = 0.0; + *timesseen[j] = 0.0; + } + } + if (*timesseen[i] == 0.0) { + free(grouping[i]); + free(timesseen[i]); + timesseen[i] = NULL; + grouping[i] = NULL; + } + } + if (*timesseen[n-1] == 0.0) { + free(grouping[n-1]); + free(timesseen[n-1]); + timesseen[n-1] = NULL; + grouping[n-1] = NULL; + } +} /* elimboth */ + + +void consensus(pattern_elm ***pattern_array, long trees_in) +{ + long i, n, n2, tipy; + + group2 = (group_type **) Malloc(maxgrp*sizeof(group_type *)); + for (i = 0; i < maxgrp; i++) + group2[i] = NULL; + times2 = (double **)Malloc(maxgrp*sizeof(double *)); + for (i = 0; i < maxgrp; i++) + times2[i] = NULL; + n2 = 0; + censor(); /* drop groups that are too rare */ + compress(&n); /* push everybody to front of array */ + if (!strict) { /* drop those incompatible, if any */ + sort(n); + eliminate(&n, &n2); + compress(&n); + } + reconstruct(n); + tipy = 1; + coordinates(root, &tipy); + if (prntsets) { + fprintf(outfile, "\nSets included in the consensus tree\n"); + printset(n); + for (i = 0; i < n2; i++) { + if (!grouping[i]) { + grouping[i] = (group_type *)Malloc(setsz * sizeof(group_type)); + timesseen[i] = (double *)Malloc(sizeof(double)); + } + memcpy(grouping[i], group2[i], setsz * sizeof(group_type)); + *timesseen[i] = *times2[i]; + } + n = n2; + fprintf(outfile, "\n\nSets NOT included in consensus tree:"); + if (n2 == 0) + fprintf(outfile, " NONE\n"); + else { + putc('\n', outfile); + printset(n); + } + } + putc('\n', outfile); + if (strict) + fprintf(outfile, "\nStrict consensus tree\n"); + if (mre) + fprintf(outfile, "\nExtended majority rule consensus tree\n"); + if (ml) { + fprintf(outfile, "\nM consensus tree (l = %4.2f)\n", mlfrac); + fprintf(outfile, " l\n"); + } + if (mr) + fprintf(outfile, "\nMajority rule consensus tree\n"); + printree(); + free(nayme); + for (i = 0; i < maxgrp; i++) + free(grouping[i]); + free(grouping); + for (i = 0; i < maxgrp; i++) + free(order[i]); + free(order); + for (i = 0; i < maxgrp; i++) + if (timesseen[i] != NULL) + free(timesseen[i]); + free(timesseen); +} /* consensus */ + + +void rehash() +{ + group_type *s; + long i, j, k; + double temp, ss, smult; + boolean done; + + smult = (sqrt(5.0) - 1) / 2; + s = (group_type *)Malloc(setsz * sizeof(group_type)); + for (i = 0; i < maxgrp/2; i++) { + k = *order[i]; + memcpy(s, grouping[k], setsz * sizeof(group_type)); + ss = 0.0; + for (j = 0; j < setsz; j++) + ss += s[j] /* pow(2, SETBITS*j)*/; + temp = ss * smult; + j = (long)(maxgrp * (temp - floor(temp))); + done = false; + while (!done) { + if (!grping2[j]) { + grping2[j] = (group_type *)Malloc(setsz * sizeof(group_type)); + order2[i] = (long *)Malloc(sizeof(long)); + tmseen2[j] = (double *)Malloc(sizeof(double)); + memcpy(grping2[j], grouping[k], setsz * sizeof(group_type)); + *tmseen2[j] = *timesseen[k]; + *order2[i] = j; + grouping[k] = NULL; + timesseen[k] = NULL; + order[i] = NULL; + done = true; + } else { + j++; + if (j >= maxgrp) j -= maxgrp; + } + } + } + free(s); +} /* rehash */ + + +void enternodeset(node* r) +{ /* enter a set of species into the hash table */ + long i, j, start; + double ss, n; + boolean done, same; + double times ; + group_type *s; + + s = r->nodeset; + same = true; + for (i = 0; i < setsz; i++) + if (s[i] != fullset[i]) + same = false; + if (same) + return; + times = trweight; + ss = 0.0; /* compute the hashcode for the set */ + n = ((sqrt(5.0) - 1.0) / 2.0); /* use an irrational multiplier */ + for (i = 0; i < setsz; i++) + ss += s[i] * n; + i = (long)(maxgrp * (ss - floor(ss))) + 1; /* use fractional part of code */ + start = i; + done = false; /* go through seeing if it is there */ + while (!done) { + if (grouping[i - 1]) { /* ... i.e. if group is absent, or */ + same = false; /* (will be false if timesseen = 0) */ + if (!(timesseen[i-1] == 0)) { /* ... if timesseen = 0 */ + same = true; + for (j = 0; j < setsz; j++) { + if (s[j] != grouping[i - 1][j]) + same = false; + } + } + } + if (grouping[i - 1] && same) { /* if it is there, increment timesseen */ + *timesseen[i - 1] += times; + lengths[i - 1] = nodep[r->index - 1]->v; + done = true; + } else if (!grouping[i - 1]) { /* if not there and slot empty ... */ + grouping[i - 1] = (group_type *)Malloc(setsz * sizeof(group_type)); + lasti++; + order[lasti] = (long *)Malloc(sizeof(long)); + timesseen[i - 1] = (double *)Malloc(sizeof(double)); + memcpy(grouping[i - 1], s, setsz * sizeof(group_type)); + *timesseen[i - 1] = times; + *order[lasti] = i - 1; + done = true; + lengths[i - 1] = nodep[r->index -1]->v; + } else { /* otherwise look to put it in next slot ... */ + i++; + if (i > maxgrp) i -= maxgrp; + } + if (!done && i == start) { /* if no place to put it, expand hash table */ + maxgrp = maxgrp*2; + tmseen2 = (double **)Malloc(maxgrp*sizeof(double *)); + for (j = 0; j < maxgrp; j++) + tmseen2[j] = NULL; + grping2 = (group_type **)Malloc(maxgrp*sizeof(group_type *)); + for (j = 0; j < maxgrp; j++) + grping2[j] = NULL; + order2 = (long **)Malloc(maxgrp*sizeof(long *)); + for (j = 0; j < maxgrp; j++) + order2[j] = NULL; + lengths2 = (double *)Malloc(maxgrp*sizeof(double)); + for (j = 0; j < maxgrp; j++) + lengths2[j] = 0.0; + memcpy(lengths2,lengths,maxgrp*sizeof(double) / 2); + rehash(); + free(lengths); + free(timesseen); + free(grouping); + free(order); + timesseen = tmseen2; + grouping = grping2; + lengths = lengths2; + order = order2; + done = true; + lasti = maxgrp/2 - 1; + enternodeset(r); + } + } +} /* enternodeset */ + + +void accumulate(node *r) +{ + node *q; + long i; + + if (r->tip) { + if (!r->nodeset) + r->nodeset = (group_type *)Malloc(setsz * sizeof(group_type)); + for (i = 0; i < setsz; i++) + r->nodeset[i] = 0L; + i = (r->index-1) / (long)SETBITS; + r->nodeset[i] = 1L << (r->index - 1 - i*SETBITS); + } + else { + q = r->next; + while (q != r) { + accumulate(q->back); + q = q->next; + } + q = r->next; + if (!r->nodeset) + r->nodeset = (group_type *)Malloc(setsz * sizeof(group_type)); + for (i = 0; i < setsz; i++) + r->nodeset[i] = 0; + while (q != r) { + for (i = 0; i < setsz; i++) + r->nodeset[i] |= q->back->nodeset[i]; + q = q->next; + } + } + if ((!r->tip && (r->next->next != r)) || r->tip) + enternodeset(r); +} /* accumulate */ + + +void dupname2(Char *name, node *p, node *this) +{ + /* search for a duplicate name recursively */ + node *q; + + if (p->tip) { + if (p != this) { + if (namesSearch(p->nayme)) { + printf("\n\nERROR in user tree: duplicate name found: "); + puts(p->nayme); + printf("\n\n"); + exxit(-1); + } else { + namesAdd(p->nayme); + } + } + } else { + q = p; + while (p->next != q) { + dupname2(name, p->next->back, this); + p = p->next; + } + } +} /* dupname2 */ + + +void dupname(node *p) +{ + /* search for a duplicate name in tree */ + node *q; + + if (p->tip) { + if (namesSearch(p->nayme)) { + printf("\n\nERROR in user tree: duplicate name found: "); + puts(p->nayme); + printf("\n\n"); + exxit(-1); + } else { + namesAdd(p->nayme); + } + } else { + q = p; + while (p->next != q) { + dupname(p->next->back); + p = p->next; + } + } +} /* dupname */ + + +void missingnameRecurs(node *p) +{ + /* search for missing names in first tree */ + node *q; + + if (p->tip) { + if (!namesSearch(p->nayme)) { + printf("\n\nERROR in user tree: name %s not found in first tree\n\n\n", + p->nayme); + exxit(-1); + } + } else { + q = p; + while (p->next != q) { + missingnameRecurs(p->next->back); + p = p->next; + } + } +} /* missingnameRecurs */ + +/** + * wrapper for recursive missingname function + */ +void missingname(node *p){ + missingnameRecurs(p); + namesCheckTable(); +} /* missingname */ + +void gdispose(node *p) +{ + /* go through tree throwing away nodes */ + node *q, *r; + + if (p->tip) { + chuck(&grbg, p); + return; + } + q = p->next; + while (q != p) { + gdispose(q->back); + r = q; + q = q->next; + chuck(&grbg, r); + } + chuck(&grbg, q); +} /* gdispose */ + + +void initreenode(node *p) +{ + /* traverse tree and assign species names to tip nodes */ + node *q; + + if (p->tip) { + memcpy(nayme[p->index - 1], p->nayme, MAXNCH); + } else { + q = p->next; + while (q && q != p) { + initreenode(q->back); + q = q->next; + } + } +} /* initreenode */ + + +void reroot(node *outgroup, long *nextnode) +{ + /* reorients tree, putting outgroup in desired position. */ + long i; + boolean nroot; + node *p, *q; + double newv; + + nroot = false; + p = root->next; + while (p != root) { + if ((outgroup->back == p) && (root->next->next->next == root)) { + nroot = true; + p = root; + } else + p = p->next; + } + if (nroot && root->next->next->next == root) { + root->next->next->back->v += root->next->back->v; + root->next->back->v = 0; + } + if (nroot) return; + + p = root; + i = 0; + while (p->next != root) { + p = p->next; + i++; + } + if (i == 2) { + newv = root->next->back->v + root->next->next->back->v; + root->next->back->back = p->back; + p->back->back = root->next->back; + q = root->next; + p->back->v = newv; + q->back->v = newv; + } else { + p->next = root->next; + nodep[root->index-1] = root->next; + gnu(&grbg, &root->next); + q = root->next; + gnu(&grbg, &q->next); + p = q->next; + p->next = root; + q->tip = false; + p->tip = false; + nodep[*nextnode] = root; + (*nextnode)++; + root->index = *nextnode; + root->next->index = root->index; + root->next->next->index = root->index; + } + newv = outgroup->v; + q->back = outgroup; + p->back = outgroup->back; + outgroup->back->back = p; + outgroup->back = q; + outgroup->v = 0; + outgroup->back->v = 0; + root->v = 0; + p->v = newv; + p->back->v = newv; + reorient(root); +} /* reroot */ + + +void reorient(node* n) { + node* p; + + if ( n->tip ) return; + if ( nodep[n->index - 1] != n ) { + nodep[n->index - 1] = n; + if ( n->back ) + n->v = n->back->v; + } + + for ( p = n->next ; p != n ; p = p->next) + reorient(p->back); +} + + +void store_pattern (pattern_elm ***pattern_array, + double *timesseen_changes, int trees_in_file) +{ + /* put a tree's groups into a pattern array. + Don't forget that when not Adams, grouping[] is not compressed. . . */ + long i, total_groups=0, j=0, k; + + /* First, find out how many groups exist in the given tree. */ + for (i = 0 ; i < maxgrp ; i++) + if ((grouping[i] != NULL) && + (*timesseen[i] > timesseen_changes[i])) + /* If this is group exists and is present in the current tree, */ + total_groups++ ; + + /* Then allocate a space to store the bit patterns. . . */ + for (i = 0 ; i < setsz ; i++) { + pattern_array[i][trees_in_file] + = (pattern_elm *) Malloc(sizeof(pattern_elm)) ; + pattern_array[i][trees_in_file]->apattern = + (group_type *) Malloc (total_groups * sizeof (group_type)) ; + pattern_array[i][trees_in_file]->length = + (double *) Malloc (maxgrp * sizeof (double)) ; + for ( j = 0 ; j < maxgrp ; j++ ) { + pattern_array[i][trees_in_file]->length[j] = -1; + } + pattern_array[i][trees_in_file]->patternsize = (long *)Malloc(sizeof(long)); + } + j = 0; + /* Then go through groupings again, and copy in each element + appropriately. */ + for (i = 0 ; i < maxgrp ; i++) + if (grouping[i] != NULL) { + if (*timesseen[i] > timesseen_changes[i]) { + for (k = 0 ; k < setsz ; k++) + pattern_array[k][trees_in_file]->apattern[j] = grouping[i][k] ; + pattern_array[0][trees_in_file]->length[j] = lengths[i]; + j++ ; + timesseen_changes[i] = *timesseen[i] ; + } + } + *pattern_array[0][trees_in_file]->patternsize = total_groups; +} /* store_pattern */ + + +boolean samename(naym name1, plotstring name2) +{ + return !(strncmp(name1, name2, MAXNCH)); +} /* samename */ + + +void reordertips() +{ + /* matchs tip nodes to species names first read in */ + long i, j; + boolean done; + node *p, *q, *r; + for (i = 0; i < spp; i++) { + j = 0; + done = false; + do { + if (samename(nayme[i], nodep[j]->nayme)) { + done = true; + if (i != j) { + p = nodep[i]; + q = nodep[j]; + r = p->back; + p->back->back = q; + q->back->back = p; + p->back = q->back; + q->back = r; + memcpy(q->nayme, p->nayme, MAXNCH); + memcpy(p->nayme, nayme[i], MAXNCH); + } + } + j++; + } while (j < spp && !done); + } +} /* reordertips */ + + +void read_groups (pattern_elm ****pattern_array,double *timesseen_changes, + long trees_in_1, long total_trees, FILE *intree) +{ + /* read the trees. Accumulate sets. */ + int i, j, k; + boolean haslengths, initial; + long nextnode, trees_read = 0; + + /* set up the groupings array and the timesseen array */ + grouping = (group_type **) Malloc(maxgrp*sizeof(group_type *)); + lengths = (double *) Malloc(maxgrp*sizeof(double)); + for (i = 0; i < maxgrp; i++) + grouping[i] = NULL; + order = (long **) Malloc(maxgrp*sizeof(long *)); + for (i = 0; i < maxgrp; i++) + order[i] = NULL; + timesseen = (double **)Malloc(maxgrp*sizeof(double *)); + for (i = 0; i < maxgrp; i++) + timesseen[i] = NULL; + + firsttree = true; + grbg = NULL; + initial = true; + while (!eoff(intree)) { /* go till end of input tree file */ + for (i = 0; i < maxgrp; i++) { + lengths[i] = -1; + } + goteof = false; + nextnode = 0; + haslengths = true; + allocate_nodep(&nodep, &intree, &spp); + if (firsttree) + nayme = (naym *)Malloc(spp*sizeof(naym)); + treeread(intree, &root, treenode, &goteof, &firsttree, nodep, + &nextnode, &haslengths, &grbg, initconsnode,true,-1); + if (!initial) { + reordertips(); + missingname(root); + } else { + initial = false; + hashp = (hashtype)Malloc(sizeof(namenode) * NUM_BUCKETS); + for (i=0;i= 1); + if (!done1) { + printf("ERROR: Bad outgroup number: %ld\n", outgrno); + printf(" Must be greater than zero\n"); + } + countup(&loopcount2, 10); + } while (done1 != true); + } + break; + + case 'R': + noroot = !noroot; + break; + + case 'T': + initterminal(&ibmpc, &ansi); + break; + + case '1': + prntsets = !prntsets; + break; + + case '2': + progress = !progress; + break; + + case '3': + treeprint = !treeprint; + break; + + case '4': + trout = !trout; + break; + + } + } else + printf("Not a possible option!\n"); + } + countup(&loopcount, 100); + } while (!done); + if (ml) { + do { + printf("\nFraction (l) of times a branch must appear\n"); + scanf("%lf%*[^\n]", &mlfrac); + getchar(); + } while ((mlfrac < 0.5) || (mlfrac > 1.0)); + } +} /* getoptions */ + + +void count_siblings(node **p) +{ + node *tmp_node; + int i; + + if (!(*p)) { + /* This is a leaf, */ + return; + } else { + tmp_node = (*p)->next; + } + + for (i = 0 ; i < 1000; i++) { + if (tmp_node == (*p)) { + /* When we've gone through all the siblings, */ + break; + } else if (tmp_node) { + tmp_node = tmp_node->next; + } else { + /* Should this be executed? */ + return ; + } + } +} /* count_siblings */ + + +void treeout(node *p) +{ + /* write out file with representation of final tree */ + long i, n = 0; + Char c; + node *q; + double x; + + count_siblings (&p); + + if (p->tip) { + /* If we're at a node which is a leaf, figure out how long the + name is and print it out. */ + for (i = 1; i <= MAXNCH; i++) { + if (p->nayme[i - 1] != '\0') + n = i; + } + for (i = 0; i < n; i++) { + c = p->nayme[i]; + if (c == ' ') + c = '_'; + putc(c, outtree); + } + col += n; + } else { + /* If we're at a furcation, print out the proper formatting, loop + through all the children, calling the procedure recursively. */ + putc('(', outtree); + col++; + q = p->next; + while (q != p) { + /* This should terminate when we've gone through all the + siblings, */ + treeout(q->back); + q = q->next; + if (q == p) + break; + putc(',', outtree); + col++; + if (col > 60) { + putc('\n', outtree); + col = 0; + } + } + putc(')', outtree); + col++; + } + + if (p->tip) + x = ntrees; + else + x = (double)p->deltav; + + if (p == root) { + /* When we're all done with this tree, */ + fprintf(outtree, ";\n"); + return; + } + + /* Figure out how many characters the branch length requires: */ + else { + if (!strict) { + if (x >= 100.0) { + fprintf(outtree, ":%5.1f", x); + col += 4; + } else if (x >= 10.0) { + fprintf(outtree, ":%4.1f", x); + col += 3; + } else if (x >= 0.99) { + fprintf(outtree, ":%3.1f", x); + col += 2; + } else { + fprintf(outtree, ":%4.2f", x); + col += 3; + } + } + } +} /* treeout */ + + +int main(int argc, Char *argv[]) +{ + /* Local variables added by Dan F. */ + pattern_elm ***pattern_array; + double *timesseen_changes = NULL; + long trees_in = 0; + long i, j; + node *p, *q; + +#ifdef MAC + argc = 1; /* macsetup("Consense", ""); */ + argv[0] = "Consense"; +#endif + init(argc, argv); + openfile(&intree, INTREE, "input tree file", "r", argv[0], intreename); + openfile(&outfile, OUTFILE, "output file", "w", argv[0], outfilename); + + /* Initialize option-based variables, then ask for changes regarding + their values. */ + getoptions(); + + ntrees = 0.0; + maxgrp = 32767; /* initial size of set hash table */ + lasti = -1; + + if (trout) + openfile(&outtree, OUTTREE, "output tree file", "w", argv[0], outtreename); + if (prntsets) + fprintf(outfile, "Species in order: \n\n"); + + trees_in = countsemic(&intree); + + /* Read the tree file and put together grouping, order, and timesseen */ + read_groups (&pattern_array, timesseen_changes, trees_in, trees_in, intree); + /* Compute the consensus tree. */ + putc('\n', outfile); + nodep = (pointarray)Malloc(2*(1+spp)*sizeof(node *)); + for (i = 0; i < spp; i++) { + nodep[i] = (node *)Malloc(sizeof(node)); + for (j = 0; j < MAXNCH; j++) + nodep[i]->nayme[j] = '\0'; + strncpy(nodep[i]->nayme, nayme[i], MAXNCH); + } + for (i = spp; i < 2*(1+spp); i++) + nodep[i] = NULL; + consensus(pattern_array, trees_in); + printf("\n"); + if (trout) { + treeout(root); + if (progress) + printf("Consensus tree written to file \"%s\"\n\n", outtreename); + } + if (progress) + printf("Output written to file \"%s\"\n\n", outfilename); + for (i = 0; i < spp; i++) + free(nodep[i]); + for (i = spp; i < 2*(1 + spp); i++) { + if (nodep[i] != NULL) { + p = nodep[i]->next; + do { + q = p->next; + free(p); + p = q; + } while (p != nodep[i]); + free(p); + } + } + free(nodep); + FClose(outtree); + FClose(intree); + FClose(outfile); + +#ifdef MAC + fixmacfile(outfilename); + fixmacfile(outtreename); +#endif +printf("Done.\n\n"); + +#ifdef WIN32 + phyRestoreConsoleAttributes(); +#endif + +return 0; +} /* main */ + diff --git a/forester/archive/RIO/others/phylip_mod/src/dist.c b/forester/archive/RIO/others/phylip_mod/src/dist.c new file mode 100644 index 0000000..3500c12 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/dist.c @@ -0,0 +1,526 @@ +#include "phylip.h" +#include "dist.h" + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +void alloctree(pointptr *treenode, long nonodes) +{ + /* allocate treenode dynamically */ + /* used in fitch, kitsch & neighbor */ + long i, j; + node *p, *q; + + *treenode = (pointptr)Malloc(nonodes*sizeof(node *)); + for (i = 0; i < spp; i++) + (*treenode)[i] = (node *)Malloc(sizeof(node)); + for (i = spp; i < nonodes; i++) { + q = NULL; + for (j = 1; j <= 3; j++) { + p = (node *)Malloc(sizeof(node)); + p->next = q; + q = p; + } + p->next->next->next = p; + (*treenode)[i] = p; + } +} /* alloctree */ + + +void freetree(pointptr *treenode, long nonodes) +{ + long i, j; + node *p, *q; + + for (i = 0; i < spp; i++) + free((*treenode)[i]); + for (i = spp; i < nonodes; i++) { + p = (*treenode)[i]; + for (j = 1; j <= 3; j++) { + q = p; + p = p->next; + free(q); + } + } + free(*treenode); +} /* freetree */ + + +void allocd(long nonodes, pointptr treenode) +{ + /* used in fitch & kitsch */ + long i, j; + node *p; + + for (i = 0; i < (spp); i++) { + treenode[i]->d = (vector)Malloc(nonodes*sizeof(double)); + } + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + p->d = (vector)Malloc(nonodes*sizeof(double)); + p = p->next; + } + } +} + + +void freed(long nonodes, pointptr treenode) +{ + /* used in fitch */ + long i, j; + node *p; + + for (i = 0; i < (spp); i++) { + free(treenode[i]->d); + } + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + free(p->d); + p = p->next; + } + } +} + + +void allocw(long nonodes, pointptr treenode) +{ + /* used in fitch & kitsch */ + long i, j; + node *p; + + for (i = 0; i < (spp); i++) { + treenode[i]->w = (vector)Malloc(nonodes*sizeof(double)); + } + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + p->w = (vector)Malloc(nonodes*sizeof(double)); + p = p->next; + } + } +} + + +void freew(long nonodes, pointptr treenode) +{ + /* used in fitch */ + long i, j; + node *p; + + for (i = 0; i < (spp); i++) { + free(treenode[i]->w); + } + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + free(p->w); + p = p->next; + } + } +} + + +void setuptree(tree *a, long nonodes) +{ + /* initialize a tree */ + /* used in fitch, kitsch, & neighbor */ + long i=0; + node *p; + + for (i = 1; i <= nonodes; i++) { + a->nodep[i - 1]->back = NULL; + a->nodep[i - 1]->tip = (i <= spp); + a->nodep[i - 1]->iter = true; + a->nodep[i - 1]->index = i; + a->nodep[i - 1]->t = 0.0; + a->nodep[i - 1]->sametime = false; + a->nodep[i - 1]->v = 0.0; + if (i > spp) { + p = a->nodep[i-1]->next; + while (p != a->nodep[i-1]) { + p->back = NULL; + p->tip = false; + p->iter = true; + p->index = i; + p->t = 0.0; + p->sametime = false; + p = p->next; + } + } + } + a->likelihood = -1.0; + a->start = a->nodep[0]; + a->root = NULL; +} /* setuptree */ + + +void inputdata(boolean replicates, boolean printdata, boolean lower, + boolean upper, vector *x, intvector *reps) +{ + /* read in distance matrix */ + /* used in fitch & neighbor */ + long i=0, j=0, k=0, columns=0; + boolean skipit=false, skipother=false; + + if (replicates) + columns = 4; + else + columns = 6; + if (printdata) { + fprintf(outfile, "\nName Distances"); + if (replicates) + fprintf(outfile, " (replicates)"); + fprintf(outfile, "\n---- ---------"); + if (replicates) + fprintf(outfile, "-------------"); + fprintf(outfile, "\n\n"); + } + for (i = 0; i < spp; i++) { + x[i][i] = 0.0; + scan_eoln(infile); + initname(i); + for (j = 0; j < spp; j++) { + skipit = ((lower && j + 1 >= i + 1) || (upper && j + 1 <= i + 1)); + skipother = ((lower && i + 1 >= j + 1) || (upper && i + 1 <= j + 1)); + if (!skipit) { + if (eoln(infile)) + scan_eoln(infile); + if (fscanf(infile, "%lf", &x[i][j]) != 1) { + printf("The infile is of the wrong type\n"); + exxit(-1); + } + if (replicates) { + if (eoln(infile)) + scan_eoln(infile); + fscanf(infile, "%ld", &reps[i][j]); + } else + reps[i][j] = 1; + } + if (!skipit && skipother) { + x[j][i] = x[i][j]; + reps[j][i] = reps[i][j]; + } + if ((i == j) && (fabs(x[i][j]) > 0.000000001)) { + printf("\nERROR: diagonal element of row %ld of distance matrix ", i+1); + printf("is not zero.\n"); + printf(" Is it a distance matrix?\n\n"); + exxit(-1); + } + if ((j < i) && (fabs(x[i][j]-x[j][i]) > 0.000000001)) { + printf("ERROR: distance matrix is not symmetric:\n"); + printf(" (%ld,%ld) element and (%ld,%ld) element are unequal.\n", + i+1, j+1, j+1, i+1); + printf(" They are %10.6f and %10.6f, respectively.\n", + x[i][j], x[j][i]); + printf(" Is it a distance matrix?\n\n"); + exxit(-1); + } + } + } + scan_eoln(infile); + if (!printdata) + return; + for (i = 0; i < spp; i++) { + for (j = 0; j < nmlngth; j++) + putc(nayme[i][j], outfile); + putc(' ', outfile); + for (j = 1; j <= spp; j++) { + fprintf(outfile, "%10.5f", x[i][j - 1]); + if (replicates) + fprintf(outfile, " (%3ld)", reps[i][j - 1]); + if (j % columns == 0 && j < spp) { + putc('\n', outfile); + for (k = 1; k <= nmlngth + 1; k++) + putc(' ', outfile); + } + } + putc('\n', outfile); + } + putc('\n', outfile); +} /* inputdata */ + + +void coordinates(node *p, double lengthsum, long *tipy, double *tipmax, + node *start, boolean njoin) +{ + /* establishes coordinates of nodes */ + node *q, *first, *last; + + if (p->tip) { + p->xcoord = (long)(over * lengthsum + 0.5); + p->ycoord = *tipy; + p->ymin = *tipy; + p->ymax = *tipy; + (*tipy) += down; + if (lengthsum > *tipmax) + *tipmax = lengthsum; + return; + } + q = p->next; + do { + if (q->back) + coordinates(q->back, lengthsum + q->v, tipy,tipmax, start, njoin); + q = q->next; + } while ((p == start || p != q) && (p != start || p->next != q)); + first = p->next->back; + q = p; + while (q->next != p && q->next->back) /* is this right ? */ + q = q->next; + last = q->back; + p->xcoord = (long)(over * lengthsum + 0.5); + if (p == start && p->back) + p->ycoord = p->next->next->back->ycoord; + else + p->ycoord = (first->ycoord + last->ycoord) / 2; + p->ymin = first->ymin; + p->ymax = last->ymax; +} /* coordinates */ + + +void drawline(long i, double scale, node *start, boolean rooted) +{ + /* draws one row of the tree diagram by moving up tree */ + node *p, *q; + long n=0, j=0; + boolean extra=false, trif=false; + node *r, *first =NULL, *last =NULL; + boolean done=false; + + p = start; + q = start; + extra = false; + trif = false; + if (i == (long)p->ycoord && p == start) { /* display the root */ + if (rooted) { + if (p->index - spp >= 10) + fprintf(outfile, "-"); + else + fprintf(outfile, "--"); + } + else { + if (p->index - spp >= 10) + fprintf(outfile, " "); + else + fprintf(outfile, " "); + } + if (p->index - spp >= 10) + fprintf(outfile, "%2ld", p->index - spp); + else + fprintf(outfile, "%ld", p->index - spp); + extra = true; + trif = true; + } else + fprintf(outfile, " "); + do { + if (!p->tip) { /* internal nodes */ + r = p->next; + /* r->back here is going to the same node. */ + do { + if (!r->back) { + r = r->next; + continue; + } + if (i >= r->back->ymin && i <= r->back->ymax) { + q = r->back; + break; + } + r = r->next; + } while (!((p != start && r == p) || (p == start && r == p->next))); + first = p->next->back; + r = p; + while (r->next != p) + r = r->next; + last = r->back; + if (!rooted && (p == start)) + last = p->back; + } /* end internal node case... */ + /* draw the line: */ + done = (p->tip || p == q); + n = (long)(scale * (q->xcoord - p->xcoord) + 0.5); + if (!q->tip) { + if ((n < 3) && (q->index - spp >= 10)) + n = 3; + if ((n < 2) && (q->index - spp < 10)) + n = 2; + } + if (extra) { + n--; + extra = false; + } + if ((long)q->ycoord == i && !done) { + if (p->ycoord != q->ycoord) + putc('+', outfile); + if (trif) { + n++; + trif = false; + } + if (!q->tip) { + for (j = 1; j <= n - 2; j++) + putc('-', outfile); + if (q->index - spp >= 10) + fprintf(outfile, "%2ld", q->index - spp); + else + fprintf(outfile, "-%ld", q->index - spp); + extra = true; + } else { + for (j = 1; j < n; j++) + putc('-', outfile); + } + } else if (!p->tip) { + if ((long)last->ycoord > i && (long)first->ycoord < i + && i != (long)p->ycoord) { + putc('!', outfile); + for (j = 1; j < n; j++) + putc(' ', outfile); + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + trif = false; + } + } + if (q != p) + p = q; + } while (!done); + if ((long)p->ycoord == i && p->tip) { + for (j = 0; j < nmlngth; j++) + putc(nayme[p->index - 1][j], outfile); + } + putc('\n', outfile); +} /* drawline */ + + +void printree(node *start, boolean treeprint, + boolean njoin, boolean rooted) +{ + /* prints out diagram of the tree */ + /* used in fitch & neighbor */ + long i; + long tipy; + double scale,tipmax; + + if (!treeprint) + return; + putc('\n', outfile); + tipy = 1; + tipmax = 0.0; + coordinates(start, 0.0, &tipy, &tipmax, start, njoin); + scale = 1.0 / (long)(tipmax + 1.000); + for (i = 1; i <= (tipy - down); i++) + drawline(i, scale, start, rooted); + putc('\n', outfile); +} /* printree */ + + +void treeoutr(node *p, long *col, tree *curtree) +{ + /* write out file with representation of final tree. + * Rooted case. Used in kitsch and neighbor. */ + long i, n, w; + Char c; + double x; + + if (p->tip) { + n = 0; + for (i = 1; i <= nmlngth; i++) { + if (nayme[p->index - 1][i - 1] != ' ') + n = i; + } + for (i = 0; i < n; i++) { + c = nayme[p->index - 1][i]; + if (c == ' ') + c = '_'; + putc(c, outtree); + } + (*col) += n; + } else { + putc('(', outtree); + (*col)++; + treeoutr(p->next->back,col,curtree); + putc(',', outtree); + (*col)++; + if ((*col) > 55) { + putc('\n', outtree); + (*col) = 0; + } + treeoutr(p->next->next->back,col,curtree); + putc(')', outtree); + (*col)++; + } + x = p->v; + if (x > 0.0) + w = (long)(0.43429448222 * log(x)); + else if (x == 0.0) + w = 0; + else + w = (long)(0.43429448222 * log(-x)) + 1; + if (w < 0) + w = 0; + if (p == curtree->root) + fprintf(outtree, ";\n"); + else { + fprintf(outtree, ":%*.5f", (int)(w + 7), x); + (*col) += w + 8; + } +} /* treeoutr */ + + +void treeout(node *p, long *col, double m, boolean njoin, node *start) +{ + /* write out file with representation of final tree */ + /* used in fitch & neighbor */ + long i=0, n=0, w=0; + Char c; + double x=0.0; + + if (p->tip) { + n = 0; + for (i = 1; i <= nmlngth; i++) { + if (nayme[p->index - 1][i - 1] != ' ') + n = i; + } + for (i = 0; i < n; i++) { + c = nayme[p->index - 1][i]; + if (c == ' ') + c = '_'; + putc(c, outtree); + } + *col += n; + } else { + putc('(', outtree); + (*col)++; + treeout(p->next->back, col, m, njoin, start); + putc(',', outtree); + (*col)++; + if (*col > 55) { + putc('\n', outtree); + *col = 0; + } + treeout(p->next->next->back, col, m, njoin, start); + if (p == start && njoin) { + putc(',', outtree); + treeout(p->back, col, m, njoin, start); + } + putc(')', outtree); + (*col)++; + } + x = p->v; + if (x > 0.0) + w = (long)(m * log(x)); + else if (x == 0.0) + w = 0; + else + w = (long)(m * log(-x)) + 1; + if (w < 0) + w = 0; + if (p == start) + fprintf(outtree, ";\n"); + else { + fprintf(outtree, ":%*.5f", (int) w + 7, x); + *col += w + 8; + } +} /* treeout */ + diff --git a/forester/archive/RIO/others/phylip_mod/src/dist.h b/forester/archive/RIO/others/phylip_mod/src/dist.h new file mode 100644 index 0000000..574fd1f --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/dist.h @@ -0,0 +1,35 @@ + +/* version 3.6. (c) Copyright 1993-2000 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +/* + dist.h: included in fitch, kitsch, & neighbor +*/ + +#define over 60 + + +typedef long *intvector; + +typedef node **pointptr; + +#ifndef OLDC +/*function prototypes*/ +void alloctree(pointptr *, long); +void freetree(pointptr *, long); +void allocd(long, pointptr); +void freed(long, pointptr); +void allocw(long, pointptr); +void freew(long, pointptr); +void setuptree(tree *, long); +void inputdata(boolean, boolean, boolean, boolean, vector *, intvector *); +void coordinates(node *, double, long *, double *, node *, boolean); +void drawline(long, double, node *, boolean); +void printree(node *, boolean, boolean, boolean); +void treeoutr(node *, long *, tree *); +void treeout(node *, long *, double, boolean, node *); +/*function prototypes*/ +#endif + diff --git a/forester/archive/RIO/others/phylip_mod/src/fitch.c b/forester/archive/RIO/others/phylip_mod/src/fitch.c new file mode 100644 index 0000000..c55d050 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/fitch.c @@ -0,0 +1,1176 @@ + +#include "phylip.h" +#include "dist.h" + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +#define zsmoothings 10 /* number of zero-branch correction iterations */ +#define epsilonf 0.000001 /* a very small but not too small number */ +#define delta 0.0001 /* a not quite so small number */ +#define MAXNUMTREES 100000000 /* a number bigger than conceivable numtrees */ + + +#ifndef OLDC +/* function prototypes */ +void getoptions(void); +void allocrest(void); +void doinit(void); +void inputoptions(void); +void fitch_getinput(void); +void secondtraverse(node *, double , long *, double *); +void firsttraverse(node *, long *, double *); +double evaluate(tree *); +void nudists(node *, node *); +void makedists(node *); + +void makebigv(node *); +void correctv(node *); +void alter(node *, node *); +void nuview(node *); +void update(node *); +void smooth(node *); +void filltraverse(node *, node *, boolean); +void fillin(node *, node *, boolean); +void insert_(node *, node *, boolean); +void copynode(node *, node *); + +void copy_(tree *, tree *); +void setuptipf(long, tree *); +void buildnewtip(long , tree *, long); +void buildsimpletree(tree *, long); +void addtraverse(node *, node *, boolean, long *, boolean *); +void re_move(node **, node **); +void rearrange(node *, long *, long *, boolean *); +void describe(node *); +void summarize(long); +void nodeinit(node *); +void initrav(node *); +void treevaluate(void); +void maketree(void); +void globrearrange(long* numtrees,boolean* succeeded); +/* function prototypes */ +#endif + + + +Char infilename[FNMLNGTH], outfilename[FNMLNGTH], intreename[FNMLNGTH], outtreename[FNMLNGTH]; +long nonodes2, outgrno, nums, col, datasets, ith, njumble, jumb=0; +long inseed; +vector *x; +intvector *reps; +boolean minev, global, jumble, lengths, usertree, lower, upper, negallowed, + outgropt, replicates, trout, printdata, progress, treeprint, + mulsets, firstset; +double power; +double trweight; /* to make treeread happy */ +boolean goteof, haslengths; /* ditto ... */ +boolean first; /* ditto ... */ +node *addwhere; + +longer seed; +long *enterorder; +tree curtree, priortree, bestree, bestree2; +Char ch; +char *progname; + + + +void getoptions() +{ + /* interactively set options */ + long inseed0=0, loopcount; + Char ch; + boolean done=false; + + putchar('\n'); + minev = false; + global = false; + jumble = false; + njumble = 1; + lengths = false; + lower = false; + negallowed = false; + outgrno = 1; + outgropt = false; + power = 2.0; + replicates = false; + trout = true; + upper = false; + usertree = false; + printdata = false; + progress = true; + treeprint = true; + loopcount = 0; + do { + cleerhome(); + printf("\nFitch-Margoliash method version %s\n\n",VERSION); + printf("Settings for this run:\n"); + printf(" D Method (F-M, Minimum Evolution)? %s\n", + (minev ? "Minimum Evolution" : "Fitch-Margoliash")); + printf(" U Search for best tree? %s\n", + (usertree ? "No, use user trees in input file" : "Yes")); + if (usertree) { + printf(" N Use lengths from user trees? %s\n", + (lengths ? "Yes" : "No")); + } + printf(" P Power?%9.5f\n",power); + printf(" - Negative branch lengths allowed? %s\n", + negallowed ? "Yes" : "No"); + printf(" O Outgroup root?"); + if (outgropt) + printf(" Yes, at species number%3ld\n", outgrno); + else + printf(" No, use as outgroup species%3ld\n", outgrno); + printf(" L Lower-triangular data matrix?"); + if (lower) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" R Upper-triangular data matrix?"); + if (upper) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" S Subreplicates?"); + if (replicates) + printf(" Yes\n"); + else + printf(" No\n"); + if (!usertree) { + printf(" G Global rearrangements?"); + if (global) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" J Randomize input order of species?"); + if (jumble) + printf(" Yes (seed =%8ld,%3ld times)\n", inseed0, njumble); + else + printf(" No. Use input order\n"); + } + printf(" M Analyze multiple data sets?"); + if (mulsets) + printf(" Yes, %2ld sets\n", datasets); + else + printf(" No\n"); + printf(" 0 Terminal type (IBM PC, ANSI, none)?"); + if (ibmpc) + printf(" IBM PC\n"); + if (ansi) + printf(" ANSI\n"); + if (!(ibmpc || ansi)) + printf(" (none)\n"); + printf(" 1 Print out the data at start of run"); + if (printdata) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" 2 Print indications of progress of run"); + if (progress) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" 3 Print out tree"); + if (treeprint) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" 4 Write out trees onto tree file?"); + if (trout) + printf(" Yes\n"); + else + printf(" No\n"); + printf( + "\n Y to accept these or type the letter for one to change\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch); + getchar(); + uppercase(&ch); + done = (ch == 'Y'); + if (!done) { + if (strchr("DJOUNPG-LRSM01234",ch) != NULL) { + switch (ch) { + + case 'D': + minev = !minev; + if (minev && (!negallowed)) + negallowed = true; + break; + + case '-': + negallowed = !negallowed; + break; + + case 'G': + global = !global; + break; + + case 'J': + jumble = !jumble; + if (jumble) + initjumble(&inseed, &inseed0, seed, &njumble); + else njumble = 1; + break; + + case 'L': + lower = !lower; + break; + + case 'N': + lengths = !lengths; + break; + + case 'O': + outgropt = !outgropt; + if (outgropt) + initoutgroup(&outgrno, spp); + break; + + case 'P': + initpower(&power); + break; + + case 'R': + upper = !upper; + break; + + case 'S': + replicates = !replicates; + break; + + case 'U': + usertree = !usertree; + break; + + case 'M': + mulsets = !mulsets; + if (mulsets) + initdatasets(&datasets); + jumble = true; + if (jumble) + initseed(&inseed, &inseed0, seed); + break; + + case '0': + initterminal(&ibmpc, &ansi); + break; + + case '1': + printdata = !printdata; + break; + + case '2': + progress = !progress; + break; + + case '3': + treeprint = !treeprint; + break; + + case '4': + trout = !trout; + break; + } + } else + printf("Not a possible option!\n"); + } + countup(&loopcount, 100); + } while (!done); + if (lower && upper) { + printf("ERROR: Data matrix cannot be both uppeR and Lower triangular\n"); + exxit(-1); + } +} /* getoptions */ + + +void allocrest() +{ + long i; + + x = (vector *)Malloc(spp*sizeof(vector)); + reps = (intvector *)Malloc(spp*sizeof(intvector)); + for (i=0;i 1) { + alloctree(&bestree2.nodep, nonodes2); + allocd(nonodes2, bestree2.nodep); + allocw(nonodes2, bestree2.nodep); + } + } + allocrest(); +} /* doinit */ + + +void inputoptions() +{ + /* print options information */ + if (!firstset) + samenumsp2(ith); + fprintf(outfile, "\nFitch-Margoliash method version %s\n\n",VERSION); + if (minev) + fprintf(outfile, "Minimum evolution method option\n\n"); + fprintf(outfile, " __ __ 2\n"); + fprintf(outfile, " \\ \\ (Obs - Exp)\n"); + fprintf(outfile, "Sum of squares = /_ /_ ------------\n"); + fprintf(outfile, " "); + if (power == (long)power) + fprintf(outfile, "%2ld\n", (long)power); + else + fprintf(outfile, "%4.1f\n", power); + fprintf(outfile, " i j Obs\n\n"); + fprintf(outfile, "Negative branch lengths "); + if (!negallowed) + fprintf(outfile, "not "); + fprintf(outfile, "allowed\n\n"); + if (global) + fprintf(outfile, "global optimization\n\n"); +} /* inputoptions */ + + +void fitch_getinput() +{ + /* reads the input data */ + inputoptions(); +} /* fitch_getinput */ + + +void secondtraverse(node *q, double y, long *nx, double *sum) +{ + /* from each of those places go back to all others */ + /* nx comes from firsttraverse */ + /* sum comes from evaluate via firsttraverse */ + double z=0.0, TEMP=0.0; + + z = y + q->v; + if (q->tip) { + TEMP = q->d[(*nx) - 1] - z; + *sum += q->w[(*nx) - 1] * (TEMP * TEMP); + } else { + secondtraverse(q->next->back, z, nx, sum); + secondtraverse(q->next->next->back, z, nx,sum); + } +} /* secondtraverse */ + + +void firsttraverse(node *p, long *nx, double *sum) +{ + /* go through tree calculating branch lengths */ + if (minev && (p != curtree.start)) + *sum += p->v; + if (p->tip) { + if (!minev) { + *nx = p->index; + secondtraverse(p->back, 0.0, nx, sum); + } + } else { + firsttraverse(p->next->back, nx,sum); + firsttraverse(p->next->next->back, nx,sum); + } +} /* firsttraverse */ + + +double evaluate(tree *t) +{ + double sum=0.0; + long nx=0; + /* evaluate likelihood of a tree */ + firsttraverse(t->start->back ,&nx, &sum); + firsttraverse(t->start, &nx, &sum); + if ((!minev) && replicates && (lower || upper)) + sum /= 2; + t->likelihood = -sum; + return (-sum); +} /* evaluate */ + + +void nudists(node *x, node *y) +{ + /* compute distance between an interior node and tips */ + long nq=0, nr=0, nx=0, ny=0; + double dil=0, djl=0, wil=0, wjl=0, vi=0, vj=0; + node *qprime, *rprime; + + qprime = x->next; + rprime = qprime->next->back; + qprime = qprime->back; + ny = y->index; + dil = qprime->d[ny - 1]; + djl = rprime->d[ny - 1]; + wil = qprime->w[ny - 1]; + wjl = rprime->w[ny - 1]; + vi = qprime->v; + vj = rprime->v; + x->w[ny - 1] = wil + wjl; + if (wil + wjl <= 0.0) + x->d[ny - 1] = 0.0; + else + x->d[ny - 1] = ((dil - vi) * wil + (djl - vj) * wjl) / (wil + wjl); + nx = x->index; + nq = qprime->index; + nr = rprime->index; + dil = y->d[nq - 1]; + djl = y->d[nr - 1]; + wil = y->w[nq - 1]; + wjl = y->w[nr - 1]; + y->w[nx - 1] = wil + wjl; + if (wil + wjl <= 0.0) + y->d[nx - 1] = 0.0; + else + y->d[nx - 1] = ((dil - vi) * wil + (djl - vj) * wjl) / (wil + wjl); +} /* nudists */ + + +void makedists(node *p) +{ + /* compute distances among three neighbors of a node */ + long i=0, nr=0, ns=0; + node *q, *r, *s; + + r = p->back; + nr = r->index; + for (i = 1; i <= 3; i++) { + q = p->next; + s = q->back; + ns = s->index; + if (s->w[nr - 1] + r->w[ns - 1] <= 0.0) + p->dist = 0.0; + else + p->dist = (s->w[nr - 1] * s->d[nr - 1] + r->w[ns - 1] * r->d[ns - 1]) / + (s->w[nr - 1] + r->w[ns - 1]); + p = q; + r = s; + nr = ns; + } +} /* makedists */ + + +void makebigv(node *p) +{ + /* make new branch length */ + long i=0; + node *temp, *q, *r; + + q = p->next; + r = q->next; + for (i = 1; i <= 3; i++) { + if (p->iter) { + p->v = (p->dist + r->dist - q->dist) / 2.0; + p->back->v = p->v; + } + temp = p; + p = q; + q = r; + r = temp; + } +} /* makebigv */ + + +void correctv(node *p) +{ + /* iterate branch lengths if some are to be zero */ + node *q, *r, *temp; + long i=0, j=0, n=0, nq=0, nr=0, ntemp=0; + double wq=0.0, wr=0.0; + + q = p->next; + r = q->next; + n = p->back->index; + nq = q->back->index; + nr = r->back->index; + for (i = 1; i <= zsmoothings; i++) { + for (j = 1; j <= 3; j++) { + if (p->iter) { + wr = r->back->w[n - 1] + p->back->w[nr - 1]; + wq = q->back->w[n - 1] + p->back->w[nq - 1]; + if (wr + wq <= 0.0 && !negallowed) + p->v = 0.0; + else + p->v = ((p->dist - q->v) * wq + (r->dist - r->v) * wr) / (wr + wq); + if (p->v < 0 && !negallowed) + p->v = 0.0; + p->back->v = p->v; + } + temp = p; + p = q; + q = r; + r = temp; + ntemp = n; + n = nq; + nq = nr; + nr = ntemp; + } + } +} /* correctv */ + + +void alter(node *x, node *y) +{ + /* traverse updating these views */ + nudists(x, y); + if (!y->tip) { + alter(x, y->next->back); + alter(x, y->next->next->back); + } +} /* alter */ + + +void nuview(node *p) +{ + /* renew information about subtrees */ + long i=0; + node *q, *r, *pprime, *temp; + + q = p->next; + r = q->next; + for (i = 1; i <= 3; i++) { + temp = p; + pprime = p->back; + alter(p, pprime); + p = q; + q = r; + r = temp; + } +} /* nuview */ + + +void update(node *p) +{ + /* update branch lengths around a node */ + + if (p->tip) + return; + makedists(p); + if (p->iter || p->next->iter || p->next->next->iter) { + makebigv(p); + correctv(p); + } + nuview(p); +} /* update */ + + +void smooth(node *p) +{ + /* go through tree getting new branch lengths and views */ + if (p->tip) + return; + update(p); + smooth(p->next->back); + smooth(p->next->next->back); +} /* smooth */ + + +void filltraverse(node *pb, node *qb, boolean contin) +{ + if (qb->tip) + return; + if (contin) { + filltraverse(pb, qb->next->back,contin); + filltraverse(pb, qb->next->next->back,contin); + nudists(qb, pb); + return; + } + if (!qb->next->back->tip) + nudists(qb->next->back, pb); + if (!qb->next->next->back->tip) + nudists(qb->next->next->back, pb); +} /* filltraverse */ + + +void fillin(node *pa, node *qa, boolean contin) +{ + if (!pa->tip) { + fillin(pa->next->back, qa, contin); + fillin(pa->next->next->back, qa, contin); + } + filltraverse(pa, qa, contin); +} /* fillin */ + + +void insert_(node *p, node *q, boolean contin_) +{ + /* put p and q together and iterate info. on resulting tree */ + double x=0.0, oldlike; + hookup(p->next->next, q->back); + hookup(p->next, q); + x = q->v / 2.0; + p->v = 0.0; + p->back->v = 0.0; + p->next->v = x; + p->next->back->v = x; + p->next->next->back->v = x; + p->next->next->v = x; + fillin(p->back, p, contin_); + evaluate(&curtree); + do { + oldlike = curtree.likelihood; + smooth(p); + smooth(p->back); + evaluate(&curtree); + } while (fabs(curtree.likelihood - oldlike) > delta); +} /* insert_ */ + + +void copynode(node *c, node *d) +{ + /* make a copy of a node */ + + memcpy(d->d, c->d, nonodes2*sizeof(double)); + memcpy(d->w, c->w, nonodes2*sizeof(double)); + d->v = c->v; + d->iter = c->iter; + d->dist = c->dist; + d->xcoord = c->xcoord; + d->ycoord = c->ycoord; + d->ymin = c->ymin; + d->ymax = c->ymax; +} /* copynode */ + + +void copy_(tree *a, tree *b) +{ + /* make copy of a tree a to tree b */ + long i, j=0; + node *p, *q; + + for (i = 0; i < spp; i++) { + copynode(a->nodep[i], b->nodep[i]); + if (a->nodep[i]->back) { + if (a->nodep[i]->back == a->nodep[a->nodep[i]->back->index - 1]) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]; + else if (a->nodep[i]->back + == a->nodep[a->nodep[i]->back->index - 1]->next) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]->next; + else + b->nodep[i]->back + = b->nodep[a->nodep[i]->back->index - 1]->next->next; + } + else b->nodep[i]->back = NULL; + } + for (i = spp; i < nonodes2; i++) { + p = a->nodep[i]; + q = b->nodep[i]; + for (j = 1; j <= 3; j++) { + copynode(p, q); + if (p->back) { + if (p->back == a->nodep[p->back->index - 1]) + q->back = b->nodep[p->back->index - 1]; + else if (p->back == a->nodep[p->back->index - 1]->next) + q->back = b->nodep[p->back->index - 1]->next; + else + q->back = b->nodep[p->back->index - 1]->next->next; + } + else + q->back = NULL; + p = p->next; + q = q->next; + } + } + b->likelihood = a->likelihood; + b->start = a->start; +} /* copy_ */ + + +void setuptipf(long m, tree *t) +{ + /* initialize branch lengths and views in a tip */ + long i=0; + intvector n=(long *)Malloc(spp * sizeof(long)); + node *WITH; + + WITH = t->nodep[m - 1]; + memcpy(WITH->d, x[m - 1], (nonodes2 * sizeof(double))); + memcpy(n, reps[m - 1], (spp * sizeof(long))); + for (i = 0; i < spp; i++) { + if (i + 1 != m && n[i] > 0) { + if (WITH->d[i] < epsilonf) + WITH->d[i] = epsilonf; + WITH->w[i] = n[i] / exp(power * log(WITH->d[i])); + } else { + WITH->w[i] = 0.0; + WITH->d[i] = 0.0; + } + } + for (i = spp; i < nonodes2; i++) { + WITH->w[i] = 1.0; + WITH->d[i] = 0.0; + } + WITH->index = m; + if (WITH->iter) WITH->v = 0.0; + free(n); +} /* setuptipf */ + + +void buildnewtip(long m, tree *t, long nextsp) +{ + /* initialize and hook up a new tip */ + node *p; + setuptipf(m, t); + p = t->nodep[nextsp + spp - 3]; + hookup(t->nodep[m - 1], p); +} /* buildnewtip */ + + +void buildsimpletree(tree *t, long nextsp) +{ + /* make and initialize a three-species tree */ + curtree.start=curtree.nodep[enterorder[0] - 1]; + setuptipf(enterorder[0], t); + setuptipf(enterorder[1], t); + hookup(t->nodep[enterorder[0] - 1], t->nodep[enterorder[1] - 1]); + buildnewtip(enterorder[2], t, nextsp); + insert_(t->nodep[enterorder[2] - 1]->back, t->nodep[enterorder[0] - 1], + false); +} /* buildsimpletree */ + + +void addtraverse(node *p, node *q, boolean contin, long *numtrees, + boolean *succeeded) +{ + /* traverse through a tree, finding best place to add p */ + insert_(p, q, true); + (*numtrees)++; + if (evaluate(&curtree) > (bestree.likelihood + + epsilonf * fabs(bestree.likelihood))){ + copy_(&curtree, &bestree); + addwhere = q; + (*succeeded)=true; + } + copy_(&priortree, &curtree); + if (!q->tip && contin) { + addtraverse(p, q->next->back, contin,numtrees,succeeded); + addtraverse(p, q->next->next->back, contin,numtrees,succeeded); + } +} /* addtraverse */ + + +void re_move(node **p, node **q) +{ + /* re_move p and record in q where it was */ + *q = (*p)->next->back; + hookup(*q, (*p)->next->next->back); + (*p)->next->back = NULL; + (*p)->next->next->back = NULL; + update(*q); + update((*q)->back); +} /* re_move */ + + +void globrearrange(long* numtrees,boolean* succeeded) +{ + /* does global rearrangements */ + tree globtree; + tree oldtree; + int i,j,k,num_sibs,num_sibs2; + node *where,*sib_ptr,*sib_ptr2; + double oldbestyet = curtree.likelihood; + int success = false; + + alloctree(&globtree.nodep,nonodes2); + alloctree(&oldtree.nodep,nonodes2); + setuptree(&globtree,nonodes2); + setuptree(&oldtree,nonodes2); + allocd(nonodes2, globtree.nodep); + allocd(nonodes2, oldtree.nodep); + allocw(nonodes2, globtree.nodep); + allocw(nonodes2, oldtree.nodep); + copy_(&curtree,&globtree); + copy_(&curtree,&oldtree); + for ( i = spp ; i < nonodes2 ; i++ ) { + num_sibs = count_sibs(curtree.nodep[i]); + sib_ptr = curtree.nodep[i]; + if ( (i - spp) % (( nonodes2 / 72 ) + 1 ) == 0 ) + putchar('.'); + fflush(stdout); + for ( j = 0 ; j <= num_sibs ; j++ ) { + re_move(&sib_ptr,&where); + copy_(&curtree,&priortree); + + if (where->tip) { + copy_(&oldtree,&curtree); + copy_(&oldtree,&bestree); + sib_ptr=sib_ptr->next; + continue; + } + else num_sibs2 = count_sibs(where); + sib_ptr2 = where; + for ( k = 0 ; k < num_sibs2 ; k++ ) { + addwhere = NULL; + addtraverse(sib_ptr,sib_ptr2->back,true,numtrees,succeeded); + if ( addwhere && where != addwhere && where->back != addwhere + && bestree.likelihood > globtree.likelihood) { + copy_(&bestree,&globtree); + success = true; + } + sib_ptr2 = sib_ptr2->next; + } + copy_(&oldtree,&curtree); + copy_(&oldtree,&bestree); + sib_ptr = sib_ptr->next; + } + } + copy_(&globtree,&curtree); + copy_(&globtree,&bestree); + if (success && globtree.likelihood > oldbestyet) { + *succeeded = true; + } + else { + *succeeded = false; + } + freed(nonodes2, globtree.nodep); + freed(nonodes2, oldtree.nodep); + freew(nonodes2, globtree.nodep); + freew(nonodes2, oldtree.nodep); + freetree(&globtree.nodep,nonodes2); + freetree(&oldtree.nodep,nonodes2); +} + + +void rearrange(node *p, long *numtrees, long *nextsp, boolean *succeeded) +{ + node *q, *r; + if (!p->tip && !p->back->tip) { + r = p->next->next; + re_move(&r, &q); + copy_(&curtree, &priortree); + addtraverse(r, q->next->back, false, numtrees,succeeded); + addtraverse(r, q->next->next->back, false, numtrees,succeeded); + copy_(&bestree, &curtree); + if (global && ((*nextsp) == spp)) { + putchar('.'); + fflush(stdout); + } + } + if (!p->tip) { + rearrange(p->next->back, numtrees,nextsp,succeeded); + rearrange(p->next->next->back, numtrees,nextsp,succeeded); + } +} /* rearrange */ + + +void describe(node *p) +{ + /* print out information for one branch */ + long i=0; + node *q; + + q = p->back; + fprintf(outfile, "%4ld ", q->index - spp); + if (p->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[p->index - 1][i], outfile); + } else + fprintf(outfile, "%4ld ", p->index - spp); + fprintf(outfile, "%15.5f\n", q->v); + if (!p->tip) { + describe(p->next->back); + describe(p->next->next->back); + } +} /* describe */ + + +void summarize(long numtrees) +{ + /* print out branch lengths etc. */ + long i, j, totalnum; + + fprintf(outfile, "\nremember:"); + if (outgropt) + fprintf(outfile, " (although rooted by outgroup)"); + fprintf(outfile, " this is an unrooted tree!\n\n"); + if (!minev) + fprintf(outfile, "Sum of squares = %11.5f\n\n", -curtree.likelihood); + else + fprintf(outfile, "Sum of branch lengths = %11.5f\n\n", -curtree.likelihood); + if ((power == 2.0) && !minev) { + totalnum = 0; + for (i = 1; i <= nums; i++) { + for (j = 1; j <= nums; j++) { + if (i != j) + totalnum += reps[i - 1][j - 1]; + } + } + fprintf(outfile, "Average percent standard deviation = "); + fprintf(outfile, "%11.5f\n\n", + 100 * sqrt(-curtree.likelihood / (totalnum - 2))); + } + fprintf(outfile, "Between And Length\n"); + fprintf(outfile, "------- --- ------\n"); + describe(curtree.start->next->back); + describe(curtree.start->next->next->back); + describe(curtree.start->back); + fprintf(outfile, "\n\n"); + if (trout) { + col = 0; + treeout(curtree.start, &col, 0.43429445222, true, + curtree.start); + } +} /* summarize */ + + +void nodeinit(node *p) +{ + /* initialize a node */ + long i, j; + + for (i = 1; i <= 3; i++) { + for (j = 0; j < nonodes2; j++) { + p->w[j] = 1.0; + p->d[j] = 0.0; + } + p = p->next; + } + if ((!lengths) || p->iter) + p->v = 1.0; + if ((!lengths) || p->back->iter) + p->back->v = 1.0; +} /* nodeinit */ + + +void initrav(node *p) +{ + /* traverse to initialize */ + if (p->tip) + return; + nodeinit(p); + initrav(p->next->back); + initrav(p->next->next->back); +} /* initrav */ + +void treevaluate() +{ + /* evaluate user-defined tree, iterating branch lengths */ + long i; + double oldlike; + + for (i = 1; i <= spp; i++) + setuptipf(i, &curtree); + unroot(&curtree,nonodes2); + + initrav(curtree.start); + if (curtree.start->back != NULL) { + initrav(curtree.start->back); + evaluate(&curtree); + do { + oldlike = curtree.likelihood; + smooth(curtree.start); + evaluate(&curtree); + } while (fabs(curtree.likelihood - oldlike) > delta); + } + evaluate(&curtree); +} /* treevaluate */ + + +void maketree() +{ + /* contruct the tree */ + long nextsp,numtrees; + boolean succeeded=false; + long i, j, which; + + if (usertree) { + inputdata(replicates, printdata, lower, upper, x, reps); + setuptree(&curtree, nonodes2); + for (which = 1; which <= spp; which++) + setuptipf(which, &curtree); + if (eoln(infile)) + scan_eoln(infile); + openfile(&intree,INTREE,"input tree file","r",progname,intreename); + numtrees = countsemic(&intree); + if (numtrees > MAXNUMTREES) { + printf("\nERROR: number of input trees is read incorrectly from %s\n", + intreename); + exxit(-1); + } + if (treeprint) { + fprintf(outfile, "User-defined tree"); + if (numtrees > 1) + putc('s', outfile); + fprintf(outfile, ":\n\n"); + } + first = true; + which = 1; + while (which <= numtrees) { + treeread2 (intree, &curtree.start, curtree.nodep, + lengths, &trweight, &goteof, &haslengths, &spp,false,nonodes2); + nums = spp; + curtree.start = curtree.nodep[outgrno - 1]->back; + treevaluate(); + printree(curtree.start, treeprint, false, false); + summarize(numtrees); + clear_connections(&curtree,nonodes2); + which++; + } + FClose(intree); + } else { + if (jumb == 1) { + inputdata(replicates, printdata, lower, upper, x, reps); + setuptree(&curtree, nonodes2); + setuptree(&priortree, nonodes2); + setuptree(&bestree, nonodes2); + if (njumble > 1) setuptree(&bestree2, nonodes2); + } + for (i = 1; i <= spp; i++) + enterorder[i - 1] = i; + if (jumble) + randumize(seed, enterorder); + nextsp = 3; + buildsimpletree(&curtree, nextsp); + curtree.start = curtree.nodep[enterorder[0] - 1]->back; + if (jumb == 1) numtrees = 1; + nextsp = 4; + if (progress) { + printf("Adding species:\n"); + writename(0, 3, enterorder); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + while (nextsp <= spp) { + nums = nextsp; + buildnewtip(enterorder[nextsp - 1], &curtree, nextsp); + copy_(&curtree, &priortree); + bestree.likelihood = -99999.0; + curtree.start = curtree.nodep[enterorder[0] - 1]->back; + addtraverse(curtree.nodep[enterorder[nextsp - 1] - 1]->back, + curtree.start, true, &numtrees,&succeeded); + copy_(&bestree, &curtree); + if (progress) { + writename(nextsp - 1, 1, enterorder); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + if (global && nextsp == spp) { + if (progress) { + printf("Doing global rearrangements\n"); + printf(" !"); + for (j = spp; j < nonodes2; j++) + if ( (j - spp) % (( nonodes2 / 72 ) + 1 ) == 0 ) + putchar('-'); + printf("!\n"); + printf(" "); + } + } + succeeded = true; + while (succeeded) { + succeeded = false; + curtree.start = curtree.nodep[enterorder[0] - 1]->back; + if (nextsp == spp && global) + globrearrange (&numtrees,&succeeded); + else{ + rearrange(curtree.start,&numtrees,&nextsp,&succeeded); + } + if (global && ((nextsp) == spp) && progress) + printf("\n "); + } + if (global && nextsp == spp) { + putc('\n', outfile); + if (progress) + putchar('\n'); + } + if (njumble > 1) { + if (jumb == 1 && nextsp == spp) + copy_(&bestree, &bestree2); + else if (nextsp == spp) { + if (bestree2.likelihood < bestree.likelihood) + copy_(&bestree, &bestree2); + } + } + if (nextsp == spp && jumb == njumble) { + if (njumble > 1) copy_(&bestree2, &curtree); + curtree.start = curtree.nodep[outgrno - 1]->back; + printree(curtree.start, treeprint, true, false); + summarize(numtrees); + } + nextsp++; + } + } + if (jumb == njumble && progress) { + printf("\nOutput written to file \"%s\"\n\n", outfilename); + if (trout) { + printf("Tree also written onto file \"%s\"\n", outtreename); + putchar('\n'); + } + } +} /* maketree */ + + +int main(int argc, Char *argv[]) +{ + int i; +#ifdef MAC + argc = 1; /* macsetup("Fitch",""); */ + argv[0]="Fitch"; +#endif + init(argc,argv); + progname = argv[0]; + openfile(&infile,INFILE,"input file","r",argv[0],infilename); + openfile(&outfile,OUTFILE,"output file","w",argv[0],outfilename); + + ibmpc = IBMCRT; + ansi = ANSICRT; + mulsets = false; + datasets = 1; + firstset = true; + doinit(); + if (trout) + openfile(&outtree,OUTTREE,"output tree file","w",argv[0],outtreename); + for (i=0;i 1) { + fprintf(outfile, "Data set # %ld:\n\n",ith); + if (progress) + printf("\nData set # %ld:\n\n",ith); + } + fitch_getinput(); + for (jumb = 1; jumb <= njumble; jumb++) + maketree(); + firstset = false; + if (eoln(infile) && (ith < datasets)) + scan_eoln(infile); + } + if (trout) + FClose(outtree); + FClose(outfile); + FClose(infile); +#ifdef MAC + fixmacfile(outfilename); + fixmacfile(outtreename); +#endif + printf("Done.\n\n"); +#ifdef WIN32 + phyRestoreConsoleAttributes(); +#endif + return 0; +} diff --git a/forester/archive/RIO/others/phylip_mod/src/neighbor.c b/forester/archive/RIO/others/phylip_mod/src/neighbor.c new file mode 100644 index 0000000..62150ce --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/neighbor.c @@ -0,0 +1,602 @@ + +#include "phylip.h" +#include "dist.h" + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Mary Kuhner, Jon Yamato, Joseph Felsenstein, Akiko Fuseki, + Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + + +#ifndef OLDC +/* function prototypes */ +void getoptions(void); +void allocrest(void); +void doinit(void); +void inputoptions(void); +void getinput(void); +void describe(node *, double); +void summarize(void); +void nodelabel(boolean); +void jointree(void); +void maketree(void); +void freerest(void); +/* function prototypes */ +#endif + + +Char infilename[FNMLNGTH], outfilename[FNMLNGTH], outtreename[FNMLNGTH]; +long nonodes2, outgrno, col, datasets, ith; +long inseed; +vector *x; +intvector *reps; +boolean jumble, lower, upper, outgropt, replicates, trout, + printdata, progress, treeprint, mulsets, njoin; +tree curtree; +longer seed; +long *enterorder; +Char progname[20]; + +/* variables for maketree, propagated globally for C version: */ +node **cluster; + + +void getoptions() +{ + /* interactively set options */ + long inseed0 = 0, loopcount; + Char ch; + + fprintf(outfile, "\nNeighbor-Joining/UPGMA method version %s\n\n",VERSION); + putchar('\n'); + jumble = false; + lower = false; + outgrno = 1; + outgropt = false; + replicates = false; + trout = true; + upper = false; + printdata = false; + progress = true; + treeprint = true; + njoin = true; + loopcount = 0; + for(;;) { + cleerhome(); + printf("\nNeighbor-Joining/UPGMA method version %s\n\n",VERSION); + printf("Settings for this run:\n"); + printf(" N Neighbor-joining or UPGMA tree? %s\n", + (njoin ? "Neighbor-joining" : "UPGMA")); + if (njoin) { + printf(" O Outgroup root?"); + if (outgropt) + printf(" Yes, at species number%3ld\n", outgrno); + else + printf(" No, use as outgroup species%3ld\n", outgrno); + } + printf(" L Lower-triangular data matrix? %s\n", + (lower ? "Yes" : "No")); + printf(" R Upper-triangular data matrix? %s\n", + (upper ? "Yes" : "No")); + printf(" S Subreplicates? %s\n", + (replicates ? "Yes" : "No")); + printf(" J Randomize input order of species?"); + if (jumble) + printf(" Yes (random number seed =%8ld)\n", inseed0); + else + printf(" No. Use input order\n"); + printf(" M Analyze multiple data sets?"); + if (mulsets) + printf(" Yes, %2ld sets\n", datasets); + else + printf(" No\n"); + printf(" 0 Terminal type (IBM PC, ANSI, none)? %s\n", + (ibmpc ? "IBM PC" : ansi ? "ANSI" : "(none)")); + printf(" 1 Print out the data at start of run %s\n", + (printdata ? "Yes" : "No")); + printf(" 2 Print indications of progress of run %s\n", + (progress ? "Yes" : "No")); + printf(" 3 Print out tree %s\n", + (treeprint ? "Yes" : "No")); + printf(" 4 Write out trees onto tree file? %s\n", + (trout ? "Yes" : "No")); + printf("\n\n Y to accept these or type the letter for one to change\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch); + getchar(); + if (ch == '\n') + ch = ' '; + uppercase(&ch); + if (ch == 'Y') + break; + if (strchr("NJOULRSM01234",ch) != NULL){ + switch (ch) { + + case 'J': + jumble = !jumble; + if (jumble) + initseed(&inseed, &inseed0, seed); + break; + + case 'L': + lower = !lower; + break; + + case 'O': + outgropt = !outgropt; + if (outgropt) + initoutgroup(&outgrno, spp); + else + outgrno = 1; + break; + + case 'R': + upper = !upper; + break; + + case 'S': + replicates = !replicates; + break; + + case 'N': + njoin = !njoin; + break; + + case 'M': + mulsets = !mulsets; + if (mulsets) + initdatasets(&datasets); + jumble = true; + if (jumble) + initseed(&inseed, &inseed0, seed); + break; + + case '0': + initterminal(&ibmpc, &ansi); + break; + + case '1': + printdata = !printdata; + break; + + case '2': + progress = !progress; + break; + + case '3': + treeprint = !treeprint; + break; + + case '4': + trout = !trout; + break; + } + } else + printf("Not a possible option!\n"); + countup(&loopcount, 100); + } +} /* getoptions */ + + +void allocrest() +{ + long i; + + x = (vector *)Malloc(spp*sizeof(vector)); + for (i = 0; i < spp; i++) + x[i] = (vector)Malloc(spp*sizeof(double)); + reps = (intvector *)Malloc(spp*sizeof(intvector)); + for (i = 0; i < spp; i++) + reps[i] = (intvector)Malloc(spp*sizeof(long)); + nayme = (naym *)Malloc(spp*sizeof(naym)); + enterorder = (long *)Malloc(spp*sizeof(long)); + cluster = (node **)Malloc(spp*sizeof(node *)); +} /* allocrest */ + + +void freerest() +{ + long i; + + for (i = 0; i < spp; i++) + free(x[i]); + free(x); + for (i = 0; i < spp; i++) + free(reps[i]); + free(reps); + free(nayme); + free(enterorder); + free(cluster); +} /* freerest */ + + +void doinit() +{ + /* initializes variables */ + node *p; + + inputnumbers2(&spp, &nonodes2, 2); + nonodes2 += (njoin ? 0 : 1); + getoptions(); + alloctree(&curtree.nodep, nonodes2+1); + p = curtree.nodep[nonodes2]->next->next; + curtree.nodep[nonodes2]->next = curtree.nodep[nonodes2]; + free(p); + allocrest(); + +} /* doinit */ + + +void inputoptions() +{ + /* read options information */ + + if (ith != 1) + samenumsp2(ith); + putc('\n', outfile); + if (njoin) + fprintf(outfile, " Neighbor-joining method\n"); + else + fprintf(outfile, " UPGMA method\n"); + fprintf(outfile, "\n Negative branch lengths allowed\n\n"); +} /* inputoptions */ + + +void describe(node *p, double height) +{ + /* print out information for one branch */ + long i; + node *q; + + q = p->back; + if (njoin) + fprintf(outfile, "%4ld ", q->index - spp); + else + fprintf(outfile, "%4ld ", q->index - spp); + if (p->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[p->index - 1][i], outfile); + putc(' ', outfile); + } else { + if (njoin) + fprintf(outfile, "%4ld ", p->index - spp); + else { + fprintf(outfile, "%4ld ", p->index - spp); + } + } + if (njoin) + fprintf(outfile, "%12.5f\n", q->v); + else + fprintf(outfile, "%10.5f %10.5f\n", q->v, q->v+height); + if (!p->tip) { + describe(p->next->back, height+q->v); + describe(p->next->next->back, height+q->v); + } +} /* describe */ + + +void summarize() +{ + /* print out branch lengths etc. */ + putc('\n', outfile); + if (njoin) { + fprintf(outfile, "remember:"); + if (outgropt) + fprintf(outfile, " (although rooted by outgroup)"); + fprintf(outfile, " this is an unrooted tree!\n"); + } + if (njoin) { + fprintf(outfile, "\nBetween And Length\n"); + fprintf(outfile, "------- --- ------\n"); + } else { + fprintf(outfile, "From To Length Height\n"); + fprintf(outfile, "---- -- ------ ------\n"); + } + describe(curtree.start->next->back, 0.0); + describe(curtree.start->next->next->back, 0.0); + if (njoin) + describe(curtree.start->back, 0.0); + fprintf(outfile, "\n\n"); +} /* summarize */ + + +void nodelabel(boolean isnode) +{ + if (isnode) + printf("node"); + else + printf("species"); +} /* nodelabel */ + + +void jointree() +{ + /* calculate the tree */ + long nc, nextnode, mini=0, minj=0, i, j, ia, ja, ii, jj, nude, iter; + double fotu2, total, tmin, dio, djo, bi, bj, bk, dmin=0, da; + long el[3]; + vector av; + intvector oc; + + double *R; /* added in revisions by Y. Ina */ + R = (double *)Malloc(spp * sizeof(double)); + + for (i = 0; i <= spp - 2; i++) { + for (j = i + 1; j < spp; j++) { + da = (x[i][j] + x[j][i]) / 2.0; + x[i][j] = da; + x[j][i] = da; + } + } + /* First initialization */ + fotu2 = spp - 2.0; + nextnode = spp + 1; + av = (vector)Malloc(spp*sizeof(double)); + oc = (intvector)Malloc(spp*sizeof(long)); + for (i = 0; i < spp; i++) { + av[i] = 0.0; + oc[i] = 1; + } + /* Enter the main cycle */ + if (njoin) + iter = spp - 3; + else + iter = spp - 1; + for (nc = 1; nc <= iter; nc++) { + for (j = 2; j <= spp; j++) { + for (i = 0; i <= j - 2; i++) + x[j - 1][i] = x[i][j - 1]; + } + tmin = 99999.0; + /* Compute sij and minimize */ + if (njoin) { /* many revisions by Y. Ina from here ... */ + for (i = 0; i < spp; i++) + R[i] = 0.0; + for (ja = 2; ja <= spp; ja++) { + jj = enterorder[ja - 1]; + if (cluster[jj - 1] != NULL) { + for (ia = 0; ia <= ja - 2; ia++) { + ii = enterorder[ia]; + if (cluster[ii - 1] != NULL) { + R[ii - 1] += x[ii - 1][jj - 1]; + R[jj - 1] += x[ii - 1][jj - 1]; + } + } + } + } + } /* ... to here */ + for (ja = 2; ja <= spp; ja++) { + jj = enterorder[ja - 1]; + if (cluster[jj - 1] != NULL) { + for (ia = 0; ia <= ja - 2; ia++) { + ii = enterorder[ia]; + if (cluster[ii - 1] != NULL) { + if (njoin) { + total = fotu2 * x[ii - 1][jj - 1] - R[ii - 1] - R[jj - 1]; + /* this statement part of revisions by Y. Ina */ + } else + total = x[ii - 1][jj - 1]; + if (total < tmin) { + tmin = total; + mini = ii; + minj = jj; + } + } + } + } + } + /* compute lengths and print */ + if (njoin) { + dio = 0.0; + djo = 0.0; + for (i = 0; i < spp; i++) { + dio += x[i][mini - 1]; + djo += x[i][minj - 1]; + } + dmin = x[mini - 1][minj - 1]; + dio = (dio - dmin) / fotu2; + djo = (djo - dmin) / fotu2; + bi = (dmin + dio - djo) * 0.5; + bj = dmin - bi; + bi -= av[mini - 1]; + bj -= av[minj - 1]; + } else { + bi = x[mini - 1][minj - 1] / 2.0 - av[mini - 1]; + bj = x[mini - 1][minj - 1] / 2.0 - av[minj - 1]; + av[mini - 1] += bi; + } + if (progress) { + printf("Cycle %3ld: ", iter - nc + 1); + if (njoin) + nodelabel((boolean)(av[mini - 1] > 0.0)); + else + nodelabel((boolean)(oc[mini - 1] > 1.0)); + printf(" %ld (%10.5f) joins ", mini, bi); + if (njoin) + nodelabel((boolean)(av[minj - 1] > 0.0)); + else + nodelabel((boolean)(oc[minj - 1] > 1.0)); + printf(" %ld (%10.5f)\n", minj, bj); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + hookup(curtree.nodep[nextnode - 1]->next, cluster[mini - 1]); + hookup(curtree.nodep[nextnode - 1]->next->next, cluster[minj - 1]); + cluster[mini - 1]->v = bi; + cluster[minj - 1]->v = bj; + cluster[mini - 1]->back->v = bi; + cluster[minj - 1]->back->v = bj; + cluster[mini - 1] = curtree.nodep[nextnode - 1]; + cluster[minj - 1] = NULL; + nextnode++; + if (njoin) + av[mini - 1] = dmin * 0.5; + /* re-initialization */ + fotu2 -= 1.0; + for (j = 0; j < spp; j++) { + if (cluster[j] != NULL) { + if (njoin) { + da = (x[mini - 1][j] + x[minj - 1][j]) * 0.5; + if (mini - j - 1 < 0) + x[mini - 1][j] = da; + if (mini - j - 1 > 0) + x[j][mini - 1] = da; + } else { + da = x[mini - 1][j] * oc[mini - 1] + x[minj - 1][j] * oc[minj - 1]; + da /= oc[mini - 1] + oc[minj - 1]; + x[mini - 1][j] = da; + x[j][mini - 1] = da; + } + } + } + for (j = 0; j < spp; j++) { + x[minj - 1][j] = 0.0; + x[j][minj - 1] = 0.0; + } + oc[mini - 1] += oc[minj - 1]; + } + /* the last cycle */ + nude = 1; + for (i = 1; i <= spp; i++) { + if (cluster[i - 1] != NULL) { + el[nude - 1] = i; + nude++; + } + } + if (!njoin) { + curtree.start = cluster[el[0] - 1]; + curtree.start->back = NULL; + free(av); + free(oc); + return; + } + bi = (x[el[0] - 1][el[1] - 1] + x[el[0] - 1][el[2] - 1] - x[el[1] - 1] + [el[2] - 1]) * 0.5; + bj = x[el[0] - 1][el[1] - 1] - bi; + bk = x[el[0] - 1][el[2] - 1] - bi; + bi -= av[el[0] - 1]; + bj -= av[el[1] - 1]; + bk -= av[el[2] - 1]; + if (progress) { + printf("last cycle:\n"); + putchar(' '); + nodelabel((boolean)(av[el[0] - 1] > 0.0)); + printf(" %ld (%10.5f) joins ", el[0], bi); + nodelabel((boolean)(av[el[1] - 1] > 0.0)); + printf(" %ld (%10.5f) joins ", el[1], bj); + nodelabel((boolean)(av[el[2] - 1] > 0.0)); + printf(" %ld (%10.5f)\n", el[2], bk); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + hookup(curtree.nodep[nextnode - 1], cluster[el[0] - 1]); + hookup(curtree.nodep[nextnode - 1]->next, cluster[el[1] - 1]); + hookup(curtree.nodep[nextnode - 1]->next->next, cluster[el[2] - 1]); + cluster[el[0] - 1]->v = bi; + cluster[el[1] - 1]->v = bj; + cluster[el[2] - 1]->v = bk; + cluster[el[0] - 1]->back->v = bi; + cluster[el[1] - 1]->back->v = bj; + cluster[el[2] - 1]->back->v = bk; + curtree.start = cluster[el[0] - 1]->back; + free(av); + free(oc); + free(R); +} /* jointree */ + + +void maketree() +{ + /* construct the tree */ + long i ; + + inputdata(replicates, printdata, lower, upper, x, reps); + if (njoin && (spp < 3)) { + printf("\nERROR: Neighbor-Joining runs must have at least 3 species\n\n"); + exxit(-1); + } + if (progress) + putchar('\n'); + if (ith == 1) + setuptree(&curtree, nonodes2 + 1); + for (i = 1; i <= spp; i++) + enterorder[i - 1] = i; + if (jumble) + randumize(seed, enterorder); + for (i = 0; i < spp; i++) + cluster[i] = curtree.nodep[i]; + jointree(); + if (njoin) + curtree.start = curtree.nodep[outgrno - 1]->back; + printree(curtree.start, treeprint, njoin, (boolean)(!njoin)); + if (treeprint) + summarize(); + if (trout) { + col = 0; + if (njoin) + treeout(curtree.start, &col, 0.43429448222, njoin, curtree.start); + else + curtree.root = curtree.start, + treeoutr(curtree.start,&col,&curtree); + } + if (progress) { + printf("\nOutput written on file \"%s\"\n\n", outfilename); + if (trout) + printf("Tree written on file \"%s\"\n\n", outtreename); + } +} /* maketree */ + + +int main(int argc, Char *argv[]) +{ /* main program */ +#ifdef MAC + argc = 1; /* macsetup("Neighbor",""); */ + argv[0] = "Neighbor"; +#endif + init(argc, argv); + openfile(&infile,INFILE,"input file", "r",argv[0],infilename); + openfile(&outfile,OUTFILE,"output file", "w",argv[0],outfilename); + ibmpc = IBMCRT; + ansi = ANSICRT; + mulsets = false; + datasets = 1; + doinit(); + if (trout) + openfile(&outtree,OUTTREE,"output tree file", "w",argv[0],outtreename); + ith = 1; + while (ith <= datasets) { + if (datasets > 1) { + fprintf(outfile, "Data set # %ld:\n",ith); + if (progress) + printf("Data set # %ld:\n",ith); + } + inputoptions(); + maketree(); + if (eoln(infile) && (ith < datasets)) + scan_eoln(infile); + ith++; + } + FClose(infile); + FClose(outfile); + FClose(outtree); + freerest(); +#ifdef MAC + fixmacfile(outfilename); + fixmacfile(outtreename); +#endif + printf("Done.\n\n"); +#ifdef WIN32 + phyRestoreConsoleAttributes(); +#endif + return 0; +} + + + + + diff --git a/forester/archive/RIO/others/phylip_mod/src/phylip.c b/forester/archive/RIO/others/phylip_mod/src/phylip.c new file mode 100644 index 0000000..2c87e74 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/phylip.c @@ -0,0 +1,2750 @@ + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, Andrew Keeffe, + and Dan Fineman. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +#ifdef OSX_CARBON +#include +#endif + +#include +#include +#ifdef WIN32 +#include +/* for console code (clear screen, text color settings) */ +CONSOLE_SCREEN_BUFFER_INFO savecsbi; +HANDLE hConsoleOutput; + +void phyClearScreen(); +void phySaveConsoleAttributes(); +void phySetConsoleAttributes(); +void phyRestoreConsoleAttributes(); +void phyFillScreenColor(); +#endif + +#include "phylip.h" + +#ifndef OLDC +static void crash_handler(int signum); + +#endif +#if defined(OSX_CARBON) && defined(__MWERKS__) +boolean fixedpath = false; +#endif +FILE *infile, *outfile, *intree, *intree2, *outtree, *weightfile, *catfile, *ancfile, *mixfile, *factfile; +long spp, words, bits; +boolean ibmpc, ansi, tranvsp; +naym *nayme; /* names of species */ + +static void crash_handler(int sig_num) +{ /* when we crash, lets print out something usefull */ + printf("ERROR: "); + switch(sig_num) { +#ifdef SIGSEGV + case SIGSEGV: + puts("This program has caused a Segmentation fault."); + break; +#endif /* SIGSEGV */ +#ifdef SIGFPE + case SIGFPE: + puts("This program has caused a Floating Point Exception"); + break; +#endif /* SIGFPE */ +#ifdef SIGILL + case SIGILL: + puts("This program has attempted an illegal instruction"); + break; +#endif /* SIGILL */ +#ifdef SIGPIPE + case SIGPIPE: + puts("This program tried to write to a broken pipe"); + break; +#endif /* SIGPIPE */ +#ifdef SIGBUS + case SIGBUS: + puts("This program had a bus error"); + break; +#endif /* SIGBUS */ + } + if (sig_num == SIGSEGV) { + puts( + " This may have been caused by an incorrectly formatted input file"); + puts( + " or input tree file. You should check those files carefully."); + puts(" If this seems to be a bug, please mail joe@gs.washington.edu"); + } + else { + puts(" Most likely, you have encountered a bug in the program."); + puts(" Since this seems to be a bug, please mail joe@gs.washington.edu"); + } + puts(" with the name of the program, your computer system type,"); + puts(" a full description of the problem, and with the input data file."); + puts(" (which should be in the body of the message, not as an Attachment)."); + +#ifdef WIN32 + puts ("Press Enter or Return to close program."); + puts(" You may have to press Enter or Return twice."); + getchar (); + getchar (); + phyRestoreConsoleAttributes(); +#endif + abort(); +} + + +void init(int argc, char** argv) +{ /* initialization routine for all programs + * anything done at the beginig for every program should be done here */ + + /* set up signal handler for + * segfault,floating point exception, illeagal instruction, bad pipe, bus error + * there are more signals that can cause a crash, but these are the most common + * even these aren't found on all machines. */ +#ifdef SIGSEGV + signal(SIGSEGV, crash_handler); +#endif /* SIGSEGV */ +#ifdef SIGFPE + signal(SIGFPE, crash_handler); +#endif /* SIGFPE */ +#ifdef SIGILL + signal(SIGILL, crash_handler); +#endif /* SIGILL */ +#ifdef SIGPIPE + signal(SIGPIPE, crash_handler); +#endif /* SIGPIPE */ +#ifdef SIGBUS + signal(SIGBUS, crash_handler); +#endif /* SIGBUS */ + +#ifdef WIN32 + phySetConsoleAttributes(); + phyClearScreen(); +#endif + +} + +void scan_eoln(FILE *f) +{ /* eat everything to the end of line or eof*/ + char ch; + + while (!eoff(f) && !eoln(f)) + gettc(f); + if (!eoff(f)) + ch = gettc(f); +} + + +boolean eoff(FILE *f) +{ /* check for end of file */ + int ch; + + if (feof(f)) + return true; + ch = getc(f); + if (ch == EOF) { + ungetc(ch, f); + return true; + } + ungetc(ch, f); + return false; +} /*eoff*/ + + +boolean eoln(FILE *f) +{ /* check for end of line or eof*/ + register int ch; + + ch = getc(f); + if (ch == EOF) + return true; + ungetc(ch, f); + return ((ch == '\n') || (ch == '\r')); +} /*eoln*/ + + +int filexists(char *filename) +{ /* check whether file already exists */ + FILE *fp; + fp =fopen(filename,"rb"); + if (fp) { + fclose(fp); + return 1; + } else + return 0; +} /*filexists*/ + + +const char* get_command_name (const char *vektor) +{ /* returns the name of the program from vektor without the whole path */ + char *last_slash; + + /* Point to the last slash... */ + last_slash = strrchr (vektor, DELIMITER); + + if (last_slash) + /* If there was a last slash, return the character after it */ + return last_slash + 1; + else + /* If not, return the vector */ + return vektor; + +} /*get_command_name*/ + + +void getstryng(char *fname) +{ /* read in a file name from stdin and take off newline if any */ + + fname = fgets(fname, 100, stdin); + if (strchr(fname, '\n') != NULL) + *strchr(fname, '\n') = '\0'; +} /* getstryng */ + + +void countup(long *loopcount, long maxcount) +{ /* count how many times this loop has tried to read data, bail out + if exceeds maxcount */ + + (*loopcount)++; + if ((*loopcount) >= maxcount) { + printf("\nERROR: Made %ld attempts to read input in loop. Aborting run.\n", + *loopcount); + exxit(-1); + } +} /* countup */ + + +void openfile(FILE **fp,const char *filename,const char *filedesc, + const char *mode,const char *application, char *perm) +{ /* open a file, testing whether it exists etc. */ + FILE *of; + char file[FNMLNGTH]; + char filemode[3]; + char input[FNMLNGTH]; + char ch; + const char *progname_without_path; + long loopcount, loopcount2; +#if defined(OSX_CARBON) && defined(__MWERKS__) + ProcessSerialNumber myProcess; + FSRef bundleLocation; + unsigned char bundlePath[FNMLNGTH]; + + if(!fixedpath){ + /* change path to the bundle location instead of root directory */ + GetCurrentProcess(&myProcess); + GetProcessBundleLocation(&myProcess, &bundleLocation); + FSRefMakePath(&bundleLocation, bundlePath, FNMLNGTH); + chdir((const char*)bundlePath); + chdir(".."); /* get out of the .app directory */ + + fixedpath = true; + } +#endif + + progname_without_path = get_command_name(application); + + strcpy(file,filename); + strcpy(filemode,mode); + strcat(filemode,"b"); + loopcount = 0; + while (1){ + if (filemode[0] == 'w' && filexists(file)){ + printf("\n%s: the file \"%s\" that you wanted to\n", + progname_without_path, file); + printf(" use as %s already exists.\n", filedesc); + printf(" Do you want to Replace it, Append to it,\n"); + printf(" write to a new File, or Quit?\n"); + loopcount2 = 0; + do { + printf(" (please type R, A, F, or Q) \n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + fgets(input, sizeof(input), stdin); + ch = input[0]; + uppercase(&ch); + countup(&loopcount2, 10); + } while (ch != 'A' && ch != 'R' && ch != 'F' && ch != 'Q'); + if (ch == 'Q') + exxit(-1); + if (ch == 'A') { + strcpy(filemode,"ab"); + continue; + } + else if (ch == 'F') { + file[0] = '\0'; + loopcount2 = 0; + while (file[0] =='\0') { + printf("Please enter a new file name> "); + getstryng(file); + countup(&loopcount2, 10); + } + strcpy(filemode,"wb"); + continue; + } + } + of = fopen(file,filemode); + if (of) + break; + else { + switch (filemode[0]){ + + case 'r': + printf("%s: can't find %s \"%s\"\n", progname_without_path, + filedesc, file); + file[0] = '\0'; + loopcount2 = 0; + while (file[0] =='\0'){ + printf("Please enter a new file name> "); + countup(&loopcount2, 10); + getstryng(file);} + break; + + case 'w': + case 'a': + printf("%s: can't write %s \"%s\"\n", progname_without_path, + filedesc, file); + file[0] = '\0'; + loopcount2 = 0; + while (file[0] =='\0'){ + printf("Please enter a new file name> "); + countup(&loopcount2, 10); + getstryng(file);} + continue; + default: + printf("There is some error in the call of openfile. Unknown mode.\n"); + exxit(-1); + } + } + countup(&loopcount, 20); + } + *fp = of; + if (perm != NULL) + strcpy(perm,file); +} /* openfile */ + + +void cleerhome() +{ /* home cursor and clear screen, if possible */ +#ifdef WIN32 + if(ibmpc || ansi){ + phyClearScreen(); + } else { + printf("\n\n"); + } +#else + printf("%s", ((ibmpc || ansi) ? ("\033[2J\033[H") : "\n\n")); +#endif +} /* cleerhome */ + + +double randum(longer seed) +{ /* random number generator -- slow but machine independent + This is a multiplicative congruential 32-bit generator + x(t+1) = 1664525 * x(t) mod 2^32, one that passes the + Coveyou-Macpherson and Lehmer tests, see Knuth ACP vol. 2 + We here implement it representing each integer in base-64 + notation -- i.e. as an array of 6 six-bit chunks */ + long i, j, k, sum; + longer mult, newseed; + double x; + + mult[0] = 13; /* these four statements set the multiplier */ + mult[1] = 24; /* -- they are its "digits" in a base-64 */ + mult[2] = 22; /* notation: 1664525 = 13*64^3+24*64^2 */ + mult[3] = 6; /* +22*64+6 */ + for (i = 0; i <= 5; i++) + newseed[i] = 0; + for (i = 0; i <= 5; i++) { + sum = newseed[i]; + k = i; + if (i > 3) + k = 3; + for (j = 0; j <= k; j++) + sum += mult[j] * seed[i - j]; + newseed[i] = sum; + for (j = i; j <= 4; j++) { + newseed[j + 1] += newseed[j] / 64; + newseed[j] &= 63; + } + } + memcpy(seed, newseed, sizeof(longer)); + seed[5] &= 3; + x = 0.0; + for (i = 0; i <= 5; i++) + x = x / 64.0 + seed[i]; + x /= 4.0; + return x; +} /* randum */ + + +void randumize(longer seed, long *enterorder) +{ /* randomize input order of species */ + long i, j, k; + + for (i = 0; i < spp; i++) { + j = (long)(randum(seed) * (i+1)); + k = enterorder[j]; + enterorder[j] = enterorder[i]; + enterorder[i] = k; + } +} /* randumize */ + + +double normrand(longer seed) +{/* standardized Normal random variate */ + double x; + + x = randum(seed)+randum(seed)+randum(seed)+randum(seed) + + randum(seed)+randum(seed)+randum(seed)+randum(seed) + + randum(seed)+randum(seed)+randum(seed)+randum(seed)-6.0; + return(x); +} /* normrand */ + + +long readlong(const char *prompt) +{ /* read a long */ + long res, loopcount; + char string[100]; + + loopcount = 0; + do { + printf("%s",prompt); + getstryng(string); + if (sscanf(string,"%ld",&res) == 1) + break; + countup(&loopcount, 10); + } while (1); + return res; +} /* readlong */ + + +void uppercase(Char *ch) +{ /* convert ch to upper case */ + *ch = (islower (*ch) ? toupper(*ch) : (*ch)); +} /* uppercase */ + + +void initseed(long *inseed, long *inseed0, longer seed) +{ /* input random number seed */ + long i, loopcount; + + loopcount = 0; + do { + printf("Random number seed (must be odd)?\n"); + scanf("%ld%*[^\n]", inseed); + getchar(); + countup(&loopcount, 10); + } while (((*inseed) < 0) || ((*inseed) & 1) == 0); + *inseed0 = *inseed; + for (i = 0; i <= 5; i++) + seed[i] = 0; + i = 0; + do { + seed[i] = *inseed & 63; + *inseed /= 64; + i++; + } while (*inseed != 0); +} /*initseed*/ + + +void initjumble(long *inseed, long *inseed0, longer seed, long *njumble) +{ /* input number of jumblings for jumble option */ + long loopcount; + + initseed(inseed, inseed0, seed); + loopcount = 0; + do { + printf("Number of times to jumble?\n"); + scanf("%ld%*[^\n]", njumble); + getchar(); + countup(&loopcount, 10); + } while ((*njumble) < 1); +} /*initjumble*/ + + +void initoutgroup(long *outgrno, long spp) +{ /* input outgroup number */ + long loopcount; + boolean done; + + loopcount = 0; + do { + printf("Type number of the outgroup:\n"); + scanf("%ld%*[^\n]", outgrno); + getchar(); + done = (*outgrno >= 1 && *outgrno <= spp); + if (!done) { + printf("BAD OUTGROUP NUMBER: %ld\n", *outgrno); + printf(" Must be in range 1 - %ld\n", spp); + } + countup(&loopcount, 10); + } while (done != true); +} /*initoutgroup*/ + + +void initthreshold(double *threshold) +{ /* input threshold for threshold parsimony option */ + long loopcount; + boolean done; + + loopcount = 0; + do { + printf("What will be the threshold value?\n"); + scanf("%lf%*[^\n]", threshold); + getchar(); + done = (*threshold >= 1.0); + if (!done) + printf("BAD THRESHOLD VALUE: it must be greater than 1\n"); + else + *threshold = (long)(*threshold * 10.0 + 0.5) / 10.0; + countup(&loopcount, 10); + } while (done != true); +} /*initthreshold*/ + + +void initcatn(long *categs) +{ /* initialize category number for rate categories */ + long loopcount; + + loopcount = 0; + *categs = 0; + do { + printf("Number of categories (1-%d)?\n", maxcategs); + scanf("%ld%*[^\n]", categs); + getchar(); + countup(&loopcount, 10); + } while (*categs > maxcategs || *categs < 1); +} /*initcatn*/ + + +void initcategs(long categs, double *rate) +{ /* initialize category rates for HMM rates */ + long i, loopcount, scanned; + char line[100], rest[100]; + boolean done; + + loopcount = 0; + for (;;){ + printf("Rate for each category? (use a space to separate)\n"); + getstryng(line); + done = true; + for (i = 0; i < categs; i++){ + scanned = sscanf(line,"%lf %[^\n]", &rate[i],rest); + if ((scanned < 2 && i < (categs - 1)) || + (scanned < 1 && i == (categs - 1))){ + printf("Please enter exactly %ld values.\n",categs); + done = false; + break; + } + strcpy(line,rest); + } + if (done) + break; + countup(&loopcount, 100); + } +} /*initcategs*/ + + +void initprobcat(long categs, double *probsum, double *probcat) +{ /* input probabilities of rate categores for HMM rates */ + long i, loopcount, scanned; + boolean done; + char line[100], rest[100]; + + loopcount = 0; + do { + printf("Probability for each category?"); + printf(" (use a space to separate)\n"); + getstryng(line); + done = true; + for (i = 0; i < categs; i++){ + scanned = sscanf(line,"%lf %[^\n]",&probcat[i],rest); + if ((scanned < 2 && i < (categs - 1)) || + (scanned < 1 && i == (categs - 1))){ + done = false; + printf("Please enter exactly %ld values.\n",categs); + break;} + strcpy(line,rest); + } + if (!done) + continue; + *probsum = 0.0; + for (i = 0; i < categs; i++) + *probsum += probcat[i]; + if (fabs(1.0 - (*probsum)) > 0.001) { + done = false; + printf("Probabilities must add up to"); + printf(" 1.0, plus or minus 0.001.\n"); + } + countup(&loopcount, 100); + } while (!done); +} /*initprobcat*/ + + +void lgr(long m, double b, raterootarray lgroot) +{ /* For use by initgammacat. Get roots of m-th Generalized Laguerre + polynomial, given roots of (m-1)-th, these are to be + stored in lgroot[m][] */ + long i; + double upper, lower, x, y; + boolean dwn; /* is function declining in this interval? */ + + if (m == 1) { + lgroot[1][1] = 1.0+b; + } else { + dwn = true; + for (i=1; i<=m; i++) { + if (i < m) { + if (i == 1) + lower = 0.0; + else + lower = lgroot[m-1][i-1]; + upper = lgroot[m-1][i]; + } else { /* i == m, must search above */ + lower = lgroot[m-1][i-1]; + x = lgroot[m-1][m-1]; + do { + x = 2.0*x; + y = glaguerre(m, b,x); + } while ((dwn && (y > 0.0)) || ((!dwn) && (y < 0.0))); + upper = x; + } + while (upper-lower > 0.000000001) { + x = (upper+lower)/2.0; + if (glaguerre(m, b, x) > 0.0) { + if (dwn) + lower = x; + else + upper = x; + } else { + if (dwn) + upper = x; + else + lower = x; + } + } + lgroot[m][i] = (lower+upper)/2.0; + dwn = !dwn; /* switch for next one */ + } + } +} /* lgr */ + + +double logfac (long n) +{ /* log(n!) values were calculated with Mathematica + with a precision of 30 digits */ + long i; + double x; + + switch (n) + { + case 0: + return 0.; + case 1: + return 0.; + case 2: + return 0.693147180559945309417232121458; + case 3: + return 1.791759469228055000812477358381; + case 4: + return 3.1780538303479456196469416013; + case 5: + return 4.78749174278204599424770093452; + case 6: + return 6.5792512120101009950601782929; + case 7: + return 8.52516136106541430016553103635; + case 8: + return 10.60460290274525022841722740072; + case 9: + return 12.80182748008146961120771787457; + case 10: + return 15.10441257307551529522570932925; + case 11: + return 17.50230784587388583928765290722; + case 12: + return 19.98721449566188614951736238706; + default: + x = 19.98721449566188614951736238706; + for (i = 13; i <= n; i++) + x += log(i); + return x; + } +} + + +double glaguerre(long m, double b, double x) +{ /* Generalized Laguerre polynomial computed recursively. + For use by initgammacat */ + long i; + double gln, glnm1, glnp1; /* L_n, L_(n-1), L_(n+1) */ + + if (m == 0) + return 1.0; + else { + if (m == 1) + return 1.0 + b - x; + else { + gln = 1.0+b-x; + glnm1 = 1.0; + for (i=2; i <= m; i++) { + glnp1 = ((2*(i-1)+b+1.0-x)*gln - (i-1+b)*glnm1)/i; + glnm1 = gln; + gln = glnp1; + } + return gln; + } + } +} /* glaguerre */ + + +void initlaguerrecat(long categs, double alpha, double *rate, double *probcat) +{ /* calculate rates and probabilities to approximate Gamma distribution + of rates with "categs" categories and shape parameter "alpha" using + rates and weights from Generalized Laguerre quadrature */ + long i; + raterootarray lgroot; /* roots of GLaguerre polynomials */ + double f, x, xi, y; + + alpha = alpha - 1.0; + lgroot[1][1] = 1.0+alpha; + for (i = 2; i <= categs; i++) + lgr(i, alpha, lgroot); /* get roots for L^(a)_n */ + /* here get weights */ + /* Gamma weights are (1+a)(1+a/2) ... (1+a/n)*x_i/((n+1)^2 [L_{n+1}^a(x_i)]^2) */ + f = 1; + for (i = 1; i <= categs; i++) + f *= (1.0+alpha/i); + for (i = 1; i <= categs; i++) { + xi = lgroot[categs][i]; + y = glaguerre(categs+1, alpha, xi); + x = f*xi/((categs+1)*(categs+1)*y*y); + rate[i-1] = xi/(1.0+alpha); + probcat[i-1] = x; + } +} /* initlaguerrecat */ + + +double hermite(long n, double x) +{ /* calculates hermite polynomial with degree n and parameter x */ + /* seems to be unprecise for n>13 -> root finder does not converge*/ + double h1 = 1.; + double h2 = 2. * x; + double xx = 2. * x; + long i; + + for (i = 1; i < n; i++) { + xx = 2. * x * h2 - 2. * (i) * h1; + h1 = h2; + h2 = xx; + } + return xx; +} /* hermite */ + + +void root_hermite(long n, double *hroot) +{ /* find roots of Hermite polynmials */ + long z; + long ii; + long start; + + if (n % 2 == 0) { + start = n/2; + z = 1; + } else { + start = n/2 + 1; + z=2; + hroot[start-1] = 0.0; + } + for (ii = start; ii < n; ii++) { /* search only upwards*/ + hroot[ii] = halfroot(hermite,n,hroot[ii-1]+EPSILON, 1./n); + hroot[start - z] = -hroot[ii]; + z++; + } +} /* root_hermite */ + + +double halfroot(double (*func)(long m, double x), long n, double startx, + double delta) +{ /* searches from the bound (startx) only in one direction + (by positive or negative delta, which results in + other-bound=startx+delta) + delta should be small. + (*func) is a function with two arguments */ + double xl; + double xu; + double xm; + double fu; + double fl; + double fm = 100000.; + double gradient; + boolean dwn; + + /* decide if we search above or below startx and escapes to trace back + to the starting point that most often will be + the root from the previous calculation */ + if (delta < 0) { + xu = startx; + xl = xu + delta; + } else { + xl = startx; + xu = xl + delta; + } + delta = fabs(delta); + fu = (*func)(n, xu); + fl = (*func)(n, xl); + gradient = (fl-fu)/(xl-xu); + while(fabs(fm) > EPSILON) { /* is root outside of our bracket?*/ + if ((fu<0.0 && fl<0.0) || (fu>0.0 && fl > 0.0)) { + xu += delta; + fu = (*func)(n, xu); + fl = (*func)(n, xl); + gradient = (fl-fu)/(xl-xu); + dwn = (gradient < 0.0) ? true : false; + } else { + xm = xl - fl / gradient; + fm = (*func)(n, xm); + if (dwn) { + if (fm > 0.) { + xl = xm; + fl = fm; + } else { + xu = xm; + fu = fm; + } + } else { + if (fm > 0.) { + xu = xm; + fu = fm; + } else { + xl = xm; + fl = fm; + } + } + gradient = (fl-fu)/(xl-xu); + } + } + return xm; +} /* halfroot */ + + +void hermite_weight(long n, double * hroot, double * weights) +{ + /* calculate the weights for the hermite polynomial at the roots + using formula from Abramowitz and Stegun chapter 25.4.46 p.890 */ + long i; + double hr2; + double numerator; + + numerator = exp(0.6931471805599 * ( n-1.) + logfac(n)) / (n*n); + for (i = 0; i < n; i++) { + hr2 = hermite(n-1, hroot[i]); + weights[i] = numerator / (hr2*hr2); + } +} /* hermiteweight */ + + +void inithermitcat(long categs, double alpha, double *rate, double *probcat) +{ /* calculates rates and probabilities */ + long i; + double *hroot; + double std; + + std = SQRT2 /sqrt(alpha); + hroot = (double *) Malloc((categs+1) * sizeof(double)); + root_hermite(categs, hroot); /* calculate roots */ + hermite_weight(categs, hroot, probcat); /* set weights */ + for (i=0; i= 100.0) + inithermitcat(categs, alpha, rate, probcat); + else + initlaguerrecat(categs, alpha, rate, probcat); +} /* initgammacat */ + + +void inithowmany(long *howmanny, long howoften) +{/* input how many cycles */ + long loopcount; + + loopcount = 0; + do { + printf("How many cycles of %4ld trees?\n", howoften); + scanf("%ld%*[^\n]", howmanny); + getchar(); + countup(&loopcount, 10); + } while (*howmanny <= 0); +} /*inithowmany*/ + + + +void inithowoften(long *howoften) +{ /* input how many trees per cycle */ + long loopcount; + + loopcount = 0; + do { + printf("How many trees per cycle?\n"); + scanf("%ld%*[^\n]", howoften); + getchar(); + countup(&loopcount, 10); + } while (*howoften <= 0); +} /*inithowoften*/ + + +void initlambda(double *lambda) +{ /* input patch length parameter for autocorrelated HMM rates */ + long loopcount; + + loopcount = 0; + do { + printf("Mean block length of sites having the same rate (greater than 1)?\n"); + scanf("%lf%*[^\n]", lambda); + getchar(); + countup(&loopcount, 10); + } while (*lambda <= 1.0); + *lambda = 1.0 / *lambda; +} /*initlambda*/ + + +void initfreqs(double *freqa, double *freqc, double *freqg, double *freqt) +{ /* input frequencies of the four bases */ + char input[100]; + long scanned, loopcount; + + printf("Base frequencies for A, C, G, T/U (use blanks to separate)?\n"); + loopcount = 0; + do { + getstryng(input); + scanned = sscanf(input,"%lf%lf%lf%lf%*[^\n]", freqa, freqc, freqg, freqt); + if (scanned == 4) + break; + else + printf("Please enter exactly 4 values.\n"); + countup(&loopcount, 100); + } while (1); +} /* initfreqs */ + + +void initratio(double *ttratio) +{ /* input transition/transversion ratio */ + long loopcount; + + loopcount = 0; + do { + printf("Transition/transversion ratio?\n"); + scanf("%lf%*[^\n]", ttratio); + getchar(); + countup(&loopcount, 10); + } while (*ttratio < 0.0); +} /* initratio */ + + +void initpower(double *power) +{ + printf("New power?\n"); + scanf("%lf%*[^\n]", power); + getchar(); +} /*initpower*/ + + +void initdatasets(long *datasets) +{ + /* handle multi-data set option */ + long loopcount; + boolean done; + + loopcount = 0; + do { + printf("How many data sets?\n"); + scanf("%ld%*[^\n]", datasets); + getchar(); + done = (*datasets > 1); + if (!done) + printf("Bad data sets number: it must be greater than 1\n"); + countup(&loopcount, 10); + } while (!done); +} /* initdatasets */ + + +void justweights(long *datasets) +{ + /* handle multi-data set option by weights */ + long loopcount; + boolean done; + + loopcount = 0; + do { + printf("How many sets of weights?\n"); + scanf("%ld%*[^\n]", datasets); + getchar(); + done = (*datasets >= 1); + if (!done) + printf("BAD NUMBER: it must be greater than 1\n"); + countup(&loopcount, 10); + } while (!done); +} /* justweights */ + + +void initterminal(boolean *ibmpc, boolean *ansi) +{ + /* handle terminal option */ + if (*ibmpc) { + *ibmpc = false; + *ansi = true; + } else if (*ansi) + *ansi = false; + else + *ibmpc = true; +} /*initterminal*/ + + +void initnumlines(long *screenlines) +{ + long loopcount; + + loopcount = 0; + do { + *screenlines = readlong("Number of lines on screen?\n"); + countup(&loopcount, 10); + } while (*screenlines <= 12); +} /*initnumlines*/ + + +void initbestrees(bestelm *bestrees, long maxtrees, boolean glob) +{ + /* initializes either global or local field of each array in bestrees */ + long i; + + if (glob) + for (i = 0; i < maxtrees; i++) + bestrees[i].gloreange = false; + else + for (i = 0; i < maxtrees; i++) + bestrees[i].locreange = false; +} /* initbestrees */ + + +void newline(FILE *filename, long i, long j, long k) +{ + /* go to new line if i is a multiple of j, indent k spaces */ + long m; + + if ((i - 1) % j != 0 || i <= 1) + return; + putc('\n', filename); + for (m = 1; m <= k; m++) + putc(' ', filename); +} /* newline */ + + +void inputnumbersold(long *spp, long *chars, long *nonodes, long n) +{ + /* input the numbers of species and of characters */ + + if (fscanf(infile, "%ld%ld", spp, chars) != 2 || *spp <= 0 || *chars <= 0) { + printf( + "ERROR: Unable to read the number of species or characters in data set\n"); + printf( + "The input file is incorrect (perhaps it was not saved text only).\n"); + } + *nonodes = *spp * 2 - n; +} /* inputnumbersold */ + + +void inputnumbers(long *spp, long *chars, long *nonodes, long n) +{ + /* input the numbers of species and of characters */ + + if (fscanf(infile, "%ld%ld", spp, chars) != 2 || *spp <= 0 || *chars <= 0) { + printf( + "ERROR: Unable to read the number of species or characters in data set\n"); + printf( + "The input file is incorrect (perhaps it was not saved text only).\n"); + } + *nonodes = *spp * 2 - n; +} /* inputnumbers */ + + +void inputnumbers2(long *spp, long *nonodes, long n) +{ + /* read species number */ + + if (fscanf(infile, "%ld", spp) != 1 || *spp <= 0) { + printf("ERROR: Unable to read the number of species in data set\n"); + printf( + "The input file is incorrect (perhaps it was not saved text only).\n"); + } + fprintf(outfile, "\n%4ld Populations\n", *spp); + *nonodes = *spp * 2 - n; +} /* inputnumbers2 */ + + +void inputnumbers3(long *spp, long *chars) +{ + /* input the numbers of species and of characters */ + + if (fscanf(infile, "%ld%ld", spp, chars) != 2 || *spp <= 0 || *chars <= 0) { + printf( + "ERROR: Unable to read the number of species or characters in data set\n"); + printf( + "The input file is incorrect (perhaps it was not saved text only).\n"); + exxit(-1); + } +} /* inputnumbers3 */ + + +void samenumsp(long *chars, long ith) +{ + /* check if spp is same as the first set in other data sets */ + long cursp, curchs; + + if (eoln(infile)) + scan_eoln(infile); + fscanf(infile, "%ld%ld", &cursp, &curchs); + if (cursp != spp) { + printf( + "\n\nERROR: Inconsistent number of species in data set %ld\n\n", ith); + exxit(-1); + } + *chars = curchs; +} /* samenumsp */ + + +void samenumsp2(long ith) +{ + /* check if spp is same as the first set in other data sets */ + long cursp; + + if (eoln(infile)) + scan_eoln(infile); + if (fscanf(infile, "%ld", &cursp) != 1) { + printf("\n\nERROR: Unable to read number of species in data set %ld\n", + ith); + printf( + "The input file is incorrect (perhaps it was not saved text only).\n"); + exxit(-1); + } + if (cursp != spp) { + printf( + "\n\nERROR: Inconsistent number of species in data set %ld\n\n", ith); + exxit(-1); + } +} /* samenumsp2 */ + + +void readoptions(long *extranum, const char *options) +{ /* read option characters from input file */ + Char ch; + + while (!(eoln(infile))) { + ch = gettc(infile); + uppercase(&ch); + if (strchr(options, ch) != NULL) + (* extranum)++; + else if (!(ch == ' ' || ch == '\t')) { + printf("BAD OPTION CHARACTER: %c\n", ch); + exxit(-1); + } + } + scan_eoln(infile); +} /* readoptions */ + + +void matchoptions(Char *ch, const char *options) +{ /* match option characters to those in auxiliary options line */ + + *ch = gettc(infile); + uppercase(ch); + if (strchr(options, *ch) == NULL) { + printf("ERROR: Incorrect auxiliary options line"); + printf(" which starts with %c\n", *ch); + exxit(-1); + } +} /* matchoptions */ + + +void inputweightsold(long chars, steptr weight, boolean *weights) +{ + Char ch; + int i; + + for (i = 1; i < nmlngth ; i++) + getc(infile); + + for (i = 0; i < chars; i++) { + do { + if (eoln(infile)) + scan_eoln(infile); + ch = gettc(infile); + if (ch == '\n') + ch = ' '; + } while (ch == ' '); + weight[i] = 1; + if (isdigit(ch)) + weight[i] = ch - '0'; + else if (isalpha(ch)) { + uppercase(&ch); + weight[i] = ch - 'A' + 10; + } else { + printf("\n\nERROR: Bad weight character: %c\n\n", ch); + exxit(-1); + } + } + scan_eoln(infile); + *weights = true; +} /*inputweightsold*/ + + +void inputweights(long chars, steptr weight, boolean *weights) +{ + /* input the character weights, 0-9 and A-Z for weights 0 - 35 */ + Char ch; + long i; + + for (i = 0; i < chars; i++) { + do { + if (eoln(weightfile)) + scan_eoln(weightfile); + ch = gettc(weightfile); + if (ch == '\n') + ch = ' '; + } while (ch == ' '); + weight[i] = 1; + if (isdigit(ch)) + weight[i] = ch - '0'; + else if (isalpha(ch)) { + uppercase(&ch); + weight[i] = ch - 'A' + 10; + } else { + printf("\n\nERROR: Bad weight character: %c\n\n", ch); + exxit(-1); + } + } + scan_eoln(weightfile); + *weights = true; +} /* inputweights */ + + +void inputweights2(long a, long b, long *weightsum, + steptr weight, boolean *weights, const char *prog) +{ + /* input the character weights, 0 or 1 */ + Char ch; + long i; + + *weightsum = 0; + for (i = a; i < b; i++) { + do { + if (eoln(weightfile)) + scan_eoln(weightfile); + ch = gettc(weightfile); + } while (ch == ' '); + weight[i] = 1; + if (ch == '0' || ch == '1') + weight[i] = ch - '0'; + else { + printf("\n\nERROR: Bad weight character: %c -- ", ch); + printf("weights in %s must be 0 or 1\n", prog); + exxit(-1); + } + *weightsum += weight[i]; + } + *weights = true; + scan_eoln(weightfile); +} /* inputweights2 */ + + +void printweights(FILE *filename, long inc, long chars, + steptr weight, const char *letters) +{ + /* print out the weights of sites */ + long i, j; + boolean letterweights; + + letterweights = false; + for (i = 0; i < chars; i++) + if (weight[i] > 9) + letterweights = true; + fprintf(filename, "\n %s are weighted as follows:",letters); + if (letterweights) + fprintf(filename, " (A = 10, B = 11, etc.)\n"); + else + putc('\n', filename); + for (i = 0; i < chars; i++) { + if (i % 60 == 0) { + putc('\n', filename); + for (j = 1; j <= nmlngth + 3; j++) + putc(' ', filename); + } + if (weight[i+inc] < 10) + fprintf(filename, "%ld", weight[i + inc]); + else + fprintf(filename, "%c", 'A'-10+(int)weight[i + inc]); + if ((i+1) % 5 == 0 && (i+1) % 60 != 0) + putc(' ', filename); + } + fprintf(filename, "\n\n"); +} /* printweights */ + + +void inputcategs(long a, long b, steptr category, long categs,const char *prog) +{ + /* input the categories, 1-9 */ + Char ch; + long i; + + for (i = a; i < b; i++) { + do { + if (eoln(catfile)) + scan_eoln(catfile); + ch = gettc(catfile); + } while (ch == ' '); + if ((ch >= '1') && (ch <= ('0'+categs))) + category[i] = ch - '0'; + else { + printf("\n\nERROR: Bad category character: %c", ch); + printf(" -- categories in %s are currently 1-%ld\n", prog, categs); + exxit(-1); + } + } + scan_eoln(catfile); +} /* inputcategs */ + + +void printcategs(FILE *filename, long chars, steptr category, + const char *letters) +{ + /* print out the sitewise categories */ + long i, j; + + fprintf(filename, "\n %s are:\n",letters); + for (i = 0; i < chars; i++) { + if (i % 60 == 0) { + putc('\n', filename); + for (j = 1; j <= nmlngth + 3; j++) + putc(' ', filename); + } + fprintf(filename, "%ld", category[i]); + if ((i+1) % 10 == 0 && (i+1) % 60 != 0) + putc(' ', filename); + } + fprintf(filename, "\n\n"); +} /* printcategs */ + + +void inputfactors(long chars, Char *factor, boolean *factors) +{ + /* reads the factor symbols */ + long i; + + for (i = 0; i < (chars); i++) { + if (eoln(factfile)) + scan_eoln(factfile); + factor[i] = gettc(factfile); + if (factor[i] == '\n') + factor[i] = ' '; + } + scan_eoln(factfile); + *factors = true; +} /* inputfactors */ + + +void printfactors(FILE *filename, long chars, Char *factor, const char *letters) +{ + /* print out list of factor symbols */ + long i; + + fprintf(filename, "Factors%s:\n\n", letters); + for (i = 1; i <= nmlngth - 5; i++) + putc(' ', filename); + for (i = 1; i <= (chars); i++) { + newline(filename, i, 55, nmlngth + 3); + putc(factor[i - 1], filename); + if (i % 5 == 0) + putc(' ', filename); + } + putc('\n', filename); +} /* printfactors */ + + +void headings(long chars, const char *letters1, const char *letters2) +{ + long i, j; + + putc('\n', outfile); + j = nmlngth + (chars + (chars - 1) / 10) / 2 - 5; + if (j < nmlngth - 1) + j = nmlngth - 1; + if (j > 37) + j = 37; + fprintf(outfile, "Name"); + for (i = 1; i <= j; i++) + putc(' ', outfile); + fprintf(outfile, "%s\n", letters1); + fprintf(outfile, "----"); + for (i = 1; i <= j; i++) + putc(' ', outfile); + fprintf(outfile, "%s\n\n", letters2); +} /* headings */ + + +void initname(long i) +{ + /* read in species name */ + long j; + + for (j = 0; j < nmlngth; j++) { + if (eoff(infile) | eoln(infile)){ + printf("\n\nERROR: end-of-line or end-of-file"); + printf(" in the middle of species name for species %ld\n\n", i+1); + exxit(-1); + } + nayme[i][j] = gettc(infile); + if ((nayme[i][j] == '(') || (nayme[i][j] == ')') || (nayme[i][j] == ':') + || (nayme[i][j] == ',') || (nayme[i][j] == ';') || (nayme[i][j] == '[') + || (nayme[i][j] == ']')) { + printf("\nERROR: Species name may not contain characters ( ) : ; , [ ] \n"); + printf(" In name of species number %ld there is character %c\n\n", + i+1, nayme[i][j]); + exxit(-1); + } + } +} /* initname */ + + +void findtree(boolean *found,long *pos,long nextree,long *place,bestelm *bestrees) +{ + /* finds tree given by array place in array bestrees by binary search */ + /* used by dnacomp, dnapars, dollop, mix, & protpars */ + long i, lower, upper; + boolean below, done; + + below = false; + lower = 1; + upper = nextree - 1; + (*found) = false; + while (!(*found) && lower <= upper) { + (*pos) = (lower + upper) / 2; + i = 3; + done = false; + while (!done) { + done = (i > spp); + if (!done) + done = (place[i - 1] != bestrees[(*pos) - 1].btree[i - 1]); + if (!done) + i++; + } + (*found) = (i > spp); + if (*found) + break; + below = (place[i - 1] < bestrees[(*pos )- 1].btree[i - 1]); + if (below) + upper = (*pos) - 1; + else + lower = (*pos) + 1; + } + if (!(*found) && !below) + (*pos)++; +} /* findtree */ + + +void addtree(long pos,long *nextree,boolean collapse,long *place,bestelm *bestrees) +{ + /* puts tree from array place in its proper position in array bestrees */ + /* used by dnacomp, dnapars, dollop, mix, & protpars */ + long i; + + for (i = *nextree - 1; i >= pos; i--){ + memcpy(bestrees[i].btree, bestrees[i - 1].btree, spp * sizeof(long)); + bestrees[i].gloreange = bestrees[i - 1].gloreange; + bestrees[i - 1].gloreange = false; + bestrees[i].locreange = bestrees[i - 1].locreange; + bestrees[i - 1].locreange = false; + bestrees[i].collapse = bestrees[i - 1].collapse; + } + for (i = 0; i < spp; i++) + bestrees[pos - 1].btree[i] = place[i]; + bestrees[pos - 1].collapse = collapse; + (*nextree)++; +} /* addtree */ + + +long findunrearranged(bestelm *bestrees, long nextree, boolean glob) +{ + /* finds bestree with either global or local field false */ + long i; + + if (glob) { + for (i = 0; i < nextree - 1; i++) + if (!bestrees[i].gloreange) + return i; + } else { + for (i = 0; i < nextree - 1; i++) + if (!bestrees[i].locreange) + return i; + } + return -1; +} /* findunrearranged */ + + +boolean torearrange(bestelm *bestrees, long nextree) +{ /* sees if any best tree is yet to be rearranged */ + + if (findunrearranged(bestrees, nextree, true) >= 0) + return true; + else if (findunrearranged(bestrees, nextree, false) >= 0) + return true; + else + return false; +} /* torearrange */ + + +void reducebestrees(bestelm *bestrees, long *nextree) +{ + /* finds best trees with collapsible branches and deletes them */ + long i, j; + + i = 0; + j = *nextree - 2; + do { + while (!bestrees[i].collapse && i < *nextree - 1) i++; + while (bestrees[j].collapse && j >= 0) j--; + if (i < j) { + memcpy(bestrees[i].btree, bestrees[j].btree, spp * sizeof(long)); + bestrees[i].gloreange = bestrees[j].gloreange; + bestrees[i].locreange = bestrees[j].locreange; + bestrees[i].collapse = false; + bestrees[j].collapse = true; + } + } while (i < j); + *nextree = i + 1; +} /* reducebestrees */ + + +void shellsort(double *a, long *b, long n) +{ /* Shell sort keeping a, b in same order */ + /* used by dnapenny, dolpenny, & penny */ + long gap, i, j, itemp; + double rtemp; + + gap = n / 2; + while (gap > 0) { + for (i = gap + 1; i <= n; i++) { + j = i - gap; + while (j > 0) { + if (a[j - 1] > a[j + gap - 1]) { + rtemp = a[j - 1]; + a[j - 1] = a[j + gap - 1]; + a[j + gap - 1] = rtemp; + itemp = b[j - 1]; + b[j - 1] = b[j + gap - 1]; + b[j + gap - 1] = itemp; + } + j -= gap; + } + } + gap /= 2; + } +} /* shellsort */ + + +void getch(Char *c, long *parens, FILE *treefile) +{ /* get next nonblank character */ + + do { + if (eoln(treefile)) + scan_eoln(treefile); + (*c) = gettc(treefile); + + if ((*c) == '\n' || (*c) == '\t') + (*c) = ' '; + } while ( *c == ' ' && !eoff(treefile) ); + if ((*c) == '(') + (*parens)++; + if ((*c) == ')') + (*parens)--; +} /* getch */ + + +void getch2(Char *c, long *parens) +{ /* get next nonblank character */ + do { + if (eoln(intree)) + scan_eoln(intree); + *c = gettc(intree); + if (*c == '\n' || *c == '\t') + *c = ' '; + } while (!(*c != ' ' || eoff(intree))); + if (*c == '(') + (*parens)++; + if (*c == ')') + (*parens)--; +} /* getch2 */ + + +void findch(Char c, Char *ch, long which) +{ /* scan forward until find character c */ + boolean done; + long dummy_parens; + done = false; + while (!done) { + if (c == ',') { + if (*ch == '(' || *ch == ')' || *ch == ';') { + printf( + "\n\nERROR in user tree %ld: unmatched parenthesis or missing comma\n\n", + which); + exxit(-1); + } else if (*ch == ',') + done = true; + } else if (c == ')') { + if (*ch == '(' || *ch == ',' || *ch == ';') { + printf("\n\nERROR in user tree %ld: ", which); + printf("unmatched parenthesis or non-bifurcated node\n\n"); + exxit(-1); + } else { + if (*ch == ')') + done = true; + } + } else if (c == ';') { + if (*ch != ';') { + printf("\n\nERROR in user tree %ld: ", which); + printf("unmatched parenthesis or missing semicolon\n\n"); + exxit(-1); + } else + done = true; + } + if (*ch != ')' && done) + continue; + getch(ch, &dummy_parens, intree); + } +} /* findch */ + + +void findch2(Char c, long *lparens, long *rparens, Char *ch) +{ /* skip forward in user tree until find character c */ + boolean done; + long dummy_parens; + done = false; + while (!done) { + if (c == ',') { + if (*ch == '(' || *ch == ')' || *ch == ':' || *ch == ';') { + printf("\n\nERROR in user tree: "); + printf("unmatched parenthesis, missing comma"); + printf(" or non-trifurcated base\n\n"); + exxit(-1); + } else if (*ch == ',') + done = true; + } else if (c == ')') { + if (*ch == '(' || *ch == ',' || *ch == ':' || *ch == ';') { + printf( + "\n\nERROR in user tree: unmatched parenthesis or non-bifurcated node\n\n"); + exxit(-1); + } else if (*ch == ')') { + (*rparens)++; + if ((*lparens) > 0 && (*lparens) == (*rparens)) { + if ((*lparens) == spp - 2) { + getch(ch, &dummy_parens, intree); + if (*ch != ';') { + printf( "\n\nERROR in user tree: "); + printf("unmatched parenthesis or missing semicolon\n\n"); + exxit(-1); + } + } + } + done = true; + } + } + if (*ch != ')' && done) + continue; + if (*ch == ')') + getch(ch, &dummy_parens, intree); + } +} /* findch2 */ + + +void processlength(double *valyew, double *divisor, Char *ch, + boolean *minusread, FILE *treefile, long *parens) +{ /* read a branch length from a treefile */ + long digit, ordzero; + boolean pointread; + + ordzero = '0'; + *minusread = false; + pointread = false; + *valyew = 0.0; + *divisor = 1.0; + getch(ch, parens, treefile); + digit = (long)(*ch - ordzero); + while ( ((digit <= 9) && (digit >= 0)) || *ch == '.' || *ch == '-') { + if (*ch == '.' ) + pointread = true; + else if (*ch == '-' ) + *minusread = true; + else { + *valyew = *valyew * 10.0 + digit; + if (pointread) + *divisor *= 10.0; + } + getch(ch, parens, treefile); + digit = (long)(*ch - ordzero); + } + if (*minusread) + *valyew = -(*valyew); +} /* processlength */ + + +void writename(long start, long n, long *enterorder) +{ /* write species name and number in entry order */ + long i, j; + + for (i = start; i < start+n; i++) { + printf(" %3ld. ", i+1); + for (j = 0; j < nmlngth; j++) + putchar(nayme[enterorder[i] - 1][j]); + putchar('\n'); + fflush(stdout); + } +} /* writename */ + + +void memerror() +{ + printf("Error allocating memory\n"); + exxit(-1); +} /* memerror */ + + +void odd_malloc(long x) +{ /* error message if attempt to malloc too little or too much memory */ + printf ("ERROR: a function asked for an inappropriate amount of memory:"); + printf (" %ld bytes\n", x); + printf (" This can mean one of two things:\n"); + printf (" 1. The input file is incorrect"); + printf (" (perhaps it was not saved as Text Only),\n"); + printf (" 2. There is a bug in the program.\n"); + printf (" Please check your input file carefully.\n"); + printf (" If it seems to be a bug, please mail joe@gs.washington.edu\n"); + printf (" with the name of the program, your computer system type,\n"); + printf (" a full description of the problem, and with the input data file.\n"); + printf (" (which should be in the body of the message, not as an Attachment).\n"); + + /* abort() can be used to crash */ + + exxit(-1); +} + + +MALLOCRETURN *mymalloc(long x) +{ /* wrapper for malloc, allowing error message if too little, too much */ + MALLOCRETURN *new_block; + + if ((x <= 0) || + (x > TOO_MUCH_MEMORY)) + odd_malloc(x); + + new_block = (MALLOCRETURN *)calloc(1,x); + + if (!new_block) { + memerror(); + return (MALLOCRETURN *) new_block; + } else + return (MALLOCRETURN *) new_block; +} /* mymalloc */ + + +void gnu(node **grbg, node **p) +{ /* this and the following are do-it-yourself garbage collectors. + Make a new node or pull one off the garbage list */ + + if (*grbg != NULL) { + *p = *grbg; + *grbg = (*grbg)->next; + } else + *p = (node *)Malloc(sizeof(node)); + + (*p)->back = NULL; + (*p)->next = NULL; + (*p)->tip = false; + (*p)->times_in_tree = 0.0; + (*p)->r = 0.0; + (*p)->theta = 0.0; + (*p)->x = NULL; + (*p)->protx = NULL; /* for the sake of proml */ +} /* gnu */ + + +void chuck(node **grbg, node *p) +{ /* collect garbage on p -- put it on front of garbage list */ + p->back = NULL; + p->next = *grbg; + *grbg = p; +} /* chuck */ + + +void zeronumnuc(node *p, long endsite) +{ + long i,j; + + for (i = 0; i < endsite; i++) + for (j = (long)A; j <= (long)O; j++) + p->numnuc[i][j] = 0; +} /* zeronumnuc */ + + +void zerodiscnumnuc(node *p, long endsite) +{ + long i,j; + + for (i = 0; i < endsite; i++) + for (j = (long)zero; j <= (long)seven; j++) + p->discnumnuc[i][j] = 0; +} /* zerodiscnumnuc */ + + +void allocnontip(node *p, long *zeros, long endsite) +{ /* allocate an interior node */ + /* used by dnacomp, dnapars, & dnapenny */ + + p->numsteps = (steptr)Malloc(endsite*sizeof(long)); + p->oldnumsteps = (steptr)Malloc(endsite*sizeof(long)); + p->base = (baseptr)Malloc(endsite*sizeof(long)); + p->oldbase = (baseptr)Malloc(endsite*sizeof(long)); + p->numnuc = (nucarray *)Malloc(endsite*sizeof(nucarray)); + memcpy(p->base, zeros, endsite*sizeof(long)); + memcpy(p->numsteps, zeros, endsite*sizeof(long)); + memcpy(p->oldbase, zeros, endsite*sizeof(long)); + memcpy(p->oldnumsteps, zeros, endsite*sizeof(long)); + zeronumnuc(p, endsite); +} /* allocnontip */ + + +void allocdiscnontip(node *p, long *zeros, unsigned char *zeros2, long endsite) +{ /* allocate an interior node */ + /* used by pars */ + + p->numsteps = (steptr)Malloc(endsite*sizeof(long)); + p->oldnumsteps = (steptr)Malloc(endsite*sizeof(long)); + p->discbase = (discbaseptr)Malloc(endsite*sizeof(unsigned char)); + p->olddiscbase = (discbaseptr)Malloc(endsite*sizeof(unsigned char)); + p->discnumnuc = (discnucarray *)Malloc(endsite*sizeof(discnucarray)); + memcpy(p->discbase, zeros2, endsite*sizeof(unsigned char)); + memcpy(p->numsteps, zeros, endsite*sizeof(long)); + memcpy(p->olddiscbase, zeros2, endsite*sizeof(unsigned char)); + memcpy(p->oldnumsteps, zeros, endsite*sizeof(long)); + zerodiscnumnuc(p, endsite); +} /* allocdiscnontip */ + + +void allocnode(node **anode, long *zeros, long endsite) +{ /* allocate a node */ + /* used by dnacomp, dnapars, & dnapenny */ + + *anode = (node *)Malloc(sizeof(node)); + allocnontip(*anode, zeros, endsite); +} /* allocnode */ + + +void allocdiscnode(node **anode, long *zeros, unsigned char *zeros2, + long endsite) +{ /* allocate a node */ + /* used by pars */ + + *anode = (node *)Malloc(sizeof(node)); + allocdiscnontip(*anode, zeros, zeros2, endsite); +} /* allocdiscnontip */ + + +void gnutreenode(node **grbg, node **p, long i, long endsite, long *zeros) +{ /* this and the following are do-it-yourself garbage collectors. + Make a new node or pull one off the garbage list */ + + if (*grbg != NULL) { + *p = *grbg; + *grbg = (*grbg)->next; + memcpy((*p)->numsteps, zeros, endsite*sizeof(long)); + memcpy((*p)->oldnumsteps, zeros, endsite*sizeof(long)); + memcpy((*p)->base, zeros, endsite*sizeof(long)); + memcpy((*p)->oldbase, zeros, endsite*sizeof(long)); + zeronumnuc(*p, endsite); + } else + allocnode(p, zeros, endsite); + (*p)->back = NULL; + (*p)->next = NULL; + (*p)->tip = false; + (*p)->visited = false; + (*p)->index = i; + (*p)->numdesc = 0; + (*p)->sumsteps = 0.0; +} /* gnutreenode */ + + +void gnudisctreenode(node **grbg, node **p, long i, + long endsite, long *zeros, unsigned char *zeros2) +{ /* this and the following are do-it-yourself garbage collectors. + Make a new node or pull one off the garbage list */ + + if (*grbg != NULL) { + *p = *grbg; + *grbg = (*grbg)->next; + memcpy((*p)->numsteps, zeros, endsite*sizeof(long)); + memcpy((*p)->oldnumsteps, zeros, endsite*sizeof(long)); + memcpy((*p)->discbase, zeros2, endsite*sizeof(unsigned char)); + memcpy((*p)->olddiscbase, zeros2, endsite*sizeof(unsigned char)); + zerodiscnumnuc(*p, endsite); + } else + allocdiscnode(p, zeros, zeros2, endsite); + (*p)->back = NULL; + (*p)->next = NULL; + (*p)->tip = false; + (*p)->visited = false; + (*p)->index = i; + (*p)->numdesc = 0; + (*p)->sumsteps = 0.0; +} /* gnudisctreenode */ + + +void chucktreenode(node **grbg, node *p) +{ /* collect garbage on p -- put it on front of garbage list */ + + p->back = NULL; + p->next = *grbg; + *grbg = p; +} /* chucktreenode */ + + +void setupnode(node *p, long i) +{ /* initialization of node pointers, variables */ + + p->next = NULL; + p->back = NULL; + p->times_in_tree = (double) i * 1.0; + p->index = i; + p->tip = false; +} /* setupnode */ + + +long count_sibs (node *p) +{ /* Count the number of nodes in a ring, return the total number of */ + /* nodes excluding the one passed into the function (siblings) */ + node *q; + long return_int = 0; + + if (p->tip) { + printf ("Error: the function count_sibs called on a tip. This is a bug.\n"); + exxit (-1); + } + + q = p->next; + while (q != p) { + if (q == NULL) { + printf ("Error: a loop of nodes was not closed.\n"); + exxit (-1); + } else { + return_int++; + q = q->next; + } + } + + return return_int; +} /* count_sibs */ + + +void inittrav (node *p) +{ /* traverse to set pointers uninitialized on inserting */ + long i, num_sibs; + node *sib_ptr; + + if (p == NULL) + return; + if (p->tip) + return; + num_sibs = count_sibs (p); + sib_ptr = p; + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_ptr->initialized = false; + inittrav(sib_ptr->back); + } +} /* inittrav */ + + +void commentskipper(FILE ***intree, long *bracket) +{ /* skip over comment bracket contents in reading tree */ + char c; + + c = gettc(**intree); + + while (c != ']') { + + if(feof(**intree)) { + printf("\n\nERROR: Unmatched comment brackets\n\n"); + exxit(-1); + } + + if(c == '[') { + (*bracket)++; + commentskipper(intree, bracket); + } + c = gettc(**intree); + } + (*bracket)--; +} /* commentskipper */ + + +long countcomma(FILE **treefile, long *comma) +{ + /* Modified by Dan F. 11/10/96 */ + + /* The next line inserted so this function leaves the file pointing + to where it found it, not just re-winding it. */ + long orig_position = ftell (*treefile); + + Char c; + long lparen = 0; + long bracket = 0; + (*comma) = 0; + + + for (;;){ + c = getc(*treefile); + if (feof(*treefile)) + break; + if (c == ';') + break; + if (c == ',') + (*comma)++; + if (c == '(') + lparen++; + if (c == '[') { + bracket++; + commentskipper(&treefile, &bracket); + } + } + + /* Don't just rewind, */ + /* rewind (*treefile); */ + /* Re-set to where it pointed when the function was called */ + + fseek (*treefile, orig_position, SEEK_SET); + + return lparen + (*comma); +} /*countcomma*/ +/* countcomma rewritten so it passes back both lparen+comma to allocate nodep + and a pointer to the comma variable. This allows the tree to know how many + species exist, and the tips to be placed in the front of the nodep array */ + + +long countsemic(FILE **treefile) +{ /* Used to determine the number of user trees. Return + either a: the number of semicolons in the file outside comments + or b: the first integer in the file */ + Char c; + long return_val, semic = 0; + long bracket = 0; + + /* Eat all whitespace */ + c = gettc(*treefile); + while ((c == ' ') || + (c == '\t') || + (c == '\n')) { + c = gettc(*treefile); + } + + /* Then figure out if the first non-white character is a digit; if + so, return it */ + if (isdigit (c)) { + ungetc(c, *treefile); + fscanf((*treefile), "%ld", &return_val); + } else { + + /* Loop past all characters, count the number of semicolons + outside of comments */ + for (;;){ + c = fgetc(*treefile); + if (feof(*treefile)) + break; + if (c == ';') + semic++; + if (c == '[') { + bracket++; + commentskipper(&treefile, &bracket); + } + } + return_val = semic; + } + + rewind (*treefile); + return return_val; +} /* countsemic */ + + +void hookup(node *p, node *q) +{ /* hook together two nodes */ + + p->back = q; + q->back = p; +} /* hookup */ + + +void link_trees(long local_nextnum, long nodenum, long local_nodenum, + pointarray nodep) +{ + if(local_nextnum == 0) + hookup(nodep[nodenum],nodep[local_nodenum]); + else if(local_nextnum == 1) + hookup(nodep[nodenum], nodep[local_nodenum]->next); + else if(local_nextnum == 2) + hookup(nodep[nodenum],nodep[local_nodenum]->next->next); + else + printf("Error in Link_Trees()"); +} /* link_trees() */ + + +void allocate_nodep(pointarray *nodep, FILE **treefile, long *precalc_tips) +{ /* pre-compute space and allocate memory for nodep */ + + long numnodes; /* returns number commas & ( */ + long numcom = 0; /* returns number commas */ + + numnodes = countcomma(treefile, &numcom) + 1; + *nodep = (pointarray)Malloc(2*numnodes*sizeof(node *)); + + (*precalc_tips) = numcom + 1; /* this will be used in placing the + tip nodes in the front region of + nodep. Used for species check? */ +} /* allocate_nodep -plc */ + + +void malloc_pheno (node *p, long endsite, long rcategs) +{ /* Allocate the phenotype arrays; used by dnaml */ + long i; + + p->x = (phenotype)Malloc(endsite*sizeof(ratelike)); + p->underflows = Malloc(endsite * sizeof(double)); + for (i = 0; i < endsite; i++) + p->x[i] = (ratelike)Malloc(rcategs*sizeof(sitelike)); +} /* malloc_pheno */ + + +void malloc_ppheno (node *p,long endsite, long rcategs) +{ + /* Allocate the phenotype arrays; used by proml */ + long i; + + p->protx = (pphenotype)Malloc(endsite*sizeof(pratelike)); + p->underflows = Malloc(endsite*sizeof(double)); + + for (i = 0; i < endsite; i++) + p->protx[i] = (pratelike)Malloc(rcategs*sizeof(psitelike)); +} /* malloc_ppheno */ + + +long take_name_from_tree (Char *ch, Char *str, FILE *treefile) +{ + /* This loop takes in the name from the tree. + Return the length of the name string. */ + + long name_length = 0; + + do { + if ((*ch) == '_') + (*ch) = ' '; + str[name_length++] = (*ch); + if (eoln(treefile)) + scan_eoln(treefile); + (*ch) = gettc(treefile); + if (*ch == '\n') + *ch = ' '; + } while ((*ch) != ':' && (*ch) != ',' && (*ch) != ')' && + (*ch) != '[' && (*ch) != ';' && name_length <= MAXNCH); + return name_length; +} /* take_name_from_tree */ + + +void match_names_to_data (Char *str, pointarray treenode, node **p, long spp) +{ + /* This loop matches names taken from treefile to indexed names in + the data file */ + + boolean found; + long i, n; + + n = 1; + do { + found = true; + for (i = 0; i < nmlngth; i++) { + found = (found && ((str[i] == nayme[n - 1][i]) || + (((nayme[n - 1][i] == '_') && (str[i] == ' ')) || + ((nayme[n - 1][i] == ' ') && (str[i] == '\0'))))); + } + + if (found) + *p = treenode[n - 1]; + else + n++; + + } while (!(n > spp || found)); + + if (n > spp) { + printf("\n\nERROR: Cannot find species: "); + for (i = 0; (str[i] != '\0') && (i < MAXNCH); i++) + putchar(str[i]); + printf(" in data file\n\n"); + exxit(-1); + } +} /* match_names_to_data */ + + +void addelement(node **p, node *q, Char *ch, long *parens, FILE *treefile, + pointarray treenode, boolean *goteof, boolean *first, pointarray nodep, + long *nextnode, long *ntips, boolean *haslengths, node **grbg, + initptr initnode,boolean unifok, long maxnodes) +{ + /* Recursive procedure adds nodes to user-defined tree + This is the main (new) tree-reading procedure */ + + node *pfirst; + long i, len = 0, nodei = 0; + boolean notlast; + Char str[MAXNCH]; + node *r; + long furs = 0; + + if ((*ch) == '(') { + (*nextnode)++; /* get ready to use new interior node */ + nodei = *nextnode; /* do what needs to be done at bottom */ + if ( maxnodes != -1 && nodei > maxnodes) { + printf("ERROR in input tree file: Attempting to allocate too\n"); + printf("many nodes. This is usually caused by a unifurcation."); + printf("To use this tree with this program use Retree to read\n"); + printf("and write this tree.\n"); + exxit(-1); + } + (*initnode)(p, grbg, q, len, nodei, ntips, + parens, bottom, treenode, nodep, str, ch, treefile); + pfirst = (*p); + notlast = true; + while (notlast) { /* loop through immediate descendants */ + furs++; + (*initnode)(&(*p)->next, grbg, q, + len, nodei, ntips, parens, nonbottom, treenode, + nodep, str, ch, treefile); + /* ... doing what is done before each */ + r = (*p)->next; + getch(ch, parens, treefile); /* look for next character */ + + /* handle blank names */ + if((*ch) == ',' || (*ch) == ':'){ + ungetc((*ch), treefile); + *ch = 0; + } else if((*ch)==')'){ + ungetc((*ch), treefile); + (*parens)++; + *ch = 0; + } + + addelement(&(*p)->next->back, (*p)->next, ch, parens, treefile, + treenode, goteof, first, nodep, nextnode, ntips, + haslengths, grbg, initnode,unifok,maxnodes); + + (*initnode)(&r, grbg, q, len, nodei, ntips, + parens, hslength, treenode, nodep, str, ch, treefile); + /* do what is done after each about length */ + pfirst->numdesc++; /* increment number of descendants */ + *p = r; /* make r point back to p */ + + if ((*ch) == ')') { + notlast = false; + do { + getch(ch, parens, treefile); + } while ((*ch) != ',' && (*ch) != ')' && + (*ch) != '[' && (*ch) != ';' && (*ch) != ':'); + } + } + if ( furs <= 1 && !unifok ) { + printf("ERROR in input tree file: A Unifurcation was detetected.\n"); + printf("To use this tree with this program use retree to read and"); + printf(" write this tree\n"); + exxit(-1); + } + + (*p)->next = pfirst; + (*p) = pfirst; + + } else if ((*ch) != ')') { /* if it's a species name */ + for (i = 0; i < MAXNCH; i++) /* fill string with nulls */ + str[i] = '\0'; + + len = take_name_from_tree (ch, str, treefile); /* get the name */ + + if ((*ch) == ')') + (*parens)--; /* decrement count of open parentheses */ + (*initnode)(p, grbg, q, len, nodei, ntips, + parens, tip, treenode, nodep, str, ch, treefile); + /* do what needs to be done at a tip */ + } else + getch(ch, parens, treefile); + if (q != NULL) + hookup(q, (*p)); /* now hook up */ + (*initnode)(p, grbg, q, len, nodei, ntips, + parens, iter, treenode, nodep, str, ch, treefile); + /* do what needs to be done to variable iter */ + if ((*ch) == ':') + (*initnode)(p, grbg, q, len, nodei, ntips, + parens, length, treenode, nodep, str, ch, treefile); + /* do what needs to be done with length */ + else if ((*ch) != ';' && (*ch) != '[') + (*initnode)(p, grbg, q, len, nodei, ntips, + parens, hsnolength, treenode, nodep, str, ch, treefile); + /* ... or what needs to be done when no length */ + if ((*ch) == '[') + (*initnode)(p, grbg, q, len, nodei, ntips, + parens, treewt, treenode, nodep, str, ch, treefile); + /* ... for processing a tree weight */ + else if ((*ch) == ';') /* ... and at end of tree */ + (*initnode)(p, grbg, q, len, nodei, ntips, + parens, unittrwt, treenode, nodep, str, ch, treefile); +} /* addelement */ + + +void treeread (FILE *treefile, node **root, pointarray treenode, + boolean *goteof, boolean *first, pointarray nodep, + long *nextnode, boolean *haslengths, node **grbg, initptr initnode, + boolean unifok,long maxnodes) +{ + /* read in user-defined tree and set it up */ + char ch; + long parens = 0; + long ntips = 0; + + (*goteof) = false; + (*nextnode) = spp; + + /* eat blank lines */ + while (eoln(treefile) && !eoff(treefile)) + scan_eoln(treefile); + + if (eoff(treefile)) { + (*goteof) = true; + return; + } + + getch(&ch, &parens, treefile); + + while (ch != '(') { + /* Eat everything in the file (i.e. digits, tabs) until you + encounter an open-paren */ + getch(&ch, &parens, treefile); + } + (*haslengths) = true; + addelement(root, NULL, &ch, &parens, treefile, + treenode, goteof, first, nodep, nextnode, &ntips, + haslengths, grbg, initnode,unifok,maxnodes); + + /* Eat blank lines and end of current line*/ + do { + scan_eoln(treefile); + } + while (eoln(treefile) && !eoff(treefile)); + + (*first) = false; + if (parens != 0) { + printf("\n\nERROR in tree file: unmatched parentheses\n\n"); + exxit(-1); + } +} /* treeread */ + + +void addelement2(node *q, Char *ch, long *parens, FILE *treefile, + pointarray treenode, boolean lngths, double *trweight, boolean *goteof, + long *nextnode, long *ntips, long no_species, boolean *haslengths, + boolean unifok,long maxnodes) +{ + /* recursive procedure adds nodes to user-defined tree + -- old-style bifurcating-only version */ + node *pfirst = NULL, *p; + long i, len, current_loop_index; + boolean notlast, minusread; + Char str[MAXNCH]; + double valyew, divisor; + long furs = 0; + + if ((*ch) == '(') { + + current_loop_index = (*nextnode) + spp; + (*nextnode)++; + + if ( maxnodes != -1 && current_loop_index > maxnodes) { + printf("ERROR in intree file: Attempting to allocate too many nodes\n"); + printf("This is usually caused by a unifurcation. To use this\n"); + printf("intree with this program use retree to read and write\n"); + printf("this tree.\n"); + exxit(-1); + } + /* This is an assignment of an interior node */ + p = treenode[current_loop_index]; + pfirst = p; + notlast = true; + while (notlast) { + furs++; + /* This while loop goes through a circle (triad for + bifurcations) of nodes */ + p = p->next; + /* added to ensure that non base nodes in loops have indices */ + p->index = current_loop_index + 1; + + getch(ch, parens, treefile); + + addelement2(p, ch, parens, treefile, treenode, lngths, trweight, + goteof, nextnode, ntips, no_species, haslengths,unifok,maxnodes); + + if ((*ch) == ')') { + notlast = false; + do { + getch(ch, parens, treefile); + } while ((*ch) != ',' && (*ch) != ')' && + (*ch) != '[' && (*ch) != ';' && (*ch) != ':'); + } + } + if ( furs <= 1 && !unifok ) { + printf("ERROR in intree file: A Unifurcation was detetected.\n"); + printf("To use this intree with this program use retree to read and"); + printf(" write this tree\n"); + exxit(-1); + } + + } else if ((*ch) != ')') { + for (i = 0; i < MAXNCH; i++) + str[i] = '\0'; + len = take_name_from_tree (ch, str, treefile); + match_names_to_data (str, treenode, &p, spp); + pfirst = p; + if ((*ch) == ')') + (*parens)--; + (*ntips)++; + strncpy (p->nayme, str, len); + } else + getch(ch, parens, treefile); + + if ((*ch) == '[') { /* getting tree weight from last comment field */ + if (!eoln(treefile)) { + fscanf(treefile, "%lf", trweight); + getch(ch, parens, treefile); + if (*ch != ']') { + printf("\n\nERROR: Missing right square bracket\n\n"); + exxit(-1); + } + else { + getch(ch, parens, treefile); + if (*ch != ';') { + printf("\n\nERROR: Missing semicolon after square brackets\n\n"); + exxit(-1); + } + } + } + } + else if ((*ch) == ';') { + (*trweight) = 1.0 ; + if (!eoln(treefile)) + printf("WARNING: tree weight set to 1.0\n"); + } + else + (*haslengths) = ((*haslengths) && q == NULL); + + if (q != NULL) + hookup(q, pfirst); + + if ((*ch) == ':') { + processlength(&valyew, &divisor, ch, + &minusread, treefile, parens); + if (q != NULL) { + if (!minusread) + q->oldlen = valyew / divisor; + else + q->oldlen = 0.0; + if (lngths) { + q->v = valyew / divisor; + q->back->v = q->v; + q->iter = false; + q->back->iter = false; + q->back->iter = false; + } + } + } + +} /* addelement2 */ + + +void treeread2 (FILE *treefile, node **root, pointarray treenode, + boolean lngths, double *trweight, boolean *goteof, + boolean *haslengths, long *no_species,boolean unifok,long maxnodes) +{ + /* read in user-defined tree and set it up + -- old-style bifurcating-only version */ + char ch; + long parens = 0; + long ntips = 0; + long nextnode; + + (*goteof) = false; + nextnode = 0; + + /* Eats all blank lines at start of file */ + while (eoln(treefile) && !eoff(treefile)) + scan_eoln(treefile); + + if (eoff(treefile)) { + (*goteof) = true; + return; + } + + getch(&ch, &parens, treefile); + + while (ch != '(') { + /* Eat everything in the file (i.e. digits, tabs) until you + encounter an open-paren */ + getch(&ch, &parens, treefile); + } + + addelement2(NULL, &ch, &parens, treefile, treenode, lngths, trweight, + goteof, &nextnode, &ntips, (*no_species), haslengths,unifok,maxnodes); + (*root) = treenode[*no_species]; + + /*eat blank lines */ + while (eoln(treefile) && !eoff(treefile)) + scan_eoln(treefile); + + (*root)->oldlen = 0.0; + + if (parens != 0) { + printf("\n\nERROR in tree file: unmatched parentheses\n\n"); + exxit(-1); + } +} /* treeread2 */ + + +void exxit(int exitcode) +{ +#ifdef WIN32 + if (exitcode == 0) +#endif + exit (exitcode); +#ifdef WIN32 + else { + puts ("Hit Enter or Return to close program."); + puts(" You may have to hit Enter or Return twice."); + getchar (); + getchar (); + phyRestoreConsoleAttributes(); + exit (exitcode); + } +#endif +} /* exxit */ + + +char gettc(FILE* file) +{ /* catch eof's so that other functions not expecting an eof + * won't have to worry about it */ + int ch; + + ch=getc(file); + + if (ch == EOF ) { + puts("Unexpected End of File"); + exxit(-1); + } + + if ( ch == '\r' ) { + ch = getc(file); + if ( ch != '\n' ) + ungetc(ch,file); + ch = '\n'; + } + return ch; +} /* gettc */ + +void unroot(tree *t, long nonodes) +{ + /* used by fitch, restml and contml */ + if (t->start->back == NULL) { + if (t->start->next->back->tip) + t->start = t->start->next->next->back; + else t->start = t->start->next->back; + } + if (t->start->next->back == NULL) { + if (t->start->back->tip) + t->start = t->start->next->next->back; + else t->start = t->start->back; + } + if (t->start->next->next->back == NULL) { + if (t->start->back->tip) + t->start = t->start->next->back; + else t->start = t->start->back; + } + + + unroot_r(t->start,t->nodep,nonodes); + unroot_r(t->start->back, t->nodep, nonodes); +} + +void unroot_here(node* root, node** nodep, long nonodes) +{ + node* tmpnode; + double newl; + /* used by unroot */ + /* assumes bifurcation this is ok in the programs that use it */ + + + newl = root->next->oldlen + root->next->next->oldlen; + root->next->back->oldlen = newl; + root->next->next->back->oldlen = newl; + + newl = root->next->v + root->next->next->v; + root->next->back->v = newl; + root->next->next->back->v = newl; + + root->next->back->back=root->next->next->back; + root->next->next->back->back = root->next->back; + + while ( root->index != nonodes ) { + tmpnode = nodep[ root->index ]; + nodep[root->index] = root; + root->index++; + root->next->index++; + root->next->next->index++; + nodep[root->index - 2] = tmpnode; + tmpnode->index--; + tmpnode->next->index--; + tmpnode->next->next->index--; + } +} + +void unroot_r(node* p, node** nodep, long nonodes) +{ + /* used by unroot */ + node *q; + + if ( p->tip) return; + + q = p->next; + while ( q != p ) { + if (q->back == NULL) + unroot_here(q,nodep,nonodes); + else unroot_r(q->back,nodep,nonodes); + q = q->next; + } +} + +void clear_connections(tree *t, long nonodes) +{ + long i; + for ( i = 0 ; i < nonodes ; i++) { + if ( i > spp) { + t->nodep[i]->next->back = NULL; + t->nodep[i]->next->v = 0; + t->nodep[i]->next->next->back = NULL; + t->nodep[i]->next->next->v = 0; + } + t->nodep[i]->back = NULL; + t->nodep[i]->v = 0; + } +} + +#ifdef WIN32 +void phySaveConsoleAttributes() +{ + GetConsoleScreenBufferInfo( hConsoleOutput, &savecsbi ); +} /* PhySaveConsoleAttributes */ + + +void phySetConsoleAttributes() +{ + hConsoleOutput = GetStdHandle(STD_OUTPUT_HANDLE); + + phySaveConsoleAttributes(); + + SetConsoleTextAttribute(hConsoleOutput, + BACKGROUND_GREEN | BACKGROUND_BLUE | BACKGROUND_INTENSITY); +} /* phySetConsoleAttributes */ + + +void phyRestoreConsoleAttributes() +{ + COORD coordScreen = { 0, 0 }; + DWORD cCharsWritten; + DWORD dwConSize; + + dwConSize = savecsbi.dwSize.X * savecsbi.dwSize.Y; + + SetConsoleTextAttribute(hConsoleOutput, savecsbi.wAttributes); + + FillConsoleOutputAttribute( hConsoleOutput, savecsbi.wAttributes, + dwConSize, coordScreen, &cCharsWritten ); +} /* phyRestoreConsoleAttributes */ + + +void phyFillScreenColor() +{ + COORD coordScreen = { 0, 0 }; + DWORD cCharsWritten; + CONSOLE_SCREEN_BUFFER_INFO csbi; /* to get buffer info */ + DWORD dwConSize; + + GetConsoleScreenBufferInfo( hConsoleOutput, &csbi ); + dwConSize = csbi.dwSize.X * csbi.dwSize.Y; + + FillConsoleOutputAttribute( hConsoleOutput, csbi.wAttributes, + dwConSize, coordScreen, &cCharsWritten ); +} /* PhyFillScreenColor */ + + +void phyClearScreen() +{ + COORD coordScreen = { 0, 0 }; /* here's where we'll home the + cursor */ + DWORD cCharsWritten; + CONSOLE_SCREEN_BUFFER_INFO csbi; /* to get buffer info */ + DWORD dwConSize; /* number of character cells in + the current buffer */ + + /* get the number of character cells in the current buffer */ + + GetConsoleScreenBufferInfo( hConsoleOutput, &csbi ); + dwConSize = csbi.dwSize.X * csbi.dwSize.Y; + + /* fill the entire screen with blanks */ + + FillConsoleOutputCharacter( hConsoleOutput, (TCHAR) ' ', + dwConSize, coordScreen, &cCharsWritten ); + + /* get the current text attribute */ + + GetConsoleScreenBufferInfo( hConsoleOutput, &csbi ); + + /* now set the buffer's attributes accordingly */ + + FillConsoleOutputAttribute( hConsoleOutput, csbi.wAttributes, + dwConSize, coordScreen, &cCharsWritten ); + + /* put the cursor at (0, 0) */ + + SetConsoleCursorPosition( hConsoleOutput, coordScreen ); + return; +} /* PhyClearScreen */ +#endif + diff --git a/forester/archive/RIO/others/phylip_mod/src/phylip.h b/forester/archive/RIO/others/phylip_mod/src/phylip.h new file mode 100644 index 0000000..205d9de --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/phylip.h @@ -0,0 +1,607 @@ +/*Modified by Christian Zmasek. Use at your own risk.*/ + + +#ifndef _PHYLIP_H_ +#define _PHYLIP_H_ + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, Andrew Keeffe, + Mike Palczewski, Doug Buxton and Dan Fineman. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +#define VERSION "3.65" + +/* machine-specific stuff: + based on a number of factors in the library stdlib.h, we will try + to determine what kind of machine/compiler this program is being + built on. However, it doesn't always succeed. However, if you have + ANSI conforming C, it will probably work. + + We will try to figure out machine type + based on defines in stdio, and compiler-defined things as well.: */ + +#include +#include +#ifdef WIN32 +#include + +void phyClearScreen(void); +void phySaveConsoleAttributes(void); +void phySetConsoleAttributes(void); +void phyRestoreConsoleAttributes(void); +void phyFillScreenColor(void); + +#endif + +#ifdef GNUDOS +#define DJGPP +#define DOS +#endif + +#ifdef THINK_C +#define MAC +#endif +#ifdef __MWERKS__ +#ifndef WIN32 +#define MAC +#endif +#endif + +#ifdef __CMS_OPEN +#define CMS +#define EBCDIC true +#define INFILE "infile data" +#define OUTFILE "outfile data" +#define FONTFILE "fontfile data" +#define PLOTFILE "plotfile data" +#define INTREE "intree data" +#define INTREE2 "intree data 2" +#define OUTTREE "outtree data" +#define CATFILE "categories data" +#define WEIGHTFILE "weights data" +#define ANCFILE "ancestors data" +#define MIXFILE "mixture data" +#define FACTFILE "factors data" +#else +#define EBCDIC false +#define INFILE "infile" +#define OUTFILE "outfile" +#define FONTFILE "fontfile" /* on unix this might be /usr/local/lib/fontfile */ +#define PLOTFILE "plotfile" +#define INTREE "intree" +#define INTREE2 "intree2" +#define OUTTREE "outtree" +#define CATFILE "categories" +#define WEIGHTFILE "weights" +#define ANCFILE "ancestors" +#define MIXFILE "mixture" +#define FACTFILE "factors" +#endif + +#ifdef L_ctermid /* try and detect for sysV or V7. */ +#define SYSTEM_FIVE +#endif + +#ifdef sequent +#define SYSTEM_FIVE +#endif + +#ifndef SYSTEM_FIVE +#include +# if defined(_STDLIB_H_) || defined(_H_STDLIB) || defined(H_SCCSID) || defined(unix) +# define UNIX +# define MACHINE_TYPE "BSD Unix C" +# endif +#endif + + +#ifdef __STDIO_LOADED +#define VMS +#define MACHINE_TYPE "VAX/VMS C" +#endif + +#ifdef __WATCOMC__ +#define QUICKC +#define WATCOM +#define DOS +#include "graph.h" +#endif +/* watcom-c has graphics library calls that are almost identical to * + * quick-c, so the "QUICKC" symbol name stays. */ + + +#ifdef _QC +#define MACHINE_TYPE "MS-DOS / Quick C" +#define QUICKC +#include "graph.h" +#define DOS +#endif + +#ifdef _DOS_MODE +#define MACHINE_TYPE "MS-DOS /Microsoft C " +#define DOS /* DOS is always defined if on a DOS machine */ +#define MSC /* MSC is defined for microsoft C */ +#endif + +#ifdef __MSDOS__ /* TURBO c compiler, ONLY (no other DOS C compilers) */ +#define DOS +#define TURBOC +#include +#include +#endif + +#ifdef DJGPP /* DJ Delorie's original gnu C/C++ port */ +#include +#endif + +#ifndef MACHINE_TYPE +#define MACHINE_TYPE "ANSI C" +#endif + +#ifdef DOS +#define MALLOCRETURN void +#else +#define MALLOCRETURN void +#endif +#ifdef VMS +#define signed /* signed doesn't exist in VMS */ +#endif + +/* default screen types */ +/* if on a DOS but not a Windows system can use IBM PC screen controls */ +#ifdef DOS +#ifndef WIN32 +#define IBMCRT true +#define ANSICRT false +#endif +#endif +/* if on a Mac cannot use screen controls */ +#ifdef MAC +#define IBMCRT false +#define ANSICRT false +#endif +/* if on a Windows system can use IBM PC screen controls */ +#ifdef WIN32 +#define IBMCRT true +#define ANSICRT false +#endif +/* otherwise, let's assume we are on a Linux or Unix system + with ANSI terminal controls */ +#ifndef MAC +#ifndef DOS +#ifndef WIN32 +#define IBMCRT false +#define ANSICRT true +#endif +#endif +#endif + +#ifdef DJGPP +#undef MALLOCRETURN +#define MALLOCRETURN void +#endif + + +/* includes: */ +#ifdef UNIX +#include +#else +#include +#endif + +#include +#include + +#ifdef MAC +#ifdef DRAW +#include "interface.h" +#else +#include "macface.h" +#endif +#define getch gettch +#endif + +/* directory delimiters */ +#ifdef MAC +#define DELIMITER ':' +#else +#ifdef WIN32 +#define DELIMITER '\\' +#else +#define DELIMITER '/' +#endif +#endif + + +#define FClose(file) if (file) fclose(file) ; file=NULL +#define Malloc(x) mymalloc((long)x) + +typedef void *Anyptr; +#define Signed signed +#define Const const +#define Volatile volatile +#define Char char /* Characters (not bytes) */ +#define Static static /* Private global funcs and vars */ +#define Local static /* Nested functions */ + +typedef unsigned char boolean; + +#define true 1 +#define false 0 +#define SETBITS 31 + +MALLOCRETURN *mymalloc(long); + +#define FNMLNGTH 200 /* length of array to store a file name */ +#define MAXNCH 26 /*changed from to 20 to 26 by CZ 2006-07-28 */ +#define nmlngth 26 /*changed from to 10 to 26 by CZ 2006-07-28 */ /* number of characters in species name */ +#define maxcategs 9 /* maximum number of site types */ +#define maxcategs2 11 /* maximum number of site types + 2 */ +#define point "." +#define pointe '.' +#define down 2 +#define MAXSHIMOTREES 100 + +#define smoothings 4 /* number of passes through smoothing algorithm */ +#define iterations 4 /* number of iterates for each branch */ +#define epsilon 0.0001 /* small number used in makenewv */ +#define EPSILON 0.00001 /* small number used in hermite root-finding */ +#define initialv 0.1 /* starting branch length unless otherwise */ +#define over 60 /* maximum width all branches of tree on screen */ +#define SQRTPI 1.7724538509055160273 +#define SQRT2 1.4142135623730950488 + +#define NLRSAVES 5 /* number of views that need to be saved during local * + * rearrangement */ + +typedef long *steptr; +typedef long longer[6]; +typedef char naym[MAXNCH]; +typedef long *bitptr; +typedef double raterootarray[maxcategs2][maxcategs2]; + +typedef struct bestelm { + long *btree; + boolean gloreange; + boolean locreange; + boolean collapse; +} bestelm; + +extern FILE *infile, *outfile, *intree, *intree2, *outtree, + *weightfile, *catfile, *ancfile, *mixfile, *factfile; +extern long spp, words, bits; +extern boolean ibmpc, ansi, tranvsp; +extern naym *nayme; /* names of species */ + + +#define ebcdic EBCDIC + +typedef Char plotstring[MAXNCH]; + +/* Approx. 1GB, used to test for memory request errors */ +#define TOO_MUCH_MEMORY 1000000000 + + +/* The below pre-processor commands define the type used to store + group arrays. We can't use #elif for metrowerks, so we use + cascaded if statements */ +#include + +/* minimum double we feel safe with, anything less will be considered + underflow */ +#define MIN_DOUBLE 10e-100 + +/* K&R says that there should be a plus in front of the number, but no + machine we've seen actually uses one; we'll include it just in + case. */ +#define MAX_32BITS 2147483647 +#define MAX_32BITS_PLUS +2147483647 + +/* If ints are 4 bytes, use them */ +#if INT_MAX == MAX_32BITS +typedef int group_type; + +#else + #if INT_MAX == MAX_32BITS_PLUS + typedef int group_type; + + #else + /* Else, if longs are 4 bytes, use them */ + #if LONG_MAX == MAX_32BITS + typedef long group_type; + + #else + #if LONG_MAX == MAX_32BITS_PLUS + typedef long group_type; + + /* Default to longs */ + #else + typedef long group_type; + #endif + + #endif + #endif +#endif + +/* for many programs */ + +#define maxuser 1000 /* maximum number of user-defined trees */ + +typedef Char **sequence; + +typedef enum { + A, C, G, T, O +} bases; + +typedef enum { + alanine, arginine, asparagine, aspartic, cysteine, + glutamine, glutamic, glycine, histidine, isoleucine, + leucine, lysine, methionine, phenylalanine, proline, + serine, threonine, tryptophan, tyrosine, valine +} acids; + +/* for Pars */ + +typedef enum { + zero = 0, one, two, three, four, five, six, seven +} discbases; + +/* for Protpars */ + +typedef enum { + ala, arg, asn, asp, cys, gln, glu, gly, his, ileu, leu, lys, met, phe, pro, + ser1, ser2, thr, trp, tyr, val, del, stop, asx, glx, ser, unk, quest +} aas; + +typedef double sitelike[(long)T - (long)A + 1]; /* used in dnaml, dnadist */ +typedef double psitelike[(long)valine - (long)alanine + 1]; + /* used in proml */ + +typedef long *baseptr; /* baseptr used in dnapars, dnacomp & dnapenny */ +typedef long *baseptr2; /* baseptr used in dnamove */ +typedef unsigned char *discbaseptr; /* discbaseptr used in pars */ +typedef sitelike *ratelike; /* used in dnaml ... */ +typedef psitelike *pratelike; /* used in proml */ +typedef ratelike *phenotype; /* phenotype used in dnaml, dnamlk, dnadist */ +typedef pratelike *pphenotype; /* phenotype used in proml */ +typedef double *sitelike2; +typedef sitelike2 *phenotype2; /* phenotype2 used in restml */ +typedef double *phenotype3; /* for continuous char programs */ + +typedef double *vector; /* used in distance programs */ + +typedef long nucarray[(long)O - (long)A + 1]; +typedef long discnucarray[(long)seven - (long)zero + 1]; + +typedef enum { nocollap, tocollap, undefined } collapstates; + +typedef enum { bottom, nonbottom, hslength, tip, iter, length, + hsnolength, treewt, unittrwt } initops; + + +typedef double **transmatrix; +typedef transmatrix *transptr; /* transptr used in restml */ + +typedef long sitearray[3]; +typedef sitearray *seqptr; /* seqptr used in protpars */ + +typedef struct node { + struct node *next, *back; + plotstring nayme; + long naymlength, tipsabove, index; + double times_in_tree; /* Previously known as cons_index */ + double xcoord, ycoord; + long long_xcoord, long_ycoord; /* for use in cons. */ + double oldlen, length, r, theta, oldtheta, width, depth, + tipdist, lefttheta, righttheta; + group_type *nodeset; /* used by accumulate -plc */ + long ymin, ymax; /* used by printree -plc */ + boolean haslength; /* haslength used in dnamlk */ + boolean iter; /* iter used in dnaml, fitch & restml */ + boolean initialized; /* initialized used in dnamlk & restml */ + long branchnum; /* branchnum used in restml */ + phenotype x; /* x used in dnaml, dnamlk, dnadist */ + phenotype2 x2; /* x2 used in restml */ + phenotype3 view; /* contml etc */ + pphenotype protx; /* protx used in proml */ + aas *seq; /* the sequence used in protpars */ + seqptr siteset; /* temporary storage for aa's used in protpars*/ + double v, deltav, ssq; /* ssq used only in contrast */ + double bigv; /* bigv used in contml */ + double tyme, oldtyme; /* used in dnamlk */ + double t; /* time in kitsch */ + boolean sametime; /* bookkeeps scrunched nodes in kitsch */ + double weight; /* weight of node used by scrunch in kitsch */ + boolean processed; /* used by evaluate in kitsch */ + boolean deleted; /* true if node is deleted (retree) */ + boolean hasname; /* true if tip has a name (retree) */ + double beyond; /* distance beyond this node to most distant tip */ + /* (retree) */ + boolean deadend; /* true if no undeleted nodes beyond this node */ + /* (retree) */ + boolean onebranch; /* true if there is one undeleted node beyond */ + /* this node (retree) */ + struct node *onebranchnode; + /* if there is, a pointer to that node (retree)*/ + double onebranchlength; /* if there is, the distance from here to there*/ + /* (retree) */ + boolean onebranchhaslength; /* true if there is a valid combined length*/ + /* from here to there (retree) */ + collapstates collapse; /* used in dnapars & dnacomp */ + boolean tip; + boolean bottom; /* used in dnapars & dnacomp, disc char */ + boolean visited; /* used in dnapars & dnacomp disc char */ + baseptr base; /* the sequence in dnapars/comp/penny */ + discbaseptr discbase; /* the sequence in pars */ + baseptr2 base2; /* the sequence in dnamove */ + baseptr oldbase; /* record previous sequence */ + discbaseptr olddiscbase; /* record previous sequence */ + long numdesc; /* number of immediate descendants */ + nucarray *numnuc; /* bookkeeps number of nucleotides */ + discnucarray *discnumnuc; /* bookkeeps number of nucleotides */ + steptr numsteps; /* bookkeeps steps */ + steptr oldnumsteps; /* record previous steps */ + double sumsteps; /* bookkeeps sum of steps */ + nucarray cumlengths; /* bookkeeps cummulative minimum lengths */ + discnucarray disccumlengths; /* bookkeeps cummulative minimum lengths */ + nucarray numreconst; /* bookkeeps number of reconstructions */ + discnucarray discnumreconst; /* bookkeeps number of reconstructions */ + vector d, w; /* for distance matrix programs */ + double dist; /* dist used in fitch */ + bitptr stateone, statezero; /* discrete char programs */ + long maxpos; /* maxpos used in Clique */ + Char state; /* state used in Dnamove, Dolmove & Move */ + double* underflows; /* used to record underflow */ +} node; + +typedef node **pointarray; + +typedef struct tree { + pointarray nodep; + double likelihood; + transptr trans; /* restml */ + long *freetrans; /* restml */ + long transindex; /* restml */ + node *start; + node *root; +} tree; + +typedef void (*initptr)(node **, node **, node *, long, long, + long *, long *, initops, pointarray, + pointarray, Char *, Char *, FILE *); + +#ifndef OLDC +/* function prototypes */ +void scan_eoln(FILE *); +boolean eoff(FILE *); +boolean eoln(FILE *); +int filexists(char *); +const char* get_command_name (const char *); +void getstryng(char *); +void openfile(FILE **,const char *,const char *,const char *,const char *, + char *); +void cleerhome(void); +void loopcount(long *, long); +double randum(longer); +void randumize(longer, long *); +double normrand(longer); +long readlong(const char *); + +void uppercase(Char *); +void initseed(long *, long *, longer); +void initjumble(long *, long *, longer, long *); +void initoutgroup(long *, long); +void initthreshold(double *); +void initcatn(long *); +void initcategs(long, double *); +void initprobcat(long, double *, double *); +double logfac (long); +double halfroot(double (*func)(long , double), long, double, double); +double hermite(long, double); +void initlaguerrecat(long, double, double *, double *); +void root_hermite(long, double *); +void hermite_weight(long, double *, double *); +void inithermitcat(long, double, double *, double *); +void lgr(long, double, raterootarray); +double glaguerre(long, double, double); +void initgammacat(long, double, double *, double *); +void inithowmany(long *, long); +void inithowoften(long *); + +void initlambda(double *); +void initfreqs(double *, double *, double *, double *); +void initratio(double *); +void initpower(double *); +void initdatasets(long *); +void justweights(long *); +void initterminal(boolean *, boolean *); +void initnumlines(long *); +void initbestrees(bestelm *, long, boolean); +void newline(FILE *, long, long, long); + +void inputnumbers(long *, long *, long *, long); +void inputnumbersold(long *, long *, long *, long); +void inputnumbers2(long *, long *, long n); +void inputnumbers3(long *, long *); +void samenumsp(long *, long); +void samenumsp2(long); +void readoptions(long *, const char *); +void matchoptions(Char *, const char *); +void inputweights(long, steptr, boolean *); +void inputweightsold(long, steptr, boolean *); +void inputweights2(long, long, long *, steptr, boolean *, const char *); +void printweights(FILE *, long, long, steptr, const char *); + +void inputcategs(long, long, steptr, long, const char *); +void printcategs(FILE *, long, steptr, const char *); +void inputfactors(long, Char *, boolean *); +void inputfactorsnew(long, Char *, boolean *); +void printfactors(FILE *, long, Char *, const char *); +void headings(long, const char *, const char *); +void initname(long); +void findtree(boolean *,long *,long,long *,bestelm *); +void addtree(long,long *,boolean,long *,bestelm *); +long findunrearranged(bestelm *, long, boolean); +boolean torearrange(bestelm *, long); + +void reducebestrees(bestelm *, long *); +void shellsort(double *, long *, long); +void getch(Char *, long *, FILE *); +void getch2(Char *, long *); +void findch(Char, Char *, long); +void findch2(Char, long *, long *, Char *); +void findch3(Char, Char *, long, long); +void processlength(double *,double *,Char *,boolean *,FILE *,long *); +void writename(long, long, long *); +void memerror(void); + +void odd_malloc(long); + +void gnu(node **, node **); +void chuck(node **, node *); +void zeronumnuc(node *, long); +void zerodiscnumnuc(node *, long); +void allocnontip(node *, long *, long); +void allocdiscnontip(node *, long *, unsigned char *, long ); +void allocnode(node **, long *, long); +void allocdiscnode(node **, long *, unsigned char *, long ); +void gnutreenode(node **, node **, long, long, long *); +void gnudisctreenode(node **, node **, long , long, long *, + unsigned char *); + +void chucktreenode(node **, node *); +void setupnode(node *, long); +long count_sibs (node *); +void inittrav (node *); +void commentskipper(FILE ***, long *); +long countcomma(FILE **, long *); +long countsemic(FILE **); +void hookup(node *, node *); +void link_trees(long, long , long, pointarray); +void allocate_nodep(pointarray *, FILE **, long *); + +void malloc_pheno(node *, long, long); +void malloc_ppheno(node *, long, long); +long take_name_from_tree (Char *, Char *, FILE *); +void match_names_to_data (Char *, pointarray, node **, long); +void addelement(node **, node *, Char *, long *, FILE *, pointarray, + boolean *, boolean *, pointarray, long *, long *, boolean *, + node **, initptr,boolean,long); +void treeread (FILE *, node **, pointarray, boolean *, boolean *, + pointarray, long *, boolean *, node **, initptr,boolean,long); +void addelement2(node *, Char *, long *, FILE *, pointarray, boolean, + double *, boolean *, long *, long *, long, boolean *,boolean, + long); +void treeread2 (FILE *, node **, pointarray, boolean, double *, + boolean *, boolean *, long *,boolean,long); +void exxit (int); +void countup(long *loopcount, long maxcount); +char gettc(FILE* file); +void unroot_r(node* p,node ** nodep, long nonodes); +void unroot(tree* t,long nonodes); +void unroot_here(node* root, node** nodep, long nonodes); +void clear_connections(tree *t, long nonodes); +void init(int argc, char** argv); +#endif /* OLDC */ +#endif /* _PHYLIP_H_ */ diff --git a/forester/archive/RIO/others/phylip_mod/src/proml.c b/forester/archive/RIO/others/phylip_mod/src/proml.c new file mode 100644 index 0000000..0302897 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/proml.c @@ -0,0 +1,3202 @@ + +#include "phylip.h" +#include "seq.h" + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Lucas Mix, Akiko Fuseki, Sean Lamont, + Andrew Keeffe, Dan Fineman, and Patrick Colacurcio. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + + +typedef long vall[maxcategs]; +typedef double contribarr[maxcategs]; + +#ifndef OLDC +/* function prototypes */ +void init_protmats(void); +void getoptions(void); +void makeprotfreqs(void); +void allocrest(void); +void doinit(void); +void inputoptions(void); +void input_protdata(long); +void makeweights(void); +void prot_makevalues(long, pointarray, long, long, sequence, steptr); +void prot_inittable(void); + +void alloc_pmatrix(long); +void getinput(void); +void inittravtree(node *); +void prot_nuview(node *); +void prot_slopecurv(node *, double, double *, double *, double *); +void makenewv(node *); +void update(node *); +void smooth(node *); +void make_pmatrix(double **, double **, double **, long, double, + double, double *, double **); +double prot_evaluate(node *, boolean); + +void treevaluate(void); +void promlcopy(tree *, tree *, long, long); +void proml_re_move(node **, node **); +void insert_(node *, node *, boolean); +void addtraverse(node *, node *, boolean); +void rearrange(node *, node *); +void proml_coordinates(node *, double, long *, double *); +void proml_printree(void); +void sigma(node *, double *, double *, double *); +void describe(node *); + +void prot_reconstr(node *, long); +void rectrav(node *, long, long); +void summarize(void); +void initpromlnode(node **, node **, node *, long, long, long *, long *, + initops, pointarray, pointarray, Char *, Char *, FILE *); +void dnaml_treeout(node *); +void buildnewtip(long, tree *); +void buildsimpletree(tree *); +void free_all_protx (long, pointarray); +void maketree(void); +void clean_up(void); +void globrearrange(void); +void proml_unroot(node* root, node** nodep, long nonodes) ; +void reallocsites(void); +void prot_freetable(void); +void free_pmatrix(long sib); +void alloclrsaves(void); +void freelrsaves(void); +void resetlrsaves(void); +/* function prototypes */ +#endif + + +long rcategs; +boolean haslengths; +long oldendsite=0; + +Char infilename[100], outfilename[100], intreename[100], outtreename[100], + catfilename[100], weightfilename[100]; +double *rate, *rrate, *probcat; +long nonodes2, sites, weightsum, categs, + datasets, ith, njumble, jumb; +long inseed, inseed0, parens; +boolean global, jumble, weights, trout, usertree, inserting = false, + ctgry, rctgry, auto_, hypstate, progress, mulsets, justwts, firstset, + improve, smoothit, polishing, lngths, gama, invar, usepmb, usepam, usejtt; +tree curtree, bestree, bestree2, priortree; +node *qwhere, *grbg, *addwhere; +double cv, alpha, lambda, invarfrac, bestyet; +long *enterorder; +steptr aliasweight; +contribarr *contribution, like, nulike, clai; +double **term, **slopeterm, **curveterm; +longer seed; +char *progname; +char aachar[26]="ARNDCQEGHILKMFPSTWYVBZX?*-"; +node **lrsaves; + +/* Local variables for maketree, propagated globally for c version: */ +long k, nextsp, numtrees, maxwhich, mx, mx0, mx1, shimotrees; +double dummy, maxlogl; +boolean succeeded, smoothed; +double **l0gf; +double *l0gl; +double **tbl; +Char ch, ch2; +long col; +vall *mp; + + +/* Variables introduced to allow for protein probability calculations */ +long max_num_sibs; /* maximum number of siblings used in a */ + /* nuview calculation. determines size */ + /* final size of pmatrices */ +double *eigmat; /* eig matrix variable */ +double **probmat; /* prob matrix variable */ +double ****dpmatrix; /* derivative of pmatrix */ +double ****ddpmatrix; /* derivative of xpmatrix */ +double *****pmatrices; /* matrix of probabilities of protien */ + /* conversion. The 5 subscripts refer */ + /* to sibs, rcategs, categs, final and */ + /* initial states, respectively. */ +double freqaa[20]; /* amino acid frequencies */ + +/* this JTT matrix decomposition thanks to Elisabeth Tillier */ +static double jtteigmat[] = +{0.0, -0.7031123, -0.6484345, -0.6086499, -0.5514432, +-0.772664, -0.8643413, -1.0620756, -0.9965552, -1.1671808, +-1.2222418,-0.4589201, -1.3103714, -1.4048038, -0.3170582, +-0.347935, -1.5311677, -1.6021194, -1.7991454, -1.8911888}; + +static double jttprobmat[20][20] = +{{0.076999996, 0.051000003, 0.043000004, 0.051999998, 0.019999996, 0.041, + 0.061999994, 0.073999997, 0.022999999, 0.052000004, 0.090999997, 0.058999988, + 0.024000007, 0.04, 0.050999992, 0.069, 0.059000006, 0.014000008, 0.032000004, + 0.066000005}, + {0.015604455, -0.068062363, 0.020106264, 0.070723273, 0.011702977, 0.009674053, + 0.074000798, -0.169750458, 0.005560808, -0.008208636, -0.012305869, + -0.063730179, -0.005674643, -0.02116828, 0.104586169, 0.016480839, 0.016765139, + 0.005936994, 0.006046367, -0.0082877}, + {-0.049778281, -0.007118197, 0.003801272, 0.070749616, 0.047506147, + 0.006447017, 0.090522425, -0.053620432, -0.008508175, 0.037170603, + 0.051805545, 0.015413608, 0.019939916, -0.008431976, -0.143511376, + -0.052486072, -0.032116542, -0.000860626, -0.02535993, 0.03843545}, + {-0.028906423, 0.092952047, -0.009615343, -0.067870117, 0.031970392, + 0.048338335, -0.054396304, -0.135916654, 0.017780083, 0.000129242, + 0.031267424, 0.116333586, 0.007499746, -0.032153596, 0.033517051, + -0.013719269, -0.00347293, -0.003291821, -0.02158326, -0.008862168}, + {0.037181176, -0.023106564, -0.004482225, -0.029899635, 0.118139633, + -0.032298569, -0.04683198, 0.05566988, -0.012622847, 0.002023096, + -0.043921088, -0.04792557, -0.003452711, -0.037744513, 0.020822974, + 0.036580187, 0.02331425, -0.004807711, -0.017504496, 0.01086673}, + {0.044754061, -0.002503471, 0.019452517, -0.015611487, -0.02152807, + -0.013131425, -0.03465365, -0.047928912, 0.020608851, 0.067843095, + -0.122130014, 0.002521499, 0.013021646, -0.082891087, -0.061590119, + 0.016270856, 0.051468938, 0.002079063, 0.081019713, 0.082927944}, + {0.058917882, 0.007320741, 0.025278141, 0.000357541, -0.002831285, + -0.032453034, -0.010177288, -0.069447924, -0.034467324, 0.011422358, + -0.128478324, 0.04309667, -0.015319944, 0.113302422, -0.035052393, + 0.046885372, 0.06185183, 0.00175743, -0.06224497, 0.020282093}, + {-0.014562092, 0.022522921, -0.007094389, 0.03480089, -0.000326144, + -0.124039037, 0.020577906, -0.005056454, -0.081841576, -0.004381786, + 0.030826152, 0.091261631, 0.008878828, -0.02829487, 0.042718836, + -0.011180886, -0.012719227, -0.000753926, 0.048062375, -0.009399129}, + {0.033789571, -0.013512235, 0.088010984, 0.017580292, -0.006608005, + -0.037836971, -0.061344686, -0.034268357, 0.018190209, -0.068484614, + 0.120024744, -0.00319321, -0.001349477, -0.03000546, -0.073063759, + 0.081912399, 0.0635245, 0.000197, -0.002481798, -0.09108114}, + {-0.113947615, 0.019230545, 0.088819683, 0.064832765, 0.001801467, + -0.063829682, -0.072001633, 0.018429333, 0.057465965, 0.043901014, + -0.048050874, -0.001705918, 0.022637173, 0.017404665, 0.043877902, + -0.017089594, -0.058489485, 0.000127498, -0.029357194, 0.025943972}, + {0.01512923, 0.023603725, 0.006681954, 0.012360216, -0.000181447, + -0.023011838, -0.008960024, -0.008533239, 0.012569835, 0.03216118, + 0.061986403, -0.001919083, -0.1400832, -0.010669741, -0.003919454, + -0.003707024, -0.026806029, -0.000611603, -0.001402648, 0.065312824}, + {-0.036405351, 0.020816769, 0.011408213, 0.019787053, 0.038897829, + 0.017641789, 0.020858533, -0.006067252, 0.028617353, -0.064259496, + -0.081676567, 0.024421823, -0.028751676, 0.07095096, -0.024199434, + -0.007513119, -0.028108766, -0.01198095, 0.111761119, -0.076198809}, + {0.060831772, 0.144097327, -0.069151377, 0.023754576, -0.003322955, + -0.071618574, 0.03353154, -0.02795295, 0.039519769, -0.023453968, + -0.000630308, -0.098024591, 0.017672997, 0.003813378, -0.009266499, + -0.011192111, 0.016013873, -0.002072968, -0.010022044, -0.012526904}, + {-0.050776604, 0.092833081, 0.044069596, 0.050523021, -0.002628417, + 0.076542572, -0.06388631, -0.00854892, -0.084725311, 0.017401063, + -0.006262541, -0.094457679, -0.002818678, -0.0044122, -0.002883973, + 0.028729685, -0.004961596, -0.001498627, 0.017994575, -0.000232779}, + {-0.01894566, -0.007760205, -0.015160993, -0.027254587, 0.009800903, + -0.013443561, -0.032896517, -0.022734138, -0.001983861, 0.00256111, + 0.024823166, -0.021256768, 0.001980052, 0.028136263, -0.012364384, + -0.013782446, -0.013061091, 0.111173981, 0.021702122, 0.00046654}, + {-0.009444193, -0.042106824, -0.02535015, -0.055125574, 0.006369612, + -0.02945416, -0.069922064, -0.067221068, -0.003004999, 0.053624311, + 0.128862984, -0.057245803, 0.025550508, 0.087741073, -0.001119043, + -0.012036202, -0.000913488, -0.034864475, 0.050124813, 0.055534723}, + {0.145782464, -0.024348311, -0.031216873, 0.106174443, 0.00202862, + 0.02653866, -0.113657267, -0.00755018, 0.000307232, -0.051241158, + 0.001310685, 0.035275877, 0.013308898, 0.002957626, -0.002925034, + -0.065362319, -0.071844582, 0.000475894, -0.000112419, 0.034097762}, + {0.079840455, 0.018769331, 0.078685899, -0.084329807, -0.00277264, + -0.010099754, 0.059700608, -0.019209715, -0.010442992, -0.042100476, + -0.006020556, -0.023061786, 0.017246106, -0.001572858, -0.006703785, + 0.056301316, -0.156787357, -0.000303638, 0.001498195, 0.051363455}, + {0.049628261, 0.016475144, 0.094141653, -0.04444633, 0.005206131, + -0.001827555, 0.02195624, 0.013066683, -0.010415582, -0.022338403, + 0.007837197, -0.023397671, -0.002507095, 0.005177694, 0.017109561, + -0.202340113, 0.069681441, 0.000120736, 0.002201146, 0.004670849}, + {0.089153689, 0.000233354, 0.010826822, -0.004273519, 0.001440618, + 0.000436077, 0.001182351, -0.002255508, -0.000700465, 0.150589876, + -0.003911914, -0.00050154, -0.004564983, 0.00012701, -0.001486973, + -0.018902754, -0.054748555, 0.000217377, -0.000319302, -0.162541651}}; + +/* this PMB matrix decomposition due to Elisabeth Tillier */ +static double pmbeigmat[20] = +{0.0000001586972220,-1.8416770496147100, -1.6025046986139100,-1.5801012515121300, +-1.4987794099715900,-1.3520794233801900,-1.3003469390479700,-1.2439503327631300, +-1.1962574080244200,-1.1383730501367500,-1.1153278910708000,-0.4934843510654760, +-0.5419014550215590,-0.9657997830826700,-0.6276075673757390,-0.6675927795018510, +-0.6932641383465870,-0.8897872681859630,-0.8382698977371710,-0.8074694642446040}; + +static double pmbprobmat[20][20] = +{{0.0771762457248147,0.0531913844998640,0.0393445076407294,0.0466756566755510, +0.0286348361997465,0.0312327748383639,0.0505410248721427,0.0767106611472993, +0.0258916271688597,0.0673140562194124,0.0965705469252199,0.0515979465932174, +0.0250628079438675,0.0503492018628350,0.0399908189418273,0.0641898881894471, +0.0517539616710987,0.0143507440546115,0.0357994592438322,0.0736218495862984}, +{0.0368263046116572,-0.0006728917107827,0.0008590805287740,-0.0002764255356960, +0.0020152937187455,0.0055743720652960,0.0003213317669367,0.0000449190281568, +-0.0004226254397134,0.1805040629634510,-0.0272246813586204,0.0005904606533477, +-0.0183743200073889,-0.0009194625608688,0.0008173657533167,-0.0262629806302238, +0.0265738757209787,0.0002176606241904,0.0021315644838566,-0.1823229927207580}, +{-0.0194800075560895,0.0012068088610652,-0.0008803318319596,-0.0016044273960017, +-0.0002938633803197,-0.0535796754602196,0.0155163896648621,-0.0015006360762140, +0.0021601372013703,0.0268513218744797,-0.1085292493742730,0.0149753083138452, +0.1346457366717310,-0.0009371698759829,0.0013501708044116,0.0346352293103622, +-0.0276963770242276,0.0003643142783940,0.0002074817333067,-0.0174108903914110}, +{0.0557839400850153,0.0023271577185437,0.0183481103396687,0.0023339480096311, +0.0002013267015151,-0.0227406863569852,0.0098644845475047,0.0064721276774396, +0.0001389408104210,-0.0473713878768274,-0.0086984445005797,0.0026913674934634, +0.0283724052562196,0.0001063665179457,0.0027442574779383,-0.1875312134708470, +0.1279864877057640,0.0005103347834563,0.0003155113168637,0.0081451082759554}, +{0.0037510125027265,0.0107095920636885,0.0147305410328404,-0.0112351252180332, +-0.0001500408626446,-0.1523450933729730,0.0611532413339872,-0.0005496748939503, +0.0048714378736644,-0.0003826320053999,0.0552010244407311,0.0482555671001955, +-0.0461664995115847,-0.0021165008617978,-0.0004574454232187,0.0233755883688949, +-0.0035484915422384,0.0009090698422851,0.0013840637687758,-0.0073895139302231}, +{-0.0111512564930024,0.1025460064723080,0.0396772456883791,-0.0298408501361294, +-0.0001656742634733,-0.0079876311843289,0.0712644184507945,-0.0010780604625230, +-0.0035880882043592,0.0021070399334252,0.0016716329894279,-0.1810123023850110, +0.0015141703608724,-0.0032700852781804,0.0035503782441679,0.0118634302028026, +0.0044561606458028,-0.0001576678495964,0.0023470722225751,-0.0027457045397157}, +{0.1474525743949170,-0.0054432538500293,0.0853848892349828,-0.0137787746207348, +-0.0008274830358513,0.0042248844582553,0.0019556229305563,-0.0164191435175148, +-0.0024501858854849,0.0120908948084233,-0.0381456105972653,0.0101271614855119, +-0.0061945941321859,0.0178841099895867,-0.0014577779202600,-0.0752120602555032, +-0.1426985695849920,0.0002862275078983,-0.0081191734261838,0.0313401149422531}, +{0.0542034611735289,-0.0078763926211829,0.0060433542506096,0.0033396210615510, +0.0013965072374079,0.0067798903832256,-0.0135291136622509,-0.0089982442731848, +-0.0056744537593887,-0.0766524225176246,0.1881210263933930,-0.0065875518675173, +0.0416627569300375,-0.0953804133524747,-0.0012559228448735,0.0101622644292547, +-0.0304742453119050,0.0011702318499737,0.0454733434783982,-0.1119239362388150}, +{0.1069409037912470,0.0805064400880297,-0.1127352030714600,0.1001181253523260, +-0.0021480427488769,-0.0332884841459003,-0.0679837575848452,-0.0043812841356657, +0.0153418716846395,-0.0079441315103188,-0.0121766182046363,-0.0381127991037620, +-0.0036338726532673,0.0195324059593791,-0.0020165963699984,-0.0061222685010268, +-0.0253761448771437,-0.0005246410999057,-0.0112205170502433,0.0052248485517237}, +{-0.0325247648326262,0.0238753651653669,0.0203684886605797,0.0295666232678825, +-0.0003946714764213,-0.0157242718469554,-0.0511737848084862,0.0084725632040180, +-0.0167068828528921,0.0686962159427527,-0.0659702890616198,-0.0014289912494271, +-0.0167000964093416,-0.1276689083678200,0.0036575057830967,-0.0205958145531018, +0.0000368919612829,0.0014413626622426,0.1064360941926030,0.0863372661517408}, +{-0.0463777468104402,0.0394712148670596,0.1118686750747160,0.0440711686389031, +-0.0026076286506751,-0.0268454015202516,-0.1464943067133240,-0.0137514051835380, +-0.0094395514284145,-0.0144124844774228,0.0249103379323744,-0.0071832157138676, +0.0035592787728526,0.0415627419826693,0.0027040097365669,0.0337523666612066, +0.0316121324137152,-0.0011350177559026,-0.0349998884574440,-0.0302651879823361}, +{0.0142360925194728,0.0413145623127025,0.0324976427846929,0.0580930922002398, +-0.0586974207121084,0.0202001168873069,0.0492204086749069,0.1126593173463060, +0.0116620013776662,-0.0780333711712066,-0.1109786767320410,0.0407775100936731, +-0.0205013161312652,-0.0653458585025237,0.0347351829703865,0.0304448983224773, +0.0068813748197884,-0.0189002309261882,-0.0334507528405279,-0.0668143558699485}, +{-0.0131548829657936,0.0044244322828034,-0.0050639951827271,-0.0038668197633889, +-0.1536822386530220,0.0026336969165336,0.0021585651200470,-0.0459233839062969, +0.0046854727140565,0.0393815434593599,0.0619554007991097,0.0027456299925622, +0.0117574347936383,0.0373018612990383,0.0024818527553328,-0.0133956606027299, +-0.0020457128424105,0.0154178819990401,0.0246524142683911,0.0275363065682921}, +{-0.1542307272455030,0.0364861558267547,-0.0090880407008181,0.0531673937889863, +0.0157585615170580,0.0029986538457297,0.0180194047699875,0.0652152443589317, +0.0266842840376180,0.0388457366405908,0.0856237634510719,0.0126955778952183, +0.0099593861698250,-0.0013941794862563,0.0294065511237513,-0.1151906949298290, +-0.0852991447389655,0.0028699120202636,-0.0332087026659522,0.0006811857297899}, +{0.0281300736924501,-0.0584072081898638,-0.0178386569847853,-0.0536470338171487, +-0.0186881656029960,-0.0240008730656106,-0.0541064820498883,0.2217137098936020, +-0.0260500001542033,0.0234505236798375,0.0311127151218573,-0.0494139126682672, +0.0057093465049849,0.0124937286655911,-0.0298322975915689,0.0006520211333102, +-0.0061018680727128,-0.0007081999479528,-0.0060523759094034,0.0215845995364623}, +{0.0295321046399105,-0.0088296411830544,-0.0065057049917325,-0.0053478115612781, +-0.0100646496794634,-0.0015473619084872,0.0008539960632865,-0.0376381933046211, +-0.0328135588935604,0.0672161874239480,0.0667626853916552,-0.0026511651464901, +0.0140451514222062,-0.0544836996133137,0.0427485157912094,0.0097455780205802, +0.0177309072915667,-0.0828759701187452,-0.0729504795471370,0.0670731961252313}, +{0.0082646581043963,-0.0319918630534466,-0.0188454445200422,-0.0374976353856606, +0.0037131290686848,-0.0132507796987883,-0.0306958830735725,-0.0044119395527308, +-0.0140786756619672,-0.0180512599925078,-0.0208243802903953,-0.0232202769398931, +-0.0063135878270273,0.0110442171178168,0.1824538048228460,-0.0006644614422758, +-0.0069909097436659,0.0255407650654681,0.0099119399501151,-0.0140911517070698}, +{0.0261344441524861,-0.0714454044548650,0.0159436926233439,0.0028462736216688, +-0.0044572637889080,-0.0089474834434532,-0.0177570282144517,-0.0153693244094452, +0.1160919467206400,0.0304911481385036,0.0047047513411774,-0.0456535116423972, +0.0004491494948617,-0.0767108879444462,-0.0012688533741441,0.0192445965934123, +0.0202321954782039,0.0281039933233607,-0.0590403018490048,0.0364080426546883}, +{0.0115826306265004,0.1340228176509380,-0.0236200652949049,-0.1284484655137340, +-0.0004742338006503,0.0127617346949511,-0.0428560878860394,0.0060030732454125, +0.0089182609926781,0.0085353834972860,0.0048464809638033,0.0709740071429510, +0.0029940462557054,-0.0483434904493132,-0.0071713680727884,-0.0036840391887209, +0.0031454003250096,0.0246243550241551,-0.0449551277644180,0.0111449232769393}, +{0.0140356721886765,-0.0196518236826680,0.0030517022326582,0.0582672093364850, +-0.0000973895685457,0.0021704767224292,0.0341806268602705,-0.0152035987563018, +-0.0903198657739177,0.0259623214586925,0.0155832497882743,-0.0040543568451651, +0.0036477631918247,-0.0532892744763217,-0.0142569373662724,0.0104500681408622, +0.0103483945857315,0.0679534422398752,-0.0768068882938636,0.0280289727046158}} +; + + +static double pameigmat[] = {0.0, -0.2350753691875762, -0.2701991863800379, + -0.2931612442853115, -0.4262492032364507, -0.5395980482561625, + -0.7141172690079523, -0.7392844756151318, -0.7781761342200766, + -0.810032066366362, -0.875299712761124, -1.048227332164386, + -1.109594097332267, -1.298616073142234, -1.342036228188581, + -1.552599145527578, -1.658762802054814, -1.74893445623765, + -1.933280832903272, -2.206353522613025}; + +static double pamprobmat[20][20] = + {{0.087683339901135, 0.04051291829598762, 0.04087846315185977, + 0.04771603459744777, 0.03247095396561266, 0.03784612688594957, + 0.0504933695604875, 0.0898249006830755, 0.03285885059543713, + 0.0357514442352119, 0.0852464099207521, 0.07910313444070642, + 0.01488243946396588, 0.04100101908956829, 0.05158026947089499, + 0.06975497205982451, 0.05832757042475474, 0.00931264523877807, + 0.03171540880870517, 0.06303972920984541}, + {0.01943453646811026, -0.004492574160484092, 0.007694891061220776, + 0.01278399096887701, 0.0106157418450234, 0.007542140341575122, + 0.01326994069032819, 0.02615565199894889, 0.003123125764490066, + 0.002204507682495444, -0.004782898215768979, 0.01204241965177619, + 0.0007847400096924341, -0.03043626073172116, 0.01221202591902536, + 0.01100527004684405, 0.01116495631339549, -0.0925364931988571, + -0.02622065387931562, 0.00843494142432107}, + {0.01855357100209072, 0.01493642835763868, 0.0127983090766285, + 0.0200533250704364, -0.1681898360107787, 0.01551657969909255, + 0.02128060163107209, 0.03100633591848964, 0.00845480845269879, + 0.000927149370785571, 0.00937207565817036, 0.03490557769673472, + 0.00300443019551563, -0.02590837220264415, 0.01329376859943192, + 0.006854110889741407, 0.01102593860528263, 0.003360844186685888, + -0.03459712356647764, 0.003351477369404443}, + {0.02690642688200102, 0.02131745801890152, 0.0143626616005213, + 0.02405101425725929, 0.05041008641436849, 0.01430925051050233, + 0.02362114036816964, 0.04688381789373886, 0.005250115453626377, + -0.02040112168595516, -0.0942720776915669, 0.03773004996758644, + -0.00822831940782616, -0.1164872809439224, 0.02286281877257392, + 0.02849551240669926, 0.01468856796295663, 0.02377110964207936, + -0.094380545436577, -0.02089068498518036}, + {0.00930172577225213, 0.01493463068441099, 0.020186920775608, + 0.02892154953912524, -0.01224593358361567, 0.01404228329986624, + 0.02671186617119041, 0.04537535161795231, 0.02229995804098249, + -0.04635704133961575, -0.1966910360247138, 0.02796648065439046, + -0.02263484732621436, 0.0440490503242072, 0.01148782948302166, + 0.01989170531824069, 0.001306805142981245, -0.005676690969116321, + 0.07680476281625202, -0.07967537039721849}, + {0.06602274245435476, -0.0966661981471856, -0.005241648783844579, + 0.00859135188171146, -0.007762129660943368, -0.02888965572526196, + 0.003592291525888222, 0.1668410669287673, -0.04082039290551406, + 0.005233775047553415, -0.01758244726137135, -0.1493955762326898, + -0.00855819137835548, 0.004211419253492328, 0.01929306335052688, + 0.03008056746359405, 0.0190444422412472, 0.005577189741419315, + 0.0000874156155112068, 0.02634091459108298}, + {0.01933897472880726, 0.05874583569377844, -0.02293534606228405, + -0.07206314017962175, -0.004580681581546643, -0.0628814337610561, + -0.0850783812795136, 0.07988417636610614, -0.0852798990133397, + 0.01649047166155952, -0.05416647263757423, 0.1089834536254064, + 0.005093403979413865, 0.02520300254161142, 0.0005951431406455604, + 0.02441251821224675, 0.02796099482240553, -0.002574933994926502, + -0.007172237553012804, 0.03002455129086954}, + {0.04041118479094272, -0.002476225672095412, -0.01494505811263243, + -0.03759443758599911, -0.00892246902492875, -0.003634714029239211, + -0.03085671837973749, -0.126176309029931, 0.005814031139083794, + 0.01313561962646063, -0.04760487162503322, -0.0490563712725484, + -0.005082243450421558, -0.01213634309383557, 0.1806666927079249, + 0.02111663336185495, 0.02963486860587087, -0.0000175020101657785, + 0.01197155383597686, 0.0357526792184636}, + {-0.01184769557720525, 0.01582776076338872, -0.006570708266564639, + -0.01471915653734024, 0.00894343616503608, 0.00562664968033149, + -0.01465878888356943, 0.05365282692645818, 0.00893509735776116, + -0.05879312944436473, 0.0806048683392995, -0.007722897986905326, + -0.001819943882718859, 0.0942535573077267, 0.07483883782251654, + 0.004354639673913651, -0.02828804845740341, -0.001318222184691827, + -0.07613149604246563, -0.1251675867732172}, + {0.00834167031558193, -0.01509357596974962, 0.007098172811092488, + 0.03127677418040319, 0.001992448468465455, 0.00915441566808454, + 0.03430175973499201, -0.0730648147535803, -0.001402707145575659, + 0.04780949194330815, -0.1115035603461273, -0.01292297197609604, + -0.005056270550868528, 0.1112053349612027, -0.03801929822379964, + -0.001191241001736563, 0.01872874622910247, 0.0005314214903865993, + -0.0882576318311789, 0.07607183599610171}, + {-0.01539460099727769, 0.04988596184297883, -0.01187240760647617, + -0.06987843637091853, -0.002490472846497859, 0.01009857892494956, + -0.07473588067847209, 0.0906009925879084, 0.1243612446505172, + 0.02152806401345371, -0.03504879644860233, -0.06680752427613573, + -0.005574485153629651, 0.001518282948127752, -0.01999168507510701, + -0.01478606199529457, -0.02203749419458996, -0.00132680708294333, + -0.01137505997867614, 0.05332658773667142}, + {-0.06104378736432388, 0.0869446603393548, -0.03298331234537257, + 0.03128515657456024, 0.003906358569208259, 0.03578694104193928, + 0.06241936133189683, 0.06182827284921748, -0.05566564263245907, + 0.02640868588189002, -0.01349751243059039, -0.05507866642582638, + -0.006671347738489326, -0.001470096466016046, 0.05185743641479938, + -0.07494697511168257, -0.1175185439057584, -0.001188074094105709, + 0.00937934805737347, 0.05024773745437657}, + {-0.07252555582124737, -0.116554459356382, 0.003605361887406413, + -0.00836518656029184, 0.004615715410745561, 0.005105376617651312, + -0.00944938657024391, 0.05602449420950007, 0.02722719610561933, + 0.01959357494748446, -0.0258655103753962, 0.1440733975689835, + 0.01446782819722976, 0.003718896062070054, 0.05825843045655135, + -0.06230154142733073, -0.07833704962300169, 0.003160836143568724, + -0.001169873777936648, 0.03471745590503304}, + {-0.03204352258752698, 0.01019272923862322, 0.04509668708733181, + 0.05756522429120813, -0.0004601149081726732, -0.0984718150777423, + -0.01107826100664925, -0.005680277810520585, 0.01962359392320817, + 0.01550006899131986, 0.05143956925922197, 0.02462476682588468, + -0.0888843861002653, -0.00171553583659411, 0.01606331750661664, + 0.001176847743518958, -0.02070972978912828, -0.000341523293579971, + -0.002654732745607882, 0.02075709428885848}, + {0.03595199666430258, -0.02800219615234468, -0.04341570015493925, + -0.0748275906176658, 0.0001051403676377422, 0.1137431321746627, + 0.005852087565974318, 0.003443037513847801, -0.02481931657706633, + -0.003651181839831423, 0.03195794176786321, 0.04135411406392523, + -0.07562030263210619, 0.001769332364699, -0.01984381173403915, + -0.005029750745010152, 0.02649253902476472, 0.000518085571702734, + 0.001062936684474851, 0.01295950668914449}, + {-0.16164552322896, -0.0006050035060464324, 0.0258380054414968, + 0.003188424740960557, -0.0002058911341821877, 0.03157555987384681, + -0.01678913462596107, 0.03096216145389774, -0.0133791110666919, + 0.1125249625204277, -0.00769017706442472, -0.02653938062180483, + -0.002555329863523985, -0.00861833362947954, 0.01775148884754278, + 0.02529310679774722, 0.0826243417011238, -0.0001036728183032624, + 0.001963562313294209, -0.0935900561309786}, + {0.1652394174588469, -0.002814245280784351, -0.0328982001821263, + -0.02000104712964131, 0.0002208121995725443, -0.02733462178511839, + 0.02648078162927627, -0.01788316626401427, 0.01630747623755998, + 0.1053849023838147, -0.005447706553811218, 0.01810876922536839, + -0.001808914710282444, -0.007687912115607397, -0.01332593672114388, + -0.02110750894891371, -0.07456116592983384, 0.000219072589592394, + 0.001270886972191055, -0.1083616930749109}, + {0.02453279389716254, -0.005820072356487439, 0.100260287284095, + 0.01277522280305745, -0.003184943445296999, 0.05814689527984152, + -0.0934012278200201, -0.03017986487349484, -0.03136625380994165, + 0.00988668352785117, -0.00358900410973142, -0.02017443675004764, + 0.000915384582922184, -0.001460963415183106, -0.01370112443251124, + 0.1130040979284457, -0.1196161771323699, -0.0005800211204222045, + -0.0006153403201024954, 0.00416806428223025}, + {-0.0778089244252535, -0.007055161182430869, -0.0349307504860869, + -0.0811915584276571, -0.004689825871599125, -0.03726108871471753, + 0.1072225647141469, -0.00917015113070944, 0.01381628985996913, + -0.00123227881492089, 0.001815954515275675, 0.005708744099349901, + -0.0001448985044877925, -0.001306578795561384, -0.006992743514185243, + 0.1744720240732789, -0.05353628497814023, -0.0007613684227234787, + -0.0003550282315997644, 0.01340106423804634}, + {-0.0159527329868513, -0.007622151568160798, -0.1389875105184963, + 0.1165051999914764, -0.002217810389087748, 0.01550003226513692, + -0.07427664222230566, -0.003371438498619264, 0.01385754771325365, + 0.004759020167383304, 0.001624078805220564, 0.02011638303109029, + -0.001717827082842178, -0.0007424036708598594, -0.003978884451898934, + 0.0866418927301209, -0.01280817739158123, -0.00023039242454603, + 0.002309205802479111, 0.0005926106991001195}}; + + +void init_protmats() +{ + long l, m; + + eigmat = (double *) Malloc (20 * sizeof(double)); + for (l = 0; l <= 19; l++) + if (usejtt) + eigmat[l] = jtteigmat[l]; + else { + if (usepmb) + eigmat[l] = pmbeigmat[l]; + else + eigmat[l] = pameigmat[l]; + } + probmat = (double **) Malloc (20 * sizeof(double *)); + for (l = 0; l <= 19; l++) + for (m= 0; m <= 19; m++) + if (usejtt) + probmat[l] = jttprobmat[l]; + else { + if (usepmb) + probmat[l] = pmbprobmat[l]; + else + probmat[l] = pamprobmat[l]; + } +} /* init_protmats */ + + +void getoptions() +{ + /* interactively set options */ + long i, loopcount, loopcount2; + Char ch; + boolean didchangecat, didchangercat; + double probsum; + + fprintf(outfile, "\nAmino acid sequence Maximum Likelihood"); + fprintf(outfile, " method, version %s\n\n",VERSION); + putchar('\n'); + ctgry = false; + didchangecat = false; + rctgry = false; + didchangercat = false; + categs = 1; + rcategs = 1; + auto_ = false; + gama = false; + global = false; + hypstate = false; + improve = false; + invar = false; + jumble = false; + njumble = 1; + lngths = false; + lambda = 1.0; + outgrno = 1; + outgropt = false; + trout = true; + usertree = false; + weights = false; + printdata = false; + progress = true; + treeprint = true; + usejtt = true; + usepmb = false; + usepam = false; + interleaved = true; + loopcount = 0; + for (;;){ + cleerhome(); + printf("Amino acid sequence Maximum Likelihood"); + printf(" method, version %s\n\n",VERSION); + printf("Settings for this run:\n"); + printf(" U Search for best tree? %s\n", + (usertree ? "No, use user trees in input file" : "Yes")); + if (usertree) { + printf(" L Use lengths from user trees? %s\n", + (lngths ? "Yes" : "No")); + } + printf(" P JTT, PMB or PAM probability model? %s\n", + usejtt ? "Jones-Taylor-Thornton" : + usepmb ? "Henikoff/Tillier PMB" : "Dayhoff PAM"); + printf(" C One category of sites?"); + if (!ctgry || categs == 1) + printf(" Yes\n"); + else + printf(" %ld categories of sites\n", categs); + printf(" R Rate variation among sites?"); + if (!rctgry) + printf(" constant rate of change\n"); + else { + if (gama) + printf(" Gamma distributed rates\n"); + else { + if (invar) + printf(" Gamma+Invariant sites\n"); + else + printf(" user-defined HMM of rates\n"); + } + printf(" A Rates at adjacent sites correlated?"); + if (!auto_) + printf(" No, they are independent\n"); + else + printf(" Yes, mean block length =%6.1f\n", 1.0 / lambda); + } + printf(" W Sites weighted? %s\n", + (weights ? "Yes" : "No")); + if (!usertree) { + printf(" S Speedier but rougher analysis? %s\n", + (improve ? "No, not rough" : "Yes")); + printf(" G Global rearrangements? %s\n", + (global ? "Yes" : "No")); + } + if (!usertree) { + printf(" J Randomize input order of sequences?"); + if (jumble) + printf(" Yes (seed =%8ld,%3ld times)\n", inseed0, njumble); + else + printf(" No. Use input order\n"); + } + printf(" O Outgroup root? %s%3ld\n", + (outgropt ? "Yes, at sequence number" : + "No, use as outgroup species"),outgrno); + printf(" M Analyze multiple data sets?"); + if (mulsets) + printf(" Yes, %2ld %s\n", datasets, + (justwts ? "sets of weights" : "data sets")); + else + printf(" No\n"); + printf(" I Input sequences interleaved? %s\n", + (interleaved ? "Yes" : "No, sequential")); + printf(" 0 Terminal type (IBM PC, ANSI, none)? %s\n", + (ibmpc ? "IBM PC" : ansi ? "ANSI" : "(none)")); + printf(" 1 Print out the data at start of run %s\n", + (printdata ? "Yes" : "No")); + printf(" 2 Print indications of progress of run %s\n", + (progress ? "Yes" : "No")); + printf(" 3 Print out tree %s\n", + (treeprint ? "Yes" : "No")); + printf(" 4 Write out trees onto tree file? %s\n", + (trout ? "Yes" : "No")); + printf(" 5 Reconstruct hypothetical sequences? %s\n", + (hypstate ? "Yes" : "No")); + printf("\n Y to accept these or type the letter for one to change\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch); + getchar(); + if (ch == '\n') + ch = ' '; + uppercase(&ch); + if (ch == 'Y') + break; + if (strchr("UPLCRAWSGJOMI012345",ch) != NULL){ + switch (ch) { + + case 'C': + ctgry = !ctgry; + if (ctgry) { + printf("\nSitewise user-assigned categories:\n\n"); + initcatn(&categs); + if (rate){ + free(rate); + } + rate = (double *) Malloc(categs * sizeof(double)); + didchangecat = true; + initcategs(categs, rate); + } + break; + + case 'P': + if (usejtt) { + usejtt = false; + usepmb = true; + } else { + if (usepmb) { + usepmb = false; + usepam = true; + } else { + usepam = false; + usejtt = true; + } + } + break; + + case 'R': + if (!rctgry) { + rctgry = true; + gama = true; + } else { + if (gama) { + gama = false; + invar = true; + } else { + if (invar) + invar = false; + else + rctgry = false; + } + } + break; + + case 'A': + auto_ = !auto_; + if (auto_) + initlambda(&lambda); + break; + + case 'W': + weights = !weights; + break; + + case 'S': + improve = !improve; + break; + + case 'G': + global = !global; + break; + + case 'J': + jumble = !jumble; + if (jumble) + initjumble(&inseed, &inseed0, seed, &njumble); + else njumble = 1; + break; + + case 'L': + lngths = !lngths; + break; + + case 'O': + outgropt = !outgropt; + if (outgropt) + initoutgroup(&outgrno, spp); + break; + + case 'U': + usertree = !usertree; + break; + + case 'M': + mulsets = !mulsets; + if (mulsets) { + printf("Multiple data sets or multiple weights?"); + loopcount2 = 0; + do { + printf(" (type D or W)\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch2); + getchar(); + if (ch2 == '\n') + ch2 = ' '; + uppercase(&ch2); + countup(&loopcount2, 10); + } while ((ch2 != 'W') && (ch2 != 'D')); + justwts = (ch2 == 'W'); + if (justwts) + justweights(&datasets); + else + initdatasets(&datasets); + if (!jumble) { + jumble = true; + initjumble(&inseed, &inseed0, seed, &njumble); + } + } + break; + + case 'I': + interleaved = !interleaved; + break; + + case '0': + initterminal(&ibmpc, &ansi); + break; + + case '1': + printdata = !printdata; + break; + + case '2': + progress = !progress; + break; + + case '3': + treeprint = !treeprint; + break; + + case '4': + trout = !trout; + break; + + case '5': + hypstate = !hypstate; + break; + } + } else + printf("Not a possible option!\n"); + countup(&loopcount, 100); + } + if (gama || invar) { + loopcount = 0; + do { + printf( +"\nCoefficient of variation of substitution rate among sites (must be positive)\n"); + printf( + " In gamma distribution parameters, this is 1/(square root of alpha)\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%lf%*[^\n]", &cv); + getchar(); + countup(&loopcount, 10); + } while (cv <= 0.0); + alpha = 1.0 / (cv * cv); + } + if (!rctgry) + auto_ = false; + if (rctgry) { + printf("\nRates in HMM"); + if (invar) + printf(" (including one for invariant sites)"); + printf(":\n"); + initcatn(&rcategs); + if (probcat){ + free(probcat); + free(rrate); + } + probcat = (double *) Malloc(rcategs * sizeof(double)); + rrate = (double *) Malloc(rcategs * sizeof(double)); + didchangercat = true; + if (gama) + initgammacat(rcategs, alpha, rrate, probcat); + else { + if (invar) { + loopcount = 0; + do { + printf("Fraction of invariant sites?\n"); + scanf("%lf%*[^\n]", &invarfrac); + getchar(); + countup (&loopcount, 10); + } while ((invarfrac <= 0.0) || (invarfrac >= 1.0)); + initgammacat(rcategs-1, alpha, rrate, probcat); + for (i = 0; i < rcategs-1; i++) + probcat[i] = probcat[i]*(1.0-invarfrac); + probcat[rcategs-1] = invarfrac; + rrate[rcategs-1] = 0.0; + } else { + initcategs(rcategs, rrate); + initprobcat(rcategs, &probsum, probcat); + } + } + } + if (!didchangercat){ + rrate = (double *) Malloc(rcategs*sizeof(double)); + probcat = (double *) Malloc(rcategs*sizeof(double)); + rrate[0] = 1.0; + probcat[0] = 1.0; + } + if (!didchangecat) { + rate = (double *) Malloc(categs*sizeof(double)); + rate[0] = 1.0; + } + init_protmats(); +} /* getoptions */ + + +void makeprotfreqs() +{ + /* calculate amino acid frequencies based on eigmat */ + long i, mineig; + + mineig = 0; + for (i = 0; i <= 19; i++) + if (fabs(eigmat[i]) < fabs(eigmat[mineig])) + mineig = i; + memcpy(freqaa, probmat[mineig], 20 * sizeof(double)); + for (i = 0; i <= 19; i++) + freqaa[i] = fabs(freqaa[i]); +} /* makeprotfreqs */ + +void reallocsites() +{ + long i; + for (i = 0; i < spp; i++) + y[i] = (Char *) Malloc(sites*sizeof(Char)); + + free(category); + free(weight); + free(alias); + free(ally); + free(location); + free(aliasweight); + + category = (long *) Malloc(sites*sizeof(long)); + weight = (long *) Malloc(sites*sizeof(long)); + alias = (long *) Malloc(sites*sizeof(long)); + ally = (long *) Malloc(sites*sizeof(long)); + location = (long *) Malloc(sites*sizeof(long)); + aliasweight = (long *) Malloc(sites*sizeof(long)); + for (i = 0; i < sites; i++) + category[i] = 1; + for (i = 0; i < sites; i++) + weight[i] = 1; + makeweights(); +} + +void allocrest() +{ + long i; + + y = (Char **) Malloc(spp*sizeof(Char *)); + for (i = 0; i < spp; i++) + y[i] = (Char *) Malloc(sites*sizeof(Char)); + nayme = (naym *) Malloc(spp*sizeof(naym)); + enterorder = (long *) Malloc(spp*sizeof(long)); + category = (long *) Malloc(sites*sizeof(long)); + weight = (long *) Malloc(sites*sizeof(long)); + alias = (long *) Malloc(sites*sizeof(long)); + ally = (long *) Malloc(sites*sizeof(long)); + location = (long *) Malloc(sites*sizeof(long)); + aliasweight = (long *) Malloc(sites*sizeof(long)); +} /* allocrest */ + + +void doinit() +{ /* initializes variables */ + inputnumbers(&spp, &sites, &nonodes2, 1); + getoptions(); + if (!usertree) + nonodes2--; + makeprotfreqs(); + if (printdata) + fprintf(outfile, "%2ld species, %3ld sites\n", spp, sites); + alloctree(&curtree.nodep, nonodes2, usertree); + allocrest(); + if (usertree) + return; + alloctree(&bestree.nodep, nonodes2, 0); + alloctree(&priortree.nodep, nonodes2, 0); + if (njumble <= 1) + return; + alloctree(&bestree2.nodep, nonodes2, 0); +} /* doinit */ + + +void inputoptions() +{ + long i; + + if (!firstset) { + samenumsp(&sites, ith); + reallocsites(); + } + if (firstset) { + for (i = 0; i < sites; i++) + category[i] = 1; + for (i = 0; i < sites; i++) + weight[i] = 1; + } + if (justwts || weights) + inputweights(sites, weight, &weights); + weightsum = 0; + for (i = 0; i < sites; i++) + weightsum += weight[i]; + if ((ctgry && categs > 1) && (firstset || !justwts)) { + inputcategs(0, sites, category, categs, "ProML"); + if (printdata) + printcategs(outfile, sites, category, "Site categories"); + } + if (weights && printdata) + printweights(outfile, 0, sites, weight, "Sites"); + fprintf(outfile, "%s model of amino acid change\n\n", + (usejtt ? "Jones-Taylor-Thornton" : + usepmb ? "Henikoff/Tillier PMB" : "Dayhoff PAM")); +} /* inputoptions */ + + +void input_protdata(long chars) +{ + /* input the names and sequences for each species */ + /* used by proml */ + long i, j, k, l, basesread, basesnew; + Char charstate; + boolean allread, done; + + if (printdata) + headings(chars, "Sequences", "---------"); + basesread = 0; + basesnew = 0; + allread = false; + while (!(allread)) { + /* eat white space -- if the separator line has spaces on it*/ + do { + charstate = gettc(infile); + } while (charstate == ' ' || charstate == '\t'); + ungetc(charstate, infile); + if (eoln(infile)) + scan_eoln(infile); + i = 1; + while (i <= spp) { + if ((interleaved && basesread == 0) || !interleaved) + initname(i - 1); + j = (interleaved) ? basesread : 0; + done = false; + while (!done && !eoff(infile)) { + if (interleaved) + done = true; + while (j < chars && !(eoln(infile) || eoff(infile))) { + charstate = gettc(infile); + if (charstate == '\n' || charstate == '\t') + charstate = ' '; + if (charstate == ' ' || (charstate >= '0' && charstate <= '9')) + continue; + uppercase(&charstate); + if ((strchr("ABCDEFGHIKLMNPQRSTVWXYZ*?-", charstate)) == NULL) { + printf("ERROR: bad amino acid: %c at position %ld of species %ld\n", + charstate, j+1, i); + if (charstate == '.') { + printf(" Periods (.) may not be used as gap characters.\n"); + printf(" The correct gap character is (-)\n"); + } + exxit(-1); + } + j++; + y[i - 1][j - 1] = charstate; + } + if (interleaved) + continue; + if (j < chars) + scan_eoln(infile); + else if (j == chars) + done = true; + } + if (interleaved && i == 1) + basesnew = j; + + scan_eoln(infile); + + if ((interleaved && j != basesnew) || + (!interleaved && j != chars)) { + printf("ERROR: SEQUENCES OUT OF ALIGNMENT AT POSITION %ld.\n", j); + exxit(-1); + } + i++; + } + + if (interleaved) { + basesread = basesnew; + allread = (basesread == chars); + } else + allread = (i > spp); + } + if (!printdata) + return; + for (i = 1; i <= ((chars - 1) / 60 + 1); i++) { + for (j = 1; j <= spp; j++) { + for (k = 0; k < nmlngth; k++) + putc(nayme[j - 1][k], outfile); + fprintf(outfile, " "); + l = i * 60; + if (l > chars) + l = chars; + for (k = (i - 1) * 60 + 1; k <= l; k++) { + if (j > 1 && y[j - 1][k - 1] == y[0][k - 1]) + charstate = '.'; + else + charstate = y[j - 1][k - 1]; + putc(charstate, outfile); + if (k % 10 == 0 && k % 60 != 0) + putc(' ', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); +} /* input_protdata */ + + +void makeweights() +{ + /* make up weights vector to avoid duplicate computations */ + long i; + + for (i = 1; i <= sites; i++) { + alias[i - 1] = i; + ally[i - 1] = 0; + aliasweight[i - 1] = weight[i - 1]; + location[i - 1] = 0; + } + sitesort2 (sites, aliasweight); + sitecombine2(sites, aliasweight); + sitescrunch2(sites, 1, 2, aliasweight); + for (i = 1; i <= sites; i++) { + if (aliasweight[i - 1] > 0) + endsite = i; + } + for (i = 1; i <= endsite; i++) { + location[alias[i - 1] - 1] = i; + ally[alias[i - 1] - 1] = alias[i - 1]; + } + term = (double **) Malloc(endsite * sizeof(double *)); + for (i = 0; i < endsite; i++) + term[i] = (double *) Malloc(rcategs * sizeof(double)); + slopeterm = (double **) Malloc(endsite * sizeof(double *)); + for (i = 0; i < endsite; i++) + slopeterm[i] = (double *) Malloc(rcategs * sizeof(double)); + curveterm = (double **) Malloc(endsite * sizeof(double *)); + for (i = 0; i < endsite; i++) + curveterm[i] = (double *) Malloc(rcategs * sizeof(double)); + mp = (vall *) Malloc(sites*sizeof(vall)); + contribution = (contribarr *) Malloc(endsite*sizeof(contribarr)); +} /* makeweights */ + + +void prot_makevalues(long categs, pointarray treenode, long endsite, + long spp, sequence y, steptr alias) +{ + /* set up fractional likelihoods at tips */ + /* a version of makevalues2 found in seq.c */ + /* used by proml */ + long i, j, k, l; + long b; + + for (k = 0; k < endsite; k++) { + j = alias[k]; + for (i = 0; i < spp; i++) { + for (l = 0; l < categs; l++) { + memset(treenode[i]->protx[k][l], 0, sizeof(double)*20); + switch (y[i][j - 1]) { + + case 'A': + treenode[i]->protx[k][l][0] = 1.0; + break; + + case 'R': + treenode[i]->protx[k][l][(long)arginine - (long)alanine] = 1.0; + break; + + case 'N': + treenode[i]->protx[k][l][(long)asparagine - (long)alanine] = 1.0; + break; + + case 'D': + treenode[i]->protx[k][l][(long)aspartic - (long)alanine] = 1.0; + break; + + case 'C': + treenode[i]->protx[k][l][(long)cysteine - (long)alanine] = 1.0; + break; + + case 'Q': + treenode[i]->protx[k][l][(long)glutamine - (long)alanine] = 1.0; + break; + + case 'E': + treenode[i]->protx[k][l][(long)glutamic - (long)alanine] = 1.0; + break; + + case 'G': + treenode[i]->protx[k][l][(long)glycine - (long)alanine] = 1.0; + break; + + case 'H': + treenode[i]->protx[k][l][(long)histidine - (long)alanine] = 1.0; + break; + + case 'I': + treenode[i]->protx[k][l][(long)isoleucine - (long)alanine] = 1.0; + break; + + case 'L': + treenode[i]->protx[k][l][(long)leucine - (long)alanine] = 1.0; + break; + + case 'K': + treenode[i]->protx[k][l][(long)lysine - (long)alanine] = 1.0; + break; + + case 'M': + treenode[i]->protx[k][l][(long)methionine - (long)alanine] = 1.0; + break; + + case 'F': + treenode[i]->protx[k][l][(long)phenylalanine - (long)alanine] = 1.0; + break; + + case 'P': + treenode[i]->protx[k][l][(long)proline - (long)alanine] = 1.0; + break; + + case 'S': + treenode[i]->protx[k][l][(long)serine - (long)alanine] = 1.0; + break; + + case 'T': + treenode[i]->protx[k][l][(long)threonine - (long)alanine] = 1.0; + break; + + case 'W': + treenode[i]->protx[k][l][(long)tryptophan - (long)alanine] = 1.0; + break; + + case 'Y': + treenode[i]->protx[k][l][(long)tyrosine - (long)alanine] = 1.0; + break; + + case 'V': + treenode[i]->protx[k][l][(long)valine - (long)alanine] = 1.0; + break; + + case 'B': + treenode[i]->protx[k][l][(long)asparagine - (long)alanine] = 1.0; + treenode[i]->protx[k][l][(long)aspartic - (long)alanine] = 1.0; + break; + + case 'Z': + treenode[i]->protx[k][l][(long)glutamine - (long)alanine] = 1.0; + treenode[i]->protx[k][l][(long)glutamic - (long)alanine] = 1.0; + break; + + case 'X': /* unknown aa */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + + case '?': /* unknown aa */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + + case '*': /* stop codon symbol */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + + case '-': /* deletion event-absent data or aa */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + } + } + } + } +} /* prot_makevalues */ + + +void free_pmatrix(long sib) +{ + long j,k,l; + + for (j = 0; j < rcategs; j++) { + for (k = 0; k < categs; k++) { + for (l = 0; l < 20; l++) + free(pmatrices[sib][j][k][l]); + free(pmatrices[sib][j][k]); + } + free(pmatrices[sib][j]); + } + free(pmatrices[sib]); +} + +void alloc_pmatrix(long sib) +{ + /* Allocate memory for a new pmatrix. Called iff num_sibs>max_num_sibs */ + long j, k, l; + double ****temp_matrix; + + temp_matrix = (double ****) Malloc (rcategs * sizeof(double ***)); + for (j = 0; j < rcategs; j++) { + temp_matrix[j] = (double ***) Malloc(categs * sizeof(double **)); + for (k = 0; k < categs; k++) { + temp_matrix[j][k] = (double **) Malloc(20 * sizeof (double *)); + for (l = 0; l < 20; l++) + temp_matrix[j][k][l] = (double *) Malloc(20 * sizeof(double)); + } + } + pmatrices[sib] = temp_matrix; + max_num_sibs++; +} /* alloc_pmatrix */ + +void prot_freetable() +{ + long i,j,k,l; + for (j = 0; j < rcategs; j++) { + for (k = 0; k < categs; k++) { + for (l = 0; l < 20; l++) + free(ddpmatrix[j][k][l]); + free(ddpmatrix[j][k]); + } + free(ddpmatrix[j]); + } + free(ddpmatrix); + + for (j = 0; j < rcategs; j++) { + for (k = 0; k < categs; k++) { + for (l = 0; l < 20; l++) + free(dpmatrix[j][k][l]); + free(dpmatrix[j][k]); + } + free(dpmatrix[j]); + } + free(dpmatrix); + + + for (j = 0; j < rcategs; j++) + free(tbl[j]); + free(tbl); + + for ( i = 0 ; i < max_num_sibs ; i++ ) + free_pmatrix(i); + free(pmatrices); +} + +void prot_inittable() +{ + /* Define a lookup table. Precompute values and print them out in tables */ + /* Allocate memory for the pmatrices, dpmatices and ddpmatrices */ + long i, j, k, l; + double sumrates; + + /* Allocate memory for pmatrices, the array of pointers to pmatrices */ + + pmatrices = (double *****) Malloc ( spp * sizeof(double ****)); + + /* Allocate memory for the first 2 pmatrices, the matrix of conversion */ + /* probabilities, but only once per run (aka not on the second jumble. */ + + alloc_pmatrix(0); + alloc_pmatrix(1); + + /* Allocate memory for one dpmatrix, the first derivative matrix */ + + dpmatrix = (double ****) Malloc( rcategs * sizeof(double ***)); + for (j = 0; j < rcategs; j++) { + dpmatrix[j] = (double ***) Malloc( categs * sizeof(double **)); + for (k = 0; k < categs; k++) { + dpmatrix[j][k] = (double **) Malloc( 20 * sizeof(double *)); + for (l = 0; l < 20; l++) + dpmatrix[j][k][l] = (double *) Malloc( 20 * sizeof(double)); + } + } + + /* Allocate memory for one ddpmatrix, the second derivative matrix */ + ddpmatrix = (double ****) Malloc( rcategs * sizeof(double ***)); + for (j = 0; j < rcategs; j++) { + ddpmatrix[j] = (double ***) Malloc( categs * sizeof(double **)); + for (k = 0; k < categs; k++) { + ddpmatrix[j][k] = (double **) Malloc( 20 * sizeof(double *)); + for (l = 0; l < 20; l++) + ddpmatrix[j][k][l] = (double *) Malloc( 20 * sizeof(double)); + } + } + + /* Allocate memory and assign values to tbl, the matrix of possible rates*/ + + tbl = (double **) Malloc( rcategs * sizeof(double *)); + for (j = 0; j < rcategs; j++) + tbl[j] = (double *) Malloc( categs * sizeof(double)); + + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) + tbl[j][k] = rrate[j]*rate[k]; + + sumrates = 0.0; + for (i = 0; i < endsite; i++) { + for (j = 0; j < rcategs; j++) + sumrates += aliasweight[i] * probcat[j] + * tbl[j][category[alias[i] - 1] - 1]; + } + sumrates /= (double)sites; + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) { + tbl[j][k] /= sumrates; + } + + if(jumb > 1) + return; + + if (gama) { + fprintf(outfile, "\nDiscrete approximation to gamma distributed rates\n"); + fprintf(outfile, + " Coefficient of variation of rates = %f (alpha = %f)\n", + cv, alpha); + } + if (rcategs > 1) { + fprintf(outfile, "\nStates in HMM Rate of change Probability\n\n"); + for (i = 0; i < rcategs; i++) + if (probcat[i] < 0.0001) + fprintf(outfile, "%9ld%16.3f%20.6f\n", i+1, rrate[i], probcat[i]); + else if (probcat[i] < 0.001) + fprintf(outfile, "%9ld%16.3f%19.5f\n", i+1, rrate[i], probcat[i]); + else if (probcat[i] < 0.01) + fprintf(outfile, "%9ld%16.3f%18.4f\n", i+1, rrate[i], probcat[i]); + else + fprintf(outfile, "%9ld%16.3f%17.3f\n", i+1, rrate[i], probcat[i]); + putc('\n', outfile); + if (auto_) + fprintf(outfile, + "Expected length of a patch of sites having the same rate = %8.3f\n", + 1/lambda); + putc('\n', outfile); + } + if (categs > 1) { + fprintf(outfile, "\nSite category Rate of change\n\n"); + for (k = 0; k < categs; k++) + fprintf(outfile, "%9ld%16.3f\n", k+1, rate[k]); + } + if ((rcategs > 1) || (categs >> 1)) + fprintf(outfile, "\n\n"); +} /* prot_inittable */ + + +void getinput() +{ + /* reads the input data */ + if (!justwts || firstset) + inputoptions(); + if (!justwts || firstset) + input_protdata(sites); + if ( !firstset ) freelrsaves(); + makeweights(); + alloclrsaves(); + setuptree2(curtree); + if (!usertree) { + setuptree2(bestree); + setuptree2(priortree); + if (njumble > 1) + setuptree2(bestree2); + } + prot_allocx(nonodes2, rcategs, curtree.nodep, usertree); + if (!usertree) { + prot_allocx(nonodes2, rcategs, bestree.nodep, 0); + prot_allocx(nonodes2, rcategs, priortree.nodep, 0); + if (njumble > 1) + prot_allocx(nonodes2, rcategs, bestree2.nodep, 0); + } + prot_makevalues(rcategs, curtree.nodep, endsite, spp, y, alias); +} /* getinput */ + + +void inittravtree(node *p) +{ + /* traverse tree to set initialized and v to initial values */ + node* q; + + p->initialized = false; + p->back->initialized = false; + if ((!lngths) || p->iter) { + p->v = initialv; + p->back->v = initialv; + } + + if ( !p->tip ) { + q = p->next; + while ( q != p ) { + inittravtree(q->back); + q = q->next; + } + } +} /* inittravtree */ + + +void prot_nuview(node *p) +{ + long i, j, k, l, m, num_sibs, sib_index; + node *sib_ptr, *sib_back_ptr; + psitelike prot_xx, x2; + double lw, prod7; + double **pmat; + double maxx; + double correction; + + /* Figure out how many siblings the current node has */ + /* and be sure that pmatrices is large enough */ + num_sibs = count_sibs(p); + for (i = 0; i < num_sibs; i++) + if (pmatrices[i] == NULL) + alloc_pmatrix(i); + + /* Recursive calls, should be called for all children */ + sib_ptr = p; + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + + if (!sib_back_ptr->tip && + !sib_back_ptr->initialized) + prot_nuview(sib_back_ptr); + } + + /* Make pmatrices for all possible combinations of category, rcateg */ + /* and sib */ + sib_ptr = p; /* return to p */ + for (sib_index=0; sib_index < num_sibs; sib_index++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + + lw = sib_back_ptr->v; + + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) + make_pmatrix(pmatrices[sib_index][j][k], NULL, NULL, 0, lw, + tbl[j][k], eigmat, probmat); + } + + for (i = 0; i < endsite; i++) { + maxx = 0; + correction = 0; + + k = category[alias[i]-1] - 1; + for (j = 0; j < rcategs; j++) { + + /* initialize to 1 all values of prot_xx */ + for (m = 0; m <= 19; m++) + prot_xx[m] = 1; + + sib_ptr = p; /* return to p */ + /* loop through all sibs and calculate likelihoods for all possible*/ + /* amino acid combinations */ + for (sib_index=0; sib_index < num_sibs; sib_index++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + + if ( j == 0) + correction += sib_back_ptr->underflows[i]; + + memcpy(x2, sib_back_ptr->protx[i][j], sizeof(psitelike)); + pmat = pmatrices[sib_index][j][k]; + for (m = 0; m <= 19; m++) { + prod7 = 0; + for (l = 0; l <= 19; l++) + prod7 += (pmat[m][l] * x2[l]); + prot_xx[m] *= prod7; + if ( prot_xx[m] > maxx && sib_index == (num_sibs - 1)) + maxx = prot_xx[m]; + } + } + /* And the final point of this whole function: */ + memcpy(p->protx[i][j], prot_xx, sizeof(psitelike)); + } + p->underflows[i] = 0; + if ( maxx < MIN_DOUBLE ) + fix_protx(p,i,maxx,rcategs); + p->underflows[i] += correction; + } + + p->initialized = true; +} /* prot_nuview */ + + +void prot_slopecurv(node *p,double y,double *like,double *slope,double *curve) +{ + /* compute log likelihood, slope and curvature at node p */ + long i, j, k, l, m, lai; + double sum, sumc, sumterm, lterm, sumcs, sumcc, sum2, slope2, curve2; + double frexm = 0; /* frexm = freqaa[m]*x1[m] */ + /* frexml = frexm*x2[l] */ + double prod4m, prod5m, prod6m; /* elements of prod4-5 for */ + /* each m */ + double **pmat, **dpmat, **ddpmat; /* local pointers to global*/ + /* matrices */ + double prod4, prod5, prod6; + contribarr thelike, nulike, nuslope, nucurve, + theslope, thecurve, clai, cslai, cclai; + node *q; + psitelike x1, x2; + + q = p->back; + sum = 0.0; + for (j = 0; j < rcategs; j++) { + for (k = 0; k < categs; k++) { + make_pmatrix(pmatrices[0][j][k], dpmatrix[j][k], ddpmatrix[j][k], + 2, y, tbl[j][k], eigmat, probmat); + } + } + for (i = 0; i < endsite; i++) { + k = category[alias[i]-1] - 1; + for (j = 0; j < rcategs; j++) { + memcpy(x1, p->protx[i][j], sizeof(psitelike)); + memcpy(x2, q->protx[i][j], sizeof(psitelike)); + pmat = pmatrices[0][j][k]; + dpmat = dpmatrix[j][k]; + ddpmat = ddpmatrix[j][k]; + prod4 = 0.0; + prod5 = 0.0; + prod6 = 0.0; + for (m = 0; m <= 19; m++) { + prod4m = 0.0; + prod5m = 0.0; + prod6m = 0.0; + frexm = x1[m] * freqaa[m]; + for (l = 0; l <= 19; l++) { + prod4m += x2[l] * pmat[m][l]; + prod5m += x2[l] * dpmat[m][l]; + prod6m += x2[l] * ddpmat[m][l]; + } + prod4 += frexm * prod4m; + prod5 += frexm * prod5m; + prod6 += frexm * prod6m; + } + term[i][j] = prod4; + slopeterm[i][j] = prod5; + curveterm[i][j] = prod6; + } + sumterm = 0.0; + for (j = 0; j < rcategs; j++) + sumterm += probcat[j] * term[i][j]; + if (sumterm <= 0.0) + sumterm = 0.000000001; /* ? shouldn't get here ?? */ + lterm = log(sumterm) + p->underflows[i] + q->underflows[i]; + for (j = 0; j < rcategs; j++) { + term[i][j] = term[i][j] / sumterm; + slopeterm[i][j] = slopeterm[i][j] / sumterm; + curveterm[i][j] = curveterm[i][j] / sumterm; + } + sum += (aliasweight[i] * lterm); + } + for (i = 0; i < rcategs; i++) { + thelike[i] = 1.0; + theslope[i] = 0.0; + thecurve[i] = 0.0; + } + for (i = 0; i < sites; i++) { + sumc = 0.0; + sumcs = 0.0; + sumcc = 0.0; + for (k = 0; k < rcategs; k++) { + sumc += probcat[k] * thelike[k]; + sumcs += probcat[k] * theslope[k]; + sumcc += probcat[k] * thecurve[k]; + } + sumc *= lambda; + sumcs *= lambda; + sumcc *= lambda; + if ((ally[i] > 0) && (location[ally[i]-1] > 0)) { + lai = location[ally[i] - 1]; + memcpy(clai, term[lai - 1], rcategs*sizeof(double)); + memcpy(cslai, slopeterm[lai - 1], rcategs*sizeof(double)); + memcpy(cclai, curveterm[lai - 1], rcategs*sizeof(double)); + if (weight[i] > 1) { + for (j = 0; j < rcategs; j++) { + if (clai[j] > 0.0) + clai[j] = exp(weight[i]*log(clai[j])); + else clai[j] = 0.0; + if (cslai[j] > 0.0) + cslai[j] = exp(weight[i]*log(cslai[j])); + else cslai[j] = 0.0; + if (cclai[j] > 0.0) + cclai[j] = exp(weight[i]*log(cclai[j])); + else cclai[j] = 0.0; + } + } + for (j = 0; j < rcategs; j++) { + nulike[j] = ((1.0 - lambda) * thelike[j] + sumc) * clai[j]; + nuslope[j] = ((1.0 - lambda) * theslope[j] + sumcs) * clai[j] + + ((1.0 - lambda) * thelike[j] + sumc) * cslai[j]; + nucurve[j] = ((1.0 - lambda) * thecurve[j] + sumcc) * clai[j] + + 2.0 * ((1.0 - lambda) * theslope[j] + sumcs) * cslai[j] + + ((1.0 - lambda) * thelike[j] + sumc) * cclai[j]; + } + } else { + for (j = 0; j < rcategs; j++) { + nulike[j] = ((1.0 - lambda) * thelike[j] + sumc); + nuslope[j] = ((1.0 - lambda) * theslope[j] + sumcs); + nucurve[j] = ((1.0 - lambda) * thecurve[j] + sumcc); + } + } + memcpy(thelike, nulike, rcategs*sizeof(double)); + memcpy(theslope, nuslope, rcategs*sizeof(double)); + memcpy(thecurve, nucurve, rcategs*sizeof(double)); + } + sum2 = 0.0; + slope2 = 0.0; + curve2 = 0.0; + for (i = 0; i < rcategs; i++) { + sum2 += probcat[i] * thelike[i]; + slope2 += probcat[i] * theslope[i]; + curve2 += probcat[i] * thecurve[i]; + } + sum += log(sum2); + (*like) = sum; + (*slope) = slope2 / sum2; + (*curve) = (curve2 - slope2 * slope2 / sum2) / sum2; +} /* prot_slopecurv */ + + +void makenewv(node *p) +{ + /* Newton-Raphson algorithm improvement of a branch length */ + long it, ite; + double y, yold=0, yorig, like, slope, curve, oldlike=0; + boolean done, firsttime, better; + node *q; + + q = p->back; + y = p->v; + yorig = y; + done = false; + firsttime = true; + it = 1; + ite = 0; + while ((it < iterations) && (ite < 20) && (!done)) { + prot_slopecurv(p, y, &like, &slope, &curve); + better = false; + if (firsttime) { + yold = y; + oldlike = like; + firsttime = false; + better = true; + } else { + if (like > oldlike) { + yold = y; + oldlike = like; + better = true; + it++; + } + } + if (better) { + y = y + slope/fabs(curve); + if (y < epsilon) + y = epsilon; + } else { + if (fabs(y - yold) < epsilon) + ite = 20; + y = (y + (7 * yold)) / 8; + } + ite++; + done = fabs(y-yold) < epsilon; + } + smoothed = (fabs(yold-yorig) < epsilon) && (yorig > 1000.0*epsilon); + p->v = yold; + q->v = yold; + curtree.likelihood = oldlike; +} /* makenewv */ + + +void update(node *p) +{ + if (!p->tip && !p->initialized) + prot_nuview(p); + if (!p->back->tip && !p->back->initialized) + prot_nuview(p->back); + if ((!usertree) || (usertree && !lngths) || p->iter) { + makenewv(p); + if ( smoothit ) { + inittrav(p); + inittrav(p->back); + } + else if ( inserting && !p->tip ) { + p->next->initialized = false; + p->next->next->initialized = false; + } + } +} /* update */ + + +void smooth(node *p) +{ + long i, num_sibs; + node *sib_ptr; + + smoothed = false; + update(p); + if (p->tip) + return; + + num_sibs = count_sibs(p); + sib_ptr = p; + + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + + if (polishing || (smoothit && !smoothed)) { + smooth(sib_ptr->back); + p->initialized = false; + sib_ptr->initialized = false; + } + } +} /* smooth */ + + +void make_pmatrix(double **matrix, double **dmat, double **ddmat, + long derivative, double lz, double rat, + double *eigmat, double **probmat) +{ + /* Computes the R matrix such that matrix[m][l] is the joint probability */ + /* of m and l. */ + /* Computes a P matrix such that matrix[m][l] is the conditional */ + /* probability of m given l. This is accomplished by dividing all terms */ + /* in the R matrix by freqaa[m], the frequency of l. */ + + long k, l, m; /* (l) original character state */ + /* (m) final character state */ + /* (k) lambda counter */ + double p0, p1, p2, q; + double elambdat[20], delambdat[20], ddelambdat[20]; + /* exponential term for matrix */ + /* and both derivative matrices */ + for (k = 0; k <= 19; k++) { + elambdat[k] = exp(lz * rat * eigmat[k]); + if(derivative != 0) { + delambdat[k] = (elambdat[k] * rat * eigmat[k]); + ddelambdat[k] = (delambdat[k] * rat * eigmat[k]); + } + } + for (m = 0; m <= 19; m++) { + for (l = 0; l <= 19; l++) { + p0 = 0.0; + p1 = 0.0; + p2 = 0.0; + for (k = 0; k <= 19; k++) { + q = probmat[k][m] * probmat[k][l]; + p0 += (q * elambdat[k]); + if(derivative !=0) { + p1 += (q * delambdat[k]); + p2 += (q * ddelambdat[k]); + } + } + matrix[m][l] = p0 / freqaa[m]; + if(derivative != 0) { + dmat[m][l] = p1 / freqaa[m]; + ddmat[m][l] = p2 / freqaa[m]; + } + } + } +} /* make_pmatrix */ + + +double prot_evaluate(node *p, boolean saveit) +{ + contribarr tterm; + double sum, sum2, sumc, y, prod4, prodl, frexm, sumterm, lterm; + double **pmat; + long i, j, k, l, m, lai; + node *q; + psitelike x1, x2; + + sum = 0.0; + q = p->back; + y = p->v; + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) + make_pmatrix(pmatrices[0][j][k],NULL,NULL,0,y,tbl[j][k],eigmat,probmat); + for (i = 0; i < endsite; i++) { + k = category[alias[i]-1] - 1; + for (j = 0; j < rcategs; j++) { + memcpy(x1, p->protx[i][j], sizeof(psitelike)); + memcpy(x2, q->protx[i][j], sizeof(psitelike)); + prod4 = 0.0; + pmat = pmatrices[0][j][k]; + for (m = 0; m <= 19; m++) { + prodl = 0.0; + for (l = 0; l <= 19; l++) + prodl += (pmat[m][l] * x2[l]); + frexm = x1[m] * freqaa[m]; + prod4 += (prodl * frexm); + } + tterm[j] = prod4; + } + sumterm = 0.0; + for (j = 0; j < rcategs; j++) + sumterm += probcat[j] * tterm[j]; + if (sumterm < 0.0) + sumterm = 0.00000001; /* ??? */ + lterm = log(sumterm) + p->underflows[i] + q->underflows[i]; + for (j = 0; j < rcategs; j++) + clai[j] = tterm[j] / sumterm; + memcpy(contribution[i], clai, rcategs*sizeof(double)); + if (saveit && !auto_ && usertree && (which <= shimotrees)) + l0gf[which - 1][i] = lterm; + sum += aliasweight[i] * lterm; + } + for (j = 0; j < rcategs; j++) + like[j] = 1.0; + for (i = 0; i < sites; i++) { + sumc = 0.0; + for (k = 0; k < rcategs; k++) + sumc += probcat[k] * like[k]; + sumc *= lambda; + if ((ally[i] > 0) && (location[ally[i]-1] > 0)) { + lai = location[ally[i] - 1]; + memcpy(clai, contribution[lai - 1], rcategs*sizeof(double)); + for (j = 0; j < rcategs; j++) + nulike[j] = ((1.0 - lambda) * like[j] + sumc) * clai[j]; + } else { + for (j = 0; j < rcategs; j++) + nulike[j] = ((1.0 - lambda) * like[j] + sumc); + } + memcpy(like, nulike, rcategs*sizeof(double)); + } + sum2 = 0.0; + for (i = 0; i < rcategs; i++) + sum2 += probcat[i] * like[i]; + sum += log(sum2); + curtree.likelihood = sum; + if (!saveit || auto_ || !usertree) + return sum; + if(which <= shimotrees) + l0gl[which - 1] = sum; + if (which == 1) { + maxwhich = 1; + maxlogl = sum; + return sum; + } + if (sum > maxlogl) { + maxwhich = which; + maxlogl = sum; + } + return sum; +} /* prot_evaluate */ + + +void treevaluate() +{ + /* evaluate a user tree */ + long i; + + inittravtree(curtree.start); + polishing = true; + smoothit = true; + for (i = 1; i <= smoothings * 4; i++) + smooth (curtree.start); + dummy = prot_evaluate(curtree.start, true); +} /* treevaluate */ + + +void promlcopy(tree *a, tree *b, long nonodes, long categs) +{ + /* copy tree a to tree b */ + long i, j=0; + node *p, *q; + + for (i = 0; i < spp; i++) { + prot_copynode(a->nodep[i], b->nodep[i], categs); + if (a->nodep[i]->back) { + if (a->nodep[i]->back == a->nodep[a->nodep[i]->back->index - 1]) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]; + else if (a->nodep[i]->back == a->nodep[a->nodep[i]->back->index - 1]->next +) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]->next; + else + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]->next->next; + } + else b->nodep[i]->back = NULL; + } + for (i = spp; i < nonodes; i++) { + p = a->nodep[i]; + q = b->nodep[i]; + for (j = 1; j <= 3; j++) { + prot_copynode(p, q, categs); + if (p->back) { + if (p->back == a->nodep[p->back->index - 1]) + q->back = b->nodep[p->back->index - 1]; + else if (p->back == a->nodep[p->back->index - 1]->next) + q->back = b->nodep[p->back->index - 1]->next; + else + q->back = b->nodep[p->back->index - 1]->next->next; + } + else + q->back = NULL; + p = p->next; + q = q->next; + } + } + b->likelihood = a->likelihood; + b->start = a->start; /* start used in dnaml only */ + b->root = a->root; /* root used in dnamlk only */ +} /* promlcopy */ + + +void proml_re_move(node **p, node **q) +{ + /* remove p and record in q where it was */ + long i; + + /** assumes bifurcations */ + *q = (*p)->next->back; + hookup(*q, (*p)->next->next->back); + (*p)->next->back = NULL; + (*p)->next->next->back = NULL; + (*q)->v += (*q)->back->v; + (*q)->back->v = (*q)->v; + if ( smoothit ) { + inittrav((*q)); + inittrav((*q)->back); + inittrav((*p)->back); + } + if ( smoothit ) { + for ( i = 0 ; i < smoothings ; i++ ) { + smooth(*q); + smooth((*q)->back); + } + } + else + smooth(*q); +} /* proml_re_move */ + + +void insert_(node *p, node *q, boolean dooinit) +{ + /* Insert q near p */ + long i, j, num_sibs; + node *r, *sib_ptr; + + r = p->next->next; + hookup(r, q->back); + hookup(p->next, q); + q->v = 0.5 * q->v; + q->back->v = q->v; + r->v = q->v; + r->back->v = r->v; + p->initialized = false; + if (dooinit) { + inittrav(p); + inittrav(q); + inittrav(q->back); + } + i = 1; + inserting = true; + while (i <= smoothings) { + smooth(p); + if (!p->tip) { + num_sibs = count_sibs(p); + sib_ptr = p; + for (j=0; j < num_sibs; j++) { + smooth(sib_ptr->next->back); + sib_ptr = sib_ptr->next; + } + } + i++; + } + inserting = false; +} /* insert_ */ + + +void addtraverse(node *p, node *q, boolean contin) +{ + /* try adding p at q, proceed recursively through tree */ + long i, num_sibs; + double like, vsave = 0; + node *qback = NULL, *sib_ptr; + + if (!smoothit) { + vsave = q->v; + qback = q->back; + } + insert_(p, q, false); + like = prot_evaluate(p, false); + if (like > bestyet || bestyet == UNDEFINED) { + bestyet = like; + if (smoothit) { + addwhere = q; + promlcopy(&curtree, &bestree, nonodes2, rcategs); + } + else + qwhere = q; + succeeded = true; + } + if (smoothit) + promlcopy(&priortree, &curtree, nonodes2, rcategs); + else { + hookup (q, qback); + q->v = vsave; + q->back->v = vsave; + curtree.likelihood = bestyet; + } + if (!q->tip && contin) { + num_sibs = count_sibs(q); + if (q == curtree.start) + num_sibs++; + sib_ptr = q; + for (i=0; i < num_sibs; i++) { + addtraverse(p, sib_ptr->next->back, contin); + sib_ptr = sib_ptr->next; + } + } +} /* addtraverse */ + + +void globrearrange() +{ + /* does global rearrangements */ + tree globtree; + tree oldtree; + int i,j,k,l,num_sibs,num_sibs2; + node *where,*sib_ptr,*sib_ptr2; + double oldbestyet = curtree.likelihood; + int success = false; + + alloctree(&globtree.nodep,nonodes2,0); + alloctree(&oldtree.nodep,nonodes2,0); + setuptree2(globtree); + setuptree2(oldtree); + prot_allocx(nonodes2, rcategs, globtree.nodep, 0); + prot_allocx(nonodes2, rcategs, oldtree.nodep, 0); + promlcopy(&curtree,&globtree,nonodes2,rcategs); + promlcopy(&curtree,&oldtree,nonodes2,rcategs); + bestyet = curtree.likelihood; + for ( i = spp ; i < nonodes2 ; i++ ) { + num_sibs = count_sibs(curtree.nodep[i]); + sib_ptr = curtree.nodep[i]; + if ( (i - spp) % (( nonodes2 / 72 ) + 1 ) == 0 ) + putchar('.'); + fflush(stdout); + for ( j = 0 ; j <= num_sibs ; j++ ) { + proml_re_move(&sib_ptr,&where); + promlcopy(&curtree,&priortree,nonodes2,rcategs); + qwhere = where; + + if (where->tip) { + promlcopy(&oldtree,&curtree,nonodes2,rcategs); + promlcopy(&oldtree,&bestree,nonodes2,rcategs); + sib_ptr=sib_ptr->next; + continue; + } + else num_sibs2 = count_sibs(where); + sib_ptr2 = where; + for ( k = 0 ; k < num_sibs2 ; k++ ) { + addwhere = NULL; + addtraverse(sib_ptr,sib_ptr2->back,true); + if ( !smoothit ) { + if (succeeded && qwhere != where && qwhere != where->back) { + insert_(sib_ptr,qwhere,true); + smoothit = true; + for (l = 1; l<=smoothings; l++) { + smooth (where); + smooth (where->back); + } + smoothit = false; + success = true; + promlcopy(&curtree,&globtree,nonodes2,rcategs); + promlcopy(&priortree,&curtree,nonodes2,rcategs); + } + } + else if ( addwhere && where != addwhere && where->back != addwhere + && bestyet > globtree.likelihood) { + promlcopy(&bestree,&globtree,nonodes2,rcategs); + success = true; + } + sib_ptr2 = sib_ptr2->next; + } + promlcopy(&oldtree,&curtree,nonodes2,rcategs); + promlcopy(&oldtree,&bestree,nonodes2,rcategs); + sib_ptr = sib_ptr->next; + } + } + promlcopy(&globtree,&curtree,nonodes2,rcategs); + promlcopy(&globtree,&bestree,nonodes2,rcategs); + if (success && globtree.likelihood > oldbestyet) { + succeeded = true; + } + else { + succeeded = false; + } + bestyet = globtree.likelihood; + prot_freex(nonodes2,oldtree.nodep); + prot_freex(nonodes2,globtree.nodep); + freetree2(globtree.nodep,nonodes2); + freetree2(oldtree.nodep,nonodes2); +} /* globrearrange */ + + +void freelrsaves() +{ + long i,j; + for ( i = 0 ; i < NLRSAVES ; i++ ) { + for (j = 0; j < oldendsite; j++) + free(lrsaves[i]->protx[j]); + free(lrsaves[i]->protx); + free(lrsaves[i]->underflows); + free(lrsaves[i]); + } + free(lrsaves); +} + + +void alloclrsaves() +{ + long i,j; + lrsaves = Malloc(NLRSAVES * sizeof(node*)); + oldendsite = endsite; + for ( i = 0 ; i < NLRSAVES ; i++ ) { + lrsaves[i] = Malloc(sizeof(node)); + lrsaves[i]->protx = Malloc(endsite*sizeof(ratelike)); + lrsaves[i]->underflows = Malloc(endsite * sizeof (double)); + for (j = 0; j < endsite; j++) + lrsaves[i]->protx[j] = (pratelike)Malloc(rcategs*sizeof(psitelike)); + } +} /* alloclrsaves */ + + +void rearrange(node *p, node *pp) +{ + /* rearranges the tree locally moving pp around near p */ + long i, num_sibs; + node *q, *r, *sib_ptr; + node *rnb, *rnnb; + + if (!p->tip && !p->back->tip) { + curtree.likelihood = bestyet; + if (p->back->next != pp) + r = p->back->next; + else + r = p->back->next->next; + /* assumes bifurcations? */ + if (!smoothit) { + rnb = r->next->back; + rnnb = r->next->next->back; + prot_copynode(r,lrsaves[0],categs); + prot_copynode(r->next,lrsaves[1],categs); + prot_copynode(r->next->next,lrsaves[2],categs); + prot_copynode(p->next,lrsaves[3],categs); + prot_copynode(p->next->next,lrsaves[4],categs); + } + else + promlcopy(&curtree, &bestree, nonodes2, rcategs); + proml_re_move(&r, &q); + if (smoothit) + promlcopy(&curtree, &priortree, nonodes2, rcategs); + else + qwhere = q; + num_sibs = count_sibs (p); + sib_ptr = p; + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + addtraverse(r, sib_ptr->back, false); + } + if (smoothit) + promlcopy(&bestree, &curtree, nonodes2, rcategs); + else { + if (qwhere == q) { + hookup(rnb,r->next); + hookup(rnnb,r->next->next); + prot_copynode(lrsaves[0],r,categs); + prot_copynode(lrsaves[1],r->next,categs); + prot_copynode(lrsaves[2],r->next->next,categs); + prot_copynode(lrsaves[3],p->next,categs); + prot_copynode(lrsaves[4],p->next->next,categs); + rnb->v = r->next->v; + rnnb->v = r->next->next->v; + r->back->v = r->v; + curtree.likelihood = bestyet; + } + else { + insert_(r, qwhere, true); + smoothit = true; + for (i = 1; i<=smoothings; i++) { + smooth(r); + smooth(r->back); + } + smoothit = false; + promlcopy(&curtree, &bestree, nonodes2, rcategs); + } + } + } + if (!p->tip) { + num_sibs = count_sibs(p); + if (p == curtree.start) + num_sibs++; + sib_ptr = p; + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + rearrange(sib_ptr->back, p); + } + } +} /* rearrange */ + + +void proml_coordinates(node *p, double lengthsum, long *tipy, + double *tipmax) +{ + /* establishes coordinates of nodes */ + node *q, *first, *last; + double xx; + + if (p->tip) { + p->xcoord = (long)(over * lengthsum + 0.5); + p->ycoord = (*tipy); + p->ymin = (*tipy); + p->ymax = (*tipy); + (*tipy) += down; + if (lengthsum > (*tipmax)) + (*tipmax) = lengthsum; + return; + } + q = p->next; + do { + xx = q->v; + if (xx > 100.0) + xx = 100.0; + proml_coordinates(q->back, lengthsum + xx, tipy,tipmax); + q = q->next; + } while ((p == curtree.start || p != q) && + (p != curtree.start || p->next != q)); + first = p->next->back; + q = p; + while (q->next != p) + q = q->next; + last = q->back; + p->xcoord = (long)(over * lengthsum + 0.5); + if (p == curtree.start) + p->ycoord = p->next->next->back->ycoord; + else + p->ycoord = (first->ycoord + last->ycoord) / 2; + p->ymin = first->ymin; + p->ymax = last->ymax; +} /* proml_coordinates */ + + +void proml_printree() +{ + /* prints out diagram of the tree2 */ + long tipy; + double scale, tipmax; + long i; + + if (!treeprint) + return; + putc('\n', outfile); + tipy = 1; + tipmax = 0.0; + proml_coordinates(curtree.start, 0.0, &tipy, &tipmax); + scale = 1.0 / (long)(tipmax + 1.000); + for (i = 1; i <= (tipy - down); i++) + drawline2(i, scale, curtree); + putc('\n', outfile); +} /* proml_printree */ + + +void sigma(node *p, double *sumlr, double *s1, double *s2) +{ + /* compute standard deviation */ + double tt, aa, like, slope, curv; + + prot_slopecurv(p, p->v, &like, &slope, &curv); + tt = p->v; + p->v = epsilon; + p->back->v = epsilon; + aa = prot_evaluate(p, false); + p->v = tt; + p->back->v = tt; + (*sumlr) = prot_evaluate(p, false) - aa; + if (curv < -epsilon) { + (*s1) = p->v + (-slope - sqrt(slope * slope - 3.841 * curv)) / curv; + (*s2) = p->v + (-slope + sqrt(slope * slope - 3.841 * curv)) / curv; + } + else { + (*s1) = -1.0; + (*s2) = -1.0; + } +} /* sigma */ + + +void describe(node *p) +{ + /* print out information for one branch */ + long i, num_sibs; + node *q, *sib_ptr; + double sumlr, sigma1, sigma2; + + if (!p->tip && !p->initialized) + prot_nuview(p); + if (!p->back->tip && !p->back->initialized) + prot_nuview(p->back); + q = p->back; + if (q->tip) { + fprintf(outfile, " "); + for (i = 0; i < nmlngth; i++) + putc(nayme[q->index-1][i], outfile); + fprintf(outfile, " "); + } else + fprintf(outfile, " %4ld ", q->index - spp); + if (p->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[p->index-1][i], outfile); + } else + fprintf(outfile, "%4ld ", p->index - spp); + fprintf(outfile, "%15.5f", q->v); + if (!usertree || (usertree && !lngths) || p->iter) { + sigma(q, &sumlr, &sigma1, &sigma2); + if (sigma1 <= sigma2) + fprintf(outfile, " ( zero, infinity)"); + else { + fprintf(outfile, " ("); + if (sigma2 <= 0.0) + fprintf(outfile, " zero"); + else + fprintf(outfile, "%9.5f", sigma2); + fprintf(outfile, ",%12.5f", sigma1); + putc(')', outfile); + } + if (sumlr > 1.9205) + fprintf(outfile, " *"); + if (sumlr > 2.995) + putc('*', outfile); + } + putc('\n', outfile); + if (!p->tip) { + num_sibs = count_sibs(p); + sib_ptr = p; + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + describe(sib_ptr->back); + } + } +} /* describe */ + + +void prot_reconstr(node *p, long n) +{ + /* reconstruct and print out acid at site n+1 at node p */ + long i, j, k, first, num_sibs = 0; + double f, sum, xx[20]; + node *q = NULL; + + if (p->tip) + putc(y[p->index-1][n], outfile); + else { + num_sibs = count_sibs(p); + if ((ally[n] == 0) || (location[ally[n]-1] == 0)) + putc('.', outfile); + else { + j = location[ally[n]-1] - 1; + sum = 0; + for (i = 0; i <= 19; i++) { + f = p->protx[j][mx-1][i]; + if (!p->tip) { + q = p; + for (k = 0; k < num_sibs; k++) { + q = q->next; + f *= q->protx[j][mx-1][i]; + } + } + f = sqrt(f); + xx[i] = f * freqaa[i]; + sum += xx[i]; + } + for (i = 0; i <= 19; i++) + xx[i] /= sum; + first = 0; + for (i = 0; i <= 19; i++) + if (xx[i] > xx[first]) + first = i; + if (xx[first] > 0.95) + putc(aachar[first], outfile); + else + putc(tolower(aachar[first]), outfile); + if (rctgry && rcategs > 1) + mx = mp[n][mx - 1]; + else + mx = 1; + } + } +} /* prot_reconstr */ + + +void rectrav(node *p, long m, long n) +{ + /* print out segment of reconstructed sequence for one branch */ + long i; + + putc(' ', outfile); + if (p->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[p->index-1][i], outfile); + } else + fprintf(outfile, "%4ld ", p->index - spp); + fprintf(outfile, " "); + mx = mx0; + for (i = m; i <= n; i++) { + if ((i % 10 == 0) && (i != m)) + putc(' ', outfile); + prot_reconstr(p, i); + } + putc('\n', outfile); + if (!p->tip) { + rectrav(p->next->back, m, n); + rectrav(p->next->next->back, m, n); + } + mx1 = mx; +} /* rectrav */ + + +void summarize() +{ + /* print out branch length information and node numbers */ + long i, j, mm, num_sibs; + double mode, sum; + double like[maxcategs],nulike[maxcategs]; + double **marginal; + node *sib_ptr; + + if (!treeprint) + return; + fprintf(outfile, "\nremember: "); + if (outgropt) + fprintf(outfile, "(although rooted by outgroup) "); + fprintf(outfile, "this is an unrooted tree!\n\n"); + fprintf(outfile, "Ln Likelihood = %11.5f\n", curtree.likelihood); + fprintf(outfile, "\n Between And Length"); + if (!(usertree && lngths && haslengths)) + fprintf(outfile, " Approx. Confidence Limits"); + fprintf(outfile, "\n"); + fprintf(outfile, " ------- --- ------"); + if (!(usertree && lngths && haslengths)) + fprintf(outfile, " ------- ---------- ------"); + fprintf(outfile, "\n\n"); + for (i = spp; i < nonodes2; i++) { + /* So this works with arbitrary multifurcations */ + if (curtree.nodep[i]) { + num_sibs = count_sibs (curtree.nodep[i]); + sib_ptr = curtree.nodep[i]; + for (j = 0; j < num_sibs; j++) { + sib_ptr->initialized = false; + sib_ptr = sib_ptr->next; + } + } + } + + describe(curtree.start->back); + + /* So this works with arbitrary multifurcations */ + num_sibs = count_sibs(curtree.start); + sib_ptr = curtree.start; + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + describe(sib_ptr->back); + } + + fprintf(outfile, "\n"); + if (!(usertree && lngths && haslengths)) { + fprintf(outfile, " * = significantly positive, P < 0.05\n"); + fprintf(outfile, " ** = significantly positive, P < 0.01\n\n"); + } + dummy = prot_evaluate(curtree.start, false); + if (rctgry && rcategs > 1) { + for (i = 0; i < rcategs; i++) + like[i] = 1.0; + for (i = sites - 1; i >= 0; i--) { + sum = 0.0; + for (j = 0; j < rcategs; j++) { + nulike[j] = (1.0 - lambda + lambda * probcat[j]) * like[j]; + mp[i][j] = j + 1; + for (k = 1; k <= rcategs; k++) { + if (k != j + 1) { + if (lambda * probcat[k - 1] * like[k - 1] > nulike[j]) { + nulike[j] = lambda * probcat[k - 1] * like[k - 1]; + mp[i][j] = k; + } + } + } + if ((ally[i] > 0) && (location[ally[i]-1] > 0)) + nulike[j] *= contribution[location[ally[i] - 1] - 1][j]; + sum += nulike[j]; + } + for (j = 0; j < rcategs; j++) + nulike[j] /= sum; + memcpy(like, nulike, rcategs * sizeof(double)); + } + mode = 0.0; + mx = 1; + for (i = 1; i <= rcategs; i++) { + if (probcat[i - 1] * like[i - 1] > mode) { + mx = i; + mode = probcat[i - 1] * like[i - 1]; + } + } + mx0 = mx; + fprintf(outfile, + "Combination of categories that contributes the most to the likelihood:\n\n"); + for (i = 1; i <= nmlngth + 3; i++) + putc(' ', outfile); + for (i = 1; i <= sites; i++) { + fprintf(outfile, "%ld", mx); + if (i % 10 == 0) + putc(' ', outfile); + if (i % 60 == 0 && i != sites) { + putc('\n', outfile); + for (j = 1; j <= nmlngth + 3; j++) + putc(' ', outfile); + } + mx = mp[i - 1][mx - 1]; + } + fprintf(outfile, "\n\n"); + marginal = (double **) Malloc(sites*sizeof(double *)); + for (i = 0; i < sites; i++) + marginal[i] = (double *) Malloc(rcategs*sizeof(double)); + for (i = 0; i < rcategs; i++) + like[i] = 1.0; + for (i = sites - 1; i >= 0; i--) { + sum = 0.0; + for (j = 0; j < rcategs; j++) { + nulike[j] = (1.0 - lambda + lambda * probcat[j]) * like[j]; + for (k = 1; k <= rcategs; k++) { + if (k != j + 1) + nulike[j] += lambda * probcat[k - 1] * like[k - 1]; + } + if ((ally[i] > 0) && (location[ally[i]-1] > 0)) + nulike[j] *= contribution[location[ally[i] - 1] - 1][j]; + sum += nulike[j]; + } + for (j = 0; j < rcategs; j++) { + nulike[j] /= sum; + marginal[i][j] = nulike[j]; + } + memcpy(like, nulike, rcategs * sizeof(double)); + } + for (i = 0; i < rcategs; i++) + like[i] = 1.0; + for (i = 0; i < sites; i++) { + sum = 0.0; + for (j = 0; j < rcategs; j++) { + nulike[j] = (1.0 - lambda + lambda * probcat[j]) * like[j]; + for (k = 1; k <= rcategs; k++) { + if (k != j + 1) + nulike[j] += lambda * probcat[k - 1] * like[k - 1]; + } + marginal[i][j] *= like[j] * probcat[j]; + sum += nulike[j]; + } + for (j = 0; j < rcategs; j++) + nulike[j] /= sum; + memcpy(like, nulike, rcategs * sizeof(double)); + sum = 0.0; + for (j = 0; j < rcategs; j++) + sum += marginal[i][j]; + for (j = 0; j < rcategs; j++) + marginal[i][j] /= sum; + } + fprintf(outfile, "Most probable category at each site if > 0.95"); + fprintf(outfile, " probability (\".\" otherwise)\n\n"); + for (i = 1; i <= nmlngth + 3; i++) + putc(' ', outfile); + for (i = 0; i < sites; i++) { + sum = 0.0; + for (j = 0; j < rcategs; j++) + if (marginal[i][j] > sum) { + sum = marginal[i][j]; + mm = j; + } + if (sum >= 0.95) + fprintf(outfile, "%ld", mm+1); + else + putc('.', outfile); + if ((i+1) % 60 == 0) { + if (i != 0) { + putc('\n', outfile); + for (j = 1; j <= nmlngth + 3; j++) + putc(' ', outfile); + } + } + else if ((i+1) % 10 == 0) + putc(' ', outfile); + } + putc('\n', outfile); + for (i = 0; i < sites; i++) + free(marginal[i]); + free(marginal); + } + putc('\n', outfile); + if (hypstate) { + fprintf(outfile, "Probable sequences at interior nodes:\n\n"); + fprintf(outfile, " node "); + for (i = 0; (i < 13) && (i < ((sites + (sites-1)/10 - 39) / 2)); i++) + putc(' ', outfile); + fprintf(outfile, "Reconstructed sequence (caps if > 0.95)\n\n"); + if (!rctgry || (rcategs == 1)) + mx0 = 1; + for (i = 0; i < sites; i += 60) { + k = i + 59; + if (k >= sites) + k = sites - 1; + rectrav(curtree.start, i, k); + rectrav(curtree.start->back, i, k); + putc('\n', outfile); + mx0 = mx1; + } + } +} /* summarize */ + + +void initpromlnode(node **p, node **grbg, node *q, long len, long nodei, + long *ntips, long *parens, initops whichinit, + pointarray treenode, pointarray nodep, Char *str, + Char *ch, FILE *intree) +{ + /* initializes a node */ + boolean minusread; + double valyew, divisor; + + switch (whichinit) { + case bottom: + gnu(grbg, p); + (*p)->index = nodei; + (*p)->tip = false; + malloc_ppheno((*p), endsite, rcategs); + nodep[(*p)->index - 1] = (*p); + break; + case nonbottom: + gnu(grbg, p); + malloc_ppheno(*p, endsite, rcategs); + (*p)->index = nodei; + break; + case tip: + match_names_to_data(str, nodep, p, spp); + break; + case iter: + (*p)->initialized = false; + (*p)->v = initialv; + (*p)->iter = true; + if ((*p)->back != NULL){ + (*p)->back->iter = true; + (*p)->back->v = initialv; + (*p)->back->initialized = false; + } + break; + case length: + processlength(&valyew, &divisor, ch, &minusread, intree, parens); + (*p)->v = valyew / divisor; + (*p)->iter = false; + if ((*p)->back != NULL) { + (*p)->back->v = (*p)->v; + (*p)->back->iter = false; + } + break; + case hsnolength: + haslengths = false; + break; + default: /* cases hslength, treewt, unittrwt */ + break; /* should never occur */ + } +} /* initpromlnode */ + + +void dnaml_treeout(node *p) +{ + /* write out file with representation of final tree2 */ + /* Only works for bifurcations! */ + long i, n, w; + Char c; + double x; + node *q; + boolean inloop; + + if (p->tip) { + n = 0; + for (i = 1; i <= nmlngth; i++) { + if (nayme[p->index-1][i - 1] != ' ') + n = i; + } + for (i = 0; i < n; i++) { + c = nayme[p->index-1][i]; + if (c == ' ') + c = '_'; + putc(c, outtree); + } + col += n; + } else { + putc('(', outtree); + col++; + + inloop = false; + q = p->next; + do { + if (inloop) { + putc(',', outtree); + col++; + if (col > 45) { + putc('\n', outtree); + col = 0; + } + } + inloop = true; + dnaml_treeout(q->back); + q = q->next; + } while ((p == curtree.start || p != q) && + (p != curtree.start || p->next != q)); + + putc(')', outtree); + col++; + } + x = p->v; + if (x > 0.0) + w = (long)(0.43429448222 * log(x)); + else if (x == 0.0) + w = 0; + else + w = (long)(0.43429448222 * log(-x)) + 1; + if (w < 0) + w = 0; + if (p == curtree.start) + fprintf(outtree, ";\n"); + else { + fprintf(outtree, ":%*.5f", (int)(w + 7), x); + col += w + 8; + } +} /* dnaml_treeout */ + + +void buildnewtip(long m, tree *tr) +{ + node *p; + + p = tr->nodep[nextsp + spp - 3]; + hookup(tr->nodep[m - 1], p); + p->v = initialv; + p->back->v = initialv; +} /* buildnewtip */ + + +void buildsimpletree(tree *tr) +{ + hookup(tr->nodep[enterorder[0] - 1], tr->nodep[enterorder[1] - 1]); + tr->nodep[enterorder[0] - 1]->v = 1.0; + tr->nodep[enterorder[0] - 1]->back->v = 1.0; + tr->nodep[enterorder[1] - 1]->v = 1.0; + tr->nodep[enterorder[1] - 1]->back->v = 1.0; + buildnewtip(enterorder[2], tr); + insert_(tr->nodep[enterorder[2] - 1]->back, + tr->nodep[enterorder[0] - 1], false); +} /* buildsimpletree */ + + +void free_all_protx (long nonodes, pointarray treenode) +{ + /* used in proml */ + long i, j, k; + node *p; + + /* Zero thru spp are tips, */ + for (i = 0; i < spp; i++) { + for (j = 0; j < endsite; j++) + free(treenode[i]->protx[j]); + free(treenode[i]->protx); + } + + /* The rest are rings (i.e. triads) */ + for (i = spp; i < nonodes; i++) { + if (treenode[i] != NULL) { + p = treenode[i]; + do { + for (k = 0; k < endsite; k++) + free(p->protx[k]); + free(p->protx); + p = p->next; + } while (p != treenode[i]); + } + } +} /* free_all_protx */ + +void proml_unroot(node* root, node** nodep, long nonodes) +{ + node *r,*q,*tmpnode; + double newl; + long i; + long numsibs; + + numsibs = count_sibs(root); + + if ( numsibs > 2 ) { + q = root; + r = root; + while (!(q->next == root)) + q = q->next; + q->next = root->next; + root = q; + for(i=0 ; i < endsite ; i++){ + free(r->protx[i]); + r->protx[i] = NULL; + } + free(r->protx); + r->protx = NULL; + chucktreenode(&grbg, r); + curtree.nodep[spp] = q; + } else if ( root->next->next->next == root) { + newl = root->next->oldlen + root->next->next->oldlen; + root->next->back->oldlen = newl; + root->next->next->back->oldlen = newl; + + newl = root->next->v + root->next->next->v; + root->next->back->v = newl; + root->next->next->back->v = newl; + + root->next->back->back=root->next->next->back; + root->next->next->back->back = root->next->back; + while ( root->index != nonodes ) { + tmpnode = nodep[ root->index ]; + nodep[root->index] = root; + root->index++; + root->next->index++; + root->next->next->index++; + nodep[root->index - 2] = tmpnode; + tmpnode->index--; + tmpnode->next->index--; + tmpnode->next->next->index--; + } + nodep[nonodes -1] = NULL; + for(i=0 ; i < endsite ; i++){ + free(root->protx[i]); + free(root->next->protx[i]); + free(root->next->next->protx[i]); + root->protx[i] = NULL; + root->next->protx[i] = NULL; + root->next->next->protx[i] = NULL; + } + free(root->protx); + free(root->next->protx); + free(root->next->next->protx); + + chucktreenode(&grbg,root->next->next); + chucktreenode(&grbg,root->next); + chucktreenode(&grbg,root); + + } +} + + + +void maketree() +{ + long i, j; + boolean dummy_first, goteof; + pointarray dummy_treenode=NULL; + long nextnode; + node *root; + + prot_inittable(); + + if (usertree) { + openfile(&intree,INTREE,"input tree file", "r",progname,intreename); + numtrees = countsemic(&intree); + if(numtrees > MAXSHIMOTREES) + shimotrees = MAXSHIMOTREES; + else + shimotrees = numtrees; + if (numtrees > 2) + initseed(&inseed, &inseed0, seed); + l0gl = (double *) Malloc(shimotrees * sizeof(double)); + l0gf = (double **) Malloc(shimotrees * sizeof(double *)); + for (i=0; i < shimotrees; ++i) + l0gf[i] = (double *) Malloc(endsite * sizeof(double)); + if (treeprint) { + fprintf(outfile, "User-defined tree"); + if (numtrees > 1) + putc('s', outfile); + fprintf(outfile, ":\n\n"); + } + which = 1; + + /* This taken out of tree read, used to be [spp-1], but referring + to [0] produces output identical to what the pre-modified dnaml + produced. */ + + while (which <= numtrees) { + + /* These initializations required each time through the loop + since multiple trees require re-initialization */ + haslengths = true; + nextnode = 0; + dummy_first = true; + goteof = false; + + treeread(intree, &root, dummy_treenode, &goteof, &dummy_first, + curtree.nodep, &nextnode, &haslengths, &grbg, + initpromlnode,false,nonodes2); + proml_unroot(root,curtree.nodep,nonodes2); + if (goteof && (which <= numtrees)) { + /* if we hit the end of the file prematurely */ + printf ("\n"); + printf ("ERROR: trees missing at end of file.\n"); + printf ("\tExpected number of trees:\t\t%ld\n", numtrees); + printf ("\tNumber of trees actually in file:\t%ld.\n\n", which - 1); + exxit (-1); + } + + curtree.start = curtree.nodep[0]->back; + if ( outgropt ) + curtree.start = curtree.nodep[outgrno - 1]->back; + + treevaluate(); + proml_printree(); + summarize(); + if (trout) { + col = 0; + dnaml_treeout(curtree.start); + } + if(which < numtrees){ + prot_freex_notip(nextnode, curtree.nodep); + gdispose(curtree.start, &grbg, curtree.nodep); + } else nonodes2 = nextnode; + which++; + } + FClose(intree); + putc('\n', outfile); + if (!auto_ && numtrees > 1 && weightsum > 1 ) + standev2(numtrees, maxwhich, 0, endsite-1, maxlogl, + l0gl, l0gf, aliasweight, seed); + } else { + /* If there's no input user tree, */ + for (i = 1; i <= spp; i++) + enterorder[i - 1] = i; + if (jumble) + randumize(seed, enterorder); + if (progress) { + printf("\nAdding species:\n"); + writename(0, 3, enterorder); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + nextsp = 3; + polishing = false; + buildsimpletree(&curtree); + curtree.start = curtree.nodep[enterorder[0] - 1]->back; + smoothit = improve; + nextsp = 4; + while (nextsp <= spp) { + buildnewtip(enterorder[nextsp - 1], &curtree); + bestyet = UNDEFINED; + if (smoothit) + promlcopy(&curtree, &priortree, nonodes2, rcategs); + addtraverse(curtree.nodep[enterorder[nextsp - 1] - 1]->back, + curtree.start, true); + if (smoothit) + promlcopy(&bestree, &curtree, nonodes2, rcategs); + else { + insert_(curtree.nodep[enterorder[nextsp - 1] - 1]->back, qwhere, true); + smoothit = true; + for (i = 1; i<=smoothings; i++) { + smooth(curtree.start); + smooth(curtree.start->back); + } + smoothit = false; + promlcopy(&curtree, &bestree, nonodes2, rcategs); + bestyet = curtree.likelihood; + } + if (progress) { + writename(nextsp - 1, 1, enterorder); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + if (global && nextsp == spp && progress) { + printf("Doing global rearrangements\n"); + printf(" !"); + for (j = spp ; j < nonodes2 ; j++) + if ( (j - spp) % (( nonodes2 / 72 ) + 1 ) == 0 ) + putchar('-'); + printf("!\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + succeeded = true; + while (succeeded) { + succeeded = false; + if (global && nextsp == spp && progress) { + printf(" "); + fflush(stdout); + } + if (global && nextsp == spp) + globrearrange(); + else + rearrange(curtree.start, curtree.start->back); + if (global && nextsp == spp && progress) + putchar('\n'); + } + nextsp++; + } + if (global && progress) { + putchar('\n'); + fflush(stdout); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + promlcopy(&curtree, &bestree, nonodes2, rcategs); + if (njumble > 1) { + if (jumb == 1) + promlcopy(&bestree, &bestree2, nonodes2, rcategs); + else + if (bestree2.likelihood < bestree.likelihood) + promlcopy(&bestree, &bestree2, nonodes2, rcategs); + } + if (jumb == njumble) { + if (njumble > 1) + promlcopy(&bestree2, &curtree, nonodes2, rcategs); + curtree.start = curtree.nodep[outgrno - 1]->back; + for (i = 0; i < nonodes2; i++) { + if (i < spp) + curtree.nodep[i]->initialized = false; + else { + curtree.nodep[i]->initialized = false; + curtree.nodep[i]->next->initialized = false; + curtree.nodep[i]->next->next->initialized = false; + } + } + treevaluate(); + proml_printree(); + summarize(); + if (trout) { + col = 0; + dnaml_treeout(curtree.start); + } + } + } + if (usertree) { + free(l0gl); + for (i=0; i < shimotrees; i++) + free(l0gf[i]); + free(l0gf); + } + prot_freetable(); + if (jumb < njumble) + return; + free(contribution); + free(mp); + for (i=0; i < endsite; i++) + free(term[i]); + free(term); + for (i=0; i < endsite; i++) + free(slopeterm[i]); + free(slopeterm); + for (i=0; i < endsite; i++) + free(curveterm[i]); + free(curveterm); + free_all_protx(nonodes2, curtree.nodep); + if (!usertree) { + free_all_protx(nonodes2, bestree.nodep); + free_all_protx(nonodes2, priortree.nodep); + if (njumble > 1) + free_all_protx(nonodes2, bestree2.nodep); + } + if (progress) { + printf("\n\nOutput written to file \"%s\"\n\n", outfilename); + if (trout) + printf("Tree also written onto file \"%s\"\n", outtreename); + putchar('\n'); + } +} /* maketree */ + + +void clean_up() +{ + /* Free and/or close stuff */ + long i; + + free (rrate); + free (probcat); + free (rate); + /* Seems to require freeing every time... */ + for (i = 0; i < spp; i++) { + free (y[i]); + } + free (y); + free (nayme); + free (enterorder); + free (category); + free (weight); + free (alias); + free (ally); + free (location); + free (aliasweight); + free (probmat); + free (eigmat); + + FClose(infile); + FClose(outfile); + FClose(outtree); +#ifdef MAC + fixmacfile(outfilename); + fixmacfile(outtreename); +#endif +} /* clean_up */ + + +int main(int argc, Char *argv[]) +{ /* Protein Maximum Likelihood */ +#ifdef MAC + argc = 1; /* macsetup("ProML",""); */ + argv[0] = "ProML"; +#endif + init(argc,argv); + progname = argv[0]; + openfile(&infile,INFILE,"input file","r",argv[0],infilename); + openfile(&outfile,OUTFILE,"output file","w",argv[0],outfilename); + mulsets = false; + datasets = 1; + firstset = true; + ibmpc = IBMCRT; + ansi = ANSICRT; + grbg = NULL; + doinit(); + if (ctgry) + openfile(&catfile,CATFILE,"categories file","r",argv[0],catfilename); + if (weights || justwts) + openfile(&weightfile,WEIGHTFILE,"weights file","r",argv[0],weightfilename); + if (trout) + openfile(&outtree,OUTTREE,"output tree file","w",argv[0],outtreename); + for (ith = 1; ith <= datasets; ith++) { + if (datasets > 1) { + fprintf(outfile, "Data set # %ld:\n", ith); + printf("\nData set # %ld:\n", ith); + } + getinput(); + if (ith == 1) + firstset = false; + for (jumb = 1; jumb <= njumble; jumb++) { + max_num_sibs = 0; + maketree(); + } + } + + clean_up(); + printf("Done.\n\n"); +#ifdef WIN32 + phyRestoreConsoleAttributes(); +#endif + return 0; +} /* Protein Maximum Likelihood */ + diff --git a/forester/archive/RIO/others/phylip_mod/src/promlk.c b/forester/archive/RIO/others/phylip_mod/src/promlk.c new file mode 100644 index 0000000..7717f07 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/promlk.c @@ -0,0 +1,3176 @@ + +#include "phylip.h" +#include "seq.h" + +/* version 3.6. (c) Copyright 1986-2004 by the University of Washington + and by Joseph Felsenstein. Written by Joseph Felsenstein and Lucas Mix. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +#define epsilon 0.0001 /* used in makenewv, getthree, update */ +#define over 60 + +typedef long vall[maxcategs]; +typedef double contribarr[maxcategs]; + +#ifndef OLDC +/* function prototypes */ +void init_protmats(void); +void getoptions(void); +void makeprotfreqs(void); +void allocrest(void); +void doinit(void); +void inputoptions(void); +void input_protdata(long); +void makeweights(void); +void prot_makevalues(long, pointarray, long, long, sequence, steptr); +void getinput(void); + +void prot_inittable(void); +void alloc_pmatrix(long); +void make_pmatrix(double **, double **, double **, long, double, double, + double *, double **); +void prot_nuview(node *); +void getthree(node *p, double thigh, double tlow); +void makenewv(node *); +void update(node *); +void smooth(node *); +void promlk_add(node *, node *, node *, boolean); +void promlk_re_move(node **, node **, boolean); + +double prot_evaluate(node *); +void tryadd(node *, node **, node **); +void addpreorder(node *, node *, node *, boolean, boolean); +void restoradd(node *, node *, node *, double); +void tryrearr(node *, boolean *); +void repreorder(node *, boolean *); +void rearrange(node **); +void nodeinit(node *); +void initrav(node *); +void travinit(node *); + +void travsp(node *); +void treevaluate(void); +void promlk_coordinates(node *, long *); +void promlk_drawline(long, double); +void promlk_printree(void); +void describe(node *); +void prot_reconstr(node *, long); +void rectrav(node *, long, long); +void summarize(void); +void promlk_treeout(node *); +void initpromlnode(node **, node **, node *, long, long, long *, long *, + initops, pointarray, pointarray, Char *, Char *, FILE *); +void tymetrav(node *, double *); + +void free_all_protx(long, pointarray); +void maketree(void); +void clean_up(void); +void reallocsites(void); +void prot_freetable(void); +void free_pmatrix(long sib); +/* function prototypes */ +#endif + + +double **tbl; + +Char infilename[100], outfilename[100], intreename[100], outtreename[100], + catfilename[100], weightfilename[100]; +double *rrate; +long sites, weightsum, categs, datasets, ith, njumble, jumb, numtrees, shimotrees; +/* sites = number of sites in actual sequences + numtrees = number of user-defined trees */ +long inseed, inseed0, mx, mx0, mx1; +boolean global, jumble, lngths, trout, usertree, weights, rctgry, ctgry, + auto_, progress, mulsets, firstset, hypstate, smoothit, + polishing, justwts, gama, invar, usejtt, usepmb, usepam; +tree curtree, bestree, bestree2; +node *qwhere, *grbg; +double sumrates, cv, alpha, lambda, lambda1, invarfrac; +long *enterorder; +steptr aliasweight; +double *rate; +longer seed; +double *probcat; +contribarr *contribution; +char aachar[26]="ARNDCQEGHILKMFPSTWYVBZX?*-"; +char *progname; +long rcategs, nonodes2; + + +/* Local variables for maketree, propagated globally for C version: */ +long k, maxwhich, col; +double like, bestyet, tdelta, lnlike, slope, curv, maxlogl; +boolean lastsp, smoothed, succeeded; +double *l0gl; +double x[3], lnl[3]; +double expon1i[maxcategs], expon1v[maxcategs], + expon2i[maxcategs], expon2v[maxcategs]; +node *there; +double **l0gf; +Char ch, ch2; +long **mp; + + +/* Variables introduced to allow for protein probability calculations */ +long max_num_sibs; /* maximum number of siblings used in a */ + /* nuview calculation. determines size */ + /* final size of pmatrices */ +double *eigmat; /* eig matrix variable */ +double **probmat; /* prob matrix variable */ +double ****dpmatrix; /* derivative of pmatrix */ +double ****ddpmatrix; /* derivative of xpmatrix */ +double *****pmatrices; /* matrix of probabilities of protien */ + /* conversion. The 5 subscripts refer */ + /* to sibs, rcategs, categs, final and */ + /* initial states, respectively. */ +double freqaa[20]; /* amino acid frequencies */ + +/* this jtt matrix decomposition due to Elisabeth Tillier */ +static double jtteigmat[] = +{0.0, -0.007031123, -0.006484345, -0.006086499, -0.005514432, +-0.00772664, -0.008643413, -0.010620756, -0.009965552, -0.011671808, +-0.012222418,-0.004589201, -0.013103714, -0.014048038, -0.003170582, +-0.00347935, -0.015311677, -0.016021194, -0.017991454, -0.018911888}; + +static double jttprobmat[20][20] = +{{0.076999996, 0.051000003, 0.043000004, 0.051999998, 0.019999996, 0.041, + 0.061999994, 0.073999997, 0.022999999, 0.052000004, 0.090999997, 0.058999988, + 0.024000007, 0.04, 0.050999992, 0.069, 0.059000006, 0.014000008, 0.032000004, + 0.066000005}, + {0.015604455, -0.068062363, 0.020106264, 0.070723273, 0.011702977, 0.009674053, + 0.074000798, -0.169750458, 0.005560808, -0.008208636, -0.012305869, + -0.063730179, -0.005674643, -0.02116828, 0.104586169, 0.016480839, 0.016765139, + 0.005936994, 0.006046367, -0.0082877}, + {-0.049778281, -0.007118197, 0.003801272, 0.070749616, 0.047506147, + 0.006447017, 0.090522425, -0.053620432, -0.008508175, 0.037170603, + 0.051805545, 0.015413608, 0.019939916, -0.008431976, -0.143511376, + -0.052486072, -0.032116542, -0.000860626, -0.02535993, 0.03843545}, + {-0.028906423, 0.092952047, -0.009615343, -0.067870117, 0.031970392, + 0.048338335, -0.054396304, -0.135916654, 0.017780083, 0.000129242, + 0.031267424, 0.116333586, 0.007499746, -0.032153596, 0.033517051, + -0.013719269, -0.00347293, -0.003291821, -0.02158326, -0.008862168}, + {0.037181176, -0.023106564, -0.004482225, -0.029899635, 0.118139633, + -0.032298569, -0.04683198, 0.05566988, -0.012622847, 0.002023096, + -0.043921088, -0.04792557, -0.003452711, -0.037744513, 0.020822974, + 0.036580187, 0.02331425, -0.004807711, -0.017504496, 0.01086673}, + {0.044754061, -0.002503471, 0.019452517, -0.015611487, -0.02152807, + -0.013131425, -0.03465365, -0.047928912, 0.020608851, 0.067843095, + -0.122130014, 0.002521499, 0.013021646, -0.082891087, -0.061590119, + 0.016270856, 0.051468938, 0.002079063, 0.081019713, 0.082927944}, + {0.058917882, 0.007320741, 0.025278141, 0.000357541, -0.002831285, + -0.032453034, -0.010177288, -0.069447924, -0.034467324, 0.011422358, + -0.128478324, 0.04309667, -0.015319944, 0.113302422, -0.035052393, + 0.046885372, 0.06185183, 0.00175743, -0.06224497, 0.020282093}, + {-0.014562092, 0.022522921, -0.007094389, 0.03480089, -0.000326144, + -0.124039037, 0.020577906, -0.005056454, -0.081841576, -0.004381786, + 0.030826152, 0.091261631, 0.008878828, -0.02829487, 0.042718836, + -0.011180886, -0.012719227, -0.000753926, 0.048062375, -0.009399129}, + {0.033789571, -0.013512235, 0.088010984, 0.017580292, -0.006608005, + -0.037836971, -0.061344686, -0.034268357, 0.018190209, -0.068484614, + 0.120024744, -0.00319321, -0.001349477, -0.03000546, -0.073063759, + 0.081912399, 0.0635245, 0.000197, -0.002481798, -0.09108114}, + {-0.113947615, 0.019230545, 0.088819683, 0.064832765, 0.001801467, + -0.063829682, -0.072001633, 0.018429333, 0.057465965, 0.043901014, + -0.048050874, -0.001705918, 0.022637173, 0.017404665, 0.043877902, + -0.017089594, -0.058489485, 0.000127498, -0.029357194, 0.025943972}, + {0.01512923, 0.023603725, 0.006681954, 0.012360216, -0.000181447, + -0.023011838, -0.008960024, -0.008533239, 0.012569835, 0.03216118, + 0.061986403, -0.001919083, -0.1400832, -0.010669741, -0.003919454, + -0.003707024, -0.026806029, -0.000611603, -0.001402648, 0.065312824}, + {-0.036405351, 0.020816769, 0.011408213, 0.019787053, 0.038897829, + 0.017641789, 0.020858533, -0.006067252, 0.028617353, -0.064259496, + -0.081676567, 0.024421823, -0.028751676, 0.07095096, -0.024199434, + -0.007513119, -0.028108766, -0.01198095, 0.111761119, -0.076198809}, + {0.060831772, 0.144097327, -0.069151377, 0.023754576, -0.003322955, + -0.071618574, 0.03353154, -0.02795295, 0.039519769, -0.023453968, + -0.000630308, -0.098024591, 0.017672997, 0.003813378, -0.009266499, + -0.011192111, 0.016013873, -0.002072968, -0.010022044, -0.012526904}, + {-0.050776604, 0.092833081, 0.044069596, 0.050523021, -0.002628417, + 0.076542572, -0.06388631, -0.00854892, -0.084725311, 0.017401063, + -0.006262541, -0.094457679, -0.002818678, -0.0044122, -0.002883973, + 0.028729685, -0.004961596, -0.001498627, 0.017994575, -0.000232779}, + {-0.01894566, -0.007760205, -0.015160993, -0.027254587, 0.009800903, + -0.013443561, -0.032896517, -0.022734138, -0.001983861, 0.00256111, + 0.024823166, -0.021256768, 0.001980052, 0.028136263, -0.012364384, + -0.013782446, -0.013061091, 0.111173981, 0.021702122, 0.00046654}, + {-0.009444193, -0.042106824, -0.02535015, -0.055125574, 0.006369612, + -0.02945416, -0.069922064, -0.067221068, -0.003004999, 0.053624311, + 0.128862984, -0.057245803, 0.025550508, 0.087741073, -0.001119043, + -0.012036202, -0.000913488, -0.034864475, 0.050124813, 0.055534723}, + {0.145782464, -0.024348311, -0.031216873, 0.106174443, 0.00202862, + 0.02653866, -0.113657267, -0.00755018, 0.000307232, -0.051241158, + 0.001310685, 0.035275877, 0.013308898, 0.002957626, -0.002925034, + -0.065362319, -0.071844582, 0.000475894, -0.000112419, 0.034097762}, + {0.079840455, 0.018769331, 0.078685899, -0.084329807, -0.00277264, + -0.010099754, 0.059700608, -0.019209715, -0.010442992, -0.042100476, + -0.006020556, -0.023061786, 0.017246106, -0.001572858, -0.006703785, + 0.056301316, -0.156787357, -0.000303638, 0.001498195, 0.051363455}, + {0.049628261, 0.016475144, 0.094141653, -0.04444633, 0.005206131, + -0.001827555, 0.02195624, 0.013066683, -0.010415582, -0.022338403, + 0.007837197, -0.023397671, -0.002507095, 0.005177694, 0.017109561, + -0.202340113, 0.069681441, 0.000120736, 0.002201146, 0.004670849}, + {0.089153689, 0.000233354, 0.010826822, -0.004273519, 0.001440618, + 0.000436077, 0.001182351, -0.002255508, -0.000700465, 0.150589876, + -0.003911914, -0.00050154, -0.004564983, 0.00012701, -0.001486973, + -0.018902754, -0.054748555, 0.000217377, -0.000319302, -0.162541651}}; + + +static double pameigmat[] = {0.0, -0.002350753691875762, -0.002701991863800379, + -0.002931612442853115, -0.004262492032364507, -0.005395980482561625, + -0.007141172690079523, -0.007392844756151318, -0.007781761342200766, + -0.00810032066366362, -0.00875299712761124, -0.01048227332164386, + -0.01109594097332267, -0.01298616073142234, -0.01342036228188581, + -0.01552599145527578, -0.01658762802054814, -0.0174893445623765, + -0.01933280832903272, -0.02206353522613025}; + +static double pamprobmat[20][20] = + {{0.087683339901135, 0.04051291829598762, 0.04087846315185977, + 0.04771603459744777, 0.03247095396561266, 0.03784612688594957, + 0.0504933695604875, 0.0898249006830755, 0.03285885059543713, + 0.0357514442352119, 0.0852464099207521, 0.07910313444070642, + 0.01488243946396588, 0.04100101908956829, 0.05158026947089499, + 0.06975497205982451, 0.05832757042475474, 0.00931264523877807, + 0.03171540880870517, 0.06303972920984541}, + {0.01943453646811026, -0.004492574160484092, 0.007694891061220776, + 0.01278399096887701, 0.0106157418450234, 0.007542140341575122, + 0.01326994069032819, 0.02615565199894889, 0.003123125764490066, + 0.002204507682495444, -0.004782898215768979, 0.01204241965177619, + 0.0007847400096924341, -0.03043626073172116, 0.01221202591902536, + 0.01100527004684405, 0.01116495631339549, -0.0925364931988571, + -0.02622065387931562, 0.00843494142432107}, + {0.01855357100209072, 0.01493642835763868, 0.0127983090766285, + 0.0200533250704364, -0.1681898360107787, 0.01551657969909255, + 0.02128060163107209, 0.03100633591848964, 0.00845480845269879, + 0.000927149370785571, 0.00937207565817036, 0.03490557769673472, + 0.00300443019551563, -0.02590837220264415, 0.01329376859943192, + 0.006854110889741407, 0.01102593860528263, 0.003360844186685888, + -0.03459712356647764, 0.003351477369404443}, + {0.02690642688200102, 0.02131745801890152, 0.0143626616005213, + 0.02405101425725929, 0.05041008641436849, 0.01430925051050233, + 0.02362114036816964, 0.04688381789373886, 0.005250115453626377, + -0.02040112168595516, -0.0942720776915669, 0.03773004996758644, + -0.00822831940782616, -0.1164872809439224, 0.02286281877257392, + 0.02849551240669926, 0.01468856796295663, 0.02377110964207936, + -0.094380545436577, -0.02089068498518036}, + {0.00930172577225213, 0.01493463068441099, 0.020186920775608, + 0.02892154953912524, -0.01224593358361567, 0.01404228329986624, + 0.02671186617119041, 0.04537535161795231, 0.02229995804098249, + -0.04635704133961575, -0.1966910360247138, 0.02796648065439046, + -0.02263484732621436, 0.0440490503242072, 0.01148782948302166, + 0.01989170531824069, 0.001306805142981245, -0.005676690969116321, + 0.07680476281625202, -0.07967537039721849}, + {0.06602274245435476, -0.0966661981471856, -0.005241648783844579, + 0.00859135188171146, -0.007762129660943368, -0.02888965572526196, + 0.003592291525888222, 0.1668410669287673, -0.04082039290551406, + 0.005233775047553415, -0.01758244726137135, -0.1493955762326898, + -0.00855819137835548, 0.004211419253492328, 0.01929306335052688, + 0.03008056746359405, 0.0190444422412472, 0.005577189741419315, + 0.0000874156155112068, 0.02634091459108298}, + {0.01933897472880726, 0.05874583569377844, -0.02293534606228405, + -0.07206314017962175, -0.004580681581546643, -0.0628814337610561, + -0.0850783812795136, 0.07988417636610614, -0.0852798990133397, + 0.01649047166155952, -0.05416647263757423, 0.1089834536254064, + 0.005093403979413865, 0.02520300254161142, 0.0005951431406455604, + 0.02441251821224675, 0.02796099482240553, -0.002574933994926502, + -0.007172237553012804, 0.03002455129086954}, + {0.04041118479094272, -0.002476225672095412, -0.01494505811263243, + -0.03759443758599911, -0.00892246902492875, -0.003634714029239211, + -0.03085671837973749, -0.126176309029931, 0.005814031139083794, + 0.01313561962646063, -0.04760487162503322, -0.0490563712725484, + -0.005082243450421558, -0.01213634309383557, 0.1806666927079249, + 0.02111663336185495, 0.02963486860587087, -0.0000175020101657785, + 0.01197155383597686, 0.0357526792184636}, + {-0.01184769557720525, 0.01582776076338872, -0.006570708266564639, + -0.01471915653734024, 0.00894343616503608, 0.00562664968033149, + -0.01465878888356943, 0.05365282692645818, 0.00893509735776116, + -0.05879312944436473, 0.0806048683392995, -0.007722897986905326, + -0.001819943882718859, 0.0942535573077267, 0.07483883782251654, + 0.004354639673913651, -0.02828804845740341, -0.001318222184691827, + -0.07613149604246563, -0.1251675867732172}, + {0.00834167031558193, -0.01509357596974962, 0.007098172811092488, + 0.03127677418040319, 0.001992448468465455, 0.00915441566808454, + 0.03430175973499201, -0.0730648147535803, -0.001402707145575659, + 0.04780949194330815, -0.1115035603461273, -0.01292297197609604, + -0.005056270550868528, 0.1112053349612027, -0.03801929822379964, + -0.001191241001736563, 0.01872874622910247, 0.0005314214903865993, + -0.0882576318311789, 0.07607183599610171}, + {-0.01539460099727769, 0.04988596184297883, -0.01187240760647617, + -0.06987843637091853, -0.002490472846497859, 0.01009857892494956, + -0.07473588067847209, 0.0906009925879084, 0.1243612446505172, + 0.02152806401345371, -0.03504879644860233, -0.06680752427613573, + -0.005574485153629651, 0.001518282948127752, -0.01999168507510701, + -0.01478606199529457, -0.02203749419458996, -0.00132680708294333, + -0.01137505997867614, 0.05332658773667142}, + {-0.06104378736432388, 0.0869446603393548, -0.03298331234537257, + 0.03128515657456024, 0.003906358569208259, 0.03578694104193928, + 0.06241936133189683, 0.06182827284921748, -0.05566564263245907, + 0.02640868588189002, -0.01349751243059039, -0.05507866642582638, + -0.006671347738489326, -0.001470096466016046, 0.05185743641479938, + -0.07494697511168257, -0.1175185439057584, -0.001188074094105709, + 0.00937934805737347, 0.05024773745437657}, + {-0.07252555582124737, -0.116554459356382, 0.003605361887406413, + -0.00836518656029184, 0.004615715410745561, 0.005105376617651312, + -0.00944938657024391, 0.05602449420950007, 0.02722719610561933, + 0.01959357494748446, -0.0258655103753962, 0.1440733975689835, + 0.01446782819722976, 0.003718896062070054, 0.05825843045655135, + -0.06230154142733073, -0.07833704962300169, 0.003160836143568724, + -0.001169873777936648, 0.03471745590503304}, + {-0.03204352258752698, 0.01019272923862322, 0.04509668708733181, + 0.05756522429120813, -0.0004601149081726732, -0.0984718150777423, + -0.01107826100664925, -0.005680277810520585, 0.01962359392320817, + 0.01550006899131986, 0.05143956925922197, 0.02462476682588468, + -0.0888843861002653, -0.00171553583659411, 0.01606331750661664, + 0.001176847743518958, -0.02070972978912828, -0.000341523293579971, + -0.002654732745607882, 0.02075709428885848}, + {0.03595199666430258, -0.02800219615234468, -0.04341570015493925, + -0.0748275906176658, 0.0001051403676377422, 0.1137431321746627, + 0.005852087565974318, 0.003443037513847801, -0.02481931657706633, + -0.003651181839831423, 0.03195794176786321, 0.04135411406392523, + -0.07562030263210619, 0.001769332364699, -0.01984381173403915, + -0.005029750745010152, 0.02649253902476472, 0.000518085571702734, + 0.001062936684474851, 0.01295950668914449}, + {-0.16164552322896, -0.0006050035060464324, 0.0258380054414968, + 0.003188424740960557, -0.0002058911341821877, 0.03157555987384681, + -0.01678913462596107, 0.03096216145389774, -0.0133791110666919, + 0.1125249625204277, -0.00769017706442472, -0.02653938062180483, + -0.002555329863523985, -0.00861833362947954, 0.01775148884754278, + 0.02529310679774722, 0.0826243417011238, -0.0001036728183032624, + 0.001963562313294209, -0.0935900561309786}, + {0.1652394174588469, -0.002814245280784351, -0.0328982001821263, + -0.02000104712964131, 0.0002208121995725443, -0.02733462178511839, + 0.02648078162927627, -0.01788316626401427, 0.01630747623755998, + 0.1053849023838147, -0.005447706553811218, 0.01810876922536839, + -0.001808914710282444, -0.007687912115607397, -0.01332593672114388, + -0.02110750894891371, -0.07456116592983384, 0.000219072589592394, + 0.001270886972191055, -0.1083616930749109}, + {0.02453279389716254, -0.005820072356487439, 0.100260287284095, + 0.01277522280305745, -0.003184943445296999, 0.05814689527984152, + -0.0934012278200201, -0.03017986487349484, -0.03136625380994165, + 0.00988668352785117, -0.00358900410973142, -0.02017443675004764, + 0.000915384582922184, -0.001460963415183106, -0.01370112443251124, + 0.1130040979284457, -0.1196161771323699, -0.0005800211204222045, + -0.0006153403201024954, 0.00416806428223025}, + {-0.0778089244252535, -0.007055161182430869, -0.0349307504860869, + -0.0811915584276571, -0.004689825871599125, -0.03726108871471753, + 0.1072225647141469, -0.00917015113070944, 0.01381628985996913, + -0.00123227881492089, 0.001815954515275675, 0.005708744099349901, + -0.0001448985044877925, -0.001306578795561384, -0.006992743514185243, + 0.1744720240732789, -0.05353628497814023, -0.0007613684227234787, + -0.0003550282315997644, 0.01340106423804634}, + {-0.0159527329868513, -0.007622151568160798, -0.1389875105184963, + 0.1165051999914764, -0.002217810389087748, 0.01550003226513692, + -0.07427664222230566, -0.003371438498619264, 0.01385754771325365, + 0.004759020167383304, 0.001624078805220564, 0.02011638303109029, + -0.001717827082842178, -0.0007424036708598594, -0.003978884451898934, + 0.0866418927301209, -0.01280817739158123, -0.00023039242454603, + 0.002309205802479111, 0.0005926106991001195}}; + +/* this pmb matrix decomposition due to Elisabeth Tillier */ +static double pmbeigmat[20] = +{0.0000001586972220,-1.8416770496147100, -1.6025046986139100,-1.5801012515121300, +-1.4987794099715900,-1.3520794233801900,-1.3003469390479700,-1.2439503327631300, +-1.1962574080244200,-1.1383730501367500,-1.1153278910708000,-0.4934843510654760, +-0.5419014550215590,-0.9657997830826700,-0.6276075673757390,-0.6675927795018510, +-0.6932641383465870,-0.8897872681859630,-0.8382698977371710,-0.8074694642446040}; + +static double pmbprobmat[20][20] = +{{0.0771762457248147,0.0531913844998640,0.0393445076407294,0.0466756566755510, +0.0286348361997465,0.0312327748383639,0.0505410248721427,0.0767106611472993, +0.0258916271688597,0.0673140562194124,0.0965705469252199,0.0515979465932174, +0.0250628079438675,0.0503492018628350,0.0399908189418273,0.0641898881894471, +0.0517539616710987,0.0143507440546115,0.0357994592438322,0.0736218495862984}, +{0.0368263046116572,-0.0006728917107827,0.0008590805287740,-0.0002764255356960, +0.0020152937187455,0.0055743720652960,0.0003213317669367,0.0000449190281568, +-0.0004226254397134,0.1805040629634510,-0.0272246813586204,0.0005904606533477, +-0.0183743200073889,-0.0009194625608688,0.0008173657533167,-0.0262629806302238, +0.0265738757209787,0.0002176606241904,0.0021315644838566,-0.1823229927207580}, +{-0.0194800075560895,0.0012068088610652,-0.0008803318319596,-0.0016044273960017, +-0.0002938633803197,-0.0535796754602196,0.0155163896648621,-0.0015006360762140, +0.0021601372013703,0.0268513218744797,-0.1085292493742730,0.0149753083138452, +0.1346457366717310,-0.0009371698759829,0.0013501708044116,0.0346352293103622, +-0.0276963770242276,0.0003643142783940,0.0002074817333067,-0.0174108903914110}, +{0.0557839400850153,0.0023271577185437,0.0183481103396687,0.0023339480096311, +0.0002013267015151,-0.0227406863569852,0.0098644845475047,0.0064721276774396, +0.0001389408104210,-0.0473713878768274,-0.0086984445005797,0.0026913674934634, +0.0283724052562196,0.0001063665179457,0.0027442574779383,-0.1875312134708470, +0.1279864877057640,0.0005103347834563,0.0003155113168637,0.0081451082759554}, +{0.0037510125027265,0.0107095920636885,0.0147305410328404,-0.0112351252180332, +-0.0001500408626446,-0.1523450933729730,0.0611532413339872,-0.0005496748939503, +0.0048714378736644,-0.0003826320053999,0.0552010244407311,0.0482555671001955, +-0.0461664995115847,-0.0021165008617978,-0.0004574454232187,0.0233755883688949, +-0.0035484915422384,0.0009090698422851,0.0013840637687758,-0.0073895139302231}, +{-0.0111512564930024,0.1025460064723080,0.0396772456883791,-0.0298408501361294, +-0.0001656742634733,-0.0079876311843289,0.0712644184507945,-0.0010780604625230, +-0.0035880882043592,0.0021070399334252,0.0016716329894279,-0.1810123023850110, +0.0015141703608724,-0.0032700852781804,0.0035503782441679,0.0118634302028026, +0.0044561606458028,-0.0001576678495964,0.0023470722225751,-0.0027457045397157}, +{0.1474525743949170,-0.0054432538500293,0.0853848892349828,-0.0137787746207348, +-0.0008274830358513,0.0042248844582553,0.0019556229305563,-0.0164191435175148, +-0.0024501858854849,0.0120908948084233,-0.0381456105972653,0.0101271614855119, +-0.0061945941321859,0.0178841099895867,-0.0014577779202600,-0.0752120602555032, +-0.1426985695849920,0.0002862275078983,-0.0081191734261838,0.0313401149422531}, +{0.0542034611735289,-0.0078763926211829,0.0060433542506096,0.0033396210615510, +0.0013965072374079,0.0067798903832256,-0.0135291136622509,-0.0089982442731848, +-0.0056744537593887,-0.0766524225176246,0.1881210263933930,-0.0065875518675173, +0.0416627569300375,-0.0953804133524747,-0.0012559228448735,0.0101622644292547, +-0.0304742453119050,0.0011702318499737,0.0454733434783982,-0.1119239362388150}, +{0.1069409037912470,0.0805064400880297,-0.1127352030714600,0.1001181253523260, +-0.0021480427488769,-0.0332884841459003,-0.0679837575848452,-0.0043812841356657, +0.0153418716846395,-0.0079441315103188,-0.0121766182046363,-0.0381127991037620, +-0.0036338726532673,0.0195324059593791,-0.0020165963699984,-0.0061222685010268, +-0.0253761448771437,-0.0005246410999057,-0.0112205170502433,0.0052248485517237}, +{-0.0325247648326262,0.0238753651653669,0.0203684886605797,0.0295666232678825, +-0.0003946714764213,-0.0157242718469554,-0.0511737848084862,0.0084725632040180, +-0.0167068828528921,0.0686962159427527,-0.0659702890616198,-0.0014289912494271, +-0.0167000964093416,-0.1276689083678200,0.0036575057830967,-0.0205958145531018, +0.0000368919612829,0.0014413626622426,0.1064360941926030,0.0863372661517408}, +{-0.0463777468104402,0.0394712148670596,0.1118686750747160,0.0440711686389031, +-0.0026076286506751,-0.0268454015202516,-0.1464943067133240,-0.0137514051835380, +-0.0094395514284145,-0.0144124844774228,0.0249103379323744,-0.0071832157138676, +0.0035592787728526,0.0415627419826693,0.0027040097365669,0.0337523666612066, +0.0316121324137152,-0.0011350177559026,-0.0349998884574440,-0.0302651879823361}, +{0.0142360925194728,0.0413145623127025,0.0324976427846929,0.0580930922002398, +-0.0586974207121084,0.0202001168873069,0.0492204086749069,0.1126593173463060, +0.0116620013776662,-0.0780333711712066,-0.1109786767320410,0.0407775100936731, +-0.0205013161312652,-0.0653458585025237,0.0347351829703865,0.0304448983224773, +0.0068813748197884,-0.0189002309261882,-0.0334507528405279,-0.0668143558699485}, +{-0.0131548829657936,0.0044244322828034,-0.0050639951827271,-0.0038668197633889, +-0.1536822386530220,0.0026336969165336,0.0021585651200470,-0.0459233839062969, +0.0046854727140565,0.0393815434593599,0.0619554007991097,0.0027456299925622, +0.0117574347936383,0.0373018612990383,0.0024818527553328,-0.0133956606027299, +-0.0020457128424105,0.0154178819990401,0.0246524142683911,0.0275363065682921}, +{-0.1542307272455030,0.0364861558267547,-0.0090880407008181,0.0531673937889863, +0.0157585615170580,0.0029986538457297,0.0180194047699875,0.0652152443589317, +0.0266842840376180,0.0388457366405908,0.0856237634510719,0.0126955778952183, +0.0099593861698250,-0.0013941794862563,0.0294065511237513,-0.1151906949298290, +-0.0852991447389655,0.0028699120202636,-0.0332087026659522,0.0006811857297899}, +{0.0281300736924501,-0.0584072081898638,-0.0178386569847853,-0.0536470338171487, +-0.0186881656029960,-0.0240008730656106,-0.0541064820498883,0.2217137098936020, +-0.0260500001542033,0.0234505236798375,0.0311127151218573,-0.0494139126682672, +0.0057093465049849,0.0124937286655911,-0.0298322975915689,0.0006520211333102, +-0.0061018680727128,-0.0007081999479528,-0.0060523759094034,0.0215845995364623}, +{0.0295321046399105,-0.0088296411830544,-0.0065057049917325,-0.0053478115612781, +-0.0100646496794634,-0.0015473619084872,0.0008539960632865,-0.0376381933046211, +-0.0328135588935604,0.0672161874239480,0.0667626853916552,-0.0026511651464901, +0.0140451514222062,-0.0544836996133137,0.0427485157912094,0.0097455780205802, +0.0177309072915667,-0.0828759701187452,-0.0729504795471370,0.0670731961252313}, +{0.0082646581043963,-0.0319918630534466,-0.0188454445200422,-0.0374976353856606, +0.0037131290686848,-0.0132507796987883,-0.0306958830735725,-0.0044119395527308, +-0.0140786756619672,-0.0180512599925078,-0.0208243802903953,-0.0232202769398931, +-0.0063135878270273,0.0110442171178168,0.1824538048228460,-0.0006644614422758, +-0.0069909097436659,0.0255407650654681,0.0099119399501151,-0.0140911517070698}, +{0.0261344441524861,-0.0714454044548650,0.0159436926233439,0.0028462736216688, +-0.0044572637889080,-0.0089474834434532,-0.0177570282144517,-0.0153693244094452, +0.1160919467206400,0.0304911481385036,0.0047047513411774,-0.0456535116423972, +0.0004491494948617,-0.0767108879444462,-0.0012688533741441,0.0192445965934123, +0.0202321954782039,0.0281039933233607,-0.0590403018490048,0.0364080426546883}, +{0.0115826306265004,0.1340228176509380,-0.0236200652949049,-0.1284484655137340, +-0.0004742338006503,0.0127617346949511,-0.0428560878860394,0.0060030732454125, +0.0089182609926781,0.0085353834972860,0.0048464809638033,0.0709740071429510, +0.0029940462557054,-0.0483434904493132,-0.0071713680727884,-0.0036840391887209, +0.0031454003250096,0.0246243550241551,-0.0449551277644180,0.0111449232769393}, +{0.0140356721886765,-0.0196518236826680,0.0030517022326582,0.0582672093364850, +-0.0000973895685457,0.0021704767224292,0.0341806268602705,-0.0152035987563018, +-0.0903198657739177,0.0259623214586925,0.0155832497882743,-0.0040543568451651, +0.0036477631918247,-0.0532892744763217,-0.0142569373662724,0.0104500681408622, +0.0103483945857315,0.0679534422398752,-0.0768068882938636,0.0280289727046158}} +; + + +void init_protmats() +{ + long l, m; + + eigmat = (double *) Malloc (20 * sizeof(double)); + for (l = 0; l <= 19; l++) + if (usejtt) + eigmat[l] = jtteigmat[l]*100.0; + else { + if (usepmb) + eigmat[l] = pmbeigmat[l]; + else + eigmat[l] = pameigmat[l]*100.0; + } + probmat = (double **) Malloc (20 * sizeof(double *)); + for (l = 0; l < 20; l++) + probmat[l] = (double *) Malloc (20 * sizeof(double)); + for (l = 0; l <= 19; l++) + for (m= 0; m <= 19; m++) + if (usejtt) + probmat[l][m] = jttprobmat[l][m]; + else { + if (usepmb) + probmat[l][m] = pmbprobmat[l][m]; + else + probmat[l][m] = pamprobmat[l][m]; + } +} /* init_protmats */ + + +void getoptions() +{ + /* interactively set options */ + long i, loopcount, loopcount2; + Char ch; + boolean done; + boolean didchangecat, didchangercat; + double probsum; + + fprintf(outfile, "\nAmino acid sequence\n"); + fprintf(outfile, " Maximum Likelihood method with molecular "); + fprintf(outfile, "clock, version %s\n\n", VERSION); + + putchar('\n'); + auto_ = false; + ctgry = false; + didchangecat = false; + rctgry = false; + didchangercat = false; + categs = 1; + rcategs = 1; + gama = false; + global = false; + hypstate = false; + invar = false; + jumble = false; + njumble = 1; + lambda = 1.0; + lambda1 = 0.0; + lngths = false; + trout = true; + usepam = false; + usepmb = false; + usejtt = true; + usertree = false; + weights = false; + printdata = false; + progress = true; + treeprint = true; + interleaved = true; + loopcount = 0; + do { + cleerhome(); + printf("\nAmino acid sequence\n"); + printf(" Maximum Likelihood method with molecular clock, version %s\n\n", + VERSION); + printf("Settings for this run:\n"); + printf(" U Search for best tree?"); + if (usertree) + printf(" No, use user trees in input file\n"); + else + printf(" Yes\n"); + printf(" P JTT, PMB or PAM probability model? %s\n", + usejtt ? "Jones-Taylor-Thornton" : + usepmb ? "Henikoff/Tillier PMB" : "Dayhoff PAM"); + if (usertree) { + printf(" L Use lengths from user tree?"); + if (lngths) + printf(" Yes\n"); + else + printf(" No\n"); + } + printf(" C One category of substitution rates?"); + if (!ctgry) + printf(" Yes\n"); + else + printf(" %ld categories\n", categs); + printf(" R Rate variation among sites?"); + if (!rctgry) + printf(" constant rate of change\n"); + else { + if (gama) + printf(" Gamma distributed rates\n"); + else { + if (invar) + printf(" Gamma+Invariant sites\n"); + else + printf(" user-defined HMM of rates\n"); + } + printf(" A Rates at adjacent sites correlated?"); + if (!auto_) + printf(" No, they are independent\n"); + else + printf(" Yes, mean block length =%6.1f\n", 1.0 / lambda); + } + if (!usertree) { + printf(" G Global rearrangements?"); + if (global) + printf(" Yes\n"); + else + printf(" No\n"); + } + printf(" W Sites weighted? %s\n", + (weights ? "Yes" : "No")); + if (!usertree) { + printf(" J Randomize input order of sequences?"); + if (jumble) + printf(" Yes (seed = %8ld, %3ld times)\n", inseed0, njumble); + else + printf(" No. Use input order\n"); + } + printf(" M Analyze multiple data sets?"); + if (mulsets) + printf(" Yes, %2ld %s\n", datasets, + (justwts ? "sets of weights" : "data sets")); + else + printf(" No\n"); + printf(" I Input sequences interleaved?"); + if (interleaved) + printf(" Yes\n"); + else + printf(" No, sequential\n"); + printf(" 0 Terminal type (IBM PC, ANSI, none)?"); + if (ibmpc) + printf(" IBM PC\n"); + if (ansi) + printf(" ANSI\n"); + if (!(ibmpc || ansi)) + printf(" (none)\n"); + printf(" 1 Print out the data at start of run"); + if (printdata) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" 2 Print indications of progress of run"); + if (progress) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" 3 Print out tree"); + if (treeprint) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" 4 Write out trees onto tree file?"); + if (trout) + printf(" Yes\n"); + else + printf(" No\n"); + printf(" 5 Reconstruct hypothetical sequences? %s\n", + (hypstate ? "Yes" : "No")); + printf("\nAre these settings correct? "); + printf("(type Y or the letter for one to change)\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch); + getchar(); + if (ch == '\n') + ch = ' '; + uppercase(&ch); + done = (ch == 'Y'); + if (!done) { + uppercase(&ch); + if (strchr("UPCRJAFWGLTMI012345", ch) != NULL){ + switch (ch) { + + case 'C': + ctgry = !ctgry; + if (ctgry) { + printf("\nSitewise user-assigned categories:\n\n"); + initcatn(&categs); + if (rate){ + free(rate); + } + rate = (double *) Malloc( categs * sizeof(double)); + didchangecat = true; + initcategs(categs, rate); + } + break; + + case 'P': + if (usejtt) { + usejtt = false; + usepmb = true; + } else { + if (usepmb) { + usepmb = false; + usepam = true; + } else { + usepam = false; + usejtt = true; + } + } + break; + + case 'R': + if (!rctgry) { + rctgry = true; + gama = true; + } else { + if (gama) { + gama = false; + invar = true; + } else { + if (invar) + invar = false; + else + rctgry = false; + } + } + break; + + case 'A': + auto_ = !auto_; + if (auto_) { + initlambda(&lambda); + lambda1 = 1.0 - lambda; + } + break; + + case 'G': + global = !global; + break; + + case 'W': + weights = !weights; + break; + + case 'J': + jumble = !jumble; + if (jumble) + initjumble(&inseed, &inseed0, seed, &njumble); + else njumble = 1; + break; + + case 'L': + lngths = !lngths; + break; + + case 'U': + usertree = !usertree; + break; + + case 'M': + mulsets = !mulsets; + if (mulsets) { + printf("Multiple data sets or multiple weights?"); + loopcount2 = 0; + do { + printf(" (type D or W)\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch2); + getchar(); + if (ch2 == '\n') + ch2 = ' '; + uppercase(&ch2); + countup(&loopcount2, 10); + } while ((ch2 != 'W') && (ch2 != 'D')); + justwts = (ch2 == 'W'); + if (justwts) + justweights(&datasets); + else + initdatasets(&datasets); + if (!jumble) { + jumble = true; + initjumble(&inseed, &inseed0, seed, &njumble); + } + } + break; + + case 'I': + interleaved = !interleaved; + break; + + case '0': + initterminal(&ibmpc, &ansi); + break; + + case '1': + printdata = !printdata; + break; + + case '2': + progress = !progress; + break; + + case '3': + treeprint = !treeprint; + break; + + case '4': + trout = !trout; + break; + + case '5': + hypstate = !hypstate; + break; + } + } else + printf("Not a possible option!\n"); + } + countup(&loopcount, 100); + } while (!done); + if (gama || invar) { + loopcount = 0; + do { + printf( +"\nCoefficient of variation of substitution rate among sites (must be positive)\n"); + printf( + " In gamma distribution parameters, this is 1/(square root of alpha)\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%lf%*[^\n]", &cv); + getchar(); + countup(&loopcount, 10); + } while (cv <= 0.0); + alpha = 1.0 / (cv * cv); + } + if (!rctgry) + auto_ = false; + if (rctgry) { + printf("\nRates in HMM"); + if (invar) + printf(" (including one for invariant sites)"); + printf(":\n"); + initcatn(&rcategs); + if (probcat){ + free(probcat); + free(rrate); + } + probcat = (double *) Malloc(rcategs * sizeof(double)); + rrate = (double *) Malloc(rcategs * sizeof(double)); + didchangercat = true; + if (gama) + initgammacat(rcategs, alpha, rrate, probcat); + else { + if (invar) { + loopcount = 0; + do { + printf("Fraction of invariant sites?\n"); + scanf("%lf%*[^\n]", &invarfrac); + getchar(); + countup (&loopcount, 10); + } while ((invarfrac <= 0.0) || (invarfrac >= 1.0)); + initgammacat(rcategs-1, alpha, rrate, probcat); + for (i = 0; i < rcategs-1; i++) + probcat[i] = probcat[i]*(1.0-invarfrac); + probcat[rcategs-1] = invarfrac; + rrate[rcategs-1] = 0.0; + } else { + initcategs(rcategs, rrate); + initprobcat(rcategs, &probsum, probcat); + } + } + } + if (!didchangercat){ + rrate = Malloc( rcategs*sizeof(double)); + probcat = Malloc( rcategs*sizeof(double)); + rrate[0] = 1.0; + probcat[0] = 1.0; + } + if (!didchangecat){ + rate = Malloc( categs*sizeof(double)); + rate[0] = 1.0; + } + init_protmats(); +} /* getoptions */ + + +void makeprotfreqs() +{ + /* calculate amino acid frequencies based on eigmat */ + long i, mineig; + + mineig = 0; + for (i = 0; i <= 19; i++) + if (fabs(eigmat[i]) < fabs(eigmat[mineig])) + mineig = i; + memcpy(freqaa, probmat[mineig], 20 * sizeof(double)); + for (i = 0; i <= 19; i++) + freqaa[i] = fabs(freqaa[i]); +} /* makeprotfreqs */ + +void reallocsites() +{ + long i; + for (i = 0; i < spp; i++) + y[i] = (char *)Malloc(sites * sizeof(char)); + enterorder = (long *)Malloc(spp*sizeof(long)); + weight = (long *)Malloc(sites*sizeof(long)); + category = (long *)Malloc(sites*sizeof(long)); + alias = (long *)Malloc(sites*sizeof(long)); + aliasweight = (long *)Malloc(sites*sizeof(long)); + ally = (long *)Malloc(sites*sizeof(long)); + location = (long *)Malloc(sites*sizeof(long)); + for (i = 0; i < sites; i++) + category[i] = 1; + for (i = 0; i < sites; i++) + weight[i] = 1; + makeweights(); +} + +void allocrest() +{ + long i; + + y = (Char **)Malloc(spp*sizeof(Char *)); + nayme = (naym *)Malloc(spp*sizeof(naym)); + for (i = 0; i < spp; i++) + y[i] = (char *)Malloc(sites * sizeof(char)); + enterorder = (long *)Malloc(spp*sizeof(long)); + weight = (long *)Malloc(sites*sizeof(long)); + category = (long *)Malloc(sites*sizeof(long)); + alias = (long *)Malloc(sites*sizeof(long)); + aliasweight = (long *)Malloc(sites*sizeof(long)); + ally = (long *)Malloc(sites*sizeof(long)); + location = (long *)Malloc(sites*sizeof(long)); +} /* allocrest */ + + +void doinit() +{ + /* initializes variables */ + + inputnumbers(&spp, &sites, &nonodes, 1); + getoptions(); + makeprotfreqs(); + if (printdata) + fprintf(outfile, "%2ld species, %3ld sites\n", spp, sites); + alloctree(&curtree.nodep, nonodes, usertree); + allocrest(); + if (usertree) + return; + alloctree(&bestree.nodep, nonodes, 0); + if (njumble <= 1) + return; + alloctree(&bestree2.nodep, nonodes, 0); +} /* doinit */ + + +void inputoptions() +{ + long i; + + if (!firstset) { + samenumsp(&sites, ith); + reallocsites(); + } + if (firstset) { + for (i = 0; i < sites; i++) + category[i] = 1; + for (i = 0; i < sites; i++) + weight[i] = 1; + } + if (justwts || weights) + inputweights(sites, weight, &weights); + weightsum = 0; + for (i = 0; i < sites; i++) + weightsum += weight[i]; + if ((ctgry && categs > 1) && (firstset || !justwts)) { + inputcategs(0, sites, category, categs, "ProMLK"); + if (printdata) + printcategs(outfile, sites, category, "Site categories"); + } + if (weights && printdata) + printweights(outfile, 0, sites, weight, "Sites"); + fprintf(outfile, "%s model of amino acid change\n\n", + (usejtt ? "Jones-Taylor-Thornton" : + usepmb ? "Henikoff/Tillier PMB" : "Dayhoff PAM")); +} /* inputoptions */ + + +void input_protdata(long chars) +{ + /* input the names and sequences for each species */ + /* used by proml */ + long i, j, k, l, basesread, basesnew; + Char charstate; + boolean allread, done; + + if (printdata) + headings(chars, "Sequences", "---------"); + basesread = 0; + basesnew = 0; + allread = false; + while (!(allread)) { + /* eat white space -- if the separator line has spaces on it*/ + do { + charstate = gettc(infile); + } while (charstate == ' ' || charstate == '\t'); + ungetc(charstate, infile); + if (eoln(infile)) + scan_eoln(infile); + i = 1; + while (i <= spp) { + if ((interleaved && basesread == 0) || !interleaved) + initname(i - 1); + j = (interleaved) ? basesread : 0; + done = false; + while (!done && !eoff(infile)) { + if (interleaved) + done = true; + while (j < chars && !(eoln(infile) || eoff(infile))) { + charstate = gettc(infile); + if (charstate == '\n' || charstate == '\t') + charstate = ' '; + if (charstate == ' ' || (charstate >= '0' && charstate <= '9')) + continue; + uppercase(&charstate); + if ((strchr("ABCDEFGHIKLMNPQRSTVWXYZ*?-", charstate)) == NULL){ + printf("ERROR: bad amino acid: %c at position %ld of species %ld\n", + charstate, j, i); + if (charstate == '.') { + printf(" Periods (.) may not be used as gap characters.\n"); + printf(" The correct gap character is (-)\n"); + } + exxit(-1); + } + j++; + y[i - 1][j - 1] = charstate; + } + if (interleaved) + continue; + if (j < chars) + scan_eoln(infile); + else if (j == chars) + done = true; + } + if (interleaved && i == 1) + basesnew = j; + + scan_eoln(infile); + + if ((interleaved && j != basesnew) || + (!interleaved && j != chars)) { + printf("ERROR: SEQUENCES OUT OF ALIGNMENT AT POSITION %ld.\n", j); + exxit(-1); + } + i++; + } + + if (interleaved) { + basesread = basesnew; + allread = (basesread == chars); + } else + allread = (i > spp); + } + if (!printdata) + return; + for (i = 1; i <= ((chars - 1) / 60 + 1); i++) { + for (j = 1; j <= spp; j++) { + for (k = 0; k < nmlngth; k++) + putc(nayme[j - 1][k], outfile); + fprintf(outfile, " "); + l = i * 60; + if (l > chars) + l = chars; + for (k = (i - 1) * 60 + 1; k <= l; k++) { + if (j > 1 && y[j - 1][k - 1] == y[0][k - 1]) + charstate = '.'; + else + charstate = y[j - 1][k - 1]; + putc(charstate, outfile); + if (k % 10 == 0 && k % 60 != 0) + putc(' ', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); +} /* input_protdata */ + + +void makeweights() +{ + /* make up weights vector to avoid duplicate computations */ + long i; + + for (i = 1; i <= sites; i++) { + alias[i - 1] = i; + ally[i - 1] = 0; + aliasweight[i - 1] = weight[i - 1]; + location[i - 1] = 0; + } + sitesort2(sites, aliasweight); + sitecombine2(sites, aliasweight); + sitescrunch2(sites, 1, 2, aliasweight); + for (i = 1; i <= sites; i++) { + if (aliasweight[i - 1] > 0) + endsite = i; + } + for (i = 1; i <= endsite; i++) { + ally[alias[i - 1] - 1] = alias[i - 1]; + location[alias[i - 1] - 1] = i; + } + contribution = (contribarr *) Malloc( endsite*sizeof(contribarr)); +} /* makeweights */ + + +void prot_makevalues(long categs, pointarray treenode, long endsite, + long spp, sequence y, steptr alias) +{ + /* set up fractional likelihoods at tips */ + /* a version of makevalues2 found in seq.c */ + /* used by proml */ + long i, j, k, l; + long b; + + for (k = 0; k < endsite; k++) { + j = alias[k]; + for (i = 0; i < spp; i++) { + for (l = 0; l < categs; l++) { + memset(treenode[i]->protx[k][l], 0, sizeof(double)*20); + switch (y[i][j - 1]) { + + case 'A': + treenode[i]->protx[k][l][0] = 1.0; + break; + + case 'R': + treenode[i]->protx[k][l][(long)arginine - (long)alanine] = 1.0; + break; + + case 'N': + treenode[i]->protx[k][l][(long)asparagine - (long)alanine] = 1.0; + break; + + case 'D': + treenode[i]->protx[k][l][(long)aspartic - (long)alanine] = 1.0; + break; + + case 'C': + treenode[i]->protx[k][l][(long)cysteine - (long)alanine] = 1.0; + break; + + case 'Q': + treenode[i]->protx[k][l][(long)glutamine - (long)alanine] = 1.0; + break; + + case 'E': + treenode[i]->protx[k][l][(long)glutamic - (long)alanine] = 1.0; + break; + + case 'G': + treenode[i]->protx[k][l][(long)glycine - (long)alanine] = 1.0; + break; + + case 'H': + treenode[i]->protx[k][l][(long)histidine - (long)alanine] = 1.0; + break; + + case 'I': + treenode[i]->protx[k][l][(long)isoleucine - (long)alanine] = 1.0; + break; + + case 'L': + treenode[i]->protx[k][l][(long)leucine - (long)alanine] = 1.0; + break; + + case 'K': + treenode[i]->protx[k][l][(long)lysine - (long)alanine] = 1.0; + break; + + case 'M': + treenode[i]->protx[k][l][(long)methionine - (long)alanine] = 1.0; + break; + + case 'F': + treenode[i]->protx[k][l][(long)phenylalanine - (long)alanine] = 1.0; + break; + + case 'P': + treenode[i]->protx[k][l][(long)proline - (long)alanine] = 1.0; + break; + + case 'S': + treenode[i]->protx[k][l][(long)serine - (long)alanine] = 1.0; + break; + + case 'T': + treenode[i]->protx[k][l][(long)threonine - (long)alanine] = 1.0; + break; + + case 'W': + treenode[i]->protx[k][l][(long)tryptophan - (long)alanine] = 1.0; + break; + + case 'Y': + treenode[i]->protx[k][l][(long)tyrosine - (long)alanine] = 1.0; + break; + + case 'V': + treenode[i]->protx[k][l][(long)valine - (long)alanine] = 1.0; + break; + + case 'B': + treenode[i]->protx[k][l][(long)asparagine - (long)alanine] = 1.0; + treenode[i]->protx[k][l][(long)aspartic - (long)alanine] = 1.0; + break; + + case 'Z': + treenode[i]->protx[k][l][(long)glutamine - (long)alanine] = 1.0; + treenode[i]->protx[k][l][(long)glutamic - (long)alanine] = 1.0; + break; + + case 'X': /* unknown aa */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + + case '?': /* unknown aa */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + + case '*': /* stop codon symbol */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + + case '-': /* deletion event-absent data or aa */ + for (b = 0; b <= 19; b++) + treenode[i]->protx[k][l][b] = 1.0; + break; + } + } + } + } +} /* prot_makevalues */ + + +void getinput() +{ + long grcategs; + + /* reads the input data */ + if (!justwts || firstset) + inputoptions(); + if (!justwts || firstset) + input_protdata(sites); + makeweights(); + setuptree2(curtree); + if (!usertree) { + setuptree2(bestree); + if (njumble > 1) + setuptree2(bestree2); + } + grcategs = (categs > rcategs) ? categs : rcategs; + prot_allocx(nonodes, grcategs, curtree.nodep, usertree); + if (!usertree) { + prot_allocx(nonodes, grcategs, bestree.nodep, 0); + if (njumble > 1) + prot_allocx(nonodes, grcategs, bestree2.nodep, 0); + } + prot_makevalues(rcategs, curtree.nodep, endsite, spp, y, alias); +} /* getinput */ + +void prot_freetable(void) +{ + long i,j,k,l; + for (j = 0; j < rcategs; j++) { + for (k = 0; k < categs; k++) { + for (l = 0; l < 20; l++) + free(ddpmatrix[j][k][l]); + free(ddpmatrix[j][k]); + } + free(ddpmatrix[j]); + } + free(ddpmatrix); + + for (j = 0; j < rcategs; j++) { + for (k = 0; k < categs; k++) { + for (l = 0; l < 20; l++) + free(dpmatrix[j][k][l]); + free(dpmatrix[j][k]); + } + free(dpmatrix[j]); + } + free(dpmatrix); + + + for (j = 0; j < rcategs; j++) + free(tbl[j]); + free(tbl); + + for ( i = 0 ; i < max_num_sibs ; i++ ) + free_pmatrix(i); + free(pmatrices); +} + +void prot_inittable() +{ + /* Define a lookup table. Precompute values and print them out in tables */ + /* Allocate memory for the pmatrices, dpmatices and ddpmatrices */ + long i, j, k, l; + double sumrates; + + /* Allocate memory for pmatrices, the array of pointers to pmatrices */ + + pmatrices = (double *****) Malloc (spp * sizeof(double ****)); + + /* Allocate memory for the first 2 pmatrices, the matrix of conversion */ + /* probabilities, but only once per run (aka not on the second jumble. */ + + alloc_pmatrix(0); + alloc_pmatrix(1); + + /* Allocate memory for one dpmatrix, the first derivative matrix */ + + dpmatrix = (double ****) Malloc( rcategs * sizeof(double ***)); + for (j = 0; j < rcategs; j++) { + dpmatrix[j] = (double ***) Malloc( categs * sizeof(double **)); + for (k = 0; k < categs; k++) { + dpmatrix[j][k] = (double **) Malloc( 20 * sizeof(double *)); + for (l = 0; l < 20; l++) + dpmatrix[j][k][l] = (double *) Malloc( 20 * sizeof(double)); + } + } + + /* Allocate memory for one ddpmatrix, the second derivative matrix */ + ddpmatrix = (double ****) Malloc( rcategs * sizeof(double ***)); + for (j = 0; j < rcategs; j++) { + ddpmatrix[j] = (double ***) Malloc( categs * sizeof(double **)); + for (k = 0; k < categs; k++) { + ddpmatrix[j][k] = (double **) Malloc( 20 * sizeof(double *)); + for (l = 0; l < 20; l++) + ddpmatrix[j][k][l] = (double *) Malloc( 20 * sizeof(double)); + } + } + + /* Allocate memory and assign values to tbl, the matrix of possible rates*/ + + tbl = (double **) Malloc( rcategs * sizeof(double *)); + for (j = 0; j < rcategs; j++) + tbl[j] = (double *) Malloc( categs * sizeof(double)); + + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) + tbl[j][k] = rrate[j]*rate[k]; + + sumrates = 0.0; + for (i = 0; i < endsite; i++) { + for (j = 0; j < rcategs; j++) + sumrates += aliasweight[i] * probcat[j] + * tbl[j][category[alias[i] - 1] - 1]; + } + sumrates /= (double)sites; + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) { + tbl[j][k] /= sumrates; + } + + if(jumb > 1) + return; + + if (gama || invar) { + fprintf(outfile, "\nDiscrete approximation to gamma distributed rates\n"); + fprintf(outfile, + " Coefficient of variation of rates = %f (alpha = %f)\n", cv, alpha); + } + if (rcategs > 1) { + fprintf(outfile, "\nState in HMM Rate of change Probability\n\n"); + for (i = 0; i < rcategs; i++) + if (probcat[i] < 0.0001) + fprintf(outfile, "%9ld%16.3f%20.6f\n", i+1, rrate[i], probcat[i]); + else if (probcat[i] < 0.001) + fprintf(outfile, "%9ld%16.3f%19.5f\n", i+1, rrate[i], probcat[i]); + else if (probcat[i] < 0.01) + fprintf(outfile, "%9ld%16.3f%18.4f\n", i+1, rrate[i], probcat[i]); + else + fprintf(outfile, "%9ld%16.3f%17.3f\n", i+1, rrate[i], probcat[i]); + putc('\n', outfile); + if (auto_) { + fprintf(outfile, + "Expected length of a patch of sites having the same rate = %8.3f\n", + 1/lambda); + putc('\n', outfile); + } + } + if (categs > 1) { + fprintf(outfile, "\nSite category Rate of change\n\n"); + for (k = 0; k < categs; k++) + fprintf(outfile, "%9ld%16.3f\n", k+1, rate[k]); + fprintf(outfile, "\n\n"); + } +} /* prot_inittable */ + +void free_pmatrix(long sib) +{ + long j,k,l; + + for (j = 0; j < rcategs; j++) { + for (k = 0; k < categs; k++) { + for (l = 0; l < 20; l++) + free(pmatrices[sib][j][k][l]); + free(pmatrices[sib][j][k]); + } + free(pmatrices[sib][j]); + } + free(pmatrices[sib]); +} + +void alloc_pmatrix(long sib) +{ + /* Allocate memory for a new pmatrix. Called iff num_sibs>max_num_sibs */ + long j, k, l; + double ****temp_matrix; + + temp_matrix = (double ****) Malloc (rcategs * sizeof(double ***)); + for (j = 0; j < rcategs; j++) { + temp_matrix[j] = (double ***) Malloc(categs * sizeof(double **)); + for (k = 0; k < categs; k++) { + temp_matrix[j][k] = (double **) Malloc(20 * sizeof (double *)); + for (l = 0; l < 20; l++) + temp_matrix[j][k][l] = (double *) Malloc(20 * sizeof(double)); + } + } + pmatrices[sib] = temp_matrix; + max_num_sibs++; +} /* alloc_pmatrix */ + + +void make_pmatrix(double **matrix, double **dmat, double **ddmat, + long derivative, double lz, double rat, + double *eigmat, double **probmat) +{ + /* Computes the R matrix such that matrix[m][l] is the joint probability */ + /* of m and l. */ + /* Computes a P matrix such that matrix[m][l] is the conditional */ + /* probability of m given l. This is accomplished by dividing all terms */ + /* in the R matrix by freqaa[m], the frequency of l. */ + + long k, l, m; /* (l) original character state */ + /* (m) final character state */ + /* (k) lambda counter */ + double p0, p1, p2, q; + double elambdat[20], delambdat[20], ddelambdat[20]; + /* exponential term for matrix */ + /* and both derivative matrices */ + + for (k = 0; k <= 19; k++) { + elambdat[k] = exp(lz * rat * eigmat[k]); + if(derivative != 0) { + delambdat[k] = (elambdat[k] * rat * eigmat[k]); + ddelambdat[k] = (delambdat[k] * rat * eigmat[k]); + } + } + for (m = 0; m <= 19; m++) { + for (l = 0; l <= 19; l++) { + p0 = 0.0; + p1 = 0.0; + p2 = 0.0; + for (k = 0; k <= 19; k++) { + q = probmat[k][m] * probmat[k][l]; + p0 += (q * elambdat[k]); + if(derivative !=0) { + p1 += (q * delambdat[k]); + p2 += (q * ddelambdat[k]); + } + } + matrix[m][l] = p0 / freqaa[m]; + if(derivative != 0) { + dmat[m][l] = p1 / freqaa[m]; + ddmat[m][l] = p2 / freqaa[m]; + } + } + } +} /* make_pmatrix */ + + +void prot_nuview(node *p) +{ + long b, i, j, k, l, m, num_sibs, sib_index; + node *sib_ptr, *sib_back_ptr; + psitelike prot_xx, x2; + double lw, prod7; + double **pmat; + double maxx,correction; + + /* Figure out how many siblings the current node has */ + /* and be sure that pmatrices is large enough */ + num_sibs = count_sibs(p); + for (i = 0; i < num_sibs; i++) + if (pmatrices[i] == NULL) + alloc_pmatrix(i); + + /* Recursive calls, should be called for all children */ + sib_ptr = p; + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + + if (!(sib_back_ptr == NULL)) + if (!sib_back_ptr->tip && !sib_back_ptr->initialized) + prot_nuview(sib_back_ptr); + } + + /* Make pmatrices for all possible combinations of category, rcateg */ + /* and sib */ + sib_ptr = p; /* return to p */ + for (sib_index=0; sib_index < num_sibs; sib_index++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + + if (sib_back_ptr != NULL) + lw = fabs(p->tyme - sib_back_ptr->tyme); + else + lw = 0.0; + + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) + make_pmatrix(pmatrices[sib_index][j][k], NULL, NULL, 0, lw, + tbl[j][k], eigmat, probmat); + } + + for (i = 0; i < endsite; i++) { + correction = 0; + maxx = 0; + k = category[alias[i]-1] - 1; + for (j = 0; j < rcategs; j++) { + + /* initialize to 1 all values of prot_xx */ + for (m = 0; m <= 19; m++) + prot_xx[m] = 1; + + sib_ptr = p; /* return to p */ + /* loop through all sibs and calculate likelihoods for all possible*/ + /* amino acid combinations */ + for (sib_index=0; sib_index < num_sibs; sib_index++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + + + if (sib_back_ptr != NULL) { + memcpy(x2, sib_back_ptr->protx[i][j], sizeof(psitelike)); + if ( j == 0 ) + correction += sib_back_ptr->underflows[i]; + } + + else + for (b = 0; b <= 19; b++) + x2[b] = 1.0; + pmat = pmatrices[sib_index][j][k]; + for (m = 0; m <= 19; m++) { + prod7 = 0; + for (l = 0; l <= 19; l++) + prod7 += (pmat[m][l] * x2[l]); + prot_xx[m] *= prod7; + if ( prot_xx[m] > maxx && sib_index == (num_sibs - 1 )) + maxx = prot_xx[m]; + } + } + /* And the final point of this whole function: */ + memcpy(p->protx[i][j], prot_xx, sizeof(psitelike)); + } + p->underflows[i] = 0; + if ( maxx < MIN_DOUBLE ) + fix_protx(p,i,maxx,rcategs); + p->underflows[i] += correction; + } + + p->initialized = true; +} /* prot_nuview */ + + +void getthree(node *p, double thigh, double tlow) +{ + /* compute likelihood at a new triple of points */ + int i; + double tt = p->tyme; + double td = fabs(tdelta); + + x[0] = tt - td; + x[1] = tt; + x[2] = tt + td; + + if ( x[0] < tlow + epsilon ) { + x[0] = tlow + epsilon; + x[1] = ( x[0] + x[2] ) / 2; + } + + if ( x[2] > thigh - epsilon ) { + x[2] = thigh - epsilon; + x[1] = ( x[0] + x[2] ) / 2; + } + + for ( i = 0 ; i < 3 ; i++ ) { + p->tyme = x[i]; + prot_nuview(p); + lnl[i] = prot_evaluate(p); + } +} /* getthree */ + +void makenewv(node *p) +{ + /* improve a node time */ + long it, imin, imax, i, num_sibs; + double tt, tfactor, tlow, thigh, oldlike, ymin, ymax, s32, s21, yold; + boolean done, already; + node *s, *sdown, *sib_ptr, *sib_back_ptr; + + s = curtree.nodep[p->index - 1]; + sdown = s->back; + if (s == curtree.root) + tlow = -10.0; + else + tlow = sdown->tyme; + + sib_ptr = s; + num_sibs = count_sibs(p); + + thigh = s->next->back->tyme; + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + if (sib_back_ptr->tyme < thigh) + thigh = sib_back_ptr->tyme; + } + done = (thigh - tlow < 4.0*epsilon); + it = 1; + if (s != curtree.root) + tdelta = (thigh - tlow) / 10.0; + else + tdelta = (thigh - s->tyme) / 5.0; + tfactor = 1.0; + if (!done) + getthree(s, thigh, tlow); + while (it < iterations && !done) { + ymax = lnl[0]; + imax = 1; + for (i = 2; i <= 3; i++) { + if (lnl[i - 1] > ymax) { + ymax = lnl[i - 1]; + imax = i; + } + } + if (imax != 2) { + ymax = x[1]; + x[1] = x[imax - 1]; + x[imax - 1] = ymax; + ymax = lnl[1]; + lnl[1] = lnl[imax - 1]; + lnl[imax - 1] = ymax; + } + tt = x[1]; + oldlike = lnl[1]; + yold = tt; + s32 = (lnl[2] - lnl[1]) / (x[2] - x[1]); + s21 = (lnl[1] - lnl[0]) / (x[1] - x[0]); + if (fabs(x[2] - x[0]) > epsilon) + curv = (s32 - s21) / ((x[2] - x[0]) / 2); + else + curv = 0.0; + slope = (s32 + s21) / 2 - curv * (x[2] - 2 * x[1] + x[0]) / 4; + if (curv >= 0.0) { + if (slope < 0) + tdelta = -fabs(tdelta); + else + tdelta = fabs(tdelta); + } else + tdelta = -(tfactor * slope / curv); + if (tt + tdelta <= tlow + epsilon) + tdelta = tlow + epsilon - tt; + if (tt + tdelta >= thigh - epsilon) + tdelta = thigh - epsilon - tt; + tt += tdelta; + done = (fabs(yold - tt) < epsilon || fabs(tdelta) < epsilon); + s->tyme = tt; + prot_nuview(s); + lnlike = prot_evaluate(s); + ymin = lnl[0]; + imin = 1; + for (i = 2; i <= 3; i++) { + if (lnl[i - 1] < ymin) { + ymin = lnl[i - 1]; + imin = i; + } + } + already = (tt == x[0]) || (tt == x[1]) || (tt == x[2]); + if (!already && ymin < lnlike) { + x[imin - 1] = tt; + lnl[imin - 1] = lnlike; + } + if (already || lnlike < oldlike) { + tt = x[1]; + tfactor /= 2; + tdelta /= 2; + curtree.likelihood = oldlike; + lnlike = oldlike; + } else + tfactor = 1.0; + + if (!done) { + sib_ptr = p; + num_sibs = count_sibs(p); + p->tyme = tt; + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_ptr->tyme = tt; + } + + sib_ptr = p; + prot_nuview(p); + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + prot_nuview(sib_ptr); + } + } + + it++; + } + sib_ptr = p; + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + inittrav (sib_ptr); + } + smoothed = smoothed && done; +} /* makenewv */ + + +void update(node *p) +{ + node *sib_ptr, *sib_back_ptr; + long i, num_sibs; + + /* improve time and recompute views at a node */ + if (p == NULL) + return; + if (p->back != NULL) { + if (!p->back->tip && !p->back->initialized) + prot_nuview(p->back); + } + + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + if (sib_back_ptr != NULL) { + if (!sib_back_ptr->tip && !sib_back_ptr->initialized) + prot_nuview(sib_back_ptr); + } + } + + if ((!usertree) || (usertree && !lngths) || p->iter) { + makenewv(p); + return; + } + prot_nuview(p); + + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + prot_nuview(sib_ptr); + } +} /* update */ + + +void smooth(node *p) +{ + node *sib_ptr; + long i, num_sibs; + + if (p == NULL) + return; + if (p->tip) + return; + + update(p); + + smoothed = false; + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + if (polishing || (smoothit && !smoothed)) { + smooth(sib_ptr->back); + p->initialized = false; + sib_ptr->initialized = false; + } + update(p); + } +} /* smooth */ + + +void promlk_add(node *below, node *newtip, node *newfork, boolean tempadd) +{ + /* inserts the nodes newfork and its descendant, newtip, into the tree. */ + long i; + boolean done; + node *p; + + below = curtree.nodep[below->index - 1]; + newfork = curtree.nodep[newfork->index-1]; + newtip = curtree.nodep[newtip->index-1]; + if (below->back != NULL) + below->back->back = newfork; + newfork->back = below->back; + below->back = newfork->next->next; + newfork->next->next->back = below; + newfork->next->back = newtip; + newtip->back = newfork->next; + if (newtip->tyme < below->tyme) + p = newtip; + else p = below; + newfork->tyme = p->tyme; + if (curtree.root == below) + curtree.root = newfork; + if (newfork->back != NULL) { + if (p->tyme > newfork->back->tyme) + newfork->tyme = (p->tyme + newfork->back->tyme) / 2.0; + else newfork->tyme = p->tyme - epsilon; + newfork->next->tyme = newfork->tyme; + newfork->next->next->tyme = newfork->tyme; + do { + p = curtree.nodep[p->back->index - 1]; + done = (p == curtree.root); + if (!done) + done = (curtree.nodep[p->back->index - 1]->tyme < p->tyme - epsilon); + if (!done) { + curtree.nodep[p->back->index - 1]->tyme = p->tyme - epsilon; + curtree.nodep[p->back->index - 1]->next->tyme = p->tyme - epsilon; + curtree.nodep[p->back->index - 1]->next->next->tyme = p->tyme - epsilon; + } + } while (!done); + } else { + newfork->tyme = newfork->tyme - 2*epsilon; + newfork->next->tyme = newfork->tyme; + newfork->next->next->tyme = newfork->tyme; + } + inittrav(newtip); + inittrav(newtip->back); + smoothed = false; + i = 1; + while (i < smoothings && !smoothed) { + smoothed = true; + smooth(newfork); + smooth(newfork->back); + i++; + } +} /* promlk_add */ + + +void promlk_re_move(node **item, node **fork, boolean tempadd) +{ + /* removes nodes item and its ancestor, fork, from the tree. + the new descendant of fork's ancestor is made to be + fork's second descendant (other than item). Also + returns pointers to the deleted nodes, item and fork */ + node *p, *q; + long i; + + if ((*item)->back == NULL) { + *fork = NULL; + return; + } + *item = curtree.nodep[(*item)->index-1]; + *fork = curtree.nodep[(*item)->back->index - 1]; + if (curtree.root == *fork) { + if (*item == (*fork)->next->back) + curtree.root = (*fork)->next->next->back; + else + curtree.root = (*fork)->next->back; + } + p = (*item)->back->next->back; + q = (*item)->back->next->next->back; + if (p != NULL) + p->back = q; + if (q != NULL) + q->back = p; + (*fork)->back = NULL; + p = (*fork)->next; + while (p != *fork) { + p->back = NULL; + p = p->next; + } + (*item)->back = NULL; + inittrav(p); + inittrav(q); + if (tempadd) + return; + i = 1; + while (i <= smoothings) { + smooth(q); + if (smoothit) + smooth(q->back); + i++; + } +} /* promlk_re_move */ + + +double prot_evaluate(node *p) +{ + contribarr tterm; + static contribarr like, nulike, clai; + double sum, sum2, sumc=0, y, prod4, prodl, frexm, sumterm, lterm; + double **pmat; + long i, j, k, l, m, lai; + node *q, *r; + psitelike x1, x2; + + sum = 0.0; + + if (p == curtree.root && (count_sibs(p) == 2)) { + r = p->next->back; + q = p->next->next->back; + y = r->tyme + q->tyme - 2 * p->tyme; + if (!r->tip && !r->initialized) prot_nuview (r); + if (!q->tip && !q->initialized) prot_nuview (q); + } else if (p == curtree.root) { + /* the next two lines copy tyme and x to p->next. Normally they are + not initialized for an internal node. */ + /* assumes bifurcation */ + p->next->tyme = p->tyme; + prot_nuview(p->next); + r = p->next; + q = p->next->back; + y = fabs(p->next->tyme - q->tyme); + } else { + r = p; + q = p->back; + if (!r->tip && !r->initialized) prot_nuview (r); + if (!q->tip && !q->initialized) prot_nuview (q); + y = fabs(r->tyme - q->tyme); + } + + for (j = 0; j < rcategs; j++) + for (k = 0; k < categs; k++) + make_pmatrix(pmatrices[0][j][k],NULL,NULL,0,y,tbl[j][k],eigmat,probmat); + for (i = 0; i < endsite; i++) { + k = category[alias[i]-1] - 1; + for (j = 0; j < rcategs; j++) { + memcpy(x1, r->protx[i][j], sizeof(psitelike)); + memcpy(x2, q->protx[i][j], sizeof(psitelike)); + prod4 = 0.0; + pmat = pmatrices[0][j][k]; + for (m = 0; m <= 19; m++) { + prodl = 0.0; + for (l = 0; l <= 19; l++) + prodl += (pmat[m][l] * x2[l]); + frexm = x1[m] * freqaa[m]; + prod4 += (prodl * frexm); + } + tterm[j] = prod4; + } + sumterm = 0.0; + for (j = 0; j < rcategs; j++) + sumterm += probcat[j] * tterm[j]; + if (sumterm < 0.0) + sumterm = 0.00000001; /* ??? */ + lterm = log(sumterm) + p->underflows[i] + q->underflows[i]; + for (j = 0; j < rcategs; j++) + clai[j] = tterm[j] / sumterm; + memcpy(contribution[i], clai, rcategs * sizeof(double)); + if (!auto_ && usertree && (which <= shimotrees)) + l0gf[which - 1][i] = lterm; + sum += aliasweight[i] * lterm; + } + if (auto_) { + for (j = 0; j < rcategs; j++) + like[j] = 1.0; + for (i = 0; i < sites; i++) { + sumc = 0.0; + for (k = 0; k < rcategs; k++) + sumc += probcat[k] * like[k]; + sumc *= lambda; + if ((ally[i] > 0) && (location[ally[i]-1] > 0)) { + lai = location[ally[i] - 1]; + memcpy(clai, contribution[lai - 1], rcategs*sizeof(double)); + for (j = 0; j < rcategs; j++) + nulike[j] = ((1.0 - lambda) * like[j] + sumc) * clai[j]; + } else { + for (j = 0; j < rcategs; j++) + nulike[j] = ((1.0 - lambda) * like[j] + sumc); + } + memcpy(like, nulike, rcategs * sizeof(double)); + } + sum2 = 0.0; + for (i = 0; i < rcategs; i++) + sum2 += probcat[i] * like[i]; + sum += log(sum2); + } + curtree.likelihood = sum; + if (auto_ || !usertree) + return sum; + if(which <= shimotrees) + l0gl[which - 1] = sum; + if (which == 1) { + maxwhich = 1; + maxlogl = sum; + return sum; + } + if (sum > maxlogl) { + maxwhich = which; + maxlogl = sum; + } + return sum; +} /* prot_evaluate */ + + +void tryadd(node *p, node **item, node **nufork) +{ /* temporarily adds one fork and one tip to the tree. + if the location where they are added yields greater + likelihood than other locations tested up to that + time, then keeps that location as there */ + + long grcategs; + grcategs = (categs > rcategs) ? categs : rcategs; + + promlk_add(p, *item, *nufork, true); + like = prot_evaluate(p); + if (lastsp) { + if (like >= bestyet || bestyet == UNDEFINED) + prot_copy_(&curtree, &bestree, nonodes, grcategs); + } + if (like > bestyet || bestyet == UNDEFINED) { + bestyet = like; + there = p; + } + promlk_re_move(item, nufork, true); +} /* tryadd */ + + +void addpreorder(node *p, node *item_, node *nufork_, boolean contin, + boolean continagain) +{ + /* traverses a binary tree, calling function tryadd + at a node before calling tryadd at its descendants */ + node *item, *nufork; + + item = item_; + nufork = nufork_; + if (p == NULL) + return; + tryadd(p, &item, &nufork); + contin = continagain; + if ((!p->tip) && contin) { + addpreorder(p->next->back, item, nufork, contin, continagain); + addpreorder(p->next->next->back, item, nufork, contin, continagain); + } +} /* addpreorder */ + + +void restoradd(node *below, node *newtip, node *newfork, double prevtyme) +{ +/* restore "new" tip and fork to place "below". restore tymes */ +/* assumes bifurcation */ + hookup(newfork, below->back); + hookup(newfork->next, below); + hookup(newtip, newfork->next->next); + curtree.nodep[newfork->index-1] = newfork; + newfork->tyme = prevtyme; +/* assumes bifurcations */ + newfork->next->tyme = prevtyme; + newfork->next->next->tyme = prevtyme; +} /* restoradd */ + + +void tryrearr(node *p, boolean *success) +{ + /* evaluates one rearrangement of the tree. + if the new tree has greater likelihood than the old + one sets success = TRUE and keeps the new tree. + otherwise, restores the old tree */ + node *frombelow, *whereto, *forknode; + double oldlike, prevtyme; + boolean wasonleft; + + if (p == curtree.root) + return; + forknode = curtree.nodep[p->back->index - 1]; + if (forknode == curtree.root) + return; + oldlike = bestyet; + prevtyme = forknode->tyme; +/* the following statement presumes bifurcating tree */ + if (forknode->next->back == p) { + frombelow = forknode->next->next->back; + wasonleft = true; + } + else { + frombelow = forknode->next->back; + wasonleft = false; + } + whereto = curtree.nodep[forknode->back->index - 1]; + promlk_re_move(&p, &forknode, true); + promlk_add(whereto, p, forknode, true); + like = prot_evaluate(p); + if (like <= oldlike && oldlike != UNDEFINED) { + promlk_re_move(&p, &forknode, true); + restoradd(frombelow, p, forknode, prevtyme); + if (wasonleft && (forknode->next->next->back == p)) { + hookup (forknode->next->back, forknode->next->next); + hookup (forknode->next, p); + } + curtree.likelihood = oldlike; + inittrav(forknode); + inittrav(forknode->next); + inittrav(forknode->next->next); + } else { + (*success) = true; + bestyet = like; + } +} /* tryrearr */ + + +void repreorder(node *p, boolean *success) +{ + /* traverses a binary tree, calling function tryrearr + at a node before calling tryrearr at its descendants */ + if (p == NULL) + return; + tryrearr(p, success); + if (p->tip) + return; + if (!(*success)) + repreorder(p->next->back, success); + if (!(*success)) + repreorder(p->next->next->back, success); +} /* repreorder */ + + +void rearrange(node **r) +{ + /* traverses the tree (preorder), finding any local + rearrangement which increases the likelihood. + if traversal succeeds in increasing the tree's + likelihood, function rearrange runs traversal again */ + boolean success; + success = true; + while (success) { + success = false; + repreorder(*r, &success); + } +} /* rearrange */ + + +void nodeinit(node *p) +{ + /* set up times at one node */ + node *sib_ptr, *sib_back_ptr; + long i, num_sibs; + double lowertyme; + + sib_ptr = p; + num_sibs = count_sibs(p); + + /* lowertyme = lowest of children's times */ + lowertyme = p->next->back->tyme; + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + if (sib_back_ptr->tyme < lowertyme) + lowertyme = sib_back_ptr->tyme; + } + + p->tyme = lowertyme - 0.1; + + sib_ptr = p; + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + + sib_ptr->tyme = p->tyme; + sib_back_ptr->v = sib_back_ptr->tyme - p->tyme; + sib_ptr->v = sib_back_ptr->v; + } +} /* nodeinit */ + + +void initrav(node *p) +{ + + long i, num_sibs; + node *sib_ptr, *sib_back_ptr; + + /* traverse to set up times throughout tree */ + if (p->tip) + return; + + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + initrav(sib_back_ptr); + } + + nodeinit(p); +} /* initrav */ + + +void travinit(node *p) +{ + long i, num_sibs; + node *sib_ptr, *sib_back_ptr; + + /* traverse to set up initial values */ + if (p == NULL) + return; + if (p->tip) + return; + if (p->initialized) + return; + + + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + travinit(sib_back_ptr); + } + + prot_nuview(p); + p->initialized = true; +} /* travinit */ + + +void travsp(node *p) +{ + long i, num_sibs; + node *sib_ptr, *sib_back_ptr; + + /* traverse to find tips */ + if (p == curtree.root) + travinit(p); + if (p->tip) + travinit(p->back); + else { + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + travsp(sib_back_ptr); + } + } +} /* travsp */ + + +void treevaluate() +{ + /* evaluate likelihood of tree, after iterating branch lengths */ + long i, j, num_sibs; + node *sib_ptr; + + polishing = true; + smoothit = true; + for (i = 0; i < spp; i++) + curtree.nodep[i]->initialized = false; + for (i = spp; i < nonodes; i++) { + sib_ptr = curtree.nodep[i]; + sib_ptr->initialized = false; + num_sibs = count_sibs(sib_ptr); + for (j=0 ; j < num_sibs; j++) { + sib_ptr = sib_ptr->next; + sib_ptr->initialized = false; + } + + } + if (!lngths) + initrav(curtree.root); + travsp(curtree.root); + for (i = 1; i <= smoothings * 4; i++) + smooth(curtree.root); + prot_evaluate(curtree.root); +} /* treevaluate */ + + +void promlk_coordinates(node *p, long *tipy) +{ + /* establishes coordinates of nodes */ + node *q, *first, *last, *pp1 =NULL, *pp2 =NULL; + long num_sibs, p1, p2, i; + + if (p->tip) { + p->xcoord = 0; + p->ycoord = (*tipy); + p->ymin = (*tipy); + p->ymax = (*tipy); + (*tipy) += down; + return; + } + q = p->next; + do { + promlk_coordinates(q->back, tipy); + q = q->next; + } while (p != q); + num_sibs = count_sibs(p); + p1 = (long)((num_sibs+1)/2.0); + p2 = (long)((num_sibs+2)/2.0); + i = 1; + q = p->next; + first = q->back; + do { + if (i == p1) pp1 = q->back; + if (i == p2) pp2 = q->back; + last = q->back; + q = q->next; + i++; + } while (q != p); + p->xcoord = (long)(0.5 - over * p->tyme); + p->ycoord = (pp1->ycoord + pp2->ycoord) / 2; + p->ymin = first->ymin; + p->ymax = last->ymax; +} /* promlk_coordinates */ + + +void promlk_drawline(long i, double scale) +{ + /* draws one row of the tree diagram by moving up tree */ + node *p, *q, *r, *first =NULL, *last =NULL; + long n, j; + boolean extra, done; + + p = curtree.root; + q = curtree.root; + extra = false; + if ((long)(p->ycoord) == i) { + if (p->index - spp >= 10) + fprintf(outfile, "-%2ld", p->index - spp); + else + fprintf(outfile, "--%ld", p->index - spp); + extra = true; + } else + fprintf(outfile, " "); + do { + if (!p->tip) { + r = p->next; + done = false; + do { + if (i >= r->back->ymin && i <= r->back->ymax) { + q = r->back; + done = true; + } + r = r->next; + } while (!(done || r == p)); + first = p->next->back; + r = p->next; + while (r->next != p) + r = r->next; + last = r->back; + } + done = (p == q); + n = (long)(scale * ((long)(p->xcoord) - (long)(q->xcoord)) + 0.5); + if (n < 3 && !q->tip) + n = 3; + if (extra) { + n--; + extra = false; + } + if ((long)(q->ycoord) == i && !done) { + if (p->ycoord != q->ycoord) + putc('+', outfile); + else + putc('-', outfile); + if (!q->tip) { + for (j = 1; j <= n - 2; j++) + putc('-', outfile); + if (q->index - spp >= 10) + fprintf(outfile, "%2ld", q->index - spp); + else + fprintf(outfile, "-%ld", q->index - spp); + extra = true; + } else { + for (j = 1; j < n; j++) + putc('-', outfile); + } + } else if (!p->tip) { + if ((long)(last->ycoord) > i && (long)(first->ycoord) < i && + i != (long)(p->ycoord)) { + putc('!', outfile); + for (j = 1; j < n; j++) + putc(' ', outfile); + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + } + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + } + if (p != q) + p = q; + } while (!done); + if ((long)(p->ycoord) == i && p->tip) { + for (j = 0; j < nmlngth; j++) + putc(nayme[p->index - 1][j], outfile); + } + putc('\n', outfile); +} /* promlk_drawline */ + + +void promlk_printree() +{ + /* prints out diagram of the tree */ + long tipy; + double scale; + long i; + node *p; + + if (!treeprint) + return; + putc('\n', outfile); + tipy = 1; + promlk_coordinates(curtree.root, &tipy); + p = curtree.root; + while (!p->tip) + p = p->next->back; + scale = 1.0 / (long)(p->tyme - curtree.root->tyme + 1.000); + putc('\n', outfile); + for (i = 1; i <= tipy - down; i++) + promlk_drawline(i, scale); + putc('\n', outfile); +} /* promlk_printree */ + + +void describe(node *p) +{ + long i, num_sibs; + node *sib_ptr, *sib_back_ptr; + double v; + + if (p == curtree.root) + fprintf(outfile, " root "); + else + fprintf(outfile, "%4ld ", p->back->index - spp); + if (p->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[p->index - 1][i], outfile); + } else + fprintf(outfile, "%4ld ", p->index - spp); + if (p != curtree.root) { + fprintf(outfile, "%11.5f", (p->tyme - curtree.root->tyme)); + v = (p->tyme - curtree.nodep[p->back->index - 1]->tyme); + fprintf(outfile, "%13.5f", v); + } + putc('\n', outfile); + if (!p->tip) { + + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0 ; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + sib_back_ptr = sib_ptr->back; + describe(sib_back_ptr); + } + } +} /* describe */ + + +void prot_reconstr(node *p, long n) +{ + /* reconstruct and print out acid at site n+1 at node p */ + long i, j, k, first, num_sibs = 0; + double f, sum, xx[20]; + node *q = NULL; + + if (p->tip) + putc(y[p->index-1][n], outfile); + else { + num_sibs = count_sibs(p); + if ((ally[n] == 0) || (location[ally[n]-1] == 0)) + putc('.', outfile); + else { + j = location[ally[n]-1] - 1; + sum = 0; + for (i = 0; i <= 19; i++) { + f = p->protx[j][mx-1][i]; + if (!p->tip) { + q = p; + for (k = 0; k < num_sibs; k++) { + q = q->next; + f *= q->protx[j][mx-1][i]; + } + } + f = sqrt(f); + xx[i] = f * freqaa[i]; + sum += xx[i]; + } + for (i = 0; i <= 19; i++) + xx[i] /= sum; + first = 0; + for (i = 0; i <= 19; i++) + if (xx[i] > xx[first]) + first = i; + if (xx[first] > 0.95) + putc(aachar[first], outfile); + else + putc(tolower(aachar[first]), outfile); + if (rctgry && rcategs > 1) + mx = mp[n][mx - 1]; + else + mx = 1; + } + } +} /* prot_reconstr */ + + +void rectrav(node *p, long m, long n) +{ + /* print out segment of reconstructed sequence for one branch */ + long num_sibs, i; + node *sib_ptr; + + putc(' ', outfile); + if (p->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[p->index-1][i], outfile); + } else + fprintf(outfile, "%4ld ", p->index - spp); + fprintf(outfile, " "); + mx = mx0; + for (i = m; i <= n; i++) { + if ((i % 10 == 0) && (i != m)) + putc(' ', outfile); + prot_reconstr(p, i); + } + putc('\n', outfile); + if (!p->tip) { + num_sibs = count_sibs(p); + sib_ptr = p; + for (i = 0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + rectrav(sib_ptr->back, m, n); + } + } + mx1 = mx; +} /* rectrav */ + + +void summarize() +{ + long i, j, mm; + double mode, sum; + double like[maxcategs], nulike[maxcategs]; + double **marginal; + + mp = (long **)Malloc(sites * sizeof(long *)); + for (i = 0; i <= sites-1; ++i) + mp[i] = (long *)Malloc(sizeof(long)*rcategs); + fprintf(outfile, "\nLn Likelihood = %11.5f\n\n", curtree.likelihood); + fprintf(outfile, " Ancestor Node Node Height Length\n"); + fprintf(outfile, " -------- ---- ---- ------ ------\n"); + describe(curtree.root); + putc('\n', outfile); + if (rctgry && rcategs > 1) { + for (i = 0; i < rcategs; i++) + like[i] = 1.0; + for (i = sites - 1; i >= 0; i--) { + sum = 0.0; + for (j = 0; j < rcategs; j++) { + nulike[j] = (lambda1 + lambda * probcat[j]) * like[j]; + mp[i][j] = j + 1; + for (k = 1; k <= rcategs; k++) { + if (k != j + 1) { + if (lambda * probcat[k - 1] * like[k - 1] > nulike[j]) { + nulike[j] = lambda * probcat[k - 1] * like[k - 1]; + mp[i][j] = k; + } + } + } + if ((ally[i] > 0) && (location[ally[i]-1] > 0)) + nulike[j] *= contribution[location[ally[i] - 1] - 1][j]; + sum += nulike[j]; + } + for (j = 0; j < rcategs; j++) + nulike[j] /= sum; + memcpy(like, nulike, rcategs * sizeof(double)); + } + mode = 0.0; + mx = 1; + for (i = 1; i <= rcategs; i++) { + if (probcat[i - 1] * like[i - 1] > mode) { + mx = i; + mode = probcat[i - 1] * like[i - 1]; + } + } + mx0 = mx; + fprintf(outfile, + "Combination of categories that contributes the most to the likelihood:\n\n"); + for (i = 1; i <= nmlngth + 3; i++) + putc(' ', outfile); + for (i = 1; i <= sites; i++) { + fprintf(outfile, "%ld", mx); + if (i % 10 == 0) + putc(' ', outfile); + if (i % 60 == 0 && i != sites) { + putc('\n', outfile); + for (j = 1; j <= nmlngth + 3; j++) + putc(' ', outfile); + } + mx = mp[i - 1][mx - 1]; + } + fprintf(outfile, "\n\n"); + marginal = (double **) Malloc( sites*sizeof(double *)); + for (i = 0; i < sites; i++) + marginal[i] = (double *) Malloc( rcategs*sizeof(double)); + for (i = 0; i < rcategs; i++) + like[i] = 1.0; + for (i = sites - 1; i >= 0; i--) { + sum = 0.0; + for (j = 0; j < rcategs; j++) { + nulike[j] = (lambda1 + lambda * probcat[j]) * like[j]; + for (k = 1; k <= rcategs; k++) { + if (k != j + 1) + nulike[j] += lambda * probcat[k - 1] * like[k - 1]; + } + if ((ally[i] > 0) && (location[ally[i]-1] > 0)) + nulike[j] *= contribution[location[ally[i] - 1] - 1][j]; + sum += nulike[j]; + } + for (j = 0; j < rcategs; j++) { + nulike[j] /= sum; + marginal[i][j] = nulike[j]; + } + memcpy(like, nulike, rcategs * sizeof(double)); + } + for (i = 0; i < rcategs; i++) + like[i] = 1.0; + for (i = 0; i < sites; i++) { + sum = 0.0; + for (j = 0; j < rcategs; j++) { + nulike[j] = (lambda1 + lambda * probcat[j]) * like[j]; + for (k = 1; k <= rcategs; k++) { + if (k != j + 1) + nulike[j] += lambda * probcat[k - 1] * like[k - 1]; + } + marginal[i][j] *= like[j] * probcat[j]; + sum += nulike[j]; + } + for (j = 0; j < rcategs; j++) + nulike[j] /= sum; + memcpy(like, nulike, rcategs * sizeof(double)); + sum = 0.0; + for (j = 0; j < rcategs; j++) + sum += marginal[i][j]; + for (j = 0; j < rcategs; j++) + marginal[i][j] /= sum; + } + fprintf(outfile, "Most probable category at each site if > 0.95"); + fprintf(outfile, " probability (\".\" otherwise)\n\n"); + for (i = 1; i <= nmlngth + 3; i++) + putc(' ', outfile); + for (i = 0; i < sites; i++) { + sum = 0.0; + for (j = 0; j < rcategs; j++) + if (marginal[i][j] > sum) { + sum = marginal[i][j]; + mm = j; + } + if (sum >= 0.95) + fprintf(outfile, "%ld", mm+1); + else + putc('.', outfile); + if ((i+1) % 60 == 0) { + if (i != 0) { + putc('\n', outfile); + for (j = 1; j <= nmlngth + 3; j++) + putc(' ', outfile); + } + } + else if ((i+1) % 10 == 0) + putc(' ', outfile); + } + putc('\n', outfile); + for (i = 0; i < sites; i++) + free(marginal[i]); + free(marginal); + } + putc('\n', outfile); + putc('\n', outfile); + putc('\n', outfile); + if (hypstate) { + fprintf(outfile, "Probable sequences at interior nodes:\n\n"); + fprintf(outfile, " node "); + for (i = 0; (i < 13) && (i < ((sites + (sites-1)/10 - 39) / 2)); i++) + putc(' ', outfile); + fprintf(outfile, "Reconstructed sequence (caps if > 0.95)\n\n"); + if (!rctgry || (rcategs == 1)) + mx0 = 1; + for (i = 0; i < sites; i += 60) { + k = i + 59; + if (k >= sites) + k = sites - 1; + rectrav(curtree.root, i, k); + putc('\n', outfile); + mx0 = mx1; + } + } + for (i = 0; i <= sites-1; ++i) + free(mp[i]); + free(mp); +} /* summarize */ + + +void promlk_treeout(node *p) +{ + /* write out file with representation of final tree */ + node *sib_ptr; + long i, n, w, num_sibs; + Char c; + double x; + + if (p->tip) { + n = 0; + for (i = 1; i <= nmlngth; i++) { + if (nayme[p->index - 1][i - 1] != ' ') + n = i; + } + for (i = 0; i < n; i++) { + c = nayme[p->index - 1][i]; + if (c == ' ') + c = '_'; + putc(c, outtree); + } + col += n; + } else { + sib_ptr = p; + num_sibs = count_sibs(p); + putc('(', outtree); + col++; + + for (i=0; i < (num_sibs - 1); i++) { + sib_ptr = sib_ptr->next; + promlk_treeout(sib_ptr->back); + putc(',', outtree); + col++; + if (col > 55) { + putc('\n', outtree); + col = 0; + } + } + sib_ptr = sib_ptr->next; + promlk_treeout(sib_ptr->back); + putc(')', outtree); + col++; + } + if (p == curtree.root) { + fprintf(outtree, ";\n"); + return; + } + x = (p->tyme - curtree.nodep[p->back->index - 1]->tyme); + if (x > 0.0) + w = (long)(0.4342944822 * log(x)); + else if (x == 0.0) + w = 0; + else + w = (long)(0.4342944822 * log(-x)) + 1; + if (w < 0) + w = 0; + fprintf(outtree, ":%*.5f", (int)(w + 7), x); + col += w + 8; +} /* promlk_treeout */ + + +void initpromlnode(node **p, node **grbg, node *q, long len, long nodei, + long *ntips, long *parens, initops whichinit, + pointarray treenode, pointarray nodep, Char *str, + Char *ch, FILE *intree) +{ + /* initializes a node */ + boolean minusread; + double valyew, divisor; + + switch (whichinit) { + case bottom: + gnu(grbg, p); + (*p)->index = nodei; + (*p)->tip = false; + malloc_ppheno((*p), endsite, rcategs); + nodep[(*p)->index - 1] = (*p); + break; + case nonbottom: + gnu(grbg, p); + malloc_ppheno(*p, endsite, rcategs); + (*p)->index = nodei; + break; + case tip: + match_names_to_data(str, nodep, p, spp); + break; + case iter: + (*p)->initialized = false; + (*p)->v = initialv; + (*p)->iter = true; + if ((*p)->back != NULL) + (*p)->back->iter = true; + break; + case length: + processlength(&valyew, &divisor, ch, &minusread, intree, parens); + (*p)->v = valyew / divisor; + (*p)->iter = false; + if ((*p)->back != NULL) { + (*p)->back->v = (*p)->v; + (*p)->back->iter = false; + } + break; + case unittrwt: + curtree.nodep[spp]->iter = false; + break; + default: /* cases hslength, hsnolength, treewt */ + break; /* should never occur */ + } +} /* initpromlnode */ + + +void tymetrav(node *p, double *x) +{ + /* set up times of nodes */ + node *sib_ptr, *q; + long i, num_sibs; + double xmax; + + xmax = 0.0; + if (!p->tip) { + sib_ptr = p; + num_sibs = count_sibs(p); + for (i=0; i < num_sibs; i++) { + sib_ptr = sib_ptr->next; + tymetrav(sib_ptr->back, x); + if (xmax > (*x)) + xmax = (*x); + } + } else + (*x) = 0.0; + p->tyme = xmax; + if (!p->tip) { + q = p; + while (q->next != p) { + q = q->next; + q->tyme = p->tyme; + } + } + (*x) = p->tyme - p->v; +} /* tymetrav */ + + +void free_all_protx (long nonodes, pointarray treenode) +{ + /* used in proml */ + long i, j, k; + node *p; + + /* Zero thru spp are tips, */ + for (i = 0; i < spp; i++) { + for (j = 0; j < endsite; j++) + free(treenode[i]->protx[j]); + free(treenode[i]->protx); + } + + /* The rest are rings (i.e. triads) */ + for (i = spp; i < nonodes; i++) { + if (treenode[i] != NULL) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + for (k = 0; k < endsite; k++) + free(p->protx[k]); + free(p->protx); + p = p->next; + } + } + } +} /* free_all_protx */ + + +void maketree() +{ + /* constructs a binary tree from the pointers in curtree.nodep, + adds each node at location which yields highest likelihood + then rearranges the tree for greatest likelihood */ + + long i, j; + long numtrees = 0; + double bestlike, gotlike, x; + node *item, *nufork, *dummy, *q, *root=NULL; + boolean dummy_haslengths, dummy_first, goteof; + long nextnode; + long grcategs; + pointarray dummy_treenode=NULL; + + grcategs = (categs > rcategs) ? categs : rcategs; + + prot_inittable(); + + if (!usertree) { + for (i = 1; i <= spp; i++) + enterorder[i - 1] = i; + if (jumble) + randumize(seed, enterorder); + curtree.root = curtree.nodep[spp]; + curtree.root->back = NULL; + for (i = 0; i < spp; i++) + curtree.nodep[i]->back = NULL; + for (i = spp; i < nonodes; i++) { + q = curtree.nodep[i]; + q->back = NULL; + while ((q = q->next) != curtree.nodep[i]) + q->back = NULL; + } + polishing = false; + promlk_add(curtree.nodep[enterorder[0]-1], curtree.nodep[enterorder[1]-1], + curtree.nodep[spp], false); + if (progress) { + printf("\nAdding species:\n"); + writename(0, 2, enterorder); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + lastsp = false; + smoothit = false; + for (i = 3; i <= spp; i++) { + bestyet = UNDEFINED; + bestree.likelihood = bestyet; + there = curtree.root; + item = curtree.nodep[enterorder[i - 1] - 1]; + nufork = curtree.nodep[spp + i - 2]; + lastsp = (i == spp); + addpreorder(curtree.root, item, nufork, true, true); + promlk_add(there, item, nufork, false); + like = prot_evaluate(curtree.root); + rearrange(&curtree.root); + if (curtree.likelihood > bestree.likelihood) { + prot_copy_(&curtree, &bestree, nonodes, grcategs); + } + if (progress) { + writename(i - 1, 1, enterorder); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + if (lastsp && global) { + if (progress) { + printf("Doing global rearrangements\n"); + printf(" !"); + for (j = 1; j <= nonodes; j++) + if ( j % (( nonodes / 72 ) + 1 ) == 0 ) + putchar('-'); + printf("!\n"); + } + bestlike = bestyet; + do { + if (progress) + printf(" "); + gotlike = bestlike; + for (j = 0; j < nonodes; j++) { + bestyet = UNDEFINED; + item = curtree.nodep[j]; + if (item != curtree.root) { + nufork = curtree.nodep[curtree.nodep[j]->back->index - 1]; + promlk_re_move(&item, &nufork, false); + there = curtree.root; + addpreorder(curtree.root, item, nufork, true, true); + promlk_add(there, item, nufork, false); + } + if (progress) { + if ( j % (( nonodes / 72 ) + 1 ) == 0 ) + putchar('.'); + fflush(stdout); + } + } + if (progress) + putchar('\n'); + } while (bestlike < gotlike); + } + } + if (njumble > 1 && lastsp) { + for (i = 0; i < spp; i++ ) + promlk_re_move(&curtree.nodep[i], &dummy, false); + if (jumb == 1 || bestree2.likelihood < bestree.likelihood) + prot_copy_(&bestree, &bestree2, nonodes, grcategs); + } + if (jumb == njumble) { + if (njumble > 1) + prot_copy_(&bestree2, &curtree, nonodes, grcategs); + else + prot_copy_(&bestree, &curtree, nonodes, grcategs); + fprintf(outfile, "\n\n"); + treevaluate(); + curtree.likelihood = prot_evaluate(curtree.root); + promlk_printree(); + summarize(); + if (trout) { + col = 0; + promlk_treeout(curtree.root); + } + } + } else { + openfile(&intree, INTREE, "input tree file", "r", progname, intreename); + numtrees = countsemic(&intree); + if(numtrees > MAXSHIMOTREES) + shimotrees = MAXSHIMOTREES; + else + shimotrees = numtrees; + if (numtrees > 2) + initseed(&inseed, &inseed0, seed); + l0gl = (double *) Malloc(shimotrees * sizeof(double)); + l0gf = (double **) Malloc(shimotrees * sizeof(double *)); + for (i=0; i < shimotrees; ++i) + l0gf[i] = (double *)Malloc(endsite * sizeof(double)); + if (treeprint) { + fprintf(outfile, "User-defined tree"); + if (numtrees > 1) + putc('s', outfile); + fprintf(outfile, ":\n\n"); + } + fprintf(outfile, "\n\n"); + which = 1; + while (which <= numtrees) { + + /* These initializations required each time through the loop + since multiple trees require re-initialization */ + dummy_haslengths = true; + nextnode = 0; + dummy_first = true; + goteof = false; + + treeread(intree, &root, dummy_treenode, &goteof, &dummy_first, + curtree.nodep, &nextnode, &dummy_haslengths, &grbg, + initpromlnode,false,nonodes); + + nonodes = nextnode; + + root = curtree.nodep[root->index - 1]; + curtree.root = root; + + if (lngths) + tymetrav(curtree.root, &x); + + if (goteof && (which <= numtrees)) { + /* if we hit the end of the file prematurely */ + printf ("\n"); + printf ("ERROR: trees missing at end of file.\n"); + printf ("\tExpected number of trees:\t\t%ld\n", numtrees); + printf ("\tNumber of trees actually in file:\t%ld.\n\n", which - 1); + exxit(-1); + } + curtree.start = curtree.nodep[0]->back; + treevaluate(); + promlk_printree(); + summarize(); + if (trout) { + col = 0; + promlk_treeout(curtree.root); + } + if(which < numtrees){ + prot_freex_notip(nonodes, curtree.nodep); + gdispose(curtree.root, &grbg, curtree.nodep); + } + which++; + } + + FClose(intree); + if (!auto_ && numtrees > 1 && weightsum > 1 ) + standev2(numtrees, maxwhich, 0, endsite, maxlogl, l0gl, l0gf, + aliasweight, seed); + } + if (usertree) { + free(l0gl); + for (i=0; i < shimotrees; i++) + free(l0gf[i]); + free(l0gf); + } + prot_freetable(); + if (jumb < njumble) + return; + free(contribution); + free_all_protx(nonodes2, curtree.nodep); + if (!usertree) { + free_all_protx(nonodes2, bestree.nodep); + if (njumble > 1) + free_all_protx(nonodes2, bestree2.nodep); + } + if (progress) { + printf("\n\nOutput written to file \"%s\"\n\n", outfilename); + if (trout) + printf("Tree also written onto file \"%s\"\n", outtreename); + putchar('\n'); + } + + free(root); +} /* maketree */ + + +void clean_up() +{ + /* Free and/or close stuff */ + long i; + + free (rrate); + free (probcat); + free (rate); + /* Seems to require freeing every time... */ + for (i = 0; i < spp; i++) { + free (y[i]); + } + free (y); + free (nayme); + free (enterorder); + free (category); + free (weight); + free (alias); + free (ally); + free (location); + free (aliasweight); + free (probmat); + free (eigmat); + if (! (njumble <= 1)) + freetree2(bestree2.nodep, nonodes2); + FClose(infile); + FClose(outfile); + FClose(outtree); +#ifdef MAC + fixmacfile(outfilename); + fixmacfile(outtreename); +#endif +} /* clean_up */ + + +int main(int argc, Char *argv[]) +{ /* Protein Maximum Likelihood with molecular clock */ + +#ifdef MAC + argc = 1; /* macsetup("Promlk", ""); */ + argv[0] = "Promlk"; +#endif + init(argc,argv); + progname = argv[0]; + openfile(&infile, INFILE, "input file", "r", argv[0], infilename); + openfile(&outfile, OUTFILE, "output file", "w", argv[0], outfilename); + + ibmpc = IBMCRT; + ansi = ANSICRT; + datasets = 1; + mulsets = false; + firstset = true; + doinit(); + + if (trout) + openfile(&outtree,OUTTREE,"output tree file","w",argv[0],outtreename); + if (ctgry) + openfile(&catfile,CATFILE,"categories file","r",argv[0],catfilename); + if (weights || justwts) + openfile(&weightfile,WEIGHTFILE,"weights file","r",argv[0],weightfilename); + for (ith = 1; ith <= datasets; ith++) { + if (datasets > 1) { + fprintf(outfile, "Data set # %ld:\n\n", ith); + if (progress) + printf("\nData set # %ld:\n", ith); + } + getinput(); + + if (ith == 1) + firstset = false; + for (jumb = 1; jumb <= njumble; jumb++){ + max_num_sibs = 0; + maketree(); + } + } + + clean_up(); + printf("Done.\n\n"); +#ifdef WIN32 + phyRestoreConsoleAttributes(); +#endif + return 0; +} /* Protein Maximum Likelihood with molecular clock */ + diff --git a/forester/archive/RIO/others/phylip_mod/src/protdist.c b/forester/archive/RIO/others/phylip_mod/src/protdist.c new file mode 100644 index 0000000..b6a9428 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/protdist.c @@ -0,0 +1,1973 @@ +/*Modified by Christian Zmasek. Use at your own risk.*/ + +#include "phylip.h" +#include "seq.h" + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +#define nmlngth 26 /*changed from to 10 to 26 by CZ 2006-07-28 */ /* number of characters in species name */ +#define protepsilon .00001 +typedef long *steparray; +typedef enum { + universal, ciliate, mito, vertmito, flymito, yeastmito +} codetype; +typedef enum { + chemical, hall, george +} cattype; + +typedef double matrix[20][20]; + +#ifndef OLDC +/* function prototypes */ +void protdist_uppercase(Char *); +void protdist_inputnumbers(void); +void getoptions(void); +void transition(void); +void doinit(void); +void printcategories(void); +void inputoptions(void); +void protdist_inputdata(void); +void doinput(void); +void code(void); +void protdist_cats(void); +void maketrans(void); +void givens(matrix, long, long, long, double, double, boolean); +void coeffs(double, double, double *, double *, double); +void tridiag(matrix, long, double); +void shiftqr(matrix, long, double); +void qreigen(matrix, long); +void pmbeigen(void); +void pameigen(void); +void jtteigen(void); +void predict(long, long, long); +void makedists(void); +void reallocchars(void); +/* function prototypes */ +#endif + +long chars, datasets, ith, ctgry, categs; +/* spp = number of species + chars = number of positions in actual sequences */ +double freqa, freqc, freqg, freqt, cvi, invarfrac, ttratio, xi, xv, + ease, fracchange; +boolean weights, justwts, progress, mulsets, gama, invar, basesequal, + usepmb, usejtt, usepam, kimura, similarity, firstset; +codetype whichcode; +cattype whichcat; +steptr oldweight; +double rate[maxcategs]; +aas **gnode; +aas trans[4][4][4]; +double pie[20]; +long cat[(long)ser - (long)ala + 1], numaa[(long)ser - (long)ala + 1]; +double eig[20]; +matrix prob, eigvecs; +double **d; +char infilename[100], outfilename[100], catfilename[100], weightfilename[100]; + +/* Local variables for makedists, propagated globally for c version: */ + double tt, p, dp, d2p, q, elambdat; + + +/* this jtt matrix decomposition due to Elisabeth Tillier */ +static double jtteigs[] = +{0.0, -0.007031123, -0.006484345, -0.006086499, -0.005514432, +-0.00772664, -0.008643413, -0.010620756, -0.009965552, -0.011671808, +-0.012222418,-0.004589201, -0.013103714, -0.014048038, -0.003170582, +-0.00347935, -0.015311677, -0.016021194, -0.017991454, -0.018911888}; + +static double jttprobs[20][20] = +{{0.076999996, 0.051000003, 0.043000004, 0.051999998, 0.019999996, 0.041, + 0.061999994, 0.073999997, 0.022999999, 0.052000004, 0.090999997, 0.058999988, + 0.024000007, 0.04, 0.050999992, 0.069, 0.059000006, 0.014000008, 0.032000004, + 0.066000005}, + {0.015604455, -0.068062363, 0.020106264, 0.070723273, 0.011702977, 0.009674053, + 0.074000798, -0.169750458, 0.005560808, -0.008208636, -0.012305869, + -0.063730179, -0.005674643, -0.02116828, 0.104586169, 0.016480839, 0.016765139, + 0.005936994, 0.006046367, -0.0082877}, + {-0.049778281, -0.007118197, 0.003801272, 0.070749616, 0.047506147, + 0.006447017, 0.090522425, -0.053620432, -0.008508175, 0.037170603, + 0.051805545, 0.015413608, 0.019939916, -0.008431976, -0.143511376, + -0.052486072, -0.032116542, -0.000860626, -0.02535993, 0.03843545}, + {-0.028906423, 0.092952047, -0.009615343, -0.067870117, 0.031970392, + 0.048338335, -0.054396304, -0.135916654, 0.017780083, 0.000129242, + 0.031267424, 0.116333586, 0.007499746, -0.032153596, 0.033517051, + -0.013719269, -0.00347293, -0.003291821, -0.02158326, -0.008862168}, + {0.037181176, -0.023106564, -0.004482225, -0.029899635, 0.118139633, + -0.032298569, -0.04683198, 0.05566988, -0.012622847, 0.002023096, + -0.043921088, -0.04792557, -0.003452711, -0.037744513, 0.020822974, + 0.036580187, 0.02331425, -0.004807711, -0.017504496, 0.01086673}, + {0.044754061, -0.002503471, 0.019452517, -0.015611487, -0.02152807, + -0.013131425, -0.03465365, -0.047928912, 0.020608851, 0.067843095, + -0.122130014, 0.002521499, 0.013021646, -0.082891087, -0.061590119, + 0.016270856, 0.051468938, 0.002079063, 0.081019713, 0.082927944}, + {0.058917882, 0.007320741, 0.025278141, 0.000357541, -0.002831285, + -0.032453034, -0.010177288, -0.069447924, -0.034467324, 0.011422358, + -0.128478324, 0.04309667, -0.015319944, 0.113302422, -0.035052393, + 0.046885372, 0.06185183, 0.00175743, -0.06224497, 0.020282093}, + {-0.014562092, 0.022522921, -0.007094389, 0.03480089, -0.000326144, + -0.124039037, 0.020577906, -0.005056454, -0.081841576, -0.004381786, + 0.030826152, 0.091261631, 0.008878828, -0.02829487, 0.042718836, + -0.011180886, -0.012719227, -0.000753926, 0.048062375, -0.009399129}, + {0.033789571, -0.013512235, 0.088010984, 0.017580292, -0.006608005, + -0.037836971, -0.061344686, -0.034268357, 0.018190209, -0.068484614, + 0.120024744, -0.00319321, -0.001349477, -0.03000546, -0.073063759, + 0.081912399, 0.0635245, 0.000197, -0.002481798, -0.09108114}, + {-0.113947615, 0.019230545, 0.088819683, 0.064832765, 0.001801467, + -0.063829682, -0.072001633, 0.018429333, 0.057465965, 0.043901014, + -0.048050874, -0.001705918, 0.022637173, 0.017404665, 0.043877902, + -0.017089594, -0.058489485, 0.000127498, -0.029357194, 0.025943972}, + {0.01512923, 0.023603725, 0.006681954, 0.012360216, -0.000181447, + -0.023011838, -0.008960024, -0.008533239, 0.012569835, 0.03216118, + 0.061986403, -0.001919083, -0.1400832, -0.010669741, -0.003919454, + -0.003707024, -0.026806029, -0.000611603, -0.001402648, 0.065312824}, + {-0.036405351, 0.020816769, 0.011408213, 0.019787053, 0.038897829, + 0.017641789, 0.020858533, -0.006067252, 0.028617353, -0.064259496, + -0.081676567, 0.024421823, -0.028751676, 0.07095096, -0.024199434, + -0.007513119, -0.028108766, -0.01198095, 0.111761119, -0.076198809}, + {0.060831772, 0.144097327, -0.069151377, 0.023754576, -0.003322955, + -0.071618574, 0.03353154, -0.02795295, 0.039519769, -0.023453968, + -0.000630308, -0.098024591, 0.017672997, 0.003813378, -0.009266499, + -0.011192111, 0.016013873, -0.002072968, -0.010022044, -0.012526904}, + {-0.050776604, 0.092833081, 0.044069596, 0.050523021, -0.002628417, + 0.076542572, -0.06388631, -0.00854892, -0.084725311, 0.017401063, + -0.006262541, -0.094457679, -0.002818678, -0.0044122, -0.002883973, + 0.028729685, -0.004961596, -0.001498627, 0.017994575, -0.000232779}, + {-0.01894566, -0.007760205, -0.015160993, -0.027254587, 0.009800903, + -0.013443561, -0.032896517, -0.022734138, -0.001983861, 0.00256111, + 0.024823166, -0.021256768, 0.001980052, 0.028136263, -0.012364384, + -0.013782446, -0.013061091, 0.111173981, 0.021702122, 0.00046654}, + {-0.009444193, -0.042106824, -0.02535015, -0.055125574, 0.006369612, + -0.02945416, -0.069922064, -0.067221068, -0.003004999, 0.053624311, + 0.128862984, -0.057245803, 0.025550508, 0.087741073, -0.001119043, + -0.012036202, -0.000913488, -0.034864475, 0.050124813, 0.055534723}, + {0.145782464, -0.024348311, -0.031216873, 0.106174443, 0.00202862, + 0.02653866, -0.113657267, -0.00755018, 0.000307232, -0.051241158, + 0.001310685, 0.035275877, 0.013308898, 0.002957626, -0.002925034, + -0.065362319, -0.071844582, 0.000475894, -0.000112419, 0.034097762}, + {0.079840455, 0.018769331, 0.078685899, -0.084329807, -0.00277264, + -0.010099754, 0.059700608, -0.019209715, -0.010442992, -0.042100476, + -0.006020556, -0.023061786, 0.017246106, -0.001572858, -0.006703785, + 0.056301316, -0.156787357, -0.000303638, 0.001498195, 0.051363455}, + {0.049628261, 0.016475144, 0.094141653, -0.04444633, 0.005206131, + -0.001827555, 0.02195624, 0.013066683, -0.010415582, -0.022338403, + 0.007837197, -0.023397671, -0.002507095, 0.005177694, 0.017109561, + -0.202340113, 0.069681441, 0.000120736, 0.002201146, 0.004670849}, + {0.089153689, 0.000233354, 0.010826822, -0.004273519, 0.001440618, + 0.000436077, 0.001182351, -0.002255508, -0.000700465, 0.150589876, + -0.003911914, -0.00050154, -0.004564983, 0.00012701, -0.001486973, + -0.018902754, -0.054748555, 0.000217377, -0.000319302, -0.162541651}}; + +/* PMB matrix decomposition courtesy of Elisabeth Tillier */ +static double pmbeigs[] = +{0.0000001586972220,-1.8416770496147100, -1.6025046986139100,-1.5801012515121300, +-1.4987794099715900,-1.3520794233801900,-1.3003469390479700,-1.2439503327631300, +-1.1962574080244200,-1.1383730501367500,-1.1153278910708000,-0.4934843510654760, +-0.5419014550215590,-0.9657997830826700,-0.6276075673757390,-0.6675927795018510, +-0.6932641383465870,-0.8897872681859630,-0.8382698977371710,-0.8074694642446040}; +static double pmbprobs[20][20] = +{{0.0771762457248147,0.0531913844998640,0.0393445076407294,0.0466756566755510, +0.0286348361997465,0.0312327748383639,0.0505410248721427,0.0767106611472993, +0.0258916271688597,0.0673140562194124,0.0965705469252199,0.0515979465932174, +0.0250628079438675,0.0503492018628350,0.0399908189418273,0.0641898881894471, +0.0517539616710987,0.0143507440546115,0.0357994592438322,0.0736218495862984}, +{0.0368263046116572,-0.0006728917107827,0.0008590805287740,-0.0002764255356960, +0.0020152937187455,0.0055743720652960,0.0003213317669367,0.0000449190281568, +-0.0004226254397134,0.1805040629634510,-0.0272246813586204,0.0005904606533477, +-0.0183743200073889,-0.0009194625608688,0.0008173657533167,-0.0262629806302238, +0.0265738757209787,0.0002176606241904,0.0021315644838566,-0.1823229927207580}, +{-0.0194800075560895,0.0012068088610652,-0.0008803318319596,-0.0016044273960017, +-0.0002938633803197,-0.0535796754602196,0.0155163896648621,-0.0015006360762140, +0.0021601372013703,0.0268513218744797,-0.1085292493742730,0.0149753083138452, +0.1346457366717310,-0.0009371698759829,0.0013501708044116,0.0346352293103622, +-0.0276963770242276,0.0003643142783940,0.0002074817333067,-0.0174108903914110}, +{0.0557839400850153,0.0023271577185437,0.0183481103396687,0.0023339480096311, +0.0002013267015151,-0.0227406863569852,0.0098644845475047,0.0064721276774396, +0.0001389408104210,-0.0473713878768274,-0.0086984445005797,0.0026913674934634, +0.0283724052562196,0.0001063665179457,0.0027442574779383,-0.1875312134708470, +0.1279864877057640,0.0005103347834563,0.0003155113168637,0.0081451082759554}, +{0.0037510125027265,0.0107095920636885,0.0147305410328404,-0.0112351252180332, +-0.0001500408626446,-0.1523450933729730,0.0611532413339872,-0.0005496748939503, +0.0048714378736644,-0.0003826320053999,0.0552010244407311,0.0482555671001955, +-0.0461664995115847,-0.0021165008617978,-0.0004574454232187,0.0233755883688949, +-0.0035484915422384,0.0009090698422851,0.0013840637687758,-0.0073895139302231}, +{-0.0111512564930024,0.1025460064723080,0.0396772456883791,-0.0298408501361294, +-0.0001656742634733,-0.0079876311843289,0.0712644184507945,-0.0010780604625230, +-0.0035880882043592,0.0021070399334252,0.0016716329894279,-0.1810123023850110, +0.0015141703608724,-0.0032700852781804,0.0035503782441679,0.0118634302028026, +0.0044561606458028,-0.0001576678495964,0.0023470722225751,-0.0027457045397157}, +{0.1474525743949170,-0.0054432538500293,0.0853848892349828,-0.0137787746207348, +-0.0008274830358513,0.0042248844582553,0.0019556229305563,-0.0164191435175148, +-0.0024501858854849,0.0120908948084233,-0.0381456105972653,0.0101271614855119, +-0.0061945941321859,0.0178841099895867,-0.0014577779202600,-0.0752120602555032, +-0.1426985695849920,0.0002862275078983,-0.0081191734261838,0.0313401149422531}, +{0.0542034611735289,-0.0078763926211829,0.0060433542506096,0.0033396210615510, +0.0013965072374079,0.0067798903832256,-0.0135291136622509,-0.0089982442731848, +-0.0056744537593887,-0.0766524225176246,0.1881210263933930,-0.0065875518675173, +0.0416627569300375,-0.0953804133524747,-0.0012559228448735,0.0101622644292547, +-0.0304742453119050,0.0011702318499737,0.0454733434783982,-0.1119239362388150}, +{0.1069409037912470,0.0805064400880297,-0.1127352030714600,0.1001181253523260, +-0.0021480427488769,-0.0332884841459003,-0.0679837575848452,-0.0043812841356657, +0.0153418716846395,-0.0079441315103188,-0.0121766182046363,-0.0381127991037620, +-0.0036338726532673,0.0195324059593791,-0.0020165963699984,-0.0061222685010268, +-0.0253761448771437,-0.0005246410999057,-0.0112205170502433,0.0052248485517237}, +{-0.0325247648326262,0.0238753651653669,0.0203684886605797,0.0295666232678825, +-0.0003946714764213,-0.0157242718469554,-0.0511737848084862,0.0084725632040180, +-0.0167068828528921,0.0686962159427527,-0.0659702890616198,-0.0014289912494271, +-0.0167000964093416,-0.1276689083678200,0.0036575057830967,-0.0205958145531018, +0.0000368919612829,0.0014413626622426,0.1064360941926030,0.0863372661517408}, +{-0.0463777468104402,0.0394712148670596,0.1118686750747160,0.0440711686389031, +-0.0026076286506751,-0.0268454015202516,-0.1464943067133240,-0.0137514051835380, +-0.0094395514284145,-0.0144124844774228,0.0249103379323744,-0.0071832157138676, +0.0035592787728526,0.0415627419826693,0.0027040097365669,0.0337523666612066, +0.0316121324137152,-0.0011350177559026,-0.0349998884574440,-0.0302651879823361}, +{0.0142360925194728,0.0413145623127025,0.0324976427846929,0.0580930922002398, +-0.0586974207121084,0.0202001168873069,0.0492204086749069,0.1126593173463060, +0.0116620013776662,-0.0780333711712066,-0.1109786767320410,0.0407775100936731, +-0.0205013161312652,-0.0653458585025237,0.0347351829703865,0.0304448983224773, +0.0068813748197884,-0.0189002309261882,-0.0334507528405279,-0.0668143558699485}, +{-0.0131548829657936,0.0044244322828034,-0.0050639951827271,-0.0038668197633889, +-0.1536822386530220,0.0026336969165336,0.0021585651200470,-0.0459233839062969, +0.0046854727140565,0.0393815434593599,0.0619554007991097,0.0027456299925622, +0.0117574347936383,0.0373018612990383,0.0024818527553328,-0.0133956606027299, +-0.0020457128424105,0.0154178819990401,0.0246524142683911,0.0275363065682921}, +{-0.1542307272455030,0.0364861558267547,-0.0090880407008181,0.0531673937889863, +0.0157585615170580,0.0029986538457297,0.0180194047699875,0.0652152443589317, +0.0266842840376180,0.0388457366405908,0.0856237634510719,0.0126955778952183, +0.0099593861698250,-0.0013941794862563,0.0294065511237513,-0.1151906949298290, +-0.0852991447389655,0.0028699120202636,-0.0332087026659522,0.0006811857297899}, +{0.0281300736924501,-0.0584072081898638,-0.0178386569847853,-0.0536470338171487, +-0.0186881656029960,-0.0240008730656106,-0.0541064820498883,0.2217137098936020, +-0.0260500001542033,0.0234505236798375,0.0311127151218573,-0.0494139126682672, +0.0057093465049849,0.0124937286655911,-0.0298322975915689,0.0006520211333102, +-0.0061018680727128,-0.0007081999479528,-0.0060523759094034,0.0215845995364623}, +{0.0295321046399105,-0.0088296411830544,-0.0065057049917325,-0.0053478115612781, +-0.0100646496794634,-0.0015473619084872,0.0008539960632865,-0.0376381933046211, +-0.0328135588935604,0.0672161874239480,0.0667626853916552,-0.0026511651464901, +0.0140451514222062,-0.0544836996133137,0.0427485157912094,0.0097455780205802, +0.0177309072915667,-0.0828759701187452,-0.0729504795471370,0.0670731961252313}, +{0.0082646581043963,-0.0319918630534466,-0.0188454445200422,-0.0374976353856606, +0.0037131290686848,-0.0132507796987883,-0.0306958830735725,-0.0044119395527308, +-0.0140786756619672,-0.0180512599925078,-0.0208243802903953,-0.0232202769398931, +-0.0063135878270273,0.0110442171178168,0.1824538048228460,-0.0006644614422758, +-0.0069909097436659,0.0255407650654681,0.0099119399501151,-0.0140911517070698}, +{0.0261344441524861,-0.0714454044548650,0.0159436926233439,0.0028462736216688, +-0.0044572637889080,-0.0089474834434532,-0.0177570282144517,-0.0153693244094452, +0.1160919467206400,0.0304911481385036,0.0047047513411774,-0.0456535116423972, +0.0004491494948617,-0.0767108879444462,-0.0012688533741441,0.0192445965934123, +0.0202321954782039,0.0281039933233607,-0.0590403018490048,0.0364080426546883}, +{0.0115826306265004,0.1340228176509380,-0.0236200652949049,-0.1284484655137340, +-0.0004742338006503,0.0127617346949511,-0.0428560878860394,0.0060030732454125, +0.0089182609926781,0.0085353834972860,0.0048464809638033,0.0709740071429510, +0.0029940462557054,-0.0483434904493132,-0.0071713680727884,-0.0036840391887209, +0.0031454003250096,0.0246243550241551,-0.0449551277644180,0.0111449232769393}, +{0.0140356721886765,-0.0196518236826680,0.0030517022326582,0.0582672093364850, +-0.0000973895685457,0.0021704767224292,0.0341806268602705,-0.0152035987563018, +-0.0903198657739177,0.0259623214586925,0.0155832497882743,-0.0040543568451651, +0.0036477631918247,-0.0532892744763217,-0.0142569373662724,0.0104500681408622, +0.0103483945857315,0.0679534422398752,-0.0768068882938636,0.0280289727046158}} +; + +static double pameigs[] = {0.0, -0.002350753691875762, -0.002701991863800379, + -0.002931612442853115, -0.004262492032364507, -0.005395980482561625, + -0.007141172690079523, -0.007392844756151318, -0.007781761342200766, + -0.00810032066366362, -0.00875299712761124, -0.01048227332164386, + -0.01109594097332267, -0.01298616073142234, -0.01342036228188581, + -0.01552599145527578, -0.01658762802054814, -0.0174893445623765, + -0.01933280832903272, -0.02206353522613025}; + +static double pamprobs[20][20] = + {{0.087683339901135, 0.04051291829598762, 0.04087846315185977, + 0.04771603459744777, 0.03247095396561266, 0.03784612688594957, + 0.0504933695604875, 0.0898249006830755, 0.03285885059543713, + 0.0357514442352119, 0.0852464099207521, 0.07910313444070642, + 0.01488243946396588, 0.04100101908956829, 0.05158026947089499, + 0.06975497205982451, 0.05832757042475474, 0.00931264523877807, + 0.03171540880870517, 0.06303972920984541}, + {0.01943453646811026, -0.004492574160484092, 0.007694891061220776, + 0.01278399096887701, 0.0106157418450234, 0.007542140341575122, + 0.01326994069032819, 0.02615565199894889, 0.003123125764490066, + 0.002204507682495444, -0.004782898215768979, 0.01204241965177619, + 0.0007847400096924341, -0.03043626073172116, 0.01221202591902536, + 0.01100527004684405, 0.01116495631339549, -0.0925364931988571, + -0.02622065387931562, 0.00843494142432107}, + {0.01855357100209072, 0.01493642835763868, 0.0127983090766285, + 0.0200533250704364, -0.1681898360107787, 0.01551657969909255, + 0.02128060163107209, 0.03100633591848964, 0.00845480845269879, + 0.000927149370785571, 0.00937207565817036, 0.03490557769673472, + 0.00300443019551563, -0.02590837220264415, 0.01329376859943192, + 0.006854110889741407, 0.01102593860528263, 0.003360844186685888, + -0.03459712356647764, 0.003351477369404443}, + {0.02690642688200102, 0.02131745801890152, 0.0143626616005213, + 0.02405101425725929, 0.05041008641436849, 0.01430925051050233, + 0.02362114036816964, 0.04688381789373886, 0.005250115453626377, + -0.02040112168595516, -0.0942720776915669, 0.03773004996758644, + -0.00822831940782616, -0.1164872809439224, 0.02286281877257392, + 0.02849551240669926, 0.01468856796295663, 0.02377110964207936, + -0.094380545436577, -0.02089068498518036}, + {0.00930172577225213, 0.01493463068441099, 0.020186920775608, + 0.02892154953912524, -0.01224593358361567, 0.01404228329986624, + 0.02671186617119041, 0.04537535161795231, 0.02229995804098249, + -0.04635704133961575, -0.1966910360247138, 0.02796648065439046, + -0.02263484732621436, 0.0440490503242072, 0.01148782948302166, + 0.01989170531824069, 0.001306805142981245, -0.005676690969116321, + 0.07680476281625202, -0.07967537039721849}, + {0.06602274245435476, -0.0966661981471856, -0.005241648783844579, + 0.00859135188171146, -0.007762129660943368, -0.02888965572526196, + 0.003592291525888222, 0.1668410669287673, -0.04082039290551406, + 0.005233775047553415, -0.01758244726137135, -0.1493955762326898, + -0.00855819137835548, 0.004211419253492328, 0.01929306335052688, + 0.03008056746359405, 0.0190444422412472, 0.005577189741419315, + 0.0000874156155112068, 0.02634091459108298}, + {0.01933897472880726, 0.05874583569377844, -0.02293534606228405, + -0.07206314017962175, -0.004580681581546643, -0.0628814337610561, + -0.0850783812795136, 0.07988417636610614, -0.0852798990133397, + 0.01649047166155952, -0.05416647263757423, 0.1089834536254064, + 0.005093403979413865, 0.02520300254161142, 0.0005951431406455604, + 0.02441251821224675, 0.02796099482240553, -0.002574933994926502, + -0.007172237553012804, 0.03002455129086954}, + {0.04041118479094272, -0.002476225672095412, -0.01494505811263243, + -0.03759443758599911, -0.00892246902492875, -0.003634714029239211, + -0.03085671837973749, -0.126176309029931, 0.005814031139083794, + 0.01313561962646063, -0.04760487162503322, -0.0490563712725484, + -0.005082243450421558, -0.01213634309383557, 0.1806666927079249, + 0.02111663336185495, 0.02963486860587087, -0.0000175020101657785, + 0.01197155383597686, 0.0357526792184636}, + {-0.01184769557720525, 0.01582776076338872, -0.006570708266564639, + -0.01471915653734024, 0.00894343616503608, 0.00562664968033149, + -0.01465878888356943, 0.05365282692645818, 0.00893509735776116, + -0.05879312944436473, 0.0806048683392995, -0.007722897986905326, + -0.001819943882718859, 0.0942535573077267, 0.07483883782251654, + 0.004354639673913651, -0.02828804845740341, -0.001318222184691827, + -0.07613149604246563, -0.1251675867732172}, + {0.00834167031558193, -0.01509357596974962, 0.007098172811092488, + 0.03127677418040319, 0.001992448468465455, 0.00915441566808454, + 0.03430175973499201, -0.0730648147535803, -0.001402707145575659, + 0.04780949194330815, -0.1115035603461273, -0.01292297197609604, + -0.005056270550868528, 0.1112053349612027, -0.03801929822379964, + -0.001191241001736563, 0.01872874622910247, 0.0005314214903865993, + -0.0882576318311789, 0.07607183599610171}, + {-0.01539460099727769, 0.04988596184297883, -0.01187240760647617, + -0.06987843637091853, -0.002490472846497859, 0.01009857892494956, + -0.07473588067847209, 0.0906009925879084, 0.1243612446505172, + 0.02152806401345371, -0.03504879644860233, -0.06680752427613573, + -0.005574485153629651, 0.001518282948127752, -0.01999168507510701, + -0.01478606199529457, -0.02203749419458996, -0.00132680708294333, + -0.01137505997867614, 0.05332658773667142}, + {-0.06104378736432388, 0.0869446603393548, -0.03298331234537257, + 0.03128515657456024, 0.003906358569208259, 0.03578694104193928, + 0.06241936133189683, 0.06182827284921748, -0.05566564263245907, + 0.02640868588189002, -0.01349751243059039, -0.05507866642582638, + -0.006671347738489326, -0.001470096466016046, 0.05185743641479938, + -0.07494697511168257, -0.1175185439057584, -0.001188074094105709, + 0.00937934805737347, 0.05024773745437657}, + {-0.07252555582124737, -0.116554459356382, 0.003605361887406413, + -0.00836518656029184, 0.004615715410745561, 0.005105376617651312, + -0.00944938657024391, 0.05602449420950007, 0.02722719610561933, + 0.01959357494748446, -0.0258655103753962, 0.1440733975689835, + 0.01446782819722976, 0.003718896062070054, 0.05825843045655135, + -0.06230154142733073, -0.07833704962300169, 0.003160836143568724, + -0.001169873777936648, 0.03471745590503304}, + {-0.03204352258752698, 0.01019272923862322, 0.04509668708733181, + 0.05756522429120813, -0.0004601149081726732, -0.0984718150777423, + -0.01107826100664925, -0.005680277810520585, 0.01962359392320817, + 0.01550006899131986, 0.05143956925922197, 0.02462476682588468, + -0.0888843861002653, -0.00171553583659411, 0.01606331750661664, + 0.001176847743518958, -0.02070972978912828, -0.000341523293579971, + -0.002654732745607882, 0.02075709428885848}, + {0.03595199666430258, -0.02800219615234468, -0.04341570015493925, + -0.0748275906176658, 0.0001051403676377422, 0.1137431321746627, + 0.005852087565974318, 0.003443037513847801, -0.02481931657706633, + -0.003651181839831423, 0.03195794176786321, 0.04135411406392523, + -0.07562030263210619, 0.001769332364699, -0.01984381173403915, + -0.005029750745010152, 0.02649253902476472, 0.000518085571702734, + 0.001062936684474851, 0.01295950668914449}, + {-0.16164552322896, -0.0006050035060464324, 0.0258380054414968, + 0.003188424740960557, -0.0002058911341821877, 0.03157555987384681, + -0.01678913462596107, 0.03096216145389774, -0.0133791110666919, + 0.1125249625204277, -0.00769017706442472, -0.02653938062180483, + -0.002555329863523985, -0.00861833362947954, 0.01775148884754278, + 0.02529310679774722, 0.0826243417011238, -0.0001036728183032624, + 0.001963562313294209, -0.0935900561309786}, + {0.1652394174588469, -0.002814245280784351, -0.0328982001821263, + -0.02000104712964131, 0.0002208121995725443, -0.02733462178511839, + 0.02648078162927627, -0.01788316626401427, 0.01630747623755998, + 0.1053849023838147, -0.005447706553811218, 0.01810876922536839, + -0.001808914710282444, -0.007687912115607397, -0.01332593672114388, + -0.02110750894891371, -0.07456116592983384, 0.000219072589592394, + 0.001270886972191055, -0.1083616930749109}, + {0.02453279389716254, -0.005820072356487439, 0.100260287284095, + 0.01277522280305745, -0.003184943445296999, 0.05814689527984152, + -0.0934012278200201, -0.03017986487349484, -0.03136625380994165, + 0.00988668352785117, -0.00358900410973142, -0.02017443675004764, + 0.000915384582922184, -0.001460963415183106, -0.01370112443251124, + 0.1130040979284457, -0.1196161771323699, -0.0005800211204222045, + -0.0006153403201024954, 0.00416806428223025}, + {-0.0778089244252535, -0.007055161182430869, -0.0349307504860869, + -0.0811915584276571, -0.004689825871599125, -0.03726108871471753, + 0.1072225647141469, -0.00917015113070944, 0.01381628985996913, + -0.00123227881492089, 0.001815954515275675, 0.005708744099349901, + -0.0001448985044877925, -0.001306578795561384, -0.006992743514185243, + 0.1744720240732789, -0.05353628497814023, -0.0007613684227234787, + -0.0003550282315997644, 0.01340106423804634}, + {-0.0159527329868513, -0.007622151568160798, -0.1389875105184963, + 0.1165051999914764, -0.002217810389087748, 0.01550003226513692, + -0.07427664222230566, -0.003371438498619264, 0.01385754771325365, + 0.004759020167383304, 0.001624078805220564, 0.02011638303109029, + -0.001717827082842178, -0.0007424036708598594, -0.003978884451898934, + 0.0866418927301209, -0.01280817739158123, -0.00023039242454603, + 0.002309205802479111, 0.0005926106991001195}}; + + +void protdist_uppercase(Char *ch) +{ + (*ch) = (isupper(*ch) ? (*ch) : toupper(*ch)); +} /* protdist_uppercase */ + + +void protdist_inputnumbers() +{ + /* input the numbers of species and of characters */ + long i; + + fscanf(infile, "%ld%ld", &spp, &chars); + + if (printdata) + fprintf(outfile, "%2ld species, %3ld positions\n\n", spp, chars); + gnode = (aas **)Malloc(spp * sizeof(aas *)); + if (firstset) { + for (i = 0; i < spp; i++) + gnode[i] = (aas *)Malloc(chars * sizeof(aas )); + } + weight = (steparray)Malloc(chars*sizeof(long)); + oldweight = (steparray)Malloc(chars*sizeof(long)); + category = (steparray)Malloc(chars*sizeof(long)); + d = (double **)Malloc(spp*sizeof(double *)); + nayme = (naym *)Malloc(spp*sizeof(naym)); + + for (i = 0; i < spp; ++i) + d[i] = (double *)Malloc(spp*sizeof(double)); +} /* protdist_inputnumbers */ + + +void getoptions() +{ + /* interactively set options */ + long loopcount, loopcount2; + Char ch, ch2; + Char in[100]; + boolean done; + + if (printdata) + fprintf(outfile, "\nProtein distance algorithm, version %s\n\n",VERSION); + putchar('\n'); + weights = false; + printdata = false; + progress = true; + interleaved = true; + similarity = false; + ttratio = 2.0; + whichcode = universal; + whichcat = george; + basesequal = true; + freqa = 0.25; + freqc = 0.25; + freqg = 0.25; + freqt = 0.25; + usejtt = true; + usepmb = false; + usepam = false; + kimura = false; + gama = false; + invar = false; + invarfrac = 0.0; + ease = 0.457; + loopcount = 0; + do { + cleerhome(); + printf("\nProtein distance algorithm, version %s\n\n",VERSION); + printf("Settings for this run:\n"); + printf(" P Use JTT, PMB, PAM, Kimura, categories model? %s\n", + usejtt ? "Jones-Taylor-Thornton matrix" : + usepmb ? "Henikoff/Tillier PMB matrix" : + usepam ? "Dayhoff PAM matrix" : + kimura ? "Kimura formula" : + similarity ? "Similarity table" : "Categories model"); + if (!kimura && !similarity) { + printf(" G Gamma distribution of rates among positions?"); + if (gama) + printf(" Yes\n"); + else { + if (invar) + printf(" Gamma+Invariant\n"); + else + printf(" No\n"); + } + } + printf(" C One category of substitution rates?"); + if (!ctgry || categs == 1) + printf(" Yes\n"); + else + printf(" %ld categories\n", categs); + printf(" W Use weights for positions?"); + if (weights) + printf(" Yes\n"); + else + printf(" No\n"); + if (!(usejtt || usepmb || usepam || kimura || similarity)) { + printf(" U Use which genetic code? %s\n", + (whichcode == universal) ? "Universal" : + (whichcode == ciliate) ? "Ciliate" : + (whichcode == mito) ? "Universal mitochondrial" : + (whichcode == vertmito) ? "Vertebrate mitochondrial" : + (whichcode == flymito) ? "Fly mitochondrial" : + (whichcode == yeastmito) ? "Yeast mitochondrial" : ""); + printf(" A Which categorization of amino acids? %s\n", + (whichcat == chemical) ? "Chemical" : + (whichcat == george) ? "George/Hunt/Barker" : "Hall"); + + printf(" E Prob change category (1.0=easy):%8.4f\n",ease); + printf(" T Transition/transversion ratio:%7.3f\n",ttratio); + printf(" F Base Frequencies:"); + if (basesequal) + printf(" Equal\n"); + else + printf("%7.3f%6.3f%6.3f%6.3f\n", freqa, freqc, freqg, freqt); + } + printf(" M Analyze multiple data sets?"); + if (mulsets) + printf(" Yes, %2ld %s\n", datasets, + (justwts ? "sets of weights" : "data sets")); + else + printf(" No\n"); + printf(" I Input sequences interleaved? %s\n", + (interleaved ? "Yes" : "No, sequential")); + printf(" 0 Terminal type (IBM PC, ANSI)? %s\n", + ibmpc ? "IBM PC" : + ansi ? "ANSI" : "(none)"); + printf(" 1 Print out the data at start of run %s\n", + (printdata ? "Yes" : "No")); + printf(" 2 Print indications of progress of run %s\n", + progress ? "Yes" : "No"); + printf("\nAre these settings correct? (type Y or the letter for one to change)\n"); + in[0] = '\0'; + getstryng(in); + ch=in[0]; + if (ch == '\n') + ch = ' '; + protdist_uppercase(&ch); + done = (ch == 'Y'); + if (!done) { + if (((strchr("CPGMWI120",ch) != NULL) && (usejtt || usepmb || usepam)) || + ((strchr("CPMWI120",ch) != NULL) && (kimura || similarity)) || + ((strchr("CUAPGETFMWI120",ch) != NULL) && + (! (usejtt || usepmb || usepam || kimura || similarity)))) { + switch (ch) { + + case 'U': + printf("Which genetic code?\n"); + printf(" type for\n\n"); + printf(" U Universal\n"); + printf(" M Mitochondrial\n"); + printf(" V Vertebrate mitochondrial\n"); + printf(" F Fly mitochondrial\n"); + printf(" Y Yeast mitochondrial\n\n"); + loopcount2 = 0; + do { + printf("type U, M, V, F, or Y\n"); + scanf("%c%*[^\n]", &ch); + getchar(); + if (ch == '\n') + ch = ' '; + protdist_uppercase(&ch); + countup(&loopcount2, 10); + } while (ch != 'U' && ch != 'M' && ch != 'V' && ch != 'F' && ch != 'Y'); + switch (ch) { + + case 'U': + whichcode = universal; + break; + + case 'M': + whichcode = mito; + break; + + case 'V': + whichcode = vertmito; + break; + + case 'F': + whichcode = flymito; + break; + + case 'Y': + whichcode = yeastmito; + break; + } + break; + + case 'A': + printf( + "Which of these categorizations of amino acids do you want to use:\n\n"); + printf( + " all have groups: (Glu Gln Asp Asn), (Lys Arg His), (Phe Tyr Trp)\n"); + printf(" plus:\n"); + printf("George/Hunt/Barker:"); + printf(" (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr Pro)\n"); + printf("Chemical: "); + printf(" (Cys Met), (Val Leu Ileu Gly Ala Ser Thr), (Pro)\n"); + printf("Hall: "); + printf(" (Cys), (Met Val Leu Ileu), (Gly Ala Ser Thr), (Pro)\n\n"); + printf("Which do you want to use (type C, H, or G)\n"); + loopcount2 = 0; + do { + scanf("%c%*[^\n]", &ch); + getchar(); + if (ch == '\n') + ch = ' '; + protdist_uppercase(&ch); + countup(&loopcount2, 10); + } while (ch != 'C' && ch != 'H' && ch != 'G'); + switch (ch) { + + case 'C': + whichcat = chemical; + break; + + case 'H': + whichcat = hall; + break; + + case 'G': + whichcat = george; + break; + } + break; + + case 'C': + ctgry = !ctgry; + if (ctgry) { + initcatn(&categs); + initcategs(categs, rate); + } + break; + + case 'W': + weights = !weights; + break; + + case 'P': + if (usejtt) { + usejtt = false; + usepmb = true; + } else { + if (usepmb) { + usepmb = false; + usepam = true; + } else { + if (usepam) { + usepam = false; + kimura = true; + } else { + if (kimura) { + kimura = false; + similarity = true; + } else { + if (similarity) + similarity = false; + else + usejtt = true; + } + } + } + } + break; + + case 'G': + if (!(gama || invar)) + gama = true; + else { + if (gama) { + gama = false; + invar = true; + } else { + if (invar) + invar = false; + } + } + break; + + + case 'E': + printf("Ease of changing category of amino acid?\n"); + loopcount2 = 0; + do { + printf(" (1.0 if no difficulty of changing,\n"); + printf(" less if less easy. Can't be negative\n"); + scanf("%lf%*[^\n]", &ease); + getchar(); + countup(&loopcount2, 10); + } while (ease > 1.0 || ease < 0.0); + break; + + case 'T': + loopcount2 = 0; + do { + printf("Transition/transversion ratio?\n"); + scanf("%lf%*[^\n]", &ttratio); + getchar(); + countup(&loopcount2, 10); + } while (ttratio < 0.0); + break; + + case 'F': + loopcount2 = 0; + do { + basesequal = false; + printf("Frequencies of bases A,C,G,T ?\n"); + scanf("%lf%lf%lf%lf%*[^\n]", &freqa, &freqc, &freqg, &freqt); + getchar(); + if (fabs(freqa + freqc + freqg + freqt - 1.0) >= 1.0e-3) + printf("FREQUENCIES MUST SUM TO 1\n"); + countup(&loopcount2, 10); + } while (fabs(freqa + freqc + freqg + freqt - 1.0) >= 1.0e-3); + break; + + case 'M': + mulsets = !mulsets; + if (mulsets) { + printf("Multiple data sets or multiple weights?"); + loopcount2 = 0; + do { + printf(" (type D or W)\n"); + scanf("%c%*[^\n]", &ch2); + getchar(); + if (ch2 == '\n') + ch2 = ' '; + uppercase(&ch2); + countup(&loopcount2, 10); + } while ((ch2 != 'W') && (ch2 != 'D')); + justwts = (ch2 == 'W'); + if (justwts) + justweights(&datasets); + else + initdatasets(&datasets); + } + break; + + case 'I': + interleaved = !interleaved; + break; + + case '0': + if (ibmpc) { + ibmpc = false; + ansi = true; + } else if (ansi) + ansi = false; + else + ibmpc = true; + break; + + case '1': + printdata = !printdata; + break; + + case '2': + progress = !progress; + break; + } + } else { + if (strchr("CUAPGETFMWI120",ch) == NULL) + printf("Not a possible option!\n"); + else + printf("That option not allowed with these settings\n"); + printf("\nPress Enter or Return key to continue\n"); + getchar(); + } + } + countup(&loopcount, 100); + } while (!done); + if (gama || invar) { + loopcount = 0; + do { + printf( +"\nCoefficient of variation of substitution rate among positions (must be positive)\n"); + printf( + " In gamma distribution parameters, this is 1/(square root of alpha)\n"); + scanf("%lf%*[^\n]", &cvi); + getchar(); + countup(&loopcount, 10); + } while (cvi <= 0.0); + cvi = 1.0 / (cvi * cvi); + } + if (invar) { + loopcount = 0; + do { + printf("Fraction of invariant positions?\n"); + scanf("%lf%*[^\n]", &invarfrac); + getchar(); + countup (&loopcount, 10); + } while ((invarfrac <= 0.0) || (invarfrac >= 1.0)); + } +} /* getoptions */ + + +void transition() +{ + /* calculations related to transition-transversion ratio */ + double aa, bb, freqr, freqy, freqgr, freqty; + + freqr = freqa + freqg; + freqy = freqc + freqt; + freqgr = freqg / freqr; + freqty = freqt / freqy; + aa = ttratio * freqr * freqy - freqa * freqg - freqc * freqt; + bb = freqa * freqgr + freqc * freqty; + xi = aa / (aa + bb); + xv = 1.0 - xi; + if (xi <= 0.0 && xi >= -epsilon) + xi = 0.0; + if (xi < 0.0){ + printf("THIS TRANSITION-TRANSVERSION RATIO IS IMPOSSIBLE WITH"); + printf(" THESE BASE FREQUENCIES\n"); + exxit(-1);} +} /* transition */ + + +void doinit() +{ + /* initializes variables */ + protdist_inputnumbers(); + getoptions(); + transition(); +} /* doinit*/ + + +void printcategories() +{ /* print out list of categories of positions */ + long i, j; + + fprintf(outfile, "Rate categories\n\n"); + for (i = 1; i <= nmlngth + 3; i++) + putc(' ', outfile); + for (i = 1; i <= chars; i++) { + fprintf(outfile, "%ld", category[i - 1]); + if (i % 60 == 0) { + putc('\n', outfile); + for (j = 1; j <= nmlngth + 3; j++) + putc(' ', outfile); + } else if (i % 10 == 0) + putc(' ', outfile); + } + fprintf(outfile, "\n\n"); +} /* printcategories */ + +void reallocchars(void) +{ + int i; + + free(weight); + free(oldweight); + free(category); + for (i = 0; i < spp; i++) { + free(gnode[i]); + gnode[i] = (aas *)Malloc(chars * sizeof(aas )); + } + weight = (steparray)Malloc(chars*sizeof(long)); + oldweight = (steparray)Malloc(chars*sizeof(long)); + category = (steparray)Malloc(chars*sizeof(long)); +} + +void inputoptions() +{ /* input the information on the options */ + long i; + + if (!firstset && !justwts) { + samenumsp(&chars, ith); + reallocchars(); + } if (firstset || !justwts) { + for (i = 0; i < chars; i++) { + category[i] = 1; + oldweight[i] = 1; + weight[i] = 1; + } + } + /* if (!justwts && weights) {*/ + if (justwts || weights) + inputweights(chars, oldweight, &weights); + if (printdata) + putc('\n', outfile); + if (usejtt && printdata) + fprintf(outfile, " Jones-Taylor-Thornton model distance\n"); + if (usepmb && printdata) + fprintf(outfile, " Henikoff/Tillier PMB model distance\n"); + if (usepam && printdata) + fprintf(outfile, " Dayhoff PAM model distance\n"); + if (kimura && printdata) + fprintf(outfile, " Kimura protein distance\n"); + if (!(usejtt || usepmb || usepam || kimura || similarity) && printdata) + fprintf(outfile, " Categories model distance\n"); + if (similarity) + fprintf(outfile, " \n Table of similarity between sequences\n"); + if ((ctgry && categs > 1) && (firstset || !justwts)) { + inputcategs(0, chars, category, categs, "ProtDist"); + if (printdata) + printcategs(outfile, chars, category, "Position categories"); + } else if (printdata && (categs > 1)) { + fprintf(outfile, "\nPosition category Rate of change\n\n"); + for (i = 1; i <= categs; i++) + fprintf(outfile, "%15ld%13.3f\n", i, rate[i - 1]); + putc('\n', outfile); + printcategories(); + } + if (weights && printdata) + printweights(outfile, 0, chars, oldweight, "Positions"); +} /* inputoptions */ + + +void protdist_inputdata() +{ + /* input the names and sequences for each species */ + long i, j, k, l, aasread=0, aasnew=0; + Char charstate; + boolean allread, done; + aas aa=0; /* temporary amino acid for input */ + + if (progress) + putchar('\n'); + j = nmlngth + (chars + (chars - 1) / 10) / 2 - 5; + if (j < nmlngth - 1) + j = nmlngth - 1; + if (j > 37) + j = 37; + if (printdata) { + fprintf(outfile, "\nName"); + for (i = 1; i <= j; i++) + putc(' ', outfile); + fprintf(outfile, "Sequences\n"); + fprintf(outfile, "----"); + for (i = 1; i <= j; i++) + putc(' ', outfile); + fprintf(outfile, "---------\n\n"); + } + aasread = 0; + allread = false; + while (!(allread)) { + /* eat white space -- if the separator line has spaces on it*/ + do { + charstate = gettc(infile); + } while (charstate == ' ' || charstate == '\t'); + ungetc(charstate, infile); + if (eoln(infile)) + scan_eoln(infile); + i = 1; + while (i <= spp) { + if ((interleaved && aasread == 0) || !interleaved) + initname(i-1); + if (interleaved) + j = aasread; + else + j = 0; + done = false; + while (((!done) && (!(eoln(infile) || eoff(infile))))) { + if (interleaved) + done = true; + while (((j < chars) & (!(eoln(infile) | eoff(infile))))) { + charstate = gettc(infile); + if (charstate == '\n' || charstate == '\t') + charstate = ' '; + if (charstate == ' ' || (charstate >= '0' && charstate <= '9')) + continue; + protdist_uppercase(&charstate); + if ((!isalpha(charstate) && charstate != '.' && charstate != '?' && + charstate != '-' && charstate != '*') || charstate == 'J' || + charstate == 'O' || charstate == 'U' || charstate == '.') { + printf("ERROR -- bad amino acid: %c at position %ld of species %3ld\n", + charstate, j, i); + if (charstate == '.') { + printf(" Periods (.) may not be used as gap characters.\n"); + printf(" The correct gap character is (-)\n"); + } + exxit(-1); + } + j++; + + switch (charstate) { + + case 'A': + aa = ala; + break; + + case 'B': + aa = asx; + break; + + case 'C': + aa = cys; + break; + + case 'D': + aa = asp; + break; + + case 'E': + aa = glu; + break; + + case 'F': + aa = phe; + break; + + case 'G': + aa = gly; + break; + + case 'H': + aa = his; + break; + + case 'I': + aa = ileu; + break; + + case 'K': + aa = lys; + break; + + case 'L': + aa = leu; + break; + + case 'M': + aa = met; + break; + + case 'N': + aa = asn; + break; + + case 'P': + aa = pro; + break; + + case 'Q': + aa = gln; + break; + + case 'R': + aa = arg; + break; + + case 'S': + aa = ser; + break; + + case 'T': + aa = thr; + break; + + case 'V': + aa = val; + break; + + case 'W': + aa = trp; + break; + + case 'X': + aa = unk; + break; + + case 'Y': + aa = tyr; + break; + + case 'Z': + aa = glx; + break; + + case '*': + aa = stop; + break; + + case '?': + aa = quest; + break; + + case '-': + aa = del; + break; + } + gnode[i - 1][j - 1] = aa; + } + if (interleaved) + continue; + if (j < chars) + scan_eoln(infile); + else if (j == chars) + done = true; + } + if (interleaved && i == 1) + aasnew = j; + scan_eoln(infile); + if ((interleaved && j != aasnew) || ((!interleaved) && j != chars)){ + printf("ERROR: SEQUENCES OUT OF ALIGNMENT\n"); + exxit(-1);} + i++; + } + if (interleaved) { + aasread = aasnew; + allread = (aasread == chars); + } else + allread = (i > spp); + } + if ( printdata) { + for (i = 1; i <= ((chars - 1) / 60 + 1); i++) { + for (j = 1; j <= spp; j++) { + for (k = 0; k < nmlngth; k++) + putc(nayme[j - 1][k], outfile); + fprintf(outfile, " "); + l = i * 60; + if (l > chars) + l = chars; + for (k = (i - 1) * 60 + 1; k <= l; k++) { + if (j > 1 && gnode[j - 1][k - 1] == gnode[0][k - 1]) + charstate = '.'; + else { + switch (gnode[j - 1][k - 1]) { + + case ala: + charstate = 'A'; + break; + + case asx: + charstate = 'B'; + break; + + case cys: + charstate = 'C'; + break; + + case asp: + charstate = 'D'; + break; + + case glu: + charstate = 'E'; + break; + + case phe: + charstate = 'F'; + break; + + case gly: + charstate = 'G'; + break; + + case his: + charstate = 'H'; + break; + + case ileu: + charstate = 'I'; + break; + + case lys: + charstate = 'K'; + break; + + case leu: + charstate = 'L'; + break; + + case met: + charstate = 'M'; + break; + + case asn: + charstate = 'N'; + break; + + case pro: + charstate = 'P'; + break; + + case gln: + charstate = 'Q'; + break; + + case arg: + charstate = 'R'; + break; + + case ser: + charstate = 'S'; + break; + + case thr: + charstate = 'T'; + break; + + case val: + charstate = 'V'; + break; + + case trp: + charstate = 'W'; + break; + + case tyr: + charstate = 'Y'; + break; + + case glx: + charstate = 'Z'; + break; + + case del: + charstate = '-'; + break; + + case stop: + charstate = '*'; + break; + + case unk: + charstate = 'X'; + break; + + case quest: + charstate = '?'; + break; + + default: /*cases ser1 and ser2 cannot occur*/ + break; + } + } + putc(charstate, outfile); + if (k % 10 == 0 && k % 60 != 0) + putc(' ', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); + } + if (printdata) + putc('\n', outfile); +} /* protdist_inputdata */ + + +void doinput() +{ /* reads the input data */ + long i; + double sumrates, weightsum; + + inputoptions(); + if(!justwts || firstset) + protdist_inputdata(); + if (!ctgry) { + categs = 1; + rate[0] = 1.0; + } + weightsum = 0; + for (i = 0; i < chars; i++) + weightsum += oldweight[i]; + sumrates = 0.0; + for (i = 0; i < chars; i++) + sumrates += oldweight[i] * rate[category[i] - 1]; + for (i = 0; i < categs; i++) + rate[i] *= weightsum / sumrates; +} /* doinput */ + + +void code() +{ + /* make up table of the code 1 = u, 2 = c, 3 = a, 4 = g */ + long n; + aas b; + + trans[0][0][0] = phe; + trans[0][0][1] = phe; + trans[0][0][2] = leu; + trans[0][0][3] = leu; + trans[0][1][0] = ser; + trans[0][1][1] = ser; + trans[0][1][2] = ser; + trans[0][1][3] = ser; + trans[0][2][0] = tyr; + trans[0][2][1] = tyr; + trans[0][2][2] = stop; + trans[0][2][3] = stop; + trans[0][3][0] = cys; + trans[0][3][1] = cys; + trans[0][3][2] = stop; + trans[0][3][3] = trp; + trans[1][0][0] = leu; + trans[1][0][1] = leu; + trans[1][0][2] = leu; + trans[1][0][3] = leu; + trans[1][1][0] = pro; + trans[1][1][1] = pro; + trans[1][1][2] = pro; + trans[1][1][3] = pro; + trans[1][2][0] = his; + trans[1][2][1] = his; + trans[1][2][2] = gln; + trans[1][2][3] = gln; + trans[1][3][0] = arg; + trans[1][3][1] = arg; + trans[1][3][2] = arg; + trans[1][3][3] = arg; + trans[2][0][0] = ileu; + trans[2][0][1] = ileu; + trans[2][0][2] = ileu; + trans[2][0][3] = met; + trans[2][1][0] = thr; + trans[2][1][1] = thr; + trans[2][1][2] = thr; + trans[2][1][3] = thr; + trans[2][2][0] = asn; + trans[2][2][1] = asn; + trans[2][2][2] = lys; + trans[2][2][3] = lys; + trans[2][3][0] = ser; + trans[2][3][1] = ser; + trans[2][3][2] = arg; + trans[2][3][3] = arg; + trans[3][0][0] = val; + trans[3][0][1] = val; + trans[3][0][2] = val; + trans[3][0][3] = val; + trans[3][1][0] = ala; + trans[3][1][1] = ala; + trans[3][1][2] = ala; + trans[3][1][3] = ala; + trans[3][2][0] = asp; + trans[3][2][1] = asp; + trans[3][2][2] = glu; + trans[3][2][3] = glu; + trans[3][3][0] = gly; + trans[3][3][1] = gly; + trans[3][3][2] = gly; + trans[3][3][3] = gly; + if (whichcode == mito) + trans[0][3][2] = trp; + if (whichcode == vertmito) { + trans[0][3][2] = trp; + trans[2][3][2] = stop; + trans[2][3][3] = stop; + trans[2][0][2] = met; + } + if (whichcode == flymito) { + trans[0][3][2] = trp; + trans[2][0][2] = met; + trans[2][3][2] = ser; + } + if (whichcode == yeastmito) { + trans[0][3][2] = trp; + trans[1][0][2] = thr; + trans[2][0][2] = met; + } + n = 0; + for (b = ala; (long)b <= (long)val; b = (aas)((long)b + 1)) { + if (b != ser2) { + n++; + numaa[(long)b - (long)ala] = n; + } + } + numaa[(long)ser - (long)ala] = (long)ser1 - (long)(ala) + 1; +} /* code */ + + +void protdist_cats() +{ + /* define categories of amino acids */ + aas b; + + /* fundamental subgroups */ + cat[0] = 1; /* for alanine */ + cat[(long)cys - (long)ala] = 1; + cat[(long)met - (long)ala] = 2; + cat[(long)val - (long)ala] = 3; + cat[(long)leu - (long)ala] = 3; + cat[(long)ileu - (long)ala] = 3; + cat[(long)gly - (long)ala] = 4; + cat[0] = 4; + cat[(long)ser - (long)ala] = 4; + cat[(long)thr - (long)ala] = 4; + cat[(long)pro - (long)ala] = 5; + cat[(long)phe - (long)ala] = 6; + cat[(long)tyr - (long)ala] = 6; + cat[(long)trp - (long)ala] = 6; + cat[(long)glu - (long)ala] = 7; + cat[(long)gln - (long)ala] = 7; + cat[(long)asp - (long)ala] = 7; + cat[(long)asn - (long)ala] = 7; + cat[(long)lys - (long)ala] = 8; + cat[(long)arg - (long)ala] = 8; + cat[(long)his - (long)ala] = 8; + if (whichcat == george) { + /* George, Hunt and Barker: sulfhydryl, small hydrophobic, small hydrophilic, + aromatic, acid/acid-amide/hydrophilic, basic */ + for (b = ala; (long)b <= (long)val; b = (aas)((long)b + 1)) { + if (cat[(long)b - (long)ala] == 3) + cat[(long)b - (long)ala] = 2; + if (cat[(long)b - (long)ala] == 5) + cat[(long)b - (long)ala] = 4; + } + } + if (whichcat == chemical) { + /* Conn and Stumpf: monoamino, aliphatic, heterocyclic, + aromatic, dicarboxylic, basic */ + for (b = ala; (long)b <= (long)val; b = (aas)((long)b + 1)) { + if (cat[(long)b - (long)ala] == 2) + cat[(long)b - (long)ala] = 1; + if (cat[(long)b - (long)ala] == 4) + cat[(long)b - (long)ala] = 3; + } + } + /* Ben Hall's personal opinion */ + if (whichcat != hall) + return; + for (b = ala; (long)b <= (long)val; b = (aas)((long)b + 1)) { + if (cat[(long)b - (long)ala] == 3) + cat[(long)b - (long)ala] = 2; + } +} /* protdist_cats */ + + +void maketrans() +{ + /* Make up transition probability matrix from code and category tables */ + long i, j, k, m, n, s, nb1, nb2; + double x, sum; + long sub[3], newsub[3]; + double f[4], g[4]; + aas b1, b2; + double TEMP, TEMP1, TEMP2, TEMP3; + + for (i = 0; i <= 19; i++) { + pie[i] = 0.0; + for (j = 0; j <= 19; j++) + prob[i][j] = 0.0; + } + f[0] = freqt; + f[1] = freqc; + f[2] = freqa; + f[3] = freqg; + g[0] = freqc + freqt; + g[1] = freqc + freqt; + g[2] = freqa + freqg; + g[3] = freqa + freqg; + TEMP = f[0]; + TEMP1 = f[1]; + TEMP2 = f[2]; + TEMP3 = f[3]; + fracchange = xi * (2 * f[0] * f[1] / g[0] + 2 * f[2] * f[3] / g[2]) + + xv * (1 - TEMP * TEMP - TEMP1 * TEMP1 - TEMP2 * TEMP2 - TEMP3 * TEMP3); + sum = 0.0; + for (i = 0; i <= 3; i++) { + for (j = 0; j <= 3; j++) { + for (k = 0; k <= 3; k++) { + if (trans[i][j][k] != stop) + sum += f[i] * f[j] * f[k]; + } + } + } + for (i = 0; i <= 3; i++) { + sub[0] = i + 1; + for (j = 0; j <= 3; j++) { + sub[1] = j + 1; + for (k = 0; k <= 3; k++) { + sub[2] = k + 1; + b1 = trans[i][j][k]; + for (m = 0; m <= 2; m++) { + s = sub[m]; + for (n = 1; n <= 4; n++) { + memcpy(newsub, sub, sizeof(long) * 3L); + newsub[m] = n; + x = f[i] * f[j] * f[k] / (3.0 * sum); + if (((s == 1 || s == 2) && (n == 3 || n == 4)) || + ((n == 1 || n == 2) && (s == 3 || s == 4))) + x *= xv * f[n - 1]; + else + x *= xi * f[n - 1] / g[n - 1] + xv * f[n - 1]; + b2 = trans[newsub[0] - 1][newsub[1] - 1][newsub[2] - 1]; + if (b1 != stop) { + nb1 = numaa[(long)b1 - (long)ala]; + pie[nb1 - 1] += x; + if (b2 != stop) { + nb2 = numaa[(long)b2 - (long)ala]; + if (cat[(long)b1 - (long)ala] != cat[(long)b2 - (long)ala]) { + prob[nb1 - 1][nb2 - 1] += x * ease; + prob[nb1 - 1][nb1 - 1] += x * (1.0 - ease); + } else + prob[nb1 - 1][nb2 - 1] += x; + } else + prob[nb1 - 1][nb1 - 1] += x; + } + } + } + } + } + } + for (i = 0; i <= 19; i++) + prob[i][i] -= pie[i]; + for (i = 0; i <= 19; i++) { + for (j = 0; j <= 19; j++) + prob[i][j] /= sqrt(pie[i] * pie[j]); + } + /* computes pi^(1/2)*B*pi^(-1/2) */ +} /* maketrans */ + + +void givens(double (*a)[20], long i, long j, long n, double ctheta, + double stheta, boolean left) +{ /* Givens transform at i,j for 1..n with angle theta */ + long k; + double d; + + for (k = 0; k < n; k++) { + if (left) { + d = ctheta * a[i - 1][k] + stheta * a[j - 1][k]; + a[j - 1][k] = ctheta * a[j - 1][k] - stheta * a[i - 1][k]; + a[i - 1][k] = d; + } else { + d = ctheta * a[k][i - 1] + stheta * a[k][j - 1]; + a[k][j - 1] = ctheta * a[k][j - 1] - stheta * a[k][i - 1]; + a[k][i - 1] = d; + } + } +} /* givens */ + + +void coeffs(double x, double y, double *c, double *s, double accuracy) +{ /* compute cosine and sine of theta */ + double root; + + root = sqrt(x * x + y * y); + if (root < accuracy) { + *c = 1.0; + *s = 0.0; + } else { + *c = x / root; + *s = y / root; + } +} /* coeffs */ + + +void tridiag(double (*a)[20], long n, double accuracy) +{ /* Givens tridiagonalization */ + long i, j; + double s, c; + + for (i = 2; i < n; i++) { + for (j = i + 1; j <= n; j++) { + coeffs(a[i - 2][i - 1], a[i - 2][j - 1], &c, &s,accuracy); + givens(a, i, j, n, c, s, true); + givens(a, i, j, n, c, s, false); + givens(eigvecs, i, j, n, c, s, true); + } + } +} /* tridiag */ + + +void shiftqr(double (*a)[20], long n, double accuracy) +{ /* QR eigenvalue-finder */ + long i, j; + double approx, s, c, d, TEMP, TEMP1; + + for (i = n; i >= 2; i--) { + do { + TEMP = a[i - 2][i - 2] - a[i - 1][i - 1]; + TEMP1 = a[i - 1][i - 2]; + d = sqrt(TEMP * TEMP + TEMP1 * TEMP1); + approx = a[i - 2][i - 2] + a[i - 1][i - 1]; + if (a[i - 1][i - 1] < a[i - 2][i - 2]) + approx = (approx - d) / 2.0; + else + approx = (approx + d) / 2.0; + for (j = 0; j < i; j++) + a[j][j] -= approx; + for (j = 1; j < i; j++) { + coeffs(a[j - 1][j - 1], a[j][j - 1], &c, &s, accuracy); + givens(a, j, j + 1, i, c, s, true); + givens(a, j, j + 1, i, c, s, false); + givens(eigvecs, j, j + 1, n, c, s, true); + } + for (j = 0; j < i; j++) + a[j][j] += approx; + } while (fabs(a[i - 1][i - 2]) > accuracy); + } +} /* shiftqr */ + + +void qreigen(double (*prob)[20], long n) +{ /* QR eigenvector/eigenvalue method for symmetric matrix */ + double accuracy; + long i, j; + + accuracy = 1.0e-6; + for (i = 0; i < n; i++) { + for (j = 0; j < n; j++) + eigvecs[i][j] = 0.0; + eigvecs[i][i] = 1.0; + } + tridiag(prob, n, accuracy); + shiftqr(prob, n, accuracy); + for (i = 0; i < n; i++) + eig[i] = prob[i][i]; + for (i = 0; i <= 19; i++) { + for (j = 0; j <= 19; j++) + prob[i][j] = sqrt(pie[j]) * eigvecs[i][j]; + } + /* prob[i][j] is the value of U' times pi^(1/2) */ +} /* qreigen */ + + +void jtteigen() +{ /* eigenanalysis for JTT matrix, precomputed */ + memcpy(prob,jttprobs,sizeof(jttprobs)); + memcpy(eig,jtteigs,sizeof(jtteigs)); + fracchange = 0.01; +} /* jtteigen */ + + +void pmbeigen() +{ /* eigenanalysis for PMB matrix, precomputed */ + memcpy(prob,pmbprobs,sizeof(pmbprobs)); + memcpy(eig,pmbeigs,sizeof(pmbeigs)); + fracchange = 1.0; +} /* pmbeigen */ + + +void pameigen() +{ /* eigenanalysis for PAM matrix, precomputed */ + memcpy(prob,pamprobs,sizeof(pamprobs)); + memcpy(eig,pameigs,sizeof(pameigs)); + fracchange = 0.01; +} /* pameigen */ + + +void predict(long nb1, long nb2, long cat) +{ /* make contribution to prediction of this aa pair */ + long m; + double TEMP; + + for (m = 0; m <= 19; m++) { + if (gama || invar) + elambdat = exp(-cvi*log(1.0-rate[cat-1]*tt*(eig[m]/(1.0-invarfrac))/cvi)); + else + elambdat = exp(rate[cat-1]*tt * eig[m]); + q = prob[m][nb1 - 1] * prob[m][nb2 - 1] * elambdat; + p += q; + if (!gama && !invar) + dp += rate[cat-1]*eig[m] * q; + else + dp += (rate[cat-1]*eig[m]/(1.0-rate[cat-1]*tt*(eig[m]/(1.0-invarfrac))/cvi)) * q; + TEMP = eig[m]; + if (!gama && !invar) + d2p += TEMP * TEMP * q; + else + d2p += (rate[cat-1]*rate[cat-1]*eig[m]*eig[m]*(1.0+1.0/cvi)/ + ((1.0-rate[cat-1]*tt*eig[m]/cvi) + *(1.0-rate[cat-1]*tt*eig[m]/cvi))) * q; + } + if (nb1 == nb2) { + p *= (1.0 - invarfrac); + p += invarfrac; + } + dp *= (1.0 - invarfrac); + d2p *= (1.0 - invarfrac); +} /* predict */ + +void makedists() +{ /* compute the distances */ + long i, j, k, m, n, itterations, nb1, nb2, cat; + double delta, lnlike, slope, curv; + boolean neginfinity, inf, overlap; + aas b1, b2; + + if (!(printdata || similarity)) + fprintf(outfile, "%5ld\n", spp); + if (progress) + printf("Computing distances:\n"); + for (i = 1; i <= spp; i++) { + if (progress) + printf(" "); + if (progress) { + for (j = 0; j < nmlngth; j++) + putchar(nayme[i - 1][j]); + } + if (progress) { + printf(" "); + fflush(stdout); + } + if (similarity) + d[i-1][i-1] = 1.0; + else + d[i-1][i-1] = 0.0; + for (j = 0; j <= i - 2; j++) { + if (!(kimura || similarity)) { + if (usejtt || usepmb || usepam) + tt = 0.1/fracchange; + else + tt = 1.0; + delta = tt / 2.0; + itterations = 0; + inf = false; + do { + lnlike = 0.0; + slope = 0.0; + curv = 0.0; + neginfinity = false; + overlap = false; + for (k = 0; k < chars; k++) { + if (oldweight[k] > 0) { + cat = category[k]; + b1 = gnode[i - 1][k]; + b2 = gnode[j][k]; + if (b1 != stop && b1 != del && b1 != quest && b1 != unk && + b2 != stop && b2 != del && b2 != quest && b2 != unk) { + overlap = true; + p = 0.0; + dp = 0.0; + d2p = 0.0; + nb1 = numaa[(long)b1 - (long)ala]; + nb2 = numaa[(long)b2 - (long)ala]; + if (b1 != asx && b1 != glx && b2 != asx && b2 != glx) + predict(nb1, nb2, cat); + else { + if (b1 == asx) { + if (b2 == asx) { + predict(3L, 3L, cat); + predict(3L, 4L, cat); + predict(4L, 3L, cat); + predict(4L, 4L, cat); + } else { + if (b2 == glx) { + predict(3L, 6L, cat); + predict(3L, 7L, cat); + predict(4L, 6L, cat); + predict(4L, 7L, cat); + } else { + predict(3L, nb2, cat); + predict(4L, nb2, cat); + } + } + } else { + if (b1 == glx) { + if (b2 == asx) { + predict(6L, 3L, cat); + predict(6L, 4L, cat); + predict(7L, 3L, cat); + predict(7L, 4L, cat); + } else { + if (b2 == glx) { + predict(6L, 6L, cat); + predict(6L, 7L, cat); + predict(7L, 6L, cat); + predict(7L, 7L, cat); + } else { + predict(6L, nb2, cat); + predict(7L, nb2, cat); + } + } + } else { + if (b2 == asx) { + predict(nb1, 3L, cat); + predict(nb1, 4L, cat); + predict(nb1, 3L, cat); + predict(nb1, 4L, cat); + } else if (b2 == glx) { + predict(nb1, 6L, cat); + predict(nb1, 7L, cat); + predict(nb1, 6L, cat); + predict(nb1, 7L, cat); + } + } + } + } + if (p <= 0.0) + neginfinity = true; + else { + lnlike += oldweight[k]*log(p); + slope += oldweight[k]*dp / p; + curv += oldweight[k]*(d2p / p - dp * dp / (p * p)); + } + } + } + } + itterations++; + if (!overlap){ + printf("\nWARNING: NO OVERLAP BETWEEN SEQUENCES %ld AND %ld; -1.0 WAS WRITTEN\n", i, j+1); + tt = -1.0/fracchange; + itterations = 20; + inf = true; + } else if (!neginfinity) { + if (curv < 0.0) { + tt -= slope / curv; + if (tt > 10000.0) { + printf("\nWARNING: INFINITE DISTANCE BETWEEN SPECIES %ld AND %ld; -1.0 WAS WRITTEN\n", i, j+1); + tt = -1.0/fracchange; + inf = true; + itterations = 20; + } + } + else { + if ((slope > 0.0 && delta < 0.0) || (slope < 0.0 && delta > 0.0)) + delta /= -2; + tt += delta; + } + } else { + delta /= -2; + tt += delta; + } + if (tt < protepsilon && !inf) + tt = protepsilon; + } while (itterations != 20); + } else { + m = 0; + n = 0; + for (k = 0; k < chars; k++) { + b1 = gnode[i - 1][k]; + b2 = gnode[j][k]; + if ((((long)b1 <= (long)val) || ((long)b1 == (long)ser)) + && (((long)b2 <= (long)val) || ((long)b2 == (long)ser))) { + if (b1 == b2) + m++; + n++; + } + } + p = 1 - (double)m / n; + if (kimura) { + dp = 1.0 - p - 0.2 * p * p; + if (dp < 0.0) { + printf( +"\nDISTANCE BETWEEN SEQUENCES %3ld AND %3ld IS TOO LARGE FOR KIMURA FORMULA\n", + i, j + 1); + tt = -1.0; + } else + tt = -log(dp); + } else { /* if similarity */ + tt = 1.0 - p; + } + } + d[i - 1][j] = fracchange * tt; + d[j][i - 1] = d[i - 1][j]; + if (progress) { + putchar('.'); + fflush(stdout); + } + } + if (progress) { + putchar('\n'); + fflush(stdout); + } + } + if (!similarity) { + for (i = 0; i < spp; i++) { + for (j = 0; j < nmlngth; j++) + putc(nayme[i][j], outfile); + k = spp; + for (j = 1; j <= k; j++) { + fprintf(outfile, "%10.6f", d[i][j - 1]); + if ((j + 1) % 7 == 0 && j < k) + putc('\n', outfile); + } + putc('\n', outfile); + } + } else { + for (i = 0; i < spp; i += 6) { + if ((i+6) < spp) + n = i+6; + else + n = spp; + fprintf(outfile, " "); + for (j = i; j < n ; j++) { + for (k = 0; k < (nmlngth-2); k++) + putc(nayme[j][k], outfile); + putc(' ', outfile); + putc(' ', outfile); + } + putc('\n', outfile); + for (j = 0; j < spp; j++) { + for (k = 0; k < nmlngth; k++) + putc(nayme[j][k], outfile); + if ((i+6) < spp) + n = i+6; + else + n = spp; + for (k = i; k < n ; k++) + fprintf(outfile, "%10.6f", d[j][k]); + putc('\n', outfile); + } + putc('\n', outfile); + } + } + if (progress) + printf("\nOutput written to file \"%s\"\n\n", outfilename); +} /* makedists */ + + +int main(int argc, Char *argv[]) +{ /* ML Protein distances by PMB, JTT, PAM or categories model */ +#ifdef MAC + argc = 1; /* macsetup("Protdist",""); */ + argv[0] = "Protdist"; +#endif + init(argc, argv); + openfile(&infile,INFILE,"input file","r",argv[0],infilename); + openfile(&outfile,OUTFILE,"output file","w",argv[0],outfilename); + ibmpc = IBMCRT; + ansi = ANSICRT; + mulsets = false; + datasets = 1; + firstset = true; + doinit(); + if (!(kimura || similarity)) + code(); + if (!(usejtt || usepmb || usepam || kimura || similarity)) { + protdist_cats(); + maketrans(); + qreigen(prob, 20L); + } else { + if (kimura || similarity) + fracchange = 1.0; + else { + if (usejtt) + jtteigen(); + else { + if (usepmb) + pmbeigen(); + else + pameigen(); + } + } + } + if (ctgry) + openfile(&catfile,CATFILE,"categories file","r",argv[0],catfilename); + if (weights || justwts) + openfile(&weightfile,WEIGHTFILE,"weights file","r",argv[0],weightfilename); + for (ith = 1; ith <= datasets; ith++) { + doinput(); + if (ith == 1) + firstset = false; + if ((datasets > 1) && progress) + printf("\nData set # %ld:\n\n", ith); + makedists(); + } + FClose(outfile); + FClose(infile); +#ifdef MAC + fixmacfile(outfilename); +#endif + return 0; +} /* Protein distances */ + diff --git a/forester/archive/RIO/others/phylip_mod/src/protpars.c b/forester/archive/RIO/others/phylip_mod/src/protpars.c new file mode 100644 index 0000000..6020255 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/protpars.c @@ -0,0 +1,1925 @@ + +#include "phylip.h" +#include "seq.h" + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +#define maxtrees 100 /* maximum number of tied trees stored */ + +typedef enum { + universal, ciliate, mito, vertmito, flymito, yeastmito +} codetype; + +/* nodes will form a binary tree */ + +typedef struct gseq { + seqptr seq; + struct gseq *next; +} gseq; + +#ifndef OLDC +/* function prototypes */ +void protgnu(gseq **); +void protchuck(gseq *); +void code(void); +void setup(void); +void getoptions(void); +void protalloctree(void); +void allocrest(void); +void doinit(void); +void protinputdata(void); + +void protmakevalues(void); +void doinput(void); +void protfillin(node *, node *, node *); +void protpreorder(node *); +void protadd(node *, node *, node *); +void protre_move(node **, node **); +void evaluate(node *); +void protpostorder(node *); +void protreroot(node *); +void protsavetraverse(node *, long *, boolean *); + +void protsavetree(long *, boolean *); +void tryadd(node *, node **, node **); +void addpreorder(node *, node *, node *); +void tryrearr(node *, boolean *); +void repreorder(node *, boolean *); +void rearrange(node **); +void protgetch(Char *); +void protaddelement(node **, long *, long *, boolean *); +void prottreeread(void); +void protancestset(long *, long *, long *, long *, long *); + +void prothyprint(long , long , boolean *, node *, boolean *, boolean *); +void prothyptrav(node *, sitearray *, long, long, long *, boolean *, + sitearray); +void prothypstates(long *); +void describe(void); +void maketree(void); +void reallocnode(node* p); +void reallocchars(void); +/* function prototypes */ +#endif + + + +Char infilename[FNMLNGTH], outfilename[FNMLNGTH], intreename[FNMLNGTH], outtreename[FNMLNGTH], weightfilename[FNMLNGTH]; +node *root; +long chars, col, msets, ith, njumble, jumb; +/* chars = number of sites in actual sequences */ +long inseed, inseed0; +boolean jumble, usertree, weights, thresh, trout, progress, stepbox, + justwts, ancseq, mulsets, firstset; +codetype whichcode; +long fullset, fulldel; +pointarray treenode; /* pointers to all nodes in tree */ +double threshold; +steptr threshwt; +longer seed; +long *enterorder; +sitearray translate[(long)quest - (long)ala + 1]; +aas trans[4][4][4]; +long **fsteps; +bestelm *bestrees; +boolean dummy; +gseq *garbage; +node *temp, *temp1; +Char ch; +aas tmpa; +char *progname; + +/* Local variables for maketree, propagated globally for c version: */ +long minwhich; +double like, bestyet, bestlike, minsteps, bstlike2; +boolean lastrearr, recompute; +node *there; +double nsteps[maxuser]; +long *place; +boolean *names; + + +void protgnu(gseq **p) +{ + /* this and the following are do-it-yourself garbage collectors. + Make a new node or pull one off the garbage list */ + if (garbage != NULL) { + *p = garbage; + free((*p)->seq); + (*p)->seq = (seqptr)Malloc(chars*sizeof(sitearray)); + garbage = garbage->next; + } else { + *p = (gseq *)Malloc(sizeof(gseq)); + (*p)->seq = (seqptr)Malloc(chars*sizeof(sitearray)); + } + (*p)->next = NULL; +} /* protgnu */ + + +void protchuck(gseq *p) +{ + /* collect garbage on p -- put it on front of garbage list */ + p->next = garbage; + garbage = p; +} /* protchuck */ + + +void code() +{ + /* make up table of the code 1 = u, 2 = c, 3 = a, 4 = g */ + trans[0][0][0] = phe; + trans[0][0][1] = phe; + trans[0][0][2] = leu; + trans[0][0][3] = leu; + trans[0][1][0] = ser1; + trans[0][1][1] = ser1; + trans[0][1][2] = ser1; + trans[0][1][3] = ser1; + trans[0][2][0] = tyr; + trans[0][2][1] = tyr; + trans[0][2][2] = stop; + trans[0][2][3] = stop; + trans[0][3][0] = cys; + trans[0][3][1] = cys; + trans[0][3][2] = stop; + trans[0][3][3] = trp; + trans[1][0][0] = leu; + trans[1][0][1] = leu; + trans[1][0][2] = leu; + trans[1][0][3] = leu; + trans[1][1][0] = pro; + trans[1][1][1] = pro; + trans[1][1][2] = pro; + trans[1][1][3] = pro; + trans[1][2][0] = his; + trans[1][2][1] = his; + trans[1][2][2] = gln; + trans[1][2][3] = gln; + trans[1][3][0] = arg; + trans[1][3][1] = arg; + trans[1][3][2] = arg; + trans[1][3][3] = arg; + trans[2][0][0] = ileu; + trans[2][0][1] = ileu; + trans[2][0][2] = ileu; + trans[2][0][3] = met; + trans[2][1][0] = thr; + trans[2][1][1] = thr; + trans[2][1][2] = thr; + trans[2][1][3] = thr; + trans[2][2][0] = asn; + trans[2][2][1] = asn; + trans[2][2][2] = lys; + trans[2][2][3] = lys; + trans[2][3][0] = ser2; + trans[2][3][1] = ser2; + trans[2][3][2] = arg; + trans[2][3][3] = arg; + trans[3][0][0] = val; + trans[3][0][1] = val; + trans[3][0][2] = val; + trans[3][0][3] = val; + trans[3][1][0] = ala; + trans[3][1][1] = ala; + trans[3][1][2] = ala; + trans[3][1][3] = ala; + trans[3][2][0] = asp; + trans[3][2][1] = asp; + trans[3][2][2] = glu; + trans[3][2][3] = glu; + trans[3][3][0] = gly; + trans[3][3][1] = gly; + trans[3][3][2] = gly; + trans[3][3][3] = gly; + if (whichcode == mito) + trans[0][3][2] = trp; + if (whichcode == vertmito) { + trans[0][3][2] = trp; + trans[2][3][2] = stop; + trans[2][3][3] = stop; + trans[2][0][2] = met; + } + if (whichcode == flymito) { + trans[0][3][2] = trp; + trans[2][0][2] = met; + trans[2][3][2] = ser2; + } + if (whichcode == yeastmito) { + trans[0][3][2] = trp; + trans[1][0][2] = thr; + trans[2][0][2] = met; + } +} /* code */ + + +void setup() +{ + /* set up set table to get aasets from aas */ + aas a, b; + long i, j, k, l, s; + + for (a = ala; (long)a <= (long)stop; a = (aas)((long)a + 1)) { + translate[(long)a - (long)ala][0] = 1L << ((long)a); + translate[(long)a - (long)ala][1] = 1L << ((long)a); + } + for (i = 0; i <= 3; i++) { + for (j = 0; j <= 3; j++) { + for (k = 0; k <= 3; k++) { + for (l = 0; l <= 3; l++) { + translate[(long)trans[i][j][k]][1] |= (1L << (long)trans[l][j][k]); + translate[(long)trans[i][j][k]][1] |= (1L << (long)trans[i][l][k]); + translate[(long)trans[i][j][k]][1] |= (1L << (long)trans[i][j][l]); + } + } + } + } + translate[(long)del - (long)ala][1] = 1L << ((long)del); + fulldel = (1L << ((long)stop + 1)) - (1L << ((long)ala)); + fullset = fulldel & (~(1L << ((long)del))); + translate[(long)asx - (long)ala][0] + = (1L << ((long)asn)) | (1L << ((long)asp)); + translate[(long)glx - (long)ala][0] + = (1L << ((long)gln)) | (1L << ((long)glu)); + translate[(long)ser - (long)ala][0] + = (1L << ((long)ser1)) | (1L << ((long)ser2)); + translate[(long)unk - (long)ala][0] = fullset; + translate[(long)quest - (long)ala][0] = fulldel; + translate[(long)asx - (long)ala][1] = translate[(long)asn - (long)ala][1] + | translate[(long)asp - (long)ala][1]; + translate[(long)glx - (long)ala][1] = translate[(long)gln - (long)ala][1] + | translate[(long)glu - (long)ala][1]; + translate[(long)ser - (long)ala][1] = translate[(long)ser1 - (long)ala][1] + | translate[(long)ser2 - (long)ala][1]; + translate[(long)unk - (long)ala][1] = fullset; + translate[(long)quest - (long)ala][1] = fulldel; + for (a = ala; (long)a <= (long)quest; a = (aas)((long)a + 1)) { + s = 0; + for (b = ala; (long)b <= (long)stop; b = (aas)((long)b + 1)) { + if (((1L << ((long)b)) & translate[(long)a - (long)ala][1]) != 0) + s |= translate[(long)b - (long)ala][1]; + } + translate[(long)a - (long)ala][2] = s; + } +} /* setup */ + + +void getoptions() +{ + /* interactively set options */ + long loopcount, loopcount2; + Char ch, ch2; + + fprintf(outfile, "\nProtein parsimony algorithm, version %s\n\n",VERSION); + putchar('\n'); + jumble = false; + njumble = 1; + outgrno = 1; + outgropt = false; + thresh = false; + trout = true; + usertree = false; + weights = false; + whichcode = universal; + printdata = false; + progress = true; + treeprint = true; + stepbox = false; + ancseq = false; + dotdiff = true; + interleaved = true; + loopcount = 0; + for (;;) { + cleerhome(); + printf("\nProtein parsimony algorithm, version %s\n\n",VERSION); + printf("Setting for this run:\n"); + printf(" U Search for best tree? %s\n", + (usertree ? "No, use user trees in input file" : "Yes")); + if (!usertree) { + printf(" J Randomize input order of sequences?"); + if (jumble) + printf(" Yes (seed =%8ld,%3ld times)\n", inseed0, njumble); + else + printf(" No. Use input order\n"); + } + printf(" O Outgroup root?"); + if (outgropt) + printf(" Yes, at sequence number%3ld\n", outgrno); + else + printf(" No, use as outgroup species%3ld\n", outgrno); + printf(" T Use Threshold parsimony?"); + if (thresh) + printf(" Yes, count steps up to%4.1f per site\n", threshold); + else + printf(" No, use ordinary parsimony\n"); + printf(" C Use which genetic code? %s\n", + (whichcode == universal) ? "Universal" : + (whichcode == ciliate) ? "Ciliate" : + (whichcode == mito) ? "Universal mitochondrial" : + (whichcode == vertmito) ? "Vertebrate mitochondrial" : + (whichcode == flymito) ? "Fly mitochondrial" : + (whichcode == yeastmito) ? "Yeast mitochondrial" : ""); + printf(" W Sites weighted? %s\n", + (weights ? "Yes" : "No")); + printf(" M Analyze multiple data sets?"); + if (mulsets) + printf(" Yes, %2ld %s\n", msets, + (justwts ? "sets of weights" : "data sets")); + else + printf(" No\n"); + printf(" I Input sequences interleaved? %s\n", + (interleaved ? "Yes" : "No")); + printf(" 0 Terminal type (IBM PC, ANSI, none)? %s\n", + (ibmpc ? "IBM PC" : ansi ? "ANSI" : "(none)")); + printf(" 1 Print out the data at start of run %s\n", + (printdata ? "Yes" : "No")); + printf(" 2 Print indications of progress of run %s\n", + (progress ? "Yes" : "No")); + printf(" 3 Print out tree %s\n", + (treeprint ? "Yes" : "No")); + printf(" 4 Print out steps in each site %s\n", + (stepbox ? "Yes" : "No")); + printf(" 5 Print sequences at all nodes of tree %s\n", + (ancseq ? "Yes" : "No")); + if (ancseq || printdata) + printf(" . Use dot-differencing to display them %s\n", + dotdiff ? "Yes" : "No"); + printf(" 6 Write out trees onto tree file? %s\n", + (trout ? "Yes" : "No")); + if(weights && justwts){ + printf( + "WARNING: W option and Multiple Weights options are both on. "); + printf( + "The W menu option is unnecessary and has no additional effect. \n"); + } + printf( + "\nAre these settings correct? (type Y or the letter for one to change)\n"); + scanf("%c%*[^\n]", &ch); + getchar(); + uppercase(&ch); + if (ch == 'Y') + break; + if (strchr("WCJOTUMI12345.60",ch) != NULL) { + switch (ch) { + + case 'J': + jumble = !jumble; + if (jumble) + initjumble(&inseed, &inseed0, seed, &njumble); + else njumble = 1; + break; + + case 'W': + weights = !weights; + break; + + case 'O': + outgropt = !outgropt; + if (outgropt) + initoutgroup(&outgrno, spp); + else outgrno = 1; + break; + + case 'T': + thresh = !thresh; + if (thresh) + initthreshold(&threshold); + break; + + case 'C': + printf("\nWhich genetic code?\n"); + printf(" type for\n\n"); + printf(" U Universal\n"); + printf(" M Mitochondrial\n"); + printf(" V Vertebrate mitochondrial\n"); + printf(" F Fly mitochondrial\n"); + printf(" Y Yeast mitochondrial\n\n"); + loopcount2 = 0; + do { + printf("type U, M, V, F, or Y\n"); + scanf("%c%*[^\n]", &ch); + getchar(); + if (ch == '\n') + ch = ' '; + uppercase(&ch); + countup(&loopcount2, 10); + } while (ch != 'U' && ch != 'M' && ch != 'V' + && ch != 'F' && ch != 'Y'); + switch (ch) { + + case 'U': + whichcode = universal; + break; + + case 'M': + whichcode = mito; + break; + + case 'V': + whichcode = vertmito; + break; + + case 'F': + whichcode = flymito; + break; + + case 'Y': + whichcode = yeastmito; + break; + } + break; + + case 'M': + mulsets = !mulsets; + if (mulsets){ + printf("Multiple data sets or multiple weights?"); + loopcount2 = 0; + do { + printf(" (type D or W)\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch2); + getchar(); + if (ch2 == '\n') + ch2 = ' '; + uppercase(&ch2); + countup(&loopcount2, 10); + } while ((ch2 != 'W') && (ch2 != 'D')); + justwts = (ch2 == 'W'); + if (justwts) + justweights(&msets); + else + initdatasets(&msets); + if (!jumble) { + jumble = true; + initjumble(&inseed, &inseed0, seed, &njumble); + } + } + break; + + case 'I': + interleaved = !interleaved; + break; + + case 'U': + usertree = !usertree; + break; + + case '0': + initterminal(&ibmpc, &ansi); + break; + + case '1': + printdata = !printdata; + break; + + case '2': + progress = !progress; + break; + + case '3': + treeprint = !treeprint; + break; + + case '4': + stepbox = !stepbox; + break; + + case '5': + ancseq = !ancseq; + break; + + case '.': + dotdiff = !dotdiff; + break; + + case '6': + trout = !trout; + break; + } + } else + printf("Not a possible option!\n"); + countup(&loopcount, 100); + } +} /* getoptions */ + + +void protalloctree() +{ /* allocate treenode dynamically */ + long i, j; + node *p, *q; + + treenode = (pointarray)Malloc(nonodes*sizeof(node *)); + for (i = 0; i < (spp); i++) { + treenode[i] = (node *)Malloc(sizeof(node)); + treenode[i]->numsteps = (steptr)Malloc(chars*sizeof(long)); + treenode[i]->siteset = (seqptr)Malloc(chars*sizeof(sitearray)); + treenode[i]->seq = (aas *)Malloc(chars*sizeof(aas)); + } + for (i = spp; i < (nonodes); i++) { + q = NULL; + for (j = 1; j <= 3; j++) { + p = (node *)Malloc(sizeof(node)); + p->numsteps = (steptr)Malloc(chars*sizeof(long)); + p->siteset = (seqptr)Malloc(chars*sizeof(sitearray)); + p->seq = (aas *)Malloc(chars*sizeof(aas)); + p->next = q; + q = p; + } + p->next->next->next = p; + treenode[i] = p; + } +} /* protalloctree */ + + +void reallocnode(node* p) +{ + free(p->numsteps); + free(p->siteset); + free(p->seq); + p->numsteps = (steptr)Malloc(chars*sizeof(long)); + p->siteset = (seqptr)Malloc(chars*sizeof(sitearray)); + p->seq = (aas *)Malloc(chars*sizeof(aas)); +} + + +void reallocchars(void) +{ /* reallocates variables that are dependand on the number of chars + * do we need to reallocate the garbage list too? */ + long i; + node *p; + + if (usertree) + for (i = 0; i < maxuser; i++) { + free(fsteps[i]); + fsteps[i] = (long *)Malloc(chars*sizeof(long)); + } + + for (i = 0; i < nonodes; i++) { + reallocnode(treenode[i]); + if (i >= spp) { + p=treenode[i]->next; + while (p != treenode[i]) { + reallocnode(p); + p = p->next; + } + } + } + + free(weight); + free(threshwt); + free(temp->numsteps); + free(temp->siteset); + free(temp->seq); + free(temp1->numsteps); + free(temp1->siteset); + free(temp1->seq); + + weight = (steptr)Malloc(chars*sizeof(long)); + threshwt = (steptr)Malloc(chars*sizeof(long)); + temp->numsteps = (steptr)Malloc(chars*sizeof(long)); + temp->siteset = (seqptr)Malloc(chars*sizeof(sitearray)); + temp->seq = (aas *)Malloc(chars*sizeof(aas)); + temp1->numsteps = (steptr)Malloc(chars*sizeof(long)); + temp1->siteset = (seqptr)Malloc(chars*sizeof(sitearray)); + temp1->seq = (aas *)Malloc(chars*sizeof(aas)); +} + + +void allocrest() +{ /* allocate remaining global arrays and variables dynamically */ + long i; + + if (usertree) { + fsteps = (long **)Malloc(maxuser*sizeof(long *)); + for (i = 0; i < maxuser; i++) + fsteps[i] = (long *)Malloc(chars*sizeof(long)); + } + bestrees = (bestelm *)Malloc(maxtrees*sizeof(bestelm)); + for (i = 1; i <= maxtrees; i++) + bestrees[i - 1].btree = (long *)Malloc(spp*sizeof(long)); + nayme = (naym *)Malloc(spp*sizeof(naym)); + enterorder = (long *)Malloc(spp*sizeof(long)); + place = (long *)Malloc(nonodes*sizeof(long)); + weight = (steptr)Malloc(chars*sizeof(long)); + threshwt = (steptr)Malloc(chars*sizeof(long)); + temp = (node *)Malloc(sizeof(node)); + temp->numsteps = (steptr)Malloc(chars*sizeof(long)); + temp->siteset = (seqptr)Malloc(chars*sizeof(sitearray)); + temp->seq = (aas *)Malloc(chars*sizeof(aas)); + temp1 = (node *)Malloc(sizeof(node)); + temp1->numsteps = (steptr)Malloc(chars*sizeof(long)); + temp1->siteset = (seqptr)Malloc(chars*sizeof(sitearray)); + temp1->seq = (aas *)Malloc(chars*sizeof(aas)); +} /* allocrest */ + + +void doinit() +{ + /* initializes variables */ + + inputnumbers(&spp, &chars, &nonodes, 1); + getoptions(); + if (printdata) + fprintf(outfile, "%2ld species, %3ld sites\n\n", spp, chars); + protalloctree(); + allocrest(); +} /* doinit*/ + + +void protinputdata() +{ + /* input the names and sequences for each species */ + long i, j, k, l, aasread, aasnew = 0; + Char charstate; + boolean allread, done; + aas aa; /* temporary amino acid for input */ + + if (printdata) + headings(chars, "Sequences", "---------"); + aasread = 0; + allread = false; + while (!(allread)) { + /* eat white space -- if the separator line has spaces on it*/ + do { + charstate = gettc(infile); + } while (charstate == ' ' || charstate == '\t'); + ungetc(charstate, infile); + if (eoln(infile)) { + scan_eoln(infile); + } + i = 1; + while (i <= spp) { + if ((interleaved && aasread == 0) || !interleaved) + initname(i - 1); + j = interleaved ? aasread : 0; + done = false; + while (!done && !eoff(infile)) { + if (interleaved) + done = true; + while (j < chars && !(eoln(infile) || eoff(infile))) { + charstate = gettc(infile); + if (charstate == '\n' || charstate == '\t') + charstate = ' '; + if (charstate == ' ' || (charstate >= '0' && charstate <= '9')) + continue; + uppercase(&charstate); + if ((!isalpha(charstate) && charstate != '?' && + charstate != '-' && charstate != '*') || charstate == 'J' || + charstate == 'O' || charstate == 'U') { + printf("WARNING -- BAD AMINO ACID:%c",charstate); + printf(" AT POSITION%5ld OF SPECIES %3ld\n",j,i); + exxit(-1); + } + j++; + aa = (charstate == 'A') ? ala : + (charstate == 'B') ? asx : + (charstate == 'C') ? cys : + (charstate == 'D') ? asp : + (charstate == 'E') ? glu : + (charstate == 'F') ? phe : + (charstate == 'G') ? gly : aa; + aa = (charstate == 'H') ? his : + (charstate == 'I') ? ileu : + (charstate == 'K') ? lys : + (charstate == 'L') ? leu : + (charstate == 'M') ? met : + (charstate == 'N') ? asn : + (charstate == 'P') ? pro : + (charstate == 'Q') ? gln : + (charstate == 'R') ? arg : aa; + aa = (charstate == 'S') ? ser : + (charstate == 'T') ? thr : + (charstate == 'V') ? val : + (charstate == 'W') ? trp : + (charstate == 'X') ? unk : + (charstate == 'Y') ? tyr : + (charstate == 'Z') ? glx : + (charstate == '*') ? stop : + (charstate == '?') ? quest: + (charstate == '-') ? del : aa; + + treenode[i - 1]->seq[j - 1] = aa; + memcpy(treenode[i - 1]->siteset[j - 1], + translate[(long)aa - (long)ala], sizeof(sitearray)); + } + if (interleaved) + continue; + if (j < chars) + scan_eoln(infile); + else if (j == chars) + done = true; + } + if (interleaved && i == 1) + aasnew = j; + scan_eoln(infile); + if ((interleaved && j != aasnew) || ((!interleaved) && j != chars)){ + printf("ERROR: SEQUENCES OUT OF ALIGNMENT\n"); + exxit(-1);} + i++; + } + if (interleaved) { + aasread = aasnew; + allread = (aasread == chars); + } else + allread = (i > spp); + } + if (printdata) { + for (i = 1; i <= ((chars - 1) / 60 + 1); i++) { + for (j = 1; j <= (spp); j++) { + for (k = 0; k < nmlngth; k++) + putc(nayme[j - 1][k], outfile); + fprintf(outfile, " "); + l = i * 60; + if (l > chars) + l = chars; + for (k = (i - 1) * 60 + 1; k <= l; k++) { + if (j > 1 && treenode[j - 1]->seq[k - 1] == treenode[0]->seq[k - 1]) + charstate = '.'; + else { + tmpa = treenode[j-1]->seq[k-1]; + charstate = (tmpa == ala) ? 'A' : + (tmpa == asx) ? 'B' : + (tmpa == cys) ? 'C' : + (tmpa == asp) ? 'D' : + (tmpa == glu) ? 'E' : + (tmpa == phe) ? 'F' : + (tmpa == gly) ? 'G' : + (tmpa == his) ? 'H' : + (tmpa ==ileu) ? 'I' : + (tmpa == lys) ? 'K' : + (tmpa == leu) ? 'L' : charstate; + charstate = (tmpa == met) ? 'M' : + (tmpa == asn) ? 'N' : + (tmpa == pro) ? 'P' : + (tmpa == gln) ? 'Q' : + (tmpa == arg) ? 'R' : + (tmpa == ser) ? 'S' : + (tmpa ==ser1) ? 'S' : + (tmpa ==ser2) ? 'S' : charstate; + charstate = (tmpa == thr) ? 'T' : + (tmpa == val) ? 'V' : + (tmpa == trp) ? 'W' : + (tmpa == unk) ? 'X' : + (tmpa == tyr) ? 'Y' : + (tmpa == glx) ? 'Z' : + (tmpa == del) ? '-' : + (tmpa ==stop) ? '*' : + (tmpa==quest) ? '?' : charstate; + } + putc(charstate, outfile); + if (k % 10 == 0 && k % 60 != 0) + putc(' ', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); +} /* protinputdata */ + + +void protmakevalues() +{ + /* set up fractional likelihoods at tips */ + long i, j; + node *p; + + for (i = 1; i <= nonodes; i++) { + treenode[i - 1]->back = NULL; + treenode[i - 1]->tip = (i <= spp); + treenode[i - 1]->index = i; + for (j = 0; j < (chars); j++) + treenode[i - 1]->numsteps[j] = 0; + if (i > spp) { + p = treenode[i - 1]->next; + while (p != treenode[i - 1]) { + p->back = NULL; + p->tip = false; + p->index = i; + for (j = 0; j < (chars); j++) + p->numsteps[j] = 0; + p = p->next; + } + } + } +} /* protmakevalues */ + + +void doinput() +{ + /* reads the input data */ + long i; + + if (justwts) { + if (firstset) + protinputdata(); + for (i = 0; i < chars; i++) + weight[i] = 1; + inputweights(chars, weight, &weights); + if (justwts) { + fprintf(outfile, "\n\nWeights set # %ld:\n\n", ith); + if (progress) + printf("\nWeights set # %ld:\n\n", ith); + } + if (printdata) + printweights(outfile, 0, chars, weight, "Sites"); + } else { + if (!firstset){ + samenumsp(&chars, ith); + reallocchars(); + } + for (i = 0; i < chars; i++) + weight[i] = 1; + if (weights) { + inputweights(chars, weight, &weights); + } + if (weights) + printweights(outfile, 0, chars, weight, "Sites"); + protinputdata(); + } + if(!thresh) + threshold = spp * 3.0; + for(i = 0 ; i < (chars) ; i++){ + weight[i]*=10; + threshwt[i] = (long)(threshold * weight[i] + 0.5); + } + + protmakevalues(); +} /* doinput */ + + +void protfillin(node *p, node *left, node *rt) +{ + /* sets up for each node in the tree the aa set for site m + at that point and counts the changes. The program + spends much of its time in this function */ + boolean counted, done; + aas aa; + long s = 0; + sitearray ls, rs, qs; + long i, j, m, n; + + for (m = 0; m < chars; m++) { + if (left != NULL) + memcpy(ls, left->siteset[m], sizeof(sitearray)); + if (rt != NULL) + memcpy(rs, rt->siteset[m], sizeof(sitearray)); + if (left == NULL) { + n = rt->numsteps[m]; + memcpy(qs, rs, sizeof(sitearray)); + } + else if (rt == NULL) { + n = left->numsteps[m]; + memcpy(qs, ls, sizeof(sitearray)); + } + else { + n = left->numsteps[m] + rt->numsteps[m]; + if ((ls[0] == rs[0]) && (ls[1] == rs[1]) && (ls[2] == rs[2])) { + qs[0] = ls[0]; + qs[1] = ls[1]; + qs[2] = ls[2]; + } + else { + counted = false; + for (i = 0; (!counted) && (i <= 3); i++) { + switch (i) { + + case 0: + s = ls[0] & rs[0]; + break; + + case 1: + s = (ls[0] & rs[1]) | (ls[1] & rs[0]); + break; + + case 2: + s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]); + break; + + case 3: + s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0]; + break; + + } + if (s != 0) { + qs[0] = s; + counted = true; + } else + n += weight[m]; + } + switch (i) { + case 1: + qs[1] = qs[0] | (ls[0] & rs[1]) | (ls[1] & rs[0]); + qs[2] = qs[1] | (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]); + break; + case 2: + qs[1] = qs[0] | (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]); + qs[2] = qs[1] | ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0]; + break; + case 3: + qs[1] = qs[0] | ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0]; + qs[2] = qs[1] | ls[1] | (ls[2] & rs[2]) | rs[1]; + break; + case 4: + qs[1] = qs[0] | ls[1] | (ls[2] & rs[2]) | rs[1]; + qs[2] = qs[1] | ls[2] | rs[2]; + break; + } + for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) { + done = false; + for (i = 0; (!done) && (i <= 1); i++) { + if (((1L << ((long)aa)) & qs[i]) != 0) { + for (j = i+1; j <= 2; j++) + qs[j] |= translate[(long)aa - (long)ala][j-i]; + done = true; + } + } + } + } + } + p->numsteps[m] = n; + memcpy(p->siteset[m], qs, sizeof(sitearray)); + } +} /* protfillin */ + + +void protpreorder(node *p) +{ + /* recompute number of steps in preorder taking both ancestoral and + descendent steps into account */ + if (p != NULL && !p->tip) { + protfillin (p->next, p->next->next->back, p->back); + protfillin (p->next->next, p->back, p->next->back); + protpreorder (p->next->back); + protpreorder (p->next->next->back); + } +} /* protpreorder */ + + +void protadd(node *below, node *newtip, node *newfork) +{ + /* inserts the nodes newfork and its left descendant, newtip, + to the tree. below becomes newfork's right descendant */ + + if (below != treenode[below->index - 1]) + below = treenode[below->index - 1]; + if (below->back != NULL) + below->back->back = newfork; + newfork->back = below->back; + below->back = newfork->next->next; + newfork->next->next->back = below; + newfork->next->back = newtip; + newtip->back = newfork->next; + if (root == below) + root = newfork; + root->back = NULL; + + if (recompute) { + protfillin (newfork, newfork->next->back, newfork->next->next->back); + protpreorder(newfork); + if (newfork != root) + protpreorder(newfork->back); + } +} /* protadd */ + + +void protre_move(node **item, node **fork) +{ + /* removes nodes item and its ancestor, fork, from the tree. + the new descendant of fork's ancestor is made to be + fork's second descendant (other than item). Also + returns pointers to the deleted nodes, item and fork */ + node *p, *q, *other; + + if ((*item)->back == NULL) { + *fork = NULL; + return; + } + *fork = treenode[(*item)->back->index - 1]; + if ((*item) == (*fork)->next->back) + other = (*fork)->next->next->back; + else other = (*fork)->next->back; + if (root == *fork) + root = other; + p = (*item)->back->next->back; + q = (*item)->back->next->next->back; + if (p != NULL) p->back = q; + if (q != NULL) q->back = p; + (*fork)->back = NULL; + p = (*fork)->next; + do { + p->back = NULL; + p = p->next; + } while (p != (*fork)); + (*item)->back = NULL; + if (recompute) { + protpreorder(other); + if (other != root) protpreorder(other->back); + } +} /* protre_move */ + + +void evaluate(node *r) +{ + /* determines the number of steps needed for a tree. this is the + minimum number of steps needed to evolve sequences on this tree */ + long i, steps, term; + double sum; + + sum = 0.0; + for (i = 0; i < (chars); i++) { + steps = r->numsteps[i]; + if (steps <= threshwt[i]) + term = steps; + else + term = threshwt[i]; + sum += term; + if (usertree && which <= maxuser) + fsteps[which - 1][i] = term; + } + if (usertree && which <= maxuser) { + nsteps[which - 1] = sum; + if (which == 1) { + minwhich = 1; + minsteps = sum; + } else if (sum < minsteps) { + minwhich = which; + minsteps = sum; + } + } + like = -sum; +} /* evaluate */ + + +void protpostorder(node *p) +{ + /* traverses a binary tree, calling PROCEDURE fillin at a + node's descendants before calling fillin at the node */ + if (p->tip) + return; + protpostorder(p->next->back); + protpostorder(p->next->next->back); + protfillin(p, p->next->back, p->next->next->back); +} /* protpostorder */ + + +void protreroot(node *outgroup) +{ + /* reorients tree, putting outgroup in desired position. */ + node *p, *q; + + if (outgroup->back->index == root->index) + return; + p = root->next; + q = root->next->next; + p->back->back = q->back; + q->back->back = p->back; + p->back = outgroup; + q->back = outgroup->back; + outgroup->back->back = q; + outgroup->back = p; +} /* protreroot */ + + +void protsavetraverse(node *p, long *pos, boolean *found) +{ + /* sets BOOLEANs that indicate which way is down */ + p->bottom = true; + if (p->tip) + return; + p->next->bottom = false; + protsavetraverse(p->next->back, pos,found); + p->next->next->bottom = false; + protsavetraverse(p->next->next->back, pos,found); +} /* protsavetraverse */ + + +void protsavetree(long *pos, boolean *found) +{ + /* record in place where each species has to be + added to reconstruct this tree */ + long i, j; + node *p; + boolean done; + + protreroot(treenode[outgrno - 1]); + protsavetraverse(root, pos,found); + for (i = 0; i < (nonodes); i++) + place[i] = 0; + place[root->index - 1] = 1; + for (i = 1; i <= (spp); i++) { + p = treenode[i - 1]; + while (place[p->index - 1] == 0) { + place[p->index - 1] = i; + while (!p->bottom) + p = p->next; + p = p->back; + } + if (i > 1) { + place[i - 1] = place[p->index - 1]; + j = place[p->index - 1]; + done = false; + while (!done) { + place[p->index - 1] = spp + i - 1; + while (!p->bottom) + p = p->next; + p = p->back; + done = (p == NULL); + if (!done) + done = (place[p->index - 1] != j); + } + } + } +} /* protsavetree */ + + +void tryadd(node *p, node **item, node **nufork) +{ + /* temporarily adds one fork and one tip to the tree. + if the location where they are added yields greater + "likelihood" than other locations tested up to that + time, then keeps that location as there */ + long pos; + boolean found; + node *rute, *q; + + if (p == root) + protfillin(temp, *item, p); + else { + protfillin(temp1, *item, p); + protfillin(temp, temp1, p->back); + } + evaluate(temp); + if (lastrearr) { + if (like < bestlike) { + if ((*item) == (*nufork)->next->next->back) { + q = (*nufork)->next; + (*nufork)->next = (*nufork)->next->next; + (*nufork)->next->next = q; + q->next = (*nufork); + } + } + else if (like >= bstlike2) { + recompute = false; + protadd(p, (*item), (*nufork)); + rute = root->next->back; + protsavetree(&pos,&found); + protreroot(rute); + if (like > bstlike2) { + bestlike = bstlike2 = like; + pos = 1; + nextree = 1; + addtree(pos, &nextree, dummy, place, bestrees); + } else { + pos = 0; + findtree(&found, &pos, nextree, place, bestrees); + if (!found) { + if (nextree <= maxtrees) + addtree(pos, &nextree, dummy, place, bestrees); + } + } + protre_move (item, nufork); + recompute = true; + } + } + if (like >= bestyet) { + bestyet = like; + there = p; + } +} /* tryadd */ + + +void addpreorder(node *p, node *item, node *nufork) +{ + /* traverses a binary tree, calling PROCEDURE tryadd + at a node before calling tryadd at its descendants */ + + if (p == NULL) + return; + tryadd(p, &item,&nufork); + if (!p->tip) { + addpreorder(p->next->back, item, nufork); + addpreorder(p->next->next->back, item, nufork); + } +} /* addpreorder */ + + +void tryrearr(node *p, boolean *success) +{ + /* evaluates one rearrangement of the tree. + if the new tree has greater "likelihood" than the old + one sets success := TRUE and keeps the new tree. + otherwise, restores the old tree */ + node *frombelow, *whereto, *forknode, *q; + double oldlike; + + if (p->back == NULL) + return; + forknode = treenode[p->back->index - 1]; + if (forknode->back == NULL) + return; + oldlike = bestyet; + if (p->back->next->next == forknode) + frombelow = forknode->next->next->back; + else + frombelow = forknode->next->back; + whereto = treenode[forknode->back->index - 1]; + if (whereto->next->back == forknode) + q = whereto->next->next->back; + else + q = whereto->next->back; + protfillin(temp1, frombelow, q); + protfillin(temp, temp1, p); + protfillin(temp1, temp, whereto->back); + evaluate(temp1); + if (like <= oldlike) { + if (p == forknode->next->next->back) { + q = forknode->next; + forknode->next = forknode->next->next; + forknode->next->next = q; + q->next = forknode; + } + } + else { + recompute = false; + protre_move(&p, &forknode); + protfillin(whereto, whereto->next->back, whereto->next->next->back); + recompute = true; + protadd(whereto, p, forknode); + *success = true; + bestyet = like; + } +} /* tryrearr */ + + +void repreorder(node *p, boolean *success) +{ + /* traverses a binary tree, calling PROCEDURE tryrearr + at a node before calling tryrearr at its descendants */ + if (p == NULL) + return; + tryrearr(p,success); + if (!p->tip) { + repreorder(p->next->back,success); + repreorder(p->next->next->back,success); + } +} /* repreorder */ + + +void rearrange(node **r) +{ + /* traverses the tree (preorder), finding any local + rearrangement which decreases the number of steps. + if traversal succeeds in increasing the tree's + "likelihood", PROCEDURE rearrange runs traversal again */ + boolean success = true; + while (success) { + success = false; + repreorder(*r, &success); + } +} /* rearrange */ + + +void protgetch(Char *c) +{ + /* get next nonblank character */ + do { + if (eoln(intree)) + scan_eoln(intree); + *c = gettc(intree); + if (*c == '\n' || *c == '\t') + *c = ' '; + } while (!(*c != ' ' || eoff(intree))); +} /* protgetch */ + + +void protaddelement(node **p,long *nextnode,long *lparens,boolean *names) +{ + /* recursive procedure adds nodes to user-defined tree */ + node *q; + long i, n; + boolean found; + Char str[nmlngth]; + + protgetch(&ch); + + if (ch == '(' ) { + if ((*lparens) >= spp - 1) { + printf("\nERROR IN USER TREE: TOO MANY LEFT PARENTHESES\n"); + exxit(-1); + } + (*nextnode)++; + (*lparens)++; + q = treenode[(*nextnode) - 1]; + protaddelement(&q->next->back, nextnode,lparens,names); + q->next->back->back = q->next; + findch(',', &ch, which); + protaddelement(&q->next->next->back, nextnode,lparens,names); + q->next->next->back->back = q->next->next; + findch(')', &ch, which); + *p = q; + return; + } + for (i = 0; i < nmlngth; i++) + str[i] = ' '; + n = 1; + do { + if (ch == '_') + ch = ' '; + str[n - 1] = ch; + if (eoln(intree)) + scan_eoln(intree); + ch = gettc(intree); + n++; + } while (ch != ',' && ch != ')' && ch != ':' && n <= nmlngth); + n = 1; + do { + found = true; + for (i = 0; i < nmlngth; i++) + found = (found && ((str[i] == nayme[n - 1][i]) || + ((nayme[n - 1][i] == '_') && (str[i] == ' ')))); + if (found) { + if (names[n - 1] == false) { + *p = treenode[n - 1]; + names[n - 1] = true; + } else { + printf("\nERROR IN USER TREE: DUPLICATE NAME FOUND -- "); + for (i = 0; i < nmlngth; i++) + putchar(nayme[n - 1][i]); + putchar('\n'); + exxit(-1); + } + } else + n++; + } while (!(n > spp || found)); + if (n <= spp) + return; + printf("CANNOT FIND SPECIES: "); + for (i = 0; i < nmlngth; i++) + putchar(str[i]); + putchar('\n'); +} /* protaddelement */ + + +void prottreeread() +{ + /* read in user-defined tree and set it up */ + long nextnode, lparens, i; + + root = treenode[spp]; + nextnode = spp; + root->back = NULL; + names = (boolean *)Malloc(spp*sizeof(boolean)); + for (i = 0; i < (spp); i++) + names[i] = false; + lparens = 0; + protaddelement(&root, &nextnode,&lparens,names); + if (ch == '[') { + do + ch = gettc(intree); + while (ch != ']'); + ch = gettc(intree); + } + findch(';', &ch, which); + scan_eoln(intree); + free(names); +} /* prottreeread */ + + +void protancestset(long *a, long *b, long *c, long *d, long *k) +{ + /* sets up the aa set array. */ + aas aa; + long s, sa, sb; + long i, j, m, n; + boolean counted; + + counted = false; + *k = 0; + for (i = 0; i <= 5; i++) { + if (*k < 3) { + s = 0; + if (i > 3) + n = i - 3; + else + n = 0; + for (j = n; j <= (i - n); j++) { + if (j < 3) + sa = a[j]; + else + sa = fullset; + for (m = n; m <= (i - j - n); m++) { + if (m < 3) + sb = sa & b[m]; + else + sb = sa; + if (i - j - m < 3) + sb &= c[i - j - m]; + s |= sb; + } + } + if (counted || s != 0) { + d[*k] = s; + (*k)++; + counted = true; + } + } + } + for (i = 0; i <= 1; i++) { + for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) { + if (((1L << ((long)aa)) & d[i]) != 0) { + for (j = i + 1; j <= 2; j++) + d[j] |= translate[(long)aa - (long)ala][j - i]; + } + } + } +} /* protancestset */ + + +void prothyprint(long b1, long b2, boolean *bottom, node *r, + boolean *nonzero, boolean *maybe) +{ + /* print out states in sites b1 through b2 at node */ + long i; + boolean dot; + Char ch = 0; + aas aa; + + if (*bottom) { + if (!outgropt) + fprintf(outfile, " "); + else + fprintf(outfile, "root "); + } else + fprintf(outfile, "%3ld ", r->back->index - spp); + if (r->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[r->index - 1][i], outfile); + } else + fprintf(outfile, "%4ld ", r->index - spp); + if (*bottom) + fprintf(outfile, " "); + else if (*nonzero) + fprintf(outfile, " yes "); + else if (*maybe) + fprintf(outfile, " maybe "); + else + fprintf(outfile, " no "); + for (i = b1 - 1; i < b2; i++) { + aa = r->seq[i]; + switch (aa) { + + case ala: + ch = 'A'; + break; + + case asx: + ch = 'B'; + break; + + case cys: + ch = 'C'; + break; + + case asp: + ch = 'D'; + break; + + case glu: + ch = 'E'; + break; + + case phe: + ch = 'F'; + break; + + case gly: + ch = 'G'; + break; + + case his: + ch = 'H'; + break; + + case ileu: + ch = 'I'; + break; + + case lys: + ch = 'K'; + break; + + case leu: + ch = 'L'; + break; + + case met: + ch = 'M'; + break; + + case asn: + ch = 'N'; + break; + + case pro: + ch = 'P'; + break; + + case gln: + ch = 'Q'; + break; + + case arg: + ch = 'R'; + break; + + case ser: + ch = 'S'; + break; + + case ser1: + ch = 'S'; + break; + + case ser2: + ch = 'S'; + break; + + case thr: + ch = 'T'; + break; + + case trp: + ch = 'W'; + break; + + case tyr: + ch = 'Y'; + break; + + case val: + ch = 'V'; + break; + + case glx: + ch = 'Z'; + break; + + case del: + ch = '-'; + break; + + case stop: + ch = '*'; + break; + + case unk: + ch = 'X'; + break; + + case quest: + ch = '?'; + break; + } + if (!(*bottom) && dotdiff) + dot = (r->siteset[i] [0] == treenode[r->back->index - 1]->siteset[i][0] + || ((r->siteset[i][0] & + (~((1L << ((long)ser1)) | (1L << ((long)ser2)) | + (1L << ((long)ser))))) == 0 && + (treenode[r->back->index - 1]->siteset[i] [0] & + (~((1L << ((long)ser1)) | (1L << ((long)ser2)) | + (1L << ((long)ser))))) == 0)); + else + dot = false; + if (dot) + putc('.', outfile); + else + putc(ch, outfile); + if ((i + 1) % 10 == 0) + putc(' ', outfile); + } + putc('\n', outfile); +} /* prothyprint */ + + +void prothyptrav(node *r, sitearray *hypset, long b1, long b2, long *k, + boolean *bottom, sitearray nothing) +{ + boolean maybe, nonzero; + long i; + aas aa; + long anc = 0, hset; + gseq *ancset, *temparray; + + protgnu(&ancset); + protgnu(&temparray); + maybe = false; + nonzero = false; + for (i = b1 - 1; i < b2; i++) { + if (!r->tip) { + protancestset(hypset[i], r->next->back->siteset[i], + r->next->next->back->siteset[i], temparray->seq[i], k); + memcpy(r->siteset[i], temparray->seq[i], sizeof(sitearray)); + } + if (!(*bottom)) + anc = treenode[r->back->index - 1]->siteset[i][0]; + if (!r->tip) { + hset = r->siteset[i][0]; + r->seq[i] = quest; + for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) { + if (hset == 1L << ((long)aa)) + r->seq[i] = aa; + } + if (hset == ((1L << ((long)asn)) | (1L << ((long)asp)))) + r->seq[i] = asx; + if (hset == ((1L << ((long)gln)) | (1L << ((long)gly)))) + r->seq[i] = glx; + if (hset == ((1L << ((long)ser1)) | (1L << ((long)ser2)))) + r->seq[i] = ser; + if (hset == fullset) + r->seq[i] = unk; + } + nonzero = (nonzero || (r->siteset[i][0] & anc) == 0); + maybe = (maybe || r->siteset[i][0] != anc); + } + prothyprint(b1, b2,bottom,r,&nonzero,&maybe); + *bottom = false; + if (!r->tip) { + memcpy(temparray->seq, r->next->back->siteset, chars*sizeof(sitearray)); + for (i = b1 - 1; i < b2; i++) + protancestset(hypset[i], r->next->next->back->siteset[i], nothing, + ancset->seq[i], k); + prothyptrav(r->next->back, ancset->seq, b1, b2,k,bottom,nothing ); + for (i = b1 - 1; i < b2; i++) + protancestset(hypset[i], temparray->seq[i], nothing, ancset->seq[i],k); + prothyptrav(r->next->next->back, ancset->seq, b1, b2, k,bottom,nothing); + } + protchuck(temparray); + protchuck(ancset); +} /* prothyptrav */ + + +void prothypstates(long *k) +{ + /* fill in and describe states at interior nodes */ + boolean bottom; + sitearray nothing; + long i, n; + seqptr hypset; + + fprintf(outfile, "\nFrom To Any Steps? State at upper node\n"); + fprintf(outfile, " "); + fprintf(outfile, "( . means same as in the node below it on tree)\n\n"); + memcpy(nothing, translate[(long)quest - (long)ala], sizeof(sitearray)); + hypset = (seqptr)Malloc(chars*sizeof(sitearray)); + for (i = 0; i < (chars); i++) + memcpy(hypset[i], nothing, sizeof(sitearray)); + bottom = true; + for (i = 1; i <= ((chars - 1) / 40 + 1); i++) { + putc('\n', outfile); + n = i * 40; + if (n > chars) + n = chars; + bottom = true; + prothyptrav(root, hypset, i * 40 - 39, n, k,&bottom,nothing); + } + free(hypset); +} /* prothypstates */ + + +void describe() +{ + /* prints ancestors, steps and table of numbers of steps in + each site */ + long i,j,k; + + if (treeprint) + fprintf(outfile, "\nrequires a total of %10.3f\n", like / -10); + if (stepbox) { + putc('\n', outfile); + if (weights) + fprintf(outfile, "weighted "); + fprintf(outfile, "steps in each position:\n"); + fprintf(outfile, " "); + for (i = 0; i <= 9; i++) + fprintf(outfile, "%4ld", i); + fprintf(outfile, "\n *-----------------------------------------\n"); + for (i = 0; i <= (chars / 10); i++) { + fprintf(outfile, "%5ld", i * 10); + putc('!', outfile); + for (j = 0; j <= 9; j++) { + k = i * 10 + j; + if (k == 0 || k > chars) + fprintf(outfile, " "); + else + fprintf(outfile, "%4ld", root->numsteps[k - 1] / 10); + } + putc('\n', outfile); + } + } + if (ancseq) { + prothypstates(&k); + putc('\n', outfile); + } + putc('\n', outfile); + if (trout) { + col = 0; + treeout(root, nextree, &col, root); + } +} /* describe */ + + +void maketree() +{ + /* constructs a binary tree from the pointers in treenode. + adds each node at location which yields highest "likelihood" + then rearranges the tree for greatest "likelihood" */ + long i, j, numtrees; + double gotlike; + node *item, *nufork, *dummy; + + if (!usertree) { + for (i = 1; i <= (spp); i++) + enterorder[i - 1] = i; + if (jumble) + randumize(seed, enterorder); + root = treenode[enterorder[0] - 1]; + recompute = true; + protadd(treenode[enterorder[0] - 1], treenode[enterorder[1] - 1], + treenode[spp]); + if (progress) { + printf("\nAdding species:\n"); + writename(0, 2, enterorder); + } + lastrearr = false; + for (i = 3; i <= (spp); i++) { + bestyet = -30.0*spp*chars; + there = root; + item = treenode[enterorder[i - 1] - 1]; + nufork = treenode[spp + i - 2]; + addpreorder(root, item, nufork); + protadd(there, item, nufork); + like = bestyet; + rearrange(&root); + if (progress) + writename(i - 1, 1, enterorder); + lastrearr = (i == spp); + if (lastrearr) { + if (progress) { + printf("\nDoing global rearrangements\n"); + printf(" !"); + for (j = 1; j <= nonodes; j++) + if ( j % (( nonodes / 72 ) + 1 ) == 0 ) + putchar('-'); + printf("!\n"); + } + bestlike = bestyet; + if (jumb == 1) { + bstlike2 = bestlike = -30.0*spp*chars; + nextree = 1; + } + do { + if (progress) + printf(" "); + gotlike = bestlike; + for (j = 0; j < (nonodes); j++) { + bestyet = -30.0*spp*chars; + item = treenode[j]; + if (item != root) { + nufork = treenode[treenode[j]->back->index - 1]; + protre_move(&item, &nufork); + there = root; + addpreorder(root, item, nufork); + protadd(there, item, nufork); + } + if (progress) { + if ( j % (( nonodes / 72 ) + 1 ) == 0 ) + putchar('.'); + fflush(stdout); + } + } + if (progress) + putchar('\n'); + } while (bestlike > gotlike); + } + } + if (progress) + putchar('\n'); + for (i = spp - 1; i >= 1; i--) + protre_move(&treenode[i], &dummy); + if (jumb == njumble) { + if (treeprint) { + putc('\n', outfile); + if (nextree == 2) + fprintf(outfile, "One most parsimonious tree found:\n"); + else + fprintf(outfile, "%6ld trees in all found\n", nextree - 1); + } + if (nextree > maxtrees + 1) { + if (treeprint) + fprintf(outfile, "here are the first%4ld of them\n", (long)maxtrees); + nextree = maxtrees + 1; + } + if (treeprint) + putc('\n', outfile); + recompute = false; + for (i = 0; i <= (nextree - 2); i++) { + root = treenode[0]; + protadd(treenode[0], treenode[1], treenode[spp]); + for (j = 3; j <= (spp); j++) + protadd(treenode[bestrees[i].btree[j - 1] - 1], treenode[j - 1], + treenode[spp + j - 2]); + protreroot(treenode[outgrno - 1]); + protpostorder(root); + evaluate(root); + printree(root, 1.0); + describe(); + for (j = 1; j < (spp); j++) + protre_move(&treenode[j], &dummy); + } + } + } else { + openfile(&intree,INTREE,"input tree file", "r",progname,intreename); + numtrees = countsemic(&intree); + if (treeprint) { + fprintf(outfile, "User-defined tree"); + if (numtrees > 1) + putc('s', outfile); + fprintf(outfile, ":\n\n\n\n"); + } + which = 1; + while (which <= numtrees) { + prottreeread(); + if (outgropt) + protreroot(treenode[outgrno - 1]); + protpostorder(root); + evaluate(root); + printree(root, 1.0); + describe(); + which++; + } + printf("\n"); + FClose(intree); + putc('\n', outfile); + if (numtrees > 1 && chars > 1 ) + standev(chars, numtrees, minwhich, minsteps, nsteps, fsteps, seed); + } + if (jumb == njumble && progress) { + printf("Output written to file \"%s\"\n\n", outfilename); + if (trout) + printf("Trees also written onto file \"%s\"\n\n", outtreename); + } +} /* maketree */ + + +int main(int argc, Char *argv[]) +{ /* Protein parsimony by uphill search */ +#ifdef MAC + argc = 1; /* macsetup("Protpars",""); */ + argv[0] = "Protpars"; +#endif + init(argc,argv); + progname = argv[0]; + openfile(&infile,INFILE,"input file", "r",argv[0],infilename); + openfile(&outfile,OUTFILE,"output file", "w",argv[0],outfilename); + + ibmpc = IBMCRT; + ansi = ANSICRT; + garbage = NULL; + mulsets = false; + msets = 1; + firstset = true; + code(); + setup(); + doinit(); + if (weights || justwts) + openfile(&weightfile,WEIGHTFILE,"weights file","r",argv[0],weightfilename); + if (trout) + openfile(&outtree,OUTTREE,"output tree file", "w",argv[0],outtreename); + for (ith = 1; ith <= msets; ith++) { + doinput(); + if (ith == 1) + firstset = false; + if (msets > 1 && !justwts) { + fprintf(outfile, "Data set # %ld:\n\n",ith); + if (progress) + printf("Data set # %ld:\n\n",ith); + } + for (jumb = 1; jumb <= njumble; jumb++) + maketree(); + } + FClose(infile); + FClose(outfile); + FClose(outtree); +#ifdef MAC + fixmacfile(outfilename); + fixmacfile(outtreename); +#endif + return 0; +} /* Protein parsimony by uphill search */ diff --git a/forester/archive/RIO/others/phylip_mod/src/seq.c b/forester/archive/RIO/others/phylip_mod/src/seq.c new file mode 100644 index 0000000..ab0c7d9 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/seq.c @@ -0,0 +1,4178 @@ + +#include "phylip.h" +#include "seq.h" + +/* version 3.6. (c) Copyright 1993-2004 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +long nonodes, endsite, outgrno, nextree, which; +boolean interleaved, printdata, outgropt, treeprint, dotdiff, transvp; +steptr weight, category, alias, location, ally; +sequence y; + + +void fix_x(node* p,long site, double maxx, long rcategs) +{ /* dnaml dnamlk */ + long i,j; + p->underflows[site] += log(maxx); + + for ( i = 0 ; i < rcategs ; i++ ) { + for ( j = 0 ; j < ((long)T - (long)A + 1) ; j++) + p->x[site][i][j] /= maxx; + } +} /* fix_x */ + + +void fix_protx(node* p,long site, double maxx, long rcategs) +{ /* proml promlk */ + long i,m; + + p->underflows[site] += log(maxx); + + for ( i = 0 ; i < rcategs ; i++ ) + for (m = 0; m <= 19; m++) + p->protx[site][i][m] /= maxx; +} /* fix_protx */ + + +void alloctemp(node **temp, long *zeros, long endsite) +{ + /*used in dnacomp and dnapenny */ + *temp = (node *)Malloc(sizeof(node)); + (*temp)->numsteps = (steptr)Malloc(endsite*sizeof(long)); + (*temp)->base = (baseptr)Malloc(endsite*sizeof(long)); + (*temp)->numnuc = (nucarray *)Malloc(endsite*sizeof(nucarray)); + memcpy((*temp)->base, zeros, endsite*sizeof(long)); + memcpy((*temp)->numsteps, zeros, endsite*sizeof(long)); + zeronumnuc(*temp, endsite); +} /* alloctemp */ + + +void freetemp(node **temp) +{ + /* used in dnacomp, dnapars, & dnapenny */ + free((*temp)->numsteps); + free((*temp)->base); + free((*temp)->numnuc); + free(*temp); +} /* freetemp */ + + +void freetree2 (pointarray treenode, long nonodes) +{ + /* The natural complement to alloctree2. Free all elements of all + the rings (normally triads) in treenode */ + long i; + node *p, *q; + + /* The first spp elements are just nodes, not rings */ + for (i = 0; i < spp; i++) + free (treenode[i]); + + /* The rest are rings */ + for (i = spp; i < nonodes; i++) { + p = treenode[i]->next; + while (p != treenode[i]) { + q = p->next; + free (p); + p = q; + } + /* p should now point to treenode[i], which has yet to be freed */ + free (p); + } + free (treenode); +} /* freetree2 */ + + +void inputdata(long chars) +{ + /* input the names and sequences for each species */ + /* used by dnacomp, dnadist, dnainvar, dnaml, dnamlk, dnapars, & dnapenny */ + long i, j, k, l, basesread, basesnew=0; + Char charstate; + boolean allread, done; + + if (printdata) + headings(chars, "Sequences", "---------"); + basesread = 0; + allread = false; + while (!(allread)) { + /* eat white space -- if the separator line has spaces on it*/ + do { + charstate = gettc(infile); + } while (charstate == ' ' || charstate == '\t'); + ungetc(charstate, infile); + if (eoln(infile)) + scan_eoln(infile); + i = 1; + while (i <= spp) { + if ((interleaved && basesread == 0) || !interleaved) + initname(i-1); + j = (interleaved) ? basesread : 0; + done = false; + while (!done && !eoff(infile)) { + if (interleaved) + done = true; + while (j < chars && !(eoln(infile) || eoff(infile))) { + charstate = gettc(infile); + if (charstate == '\n' || charstate == '\t') + charstate = ' '; + if (charstate == ' ' || (charstate >= '0' && charstate <= '9')) + continue; + uppercase(&charstate); + if ((strchr("ABCDGHKMNRSTUVWXY?O-",charstate)) == NULL){ + printf("ERROR: bad base: %c at site %5ld of species %3ld\n", + charstate, j+1, i); + if (charstate == '.') { + printf(" Periods (.) may not be used as gap characters.\n"); + printf(" The correct gap character is (-)\n"); + } + exxit(-1); + } + j++; + y[i - 1][j - 1] = charstate; + } + if (interleaved) + continue; + if (j < chars) + scan_eoln(infile); + else if (j == chars) + done = true; + } + if (interleaved && i == 1) + basesnew = j; + + scan_eoln(infile); + + if ((interleaved && j != basesnew) || + (!interleaved && j != chars)) { + printf("\nERROR: sequences out of alignment at position %ld", j+1); + printf(" of species %ld\n\n", i); + exxit(-1); + } + i++; + } + + if (interleaved) { + basesread = basesnew; + allread = (basesread == chars); + } else + allread = (i > spp); + } + if (!printdata) + return; + for (i = 1; i <= ((chars - 1) / 60 + 1); i++) { + for (j = 1; j <= spp; j++) { + for (k = 0; k < nmlngth; k++) + putc(nayme[j - 1][k], outfile); + fprintf(outfile, " "); + l = i * 60; + if (l > chars) + l = chars; + for (k = (i - 1) * 60 + 1; k <= l; k++) { + if (dotdiff && (j > 1 && y[j - 1][k - 1] == y[0][k - 1])) + charstate = '.'; + else + charstate = y[j - 1][k - 1]; + putc(charstate, outfile); + if (k % 10 == 0 && k % 60 != 0) + putc(' ', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); +} /* inputdata */ + + +void alloctree(pointarray *treenode, long nonodes, boolean usertree) +{ + /* allocate treenode dynamically */ + /* used in dnapars, dnacomp, dnapenny & dnamove */ + long i, j; + node *p, *q; + + *treenode = (pointarray)Malloc(nonodes*sizeof(node *)); + for (i = 0; i < spp; i++) { + (*treenode)[i] = (node *)Malloc(sizeof(node)); + (*treenode)[i]->tip = true; + (*treenode)[i]->index = i+1; + (*treenode)[i]->iter = true; + (*treenode)[i]->branchnum = 0; + (*treenode)[i]->initialized = true; + } + if (!usertree) + for (i = spp; i < nonodes; i++) { + q = NULL; + for (j = 1; j <= 3; j++) { + p = (node *)Malloc(sizeof(node)); + p->tip = false; + p->index = i+1; + p->iter = true; + p->branchnum = 0; + p->initialized = false; + p->next = q; + q = p; + } + p->next->next->next = p; + (*treenode)[i] = p; + } +} /* alloctree */ + + +void allocx(long nonodes, long rcategs, pointarray treenode, boolean usertree) +{ + /* allocate x dynamically */ + /* used in dnaml & dnamlk */ + long i, j, k; + node *p; + + for (i = 0; i < spp; i++){ + treenode[i]->x = (phenotype)Malloc(endsite*sizeof(ratelike)); + treenode[i]->underflows = Malloc(endsite * sizeof (double)); + for (j = 0; j < endsite; j++) + treenode[i]->x[j] = (ratelike)Malloc(rcategs*sizeof(sitelike)); + } + if (!usertree) { + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + p->underflows = Malloc (endsite * sizeof (double)); + p->x = (phenotype)Malloc(endsite*sizeof(ratelike)); + for (k = 0; k < endsite; k++) + p->x[k] = (ratelike)Malloc(rcategs*sizeof(sitelike)); + p = p->next; + } + } + } +} /* allocx */ + + +void prot_allocx(long nonodes, long rcategs, pointarray treenode, + boolean usertree) +{ + /* allocate x dynamically */ + /* used in proml */ + long i, j, k; + node *p; + + for (i = 0; i < spp; i++){ + treenode[i]->protx = (pphenotype)Malloc(endsite*sizeof(pratelike)); + treenode[i]->underflows = Malloc(endsite*sizeof(double)); + for (j = 0; j < endsite; j++) + treenode[i]->protx[j] = (pratelike)Malloc(rcategs*sizeof(psitelike)); + } + if (!usertree) { + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + p->protx = (pphenotype)Malloc(endsite*sizeof(pratelike)); + p->underflows = Malloc(endsite*sizeof(double)); + for (k = 0; k < endsite; k++) + p->protx[k] = (pratelike)Malloc(rcategs*sizeof(psitelike)); + p = p->next; + } + } + } +} /* prot_allocx */ + + +void allocx2(long nonodes, long endsite, long sitelength, pointarray treenode, + boolean usertree) +{ + /* allocate x2 dynamically */ + /* used in restml */ + long i, j, k, l; + node *p; + + for (i = 0; i < spp; i++) { + treenode[i]->x2 = (phenotype2)Malloc(endsite*sizeof(sitelike2)); + for ( j = 0 ; j < endsite ; j++ ) + treenode[i]->x2[j] = Malloc((sitelength + 1) * sizeof(double)); + } + if (!usertree) { + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + p->x2 = (phenotype2)Malloc(endsite*sizeof(sitelike2)); + for (k = 0; k < endsite; k++) { + p->x2[k] = Malloc((sitelength + 1) * sizeof(double)); + for (l = 0; l < sitelength; l++) + p->x2[k][l] = 1.0; + } + p = p->next; + } + } + } +} /* allocx2 */ + + +void setuptree(pointarray treenode, long nonodes, boolean usertree) +{ + /* initialize treenodes */ + long i; + node *p; + + for (i = 1; i <= nonodes; i++) { + if (i <= spp || !usertree) { + treenode[i-1]->back = NULL; + treenode[i-1]->tip = (i <= spp); + treenode[i-1]->index = i; + treenode[i-1]->numdesc = 0; + treenode[i-1]->iter = true; + treenode[i-1]->initialized = true; + treenode[i-1]->tyme = 0.0; + } + } + if (!usertree) { + for (i = spp + 1; i <= nonodes; i++) { + p = treenode[i-1]->next; + while (p != treenode[i-1]) { + p->back = NULL; + p->tip = false; + p->index = i; + p->numdesc = 0; + p->iter = true; + p->initialized = false; + p->tyme = 0.0; + p = p->next; + } + } + } +} /* setuptree */ + + +void setuptree2(tree a) +{ + /* initialize a tree */ + /* used in dnaml, dnamlk, & restml */ + + a.likelihood = -999999.0; + a.start = a.nodep[0]->back; + a.root = NULL; +} /* setuptree2 */ + + +void alloctip(node *p, long *zeros) +{ /* allocate a tip node */ + /* used by dnacomp, dnapars, & dnapenny */ + + p->numsteps = (steptr)Malloc(endsite*sizeof(long)); + p->oldnumsteps = (steptr)Malloc(endsite*sizeof(long)); + p->base = (baseptr)Malloc(endsite*sizeof(long)); + p->oldbase = (baseptr)Malloc(endsite*sizeof(long)); + memcpy(p->base, zeros, endsite*sizeof(long)); + memcpy(p->numsteps, zeros, endsite*sizeof(long)); + memcpy(p->oldbase, zeros, endsite*sizeof(long)); + memcpy(p->oldnumsteps, zeros, endsite*sizeof(long)); +} /* alloctip */ + + +void freetrans(transptr *trans, long nonodes,long sitelength) +{ + long i ,j; + for ( i = 0 ; i < nonodes ; i++ ) { + for ( j = 0 ; j < sitelength + 1; j++) { + free ((*trans)[i][j]); + } + free ((*trans)[i]); + } + free(*trans); +} + + +void getbasefreqs(double freqa, double freqc, double freqg, double freqt, + double *freqr, double *freqy, double *freqar, double *freqcy, + double *freqgr, double *freqty, double *ttratio, double *xi, + double *xv, double *fracchange, boolean freqsfrom, + boolean printdata) +{ + /* used by dnadist, dnaml, & dnamlk */ + double aa, bb; + + if (printdata) { + putc('\n', outfile); + if (freqsfrom) + fprintf(outfile, "Empirical "); + fprintf(outfile, "Base Frequencies:\n\n"); + fprintf(outfile, " A %10.5f\n", freqa); + fprintf(outfile, " C %10.5f\n", freqc); + fprintf(outfile, " G %10.5f\n", freqg); + fprintf(outfile, " T(U) %10.5f\n", freqt); + } + *freqr = freqa + freqg; + *freqy = freqc + freqt; + *freqar = freqa / *freqr; + *freqcy = freqc / *freqy; + *freqgr = freqg / *freqr; + *freqty = freqt / *freqy; + aa = *ttratio * (*freqr) * (*freqy) - freqa * freqg - freqc * freqt; + bb = freqa * (*freqgr) + freqc * (*freqty); + *xi = aa / (aa + bb); + *xv = 1.0 - *xi; + if (*xi < 0.0) { + printf("\n WARNING: This transition/transversion ratio\n"); + printf(" is impossible with these base frequencies!\n"); + *xi = 0.0; + *xv = 1.0; + (*ttratio) = (freqa*freqg+freqc*freqt)/((*freqr)*(*freqy)); + printf(" Transition/transversion parameter reset\n"); + printf(" so transition/transversion ratio is %10.6f\n\n", (*ttratio)); + } + if (freqa <= 0.0) + freqa = 0.000001; + if (freqc <= 0.0) + freqc = 0.000001; + if (freqg <= 0.0) + freqg = 0.000001; + if (freqt <= 0.0) + freqt = 0.000001; + *fracchange = (*xi) * (2 * freqa * (*freqgr) + 2 * freqc * (*freqty)) + + (*xv) * (1.0 - freqa * freqa - freqc * freqc - freqg * freqg + - freqt * freqt); +} /* getbasefreqs */ + + +void empiricalfreqs(double *freqa, double *freqc, double *freqg, + double *freqt, steptr weight, pointarray treenode) +{ + /* Get empirical base frequencies from the data */ + /* used in dnaml & dnamlk */ + long i, j, k; + double sum, suma, sumc, sumg, sumt, w; + + *freqa = 0.25; + *freqc = 0.25; + *freqg = 0.25; + *freqt = 0.25; + for (k = 1; k <= 8; k++) { + suma = 0.0; + sumc = 0.0; + sumg = 0.0; + sumt = 0.0; + for (i = 0; i < spp; i++) { + for (j = 0; j < endsite; j++) { + w = weight[j]; + sum = (*freqa) * treenode[i]->x[j][0][0]; + sum += (*freqc) * treenode[i]->x[j][0][(long)C - (long)A]; + sum += (*freqg) * treenode[i]->x[j][0][(long)G - (long)A]; + sum += (*freqt) * treenode[i]->x[j][0][(long)T - (long)A]; + suma += w * (*freqa) * treenode[i]->x[j][0][0] / sum; + sumc += w * (*freqc) * treenode[i]->x[j][0][(long)C - (long)A] / sum; + sumg += w * (*freqg) * treenode[i]->x[j][0][(long)G - (long)A] / sum; + sumt += w * (*freqt) * treenode[i]->x[j][0][(long)T - (long)A] / sum; + } + } + sum = suma + sumc + sumg + sumt; + *freqa = suma / sum; + *freqc = sumc / sum; + *freqg = sumg / sum; + *freqt = sumt / sum; + } + if (*freqa <= 0.0) + *freqa = 0.000001; + if (*freqc <= 0.0) + *freqc = 0.000001; + if (*freqg <= 0.0) + *freqg = 0.000001; + if (*freqt <= 0.0) + *freqt = 0.000001; +} /* empiricalfreqs */ + + +void sitesort(long chars, steptr weight) +{ + /* Shell sort keeping sites, weights in same order */ + /* used in dnainvar, dnapars, dnacomp & dnapenny */ + long gap, i, j, jj, jg, k, itemp; + boolean flip, tied; + + gap = chars / 2; + while (gap > 0) { + for (i = gap + 1; i <= chars; i++) { + j = i - gap; + flip = true; + while (j > 0 && flip) { + jj = alias[j - 1]; + jg = alias[j + gap - 1]; + tied = true; + k = 1; + while (k <= spp && tied) { + flip = (y[k - 1][jj - 1] > y[k - 1][jg - 1]); + tied = (tied && y[k - 1][jj - 1] == y[k - 1][jg - 1]); + k++; + } + if (!flip) + break; + itemp = alias[j - 1]; + alias[j - 1] = alias[j + gap - 1]; + alias[j + gap - 1] = itemp; + itemp = weight[j - 1]; + weight[j - 1] = weight[j + gap - 1]; + weight[j + gap - 1] = itemp; + j -= gap; + } + } + gap /= 2; + } +} /* sitesort */ + + +void sitecombine(long chars) +{ + /* combine sites that have identical patterns */ + /* used in dnapars, dnapenny, & dnacomp */ + long i, j, k; + boolean tied; + + i = 1; + while (i < chars) { + j = i + 1; + tied = true; + while (j <= chars && tied) { + k = 1; + while (k <= spp && tied) { + tied = (tied && + y[k - 1][alias[i - 1] - 1] == y[k - 1][alias[j - 1] - 1]); + k++; + } + if (tied) { + weight[i - 1] += weight[j - 1]; + weight[j - 1] = 0; + ally[alias[j - 1] - 1] = alias[i - 1]; + } + j++; + } + i = j - 1; + } +} /* sitecombine */ + + +void sitescrunch(long chars) +{ + /* move so one representative of each pattern of + sites comes first */ + /* used in dnapars & dnacomp */ + long i, j, itemp; + boolean done, found; + + done = false; + i = 1; + j = 2; + while (!done) { + if (ally[alias[i - 1] - 1] != alias[i - 1]) { + if (j <= i) + j = i + 1; + if (j <= chars) { + do { + found = (ally[alias[j - 1] - 1] == alias[j - 1]); + j++; + } while (!(found || j > chars)); + if (found) { + j--; + itemp = alias[i - 1]; + alias[i - 1] = alias[j - 1]; + alias[j - 1] = itemp; + itemp = weight[i - 1]; + weight[i - 1] = weight[j - 1]; + weight[j - 1] = itemp; + } else + done = true; + } else + done = true; + } + i++; + done = (done || i >= chars); + } +} /* sitescrunch */ + + +void sitesort2(long sites, steptr aliasweight) +{ + /* Shell sort keeping sites, weights in same order */ + /* used in dnaml & dnamnlk */ + long gap, i, j, jj, jg, k, itemp; + boolean flip, tied, samewt; + + gap = sites / 2; + while (gap > 0) { + for (i = gap + 1; i <= sites; i++) { + j = i - gap; + flip = true; + while (j > 0 && flip) { + jj = alias[j - 1]; + jg = alias[j + gap - 1]; + samewt = ((weight[jj - 1] != 0) && (weight[jg - 1] != 0)) + || ((weight[jj - 1] == 0) && (weight[jg - 1] == 0)); + tied = samewt && (category[jj - 1] == category[jg - 1]); + flip = ((!samewt) && (weight[jj - 1] == 0)) + || (samewt && (category[jj - 1] > category[jg - 1])); + k = 1; + while (k <= spp && tied) { + flip = (y[k - 1][jj - 1] > y[k - 1][jg - 1]); + tied = (tied && y[k - 1][jj - 1] == y[k - 1][jg - 1]); + k++; + } + if (!flip) + break; + itemp = alias[j - 1]; + alias[j - 1] = alias[j + gap - 1]; + alias[j + gap - 1] = itemp; + itemp = aliasweight[j - 1]; + aliasweight[j - 1] = aliasweight[j + gap - 1]; + aliasweight[j + gap - 1] = itemp; + j -= gap; + } + } + gap /= 2; + } +} /* sitesort2 */ + + +void sitecombine2(long sites, steptr aliasweight) +{ + /* combine sites that have identical patterns */ + /* used in dnaml & dnamlk */ + long i, j, k; + boolean tied, samewt; + + i = 1; + while (i < sites) { + j = i + 1; + tied = true; + while (j <= sites && tied) { + samewt = ((aliasweight[i - 1] != 0) && (aliasweight[j - 1] != 0)) + || ((aliasweight[i - 1] == 0) && (aliasweight[j - 1] == 0)); + tied = samewt + && (category[alias[i - 1] - 1] == category[alias[j - 1] - 1]); + k = 1; + while (k <= spp && tied) { + tied = (tied && + y[k - 1][alias[i - 1] - 1] == y[k - 1][alias[j - 1] - 1]); + k++; + } + if (!tied) + break; + aliasweight[i - 1] += aliasweight[j - 1]; + aliasweight[j - 1] = 0; + ally[alias[j - 1] - 1] = alias[i - 1]; + j++; + } + i = j; + } +} /* sitecombine2 */ + + +void sitescrunch2(long sites, long i, long j, steptr aliasweight) +{ + /* move so positively weighted sites come first */ + /* used by dnainvar, dnaml, dnamlk, & restml */ + long itemp; + boolean done, found; + + done = false; + while (!done) { + if (aliasweight[i - 1] > 0) + i++; + else { + if (j <= i) + j = i + 1; + if (j <= sites) { + do { + found = (aliasweight[j - 1] > 0); + j++; + } while (!(found || j > sites)); + if (found) { + j--; + itemp = alias[i - 1]; + alias[i - 1] = alias[j - 1]; + alias[j - 1] = itemp; + itemp = aliasweight[i - 1]; + aliasweight[i - 1] = aliasweight[j - 1]; + aliasweight[j - 1] = itemp; + } else + done = true; + } else + done = true; + } + done = (done || i >= sites); + } +} /* sitescrunch2 */ + + +void makevalues(pointarray treenode, long *zeros, boolean usertree) +{ + /* set up fractional likelihoods at tips */ + /* used by dnacomp, dnapars, & dnapenny */ + long i, j; + char ns = 0; + node *p; + + setuptree(treenode, nonodes, usertree); + for (i = 0; i < spp; i++) + alloctip(treenode[i], zeros); + if (!usertree) { + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + do { + allocnontip(p, zeros, endsite); + p = p->next; + } while (p != treenode[i]); + } + } + for (j = 0; j < endsite; j++) { + for (i = 0; i < spp; i++) { + switch (y[i][alias[j] - 1]) { + + case 'A': + ns = 1 << A; + break; + + case 'C': + ns = 1 << C; + break; + + case 'G': + ns = 1 << G; + break; + + case 'U': + ns = 1 << T; + break; + + case 'T': + ns = 1 << T; + break; + + case 'M': + ns = (1 << A) | (1 << C); + break; + + case 'R': + ns = (1 << A) | (1 << G); + break; + + case 'W': + ns = (1 << A) | (1 << T); + break; + + case 'S': + ns = (1 << C) | (1 << G); + break; + + case 'Y': + ns = (1 << C) | (1 << T); + break; + + case 'K': + ns = (1 << G) | (1 << T); + break; + + case 'B': + ns = (1 << C) | (1 << G) | (1 << T); + break; + + case 'D': + ns = (1 << A) | (1 << G) | (1 << T); + break; + + case 'H': + ns = (1 << A) | (1 << C) | (1 << T); + break; + + case 'V': + ns = (1 << A) | (1 << C) | (1 << G); + break; + + case 'N': + ns = (1 << A) | (1 << C) | (1 << G) | (1 << T); + break; + + case 'X': + ns = (1 << A) | (1 << C) | (1 << G) | (1 << T); + break; + + case '?': + ns = (1 << A) | (1 << C) | (1 << G) | (1 << T) | (1 << O); + break; + + case 'O': + ns = 1 << O; + break; + + case '-': + ns = 1 << O; + break; + } + treenode[i]->base[j] = ns; + treenode[i]->numsteps[j] = 0; + } + } +} /* makevalues */ + + +void makevalues2(long categs, pointarray treenode, long endsite, + long spp, sequence y, steptr alias) +{ + /* set up fractional likelihoods at tips */ + /* used by dnaml & dnamlk */ + long i, j, k, l; + bases b; + + for (k = 0; k < endsite; k++) { + j = alias[k]; + for (i = 0; i < spp; i++) { + for (l = 0; l < categs; l++) { + for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) + treenode[i]->x[k][l][(long)b - (long)A] = 0.0; + switch (y[i][j - 1]) { + + case 'A': + treenode[i]->x[k][l][0] = 1.0; + break; + + case 'C': + treenode[i]->x[k][l][(long)C - (long)A] = 1.0; + break; + + case 'G': + treenode[i]->x[k][l][(long)G - (long)A] = 1.0; + break; + + case 'T': + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'U': + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'M': + treenode[i]->x[k][l][0] = 1.0; + treenode[i]->x[k][l][(long)C - (long)A] = 1.0; + break; + + case 'R': + treenode[i]->x[k][l][0] = 1.0; + treenode[i]->x[k][l][(long)G - (long)A] = 1.0; + break; + + case 'W': + treenode[i]->x[k][l][0] = 1.0; + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'S': + treenode[i]->x[k][l][(long)C - (long)A] = 1.0; + treenode[i]->x[k][l][(long)G - (long)A] = 1.0; + break; + + case 'Y': + treenode[i]->x[k][l][(long)C - (long)A] = 1.0; + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'K': + treenode[i]->x[k][l][(long)G - (long)A] = 1.0; + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'B': + treenode[i]->x[k][l][(long)C - (long)A] = 1.0; + treenode[i]->x[k][l][(long)G - (long)A] = 1.0; + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'D': + treenode[i]->x[k][l][0] = 1.0; + treenode[i]->x[k][l][(long)G - (long)A] = 1.0; + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'H': + treenode[i]->x[k][l][0] = 1.0; + treenode[i]->x[k][l][(long)C - (long)A] = 1.0; + treenode[i]->x[k][l][(long)T - (long)A] = 1.0; + break; + + case 'V': + treenode[i]->x[k][l][0] = 1.0; + treenode[i]->x[k][l][(long)C - (long)A] = 1.0; + treenode[i]->x[k][l][(long)G - (long)A] = 1.0; + break; + + case 'N': + for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) + treenode[i]->x[k][l][(long)b - (long)A] = 1.0; + break; + + case 'X': + for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) + treenode[i]->x[k][l][(long)b - (long)A] = 1.0; + break; + + case '?': + for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) + treenode[i]->x[k][l][(long)b - (long)A] = 1.0; + break; + + case 'O': + for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) + treenode[i]->x[k][l][(long)b - (long)A] = 1.0; + break; + + case '-': + for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) + treenode[i]->x[k][l][(long)b - (long)A] = 1.0; + break; + } + } + } + } +} /* makevalues2 */ + + +void fillin(node *p, node *left, node *rt) +{ + /* sets up for each node in the tree the base sequence + at that point and counts the changes. */ + long i, j, k, n, purset, pyrset; + node *q; + + purset = (1 << (long)A) + (1 << (long)G); + pyrset = (1 << (long)C) + (1 << (long)T); + if (!left) { + memcpy(p->base, rt->base, endsite*sizeof(long)); + memcpy(p->numsteps, rt->numsteps, endsite*sizeof(long)); + q = rt; + } else if (!rt) { + memcpy(p->base, left->base, endsite*sizeof(long)); + memcpy(p->numsteps, left->numsteps, endsite*sizeof(long)); + q = left; + } else { + for (i = 0; i < endsite; i++) { + p->base[i] = left->base[i] & rt->base[i]; + p->numsteps[i] = left->numsteps[i] + rt->numsteps[i]; + if (p->base[i] == 0) { + p->base[i] = left->base[i] | rt->base[i]; + if (transvp) { + if (!((p->base[i] == purset) || (p->base[i] == pyrset))) + p->numsteps[i] += weight[i]; + } + else p->numsteps[i] += weight[i]; + } + } + q = rt; + } + if (left && rt) n = 2; + else n = 1; + for (i = 0; i < endsite; i++) + for (j = (long)A; j <= (long)O; j++) + p->numnuc[i][j] = 0; + for (k = 1; k <= n; k++) { + if (k == 2) q = left; + for (i = 0; i < endsite; i++) { + for (j = (long)A; j <= (long)O; j++) { + if (q->base[i] & (1 << j)) + p->numnuc[i][j]++; + } + } + } +} /* fillin */ + + +long getlargest(long *numnuc) +{ + /* find the largest in array numnuc */ + long i, largest; + + largest = 0; + for (i = (long)A; i <= (long)O; i++) + if (numnuc[i] > largest) + largest = numnuc[i]; + return largest; +} /* getlargest */ + + +void multifillin(node *p, node *q, long dnumdesc) +{ + /* sets up for each node in the tree the base sequence + at that point and counts the changes according to the + changes in q's base */ + long i, j, b, largest, descsteps, purset, pyrset; + + memcpy(p->oldbase, p->base, endsite*sizeof(long)); + memcpy(p->oldnumsteps, p->numsteps, endsite*sizeof(long)); + purset = (1 << (long)A) + (1 << (long)G); + pyrset = (1 << (long)C) + (1 << (long)T); + for (i = 0; i < endsite; i++) { + descsteps = 0; + for (j = (long)A; j <= (long)O; j++) { + b = 1 << j; + if ((descsteps == 0) && (p->base[i] & b)) + descsteps = p->numsteps[i] + - (p->numdesc - dnumdesc - p->numnuc[i][j]) * weight[i]; + } + if (dnumdesc == -1) + descsteps -= q->oldnumsteps[i]; + else if (dnumdesc == 0) + descsteps += (q->numsteps[i] - q->oldnumsteps[i]); + else + descsteps += q->numsteps[i]; + if (q->oldbase[i] != q->base[i]) { + for (j = (long)A; j <= (long)O; j++) { + b = 1 << j; + if (transvp) { + if (b & purset) b = purset; + if (b & pyrset) b = pyrset; + } + if ((q->oldbase[i] & b) && !(q->base[i] & b)) + p->numnuc[i][j]--; + else if (!(q->oldbase[i] & b) && (q->base[i] & b)) + p->numnuc[i][j]++; + } + } + largest = getlargest(p->numnuc[i]); + if (q->oldbase[i] != q->base[i]) { + p->base[i] = 0; + for (j = (long)A; j <= (long)O; j++) { + if (p->numnuc[i][j] == largest) + p->base[i] |= (1 << j); + } + } + p->numsteps[i] = (p->numdesc - largest) * weight[i] + descsteps; + } +} /* multifillin */ + + +void sumnsteps(node *p, node *left, node *rt, long a, long b) +{ + /* sets up for each node in the tree the base sequence + at that point and counts the changes. */ + long i; + long ns, rs, ls, purset, pyrset; + + if (!left) { + memcpy(p->numsteps, rt->numsteps, endsite*sizeof(long)); + memcpy(p->base, rt->base, endsite*sizeof(long)); + } else if (!rt) { + memcpy(p->numsteps, left->numsteps, endsite*sizeof(long)); + memcpy(p->base, left->base, endsite*sizeof(long)); + } else { + purset = (1 << (long)A) + (1 << (long)G); + pyrset = (1 << (long)C) + (1 << (long)T); + for (i = a; i < b; i++) { + ls = left->base[i]; + rs = rt->base[i]; + ns = ls & rs; + p->numsteps[i] = left->numsteps[i] + rt->numsteps[i]; + if (ns == 0) { + ns = ls | rs; + if (transvp) { + if (!((ns == purset) || (ns == pyrset))) + p->numsteps[i] += weight[i]; + } + else p->numsteps[i] += weight[i]; + } + p->base[i] = ns; + } + } +} /* sumnsteps */ + + +void sumnsteps2(node *p,node *left,node *rt,long a,long b,long *threshwt) +{ + /* counts the changes at each node. */ + long i, steps; + long ns, rs, ls, purset, pyrset; + long term; + + if (a == 0) p->sumsteps = 0.0; + if (!left) + memcpy(p->numsteps, rt->numsteps, endsite*sizeof(long)); + else if (!rt) + memcpy(p->numsteps, left->numsteps, endsite*sizeof(long)); + else { + purset = (1 << (long)A) + (1 << (long)G); + pyrset = (1 << (long)C) + (1 << (long)T); + for (i = a; i < b; i++) { + ls = left->base[i]; + rs = rt->base[i]; + ns = ls & rs; + p->numsteps[i] = left->numsteps[i] + rt->numsteps[i]; + if (ns == 0) { + ns = ls | rs; + if (transvp) { + if (!((ns == purset) || (ns == pyrset))) + p->numsteps[i] += weight[i]; + } + else p->numsteps[i] += weight[i]; + } + } + } + for (i = a; i < b; i++) { + steps = p->numsteps[i]; + if ((long)steps <= threshwt[i]) + term = steps; + else + term = threshwt[i]; + p->sumsteps += (double)term; + } +} /* sumnsteps2 */ + + +void multisumnsteps(node *p, node *q, long a, long b, long *threshwt) +{ + /* computes the number of steps between p and q */ + long i, j, steps, largest, descsteps, purset, pyrset, b1; + long term; + + if (a == 0) p->sumsteps = 0.0; + purset = (1 << (long)A) + (1 << (long)G); + pyrset = (1 << (long)C) + (1 << (long)T); + for (i = a; i < b; i++) { + descsteps = 0; + for (j = (long)A; j <= (long)O; j++) { + if ((descsteps == 0) && (p->base[i] & (1 << j))) + descsteps = p->numsteps[i] - + (p->numdesc - 1 - p->numnuc[i][j]) * weight[i]; + } + descsteps += q->numsteps[i]; + largest = 0; + for (j = (long)A; j <= (long)O; j++) { + b1 = (1 << j); + if (transvp) { + if (b1 & purset) b1 = purset; + if (b1 & pyrset) b1 = pyrset; + } + if (q->base[i] & b1) + p->numnuc[i][j]++; + if (p->numnuc[i][j] > largest) + largest = p->numnuc[i][j]; + } + steps = (p->numdesc - largest) * weight[i] + descsteps; + if ((long)steps <= threshwt[i]) + term = steps; + else + term = threshwt[i]; + p->sumsteps += (double)term; + } +} /* multisumnsteps */ + + +void multisumnsteps2(node *p) +{ + /* counts the changes at each multi-way node. Sums up + steps of all descendants */ + long i, j, largest, purset, pyrset, b1; + node *q; + baseptr b; + + purset = (1 << (long)A) + (1 << (long)G); + pyrset = (1 << (long)C) + (1 << (long)T); + for (i = 0; i < endsite; i++) { + p->numsteps[i] = 0; + q = p->next; + while (q != p) { + if (q->back) { + p->numsteps[i] += q->back->numsteps[i]; + b = q->back->base; + for (j = (long)A; j <= (long)O; j++) { + b1 = (1 << j); + if (transvp) { + if (b1 & purset) b1 = purset; + if (b1 & pyrset) b1 = pyrset; + } + if (b[i] & b1) p->numnuc[i][j]++; + } + } + q = q->next; + } + largest = getlargest(p->numnuc[i]); + p->base[i] = 0; + for (j = (long)A; j <= (long)O; j++) { + if (p->numnuc[i][j] == largest) + p->base[i] |= (1 << j); + } + p->numsteps[i] += ((p->numdesc - largest) * weight[i]); + } +} /* multisumnsteps2 */ + +boolean alltips(node *forknode, node *p) +{ + /* returns true if all descendants of forknode except p are tips; + false otherwise. */ + node *q, *r; + boolean tips; + + tips = true; + r = forknode; + q = forknode->next; + do { + if (q->back && q->back != p && !q->back->tip) + tips = false; + q = q->next; + } while (tips && q != r); + return tips; +} /* alltips */ + + +void gdispose(node *p, node **grbg, pointarray treenode) +{ + /* go through tree throwing away nodes */ + node *q, *r; + + p->back = NULL; + if (p->tip) + return; + treenode[p->index - 1] = NULL; + q = p->next; + while (q != p) { + gdispose(q->back, grbg, treenode); + q->back = NULL; + r = q; + q = q->next; + chucktreenode(grbg, r); + } + chucktreenode(grbg, q); +} /* gdispose */ + + +void preorder(node *p, node *r, node *root, node *removing, node *adding, + node *changing, long dnumdesc) +{ + /* recompute number of steps in preorder taking both ancestoral and + descendent steps into account. removing points to a node being + removed, if any */ + node *q, *p1, *p2; + + if (p && !p->tip && p != adding) { + q = p; + do { + if (p->back != r) { + if (p->numdesc > 2) { + if (changing) + multifillin (p, r, dnumdesc); + else + multifillin (p, r, 0); + } else { + p1 = p->next; + if (!removing) + while (!p1->back) + p1 = p1->next; + else + while (!p1->back || p1->back == removing) + p1 = p1->next; + p2 = p1->next; + if (!removing) + while (!p2->back) + p2 = p2->next; + else + while (!p2->back || p2->back == removing) + p2 = p2->next; + p1 = p1->back; + p2 = p2->back; + if (p->back == p1) p1 = NULL; + else if (p->back == p2) p2 = NULL; + memcpy(p->oldbase, p->base, endsite*sizeof(long)); + memcpy(p->oldnumsteps, p->numsteps, endsite*sizeof(long)); + fillin(p, p1, p2); + } + } + p = p->next; + } while (p != q); + q = p; + do { + preorder(p->next->back, p->next, root, removing, adding, NULL, 0); + p = p->next; + } while (p->next != q); + } +} /* preorder */ + + +void updatenumdesc(node *p, node *root, long n) +{ + /* set p's numdesc to n. If p is the root, numdesc of p's + descendants are set to n-1. */ + node *q; + + q = p; + if (p == root && n > 0) { + p->numdesc = n; + n--; + q = q->next; + } + do { + q->numdesc = n; + q = q->next; + } while (q != p); +} /* updatenumdesc */ + + +void add(node *below,node *newtip,node *newfork,node **root, + boolean recompute,pointarray treenode,node **grbg,long *zeros) +{ + /* inserts the nodes newfork and its left descendant, newtip, + to the tree. below becomes newfork's right descendant. + if newfork is NULL, newtip is added as below's sibling */ + /* used in dnacomp & dnapars */ + node *p; + + if (below != treenode[below->index - 1]) + below = treenode[below->index - 1]; + if (newfork) { + if (below->back != NULL) + below->back->back = newfork; + newfork->back = below->back; + below->back = newfork->next->next; + newfork->next->next->back = below; + newfork->next->back = newtip; + newtip->back = newfork->next; + if (*root == below) + *root = newfork; + updatenumdesc(newfork, *root, 2); + } else { + gnutreenode(grbg, &p, below->index, endsite, zeros); + p->back = newtip; + newtip->back = p; + p->next = below->next; + below->next = p; + updatenumdesc(below, *root, below->numdesc + 1); + } + if (!newtip->tip) + updatenumdesc(newtip, *root, newtip->numdesc); + (*root)->back = NULL; + if (!recompute) + return; + if (!newfork) { + memcpy(newtip->back->base, below->base, endsite*sizeof(long)); + memcpy(newtip->back->numsteps, below->numsteps, endsite*sizeof(long)); + memcpy(newtip->back->numnuc, below->numnuc, endsite*sizeof(nucarray)); + if (below != *root) { + memcpy(below->back->oldbase, zeros, endsite*sizeof(long)); + memcpy(below->back->oldnumsteps, zeros, endsite*sizeof(long)); + multifillin(newtip->back, below->back, 1); + } + if (!newtip->tip) { + memcpy(newtip->back->oldbase, zeros, endsite*sizeof(long)); + memcpy(newtip->back->oldnumsteps, zeros, endsite*sizeof(long)); + preorder(newtip, newtip->back, *root, NULL, NULL, below, 1); + } + memcpy(newtip->oldbase, zeros, endsite*sizeof(long)); + memcpy(newtip->oldnumsteps, zeros, endsite*sizeof(long)); + preorder(below, newtip, *root, NULL, newtip, below, 1); + if (below != *root) + preorder(below->back, below, *root, NULL, NULL, NULL, 0); + } else { + fillin(newtip->back, newtip->back->next->back, + newtip->back->next->next->back); + if (!newtip->tip) { + memcpy(newtip->back->oldbase, zeros, endsite*sizeof(long)); + memcpy(newtip->back->oldnumsteps, zeros, endsite*sizeof(long)); + preorder(newtip, newtip->back, *root, NULL, NULL, newfork, 1); + } + if (newfork != *root) { + memcpy(below->back->base, newfork->back->base, endsite*sizeof(long)); + memcpy(below->back->numsteps, newfork->back->numsteps, endsite*sizeof(long)); + preorder(newfork, newtip, *root, NULL, newtip, NULL, 0); + } else { + fillin(below->back, newtip, NULL); + fillin(newfork, newtip, below); + memcpy(below->back->oldbase, zeros, endsite*sizeof(long)); + memcpy(below->back->oldnumsteps, zeros, endsite*sizeof(long)); + preorder(below, below->back, *root, NULL, NULL, newfork, 1); + } + if (newfork != *root) { + memcpy(newfork->oldbase, below->base, endsite*sizeof(long)); + memcpy(newfork->oldnumsteps, below->numsteps, endsite*sizeof(long)); + preorder(newfork->back, newfork, *root, NULL, NULL, NULL, 0); + } + } +} /* add */ + + +void findbelow(node **below, node *item, node *fork) +{ + /* decide which of fork's binary children is below */ + + if (fork->next->back == item) + *below = fork->next->next->back; + else + *below = fork->next->back; +} /* findbelow */ + + +void re_move(node *item, node **fork, node **root, boolean recompute, + pointarray treenode, node **grbg, long *zeros) +{ + /* removes nodes item and its ancestor, fork, from the tree. + the new descendant of fork's ancestor is made to be + fork's second descendant (other than item). Also + returns pointers to the deleted nodes, item and fork. + If item belongs to a node with more than 2 descendants, + fork will not be deleted */ + /* used in dnacomp & dnapars */ + node *p, *q, *other = NULL, *otherback = NULL; + + if (item->back == NULL) { + *fork = NULL; + return; + } + *fork = treenode[item->back->index - 1]; + if ((*fork)->numdesc == 2) { + updatenumdesc(*fork, *root, 0); + findbelow(&other, item, *fork); + otherback = other->back; + if (*root == *fork) { + *root = other; + if (!other->tip) + updatenumdesc(other, *root, other->numdesc); + } + p = item->back->next->back; + q = item->back->next->next->back; + if (p != NULL) + p->back = q; + if (q != NULL) + q->back = p; + (*fork)->back = NULL; + p = (*fork)->next; + while (p != *fork) { + p->back = NULL; + p = p->next; + } + } else { + updatenumdesc(*fork, *root, (*fork)->numdesc - 1); + p = *fork; + while (p->next != item->back) + p = p->next; + p->next = item->back->next; + } + if (!item->tip) { + updatenumdesc(item, item, item->numdesc); + if (recompute) { + memcpy(item->back->oldbase, item->back->base, endsite*sizeof(long)); + memcpy(item->back->oldnumsteps, item->back->numsteps, endsite*sizeof(long)); + memcpy(item->back->base, zeros, endsite*sizeof(long)); + memcpy(item->back->numsteps, zeros, endsite*sizeof(long)); + preorder(item, item->back, *root, item->back, NULL, item, -1); + } + } + if ((*fork)->numdesc >= 2) + chucktreenode(grbg, item->back); + item->back = NULL; + if (!recompute) + return; + if ((*fork)->numdesc == 0) { + memcpy(otherback->oldbase, otherback->base, endsite*sizeof(long)); + memcpy(otherback->oldnumsteps, otherback->numsteps, endsite*sizeof(long)); + if (other == *root) { + memcpy(otherback->base, zeros, endsite*sizeof(long)); + memcpy(otherback->numsteps, zeros, endsite*sizeof(long)); + } else { + memcpy(otherback->base, other->back->base, endsite*sizeof(long)); + memcpy(otherback->numsteps, other->back->numsteps, endsite*sizeof(long)); + } + p = other->back; + other->back = otherback; + if (other == *root) + preorder(other, otherback, *root, otherback, NULL, other, -1); + else + preorder(other, otherback, *root, NULL, NULL, NULL, 0); + other->back = p; + if (other != *root) { + memcpy(other->oldbase,(*fork)->base, endsite*sizeof(long)); + memcpy(other->oldnumsteps,(*fork)->numsteps, endsite*sizeof(long)); + preorder(other->back, other, *root, NULL, NULL, NULL, 0); + } + } else { + memcpy(item->oldbase, item->base, endsite*sizeof(long)); + memcpy(item->oldnumsteps, item->numsteps, endsite*sizeof(long)); + memcpy(item->base, zeros, endsite*sizeof(long)); + memcpy(item->numsteps, zeros, endsite*sizeof(long)); + preorder(*fork, item, *root, NULL, NULL, *fork, -1); + if (*fork != *root) + preorder((*fork)->back, *fork, *root, NULL, NULL, NULL, 0); + memcpy(item->base, item->oldbase, endsite*sizeof(long)); + memcpy(item->numsteps, item->oldnumsteps, endsite*sizeof(long)); + } +} /* remove */ + + +void postorder(node *p) +{ + /* traverses an n-ary tree, suming up steps at a node's descendants */ + /* used in dnacomp, dnapars, & dnapenny */ + node *q; + + if (p->tip) + return; + q = p->next; + while (q != p) { + postorder(q->back); + q = q->next; + } + zeronumnuc(p, endsite); + if (p->numdesc > 2) + multisumnsteps2(p); + else + fillin(p, p->next->back, p->next->next->back); +} /* postorder */ + + +void getnufork(node **nufork,node **grbg,pointarray treenode,long *zeros) +{ + /* find a fork not used currently */ + long i; + + i = spp; + while (treenode[i] && treenode[i]->numdesc > 0) i++; + if (!treenode[i]) + gnutreenode(grbg, &treenode[i], i, endsite, zeros); + *nufork = treenode[i]; +} /* getnufork */ + + +void reroot(node *outgroup, node *root) +{ + /* reorients tree, putting outgroup in desired position. used if + the root is binary. */ + /* used in dnacomp & dnapars */ + node *p, *q; + + if (outgroup->back->index == root->index) + return; + p = root->next; + q = root->next->next; + p->back->back = q->back; + q->back->back = p->back; + p->back = outgroup; + q->back = outgroup->back; + outgroup->back->back = q; + outgroup->back = p; +} /* reroot */ + + +void reroot2(node *outgroup, node *root) +{ + /* reorients tree, putting outgroup in desired position. */ + /* used in dnacomp & dnapars */ + node *p; + + p = outgroup->back->next; + while (p->next != outgroup->back) + p = p->next; + root->next = outgroup->back; + p->next = root; +} /* reroot2 */ + + +void reroot3(node *outgroup, node *root, node *root2, node *lastdesc, + node **grbg) +{ + /* reorients tree, putting back outgroup in original position. */ + /* used in dnacomp & dnapars */ + node *p; + + p = root->next; + while (p->next != root) + p = p->next; + chucktreenode(grbg, root); + p->next = outgroup->back; + root2->next = lastdesc->next; + lastdesc->next = root2; +} /* reroot3 */ + + +void savetraverse(node *p) +{ + /* sets BOOLEANs that indicate which way is down */ + node *q; + + p->bottom = true; + if (p->tip) + return; + q = p->next; + while (q != p) { + q->bottom = false; + savetraverse(q->back); + q = q->next; + } +} /* savetraverse */ + + +void newindex(long i, node *p) +{ + /* assigns index i to node p */ + + while (p->index != i) { + p->index = i; + p = p->next; + } +} /* newindex */ + + +void flipindexes(long nextnode, pointarray treenode) +{ + /* flips index of nodes between nextnode and last node. */ + long last; + node *temp; + + last = nonodes; + while (treenode[last - 1]->numdesc == 0) + last--; + if (last > nextnode) { + temp = treenode[nextnode - 1]; + treenode[nextnode - 1] = treenode[last - 1]; + treenode[last - 1] = temp; + newindex(nextnode, treenode[nextnode - 1]); + newindex(last, treenode[last - 1]); + } +} /* flipindexes */ + + +boolean parentinmulti(node *anode) +{ + /* sees if anode's parent has more than 2 children */ + node *p; + + while (!anode->bottom) anode = anode->next; + p = anode->back; + while (!p->bottom) + p = p->next; + return (p->numdesc > 2); +} /* parentinmulti */ + + +long sibsvisited(node *anode, long *place) +{ + /* computes the number of nodes which are visited earlier than anode among + its siblings */ + node *p; + long nvisited; + + while (!anode->bottom) anode = anode->next; + p = anode->back->next; + nvisited = 0; + do { + if (!p->bottom && place[p->back->index - 1] != 0) + nvisited++; + p = p->next; + } while (p != anode->back); + return nvisited; +} /* sibsvisited */ + + +long smallest(node *anode, long *place) +{ + /* finds the smallest index of sibling of anode */ + node *p; + long min; + + while (!anode->bottom) anode = anode->next; + p = anode->back->next; + if (p->bottom) p = p->next; + min = nonodes; + do { + if (p->back && place[p->back->index - 1] != 0) { + if (p->back->index <= spp) { + if (p->back->index < min) + min = p->back->index; + } else { + if (place[p->back->index - 1] < min) + min = place[p->back->index - 1]; + } + } + p = p->next; + if (p->bottom) p = p->next; + } while (p != anode->back); + return min; +} /* smallest */ + + +void bintomulti(node **root, node **binroot, node **grbg, long *zeros) +{ /* attaches root's left child to its right child and makes + the right child new root */ + node *left, *right, *newnode, *temp; + + right = (*root)->next->next->back; + left = (*root)->next->back; + if (right->tip) { + (*root)->next = right->back; + (*root)->next->next = left->back; + temp = left; + left = right; + right = temp; + right->back->next = *root; + } + gnutreenode(grbg, &newnode, right->index, endsite, zeros); + newnode->next = right->next; + newnode->back = left; + left->back = newnode; + right->next = newnode; + (*root)->next->back = (*root)->next->next->back = NULL; + *binroot = *root; + (*binroot)->numdesc = 0; + *root = right; + (*root)->numdesc++; + (*root)->back = NULL; +} /* bintomulti */ + + +void backtobinary(node **root, node *binroot, node **grbg) +{ /* restores binary root */ + node *p; + + binroot->next->back = (*root)->next->back; + (*root)->next->back->back = binroot->next; + p = (*root)->next; + (*root)->next = p->next; + binroot->next->next->back = *root; + (*root)->back = binroot->next->next; + chucktreenode(grbg, p); + (*root)->numdesc--; + *root = binroot; + (*root)->numdesc = 2; +} /* backtobinary */ + + +boolean outgrin(node *root, node *outgrnode) +{ /* checks if outgroup node is a child of root */ + node *p; + + p = root->next; + while (p != root) { + if (p->back == outgrnode) + return true; + p = p->next; + } + return false; +} /* outgrin */ + + +void flipnodes(node *nodea, node *nodeb) +{ /* flip nodes */ + node *backa, *backb; + + backa = nodea->back; + backb = nodeb->back; + backa->back = nodeb; + backb->back = nodea; + nodea->back = backb; + nodeb->back = backa; +} /* flipnodes */ + + +void moveleft(node *root, node *outgrnode, node **flipback) +{ /* makes outgroup node to leftmost child of root */ + node *p; + boolean done; + + p = root->next; + done = false; + while (p != root && !done) { + if (p->back == outgrnode) { + *flipback = p; + flipnodes(root->next->back, p->back); + done = true; + } + p = p->next; + } +} /* moveleft */ + + +void savetree(node *root, long *place, pointarray treenode, + node **grbg, long *zeros) +{ /* record in place where each species has to be + added to reconstruct this tree */ + /* used by dnacomp & dnapars */ + long i, j, nextnode, nvisited; + node *p, *q, *r = NULL, *root2, *lastdesc, + *outgrnode, *binroot, *flipback; + boolean done, newfork; + + binroot = NULL; + lastdesc = NULL; + root2 = NULL; + flipback = NULL; + outgrnode = treenode[outgrno - 1]; + if (root->numdesc == 2) + bintomulti(&root, &binroot, grbg, zeros); + if (outgrin(root, outgrnode)) { + if (outgrnode != root->next->back) + moveleft(root, outgrnode, &flipback); + } else { + root2 = root; + lastdesc = root->next; + while (lastdesc->next != root) + lastdesc = lastdesc->next; + lastdesc->next = root->next; + gnutreenode(grbg, &root, outgrnode->back->index, endsite, zeros); + root->numdesc = root2->numdesc; + reroot2(outgrnode, root); + } + savetraverse(root); + nextnode = spp + 1; + for (i = nextnode; i <= nonodes; i++) + if (treenode[i - 1]->numdesc == 0) + flipindexes(i, treenode); + for (i = 0; i < nonodes; i++) + place[i] = 0; + place[root->index - 1] = 1; + for (i = 1; i <= spp; i++) { + p = treenode[i - 1]; + while (place[p->index - 1] == 0) { + place[p->index - 1] = i; + while (!p->bottom) + p = p->next; + r = p; + p = p->back; + } + if (i > 1) { + q = treenode[i - 1]; + newfork = true; + nvisited = sibsvisited(q, place); + if (nvisited == 0) { + if (parentinmulti(r)) { + nvisited = sibsvisited(r, place); + if (nvisited == 0) + place[i - 1] = place[p->index - 1]; + else if (nvisited == 1) + place[i - 1] = smallest(r, place); + else { + place[i - 1] = -smallest(r, place); + newfork = false; + } + } else + place[i - 1] = place[p->index - 1]; + } else if (nvisited == 1) { + place[i - 1] = place[p->index - 1]; + } else { + place[i - 1] = -smallest(q, place); + newfork = false; + } + if (newfork) { + j = place[p->index - 1]; + done = false; + while (!done) { + place[p->index - 1] = nextnode; + while (!p->bottom) + p = p->next; + p = p->back; + done = (p == NULL); + if (!done) + done = (place[p->index - 1] != j); + if (done) { + nextnode++; + } + } + } + } + } + if (flipback) + flipnodes(outgrnode, flipback->back); + else { + if (root2) { + reroot3(outgrnode, root, root2, lastdesc, grbg); + root = root2; + } + } + if (binroot) + backtobinary(&root, binroot, grbg); +} /* savetree */ + + +void addnsave(node *p, node *item, node *nufork, node **root, node **grbg, + boolean multf, pointarray treenode, long *place, long *zeros) +{ /* adds item to tree and save it. Then removes item. */ + node *dummy; + + if (!multf) + add(p, item, nufork, root, false, treenode, grbg, zeros); + else + add(p, item, NULL, root, false, treenode, grbg, zeros); + savetree(*root, place, treenode, grbg, zeros); + if (!multf) + re_move(item, &nufork, root, false, treenode, grbg, zeros); + else + re_move(item, &dummy, root, false, treenode, grbg, zeros); +} /* addnsave */ + + +void addbestever(long *pos, long *nextree, long maxtrees, boolean collapse, + long *place, bestelm *bestrees) +{ /* adds first best tree */ + + *pos = 1; + *nextree = 1; + initbestrees(bestrees, maxtrees, true); + initbestrees(bestrees, maxtrees, false); + addtree(*pos, nextree, collapse, place, bestrees); +} /* addbestever */ + + +void addtiedtree(long pos, long *nextree, long maxtrees, boolean collapse, + long *place, bestelm *bestrees) +{ /* add tied tree */ + + if (*nextree <= maxtrees) + addtree(pos, nextree, collapse, place, bestrees); +} /* addtiedtree */ + + +void clearcollapse(pointarray treenode) +{ + /* clears collapse status at a node */ + long i; + node *p; + + for (i = 0; i < nonodes; i++) { + treenode[i]->collapse = undefined; + if (!treenode[i]->tip) { + p = treenode[i]->next; + while (p != treenode[i]) { + p->collapse = undefined; + p = p->next; + } + } + } +} /* clearcollapse */ + + +void clearbottom(pointarray treenode) +{ + /* clears boolean bottom at a node */ + long i; + node *p; + + for (i = 0; i < nonodes; i++) { + treenode[i]->bottom = false; + if (!treenode[i]->tip) { + p = treenode[i]->next; + while (p != treenode[i]) { + p->bottom = false; + p = p->next; + } + } + } +} /* clearbottom */ + + +void collabranch(node *collapfrom, node *tempfrom, node *tempto) +{ /* collapse branch from collapfrom */ + long i, j, b, largest, descsteps; + boolean done; + + for (i = 0; i < endsite; i++) { + descsteps = 0; + for (j = (long)A; j <= (long)O; j++) { + b = 1 << j; + if ((descsteps == 0) && (collapfrom->base[i] & b)) + descsteps = tempfrom->oldnumsteps[i] + - (collapfrom->numdesc - collapfrom->numnuc[i][j]) + * weight[i]; + } + done = false; + for (j = (long)A; j <= (long)O; j++) { + b = 1 << j; + if (!done && (tempto->base[i] & b)) { + descsteps += (tempto->numsteps[i] + - (tempto->numdesc - collapfrom->numdesc + - tempto->numnuc[i][j]) * weight[i]); + done = true; + } + } + for (j = (long)A; j <= (long)O; j++) + tempto->numnuc[i][j] += collapfrom->numnuc[i][j]; + largest = getlargest(tempto->numnuc[i]); + tempto->base[i] = 0; + for (j = (long)A; j <= (long)O; j++) { + if (tempto->numnuc[i][j] == largest) + tempto->base[i] |= (1 << j); + } + tempto->numsteps[i] = (tempto->numdesc - largest) * weight[i] + descsteps; + } +} /* collabranch */ + + +boolean allcommonbases(node *a, node *b, boolean *allsame) +{ /* see if bases are common at all sites for nodes a and b */ + long i; + boolean allcommon; + + allcommon = true; + *allsame = true; + for (i = 0; i < endsite; i++) { + if ((a->base[i] & b->base[i]) == 0) + allcommon = false; + else if (a->base[i] != b->base[i]) + *allsame = false; + } + return allcommon; +} /* allcommonbases */ + + +void findbottom(node *p, node **bottom) +{ /* find a node with field bottom set at node p */ + node *q; + + if (p->bottom) + *bottom = p; + else { + q = p->next; + while(!q->bottom && q != p) + q = q->next; + *bottom = q; + } +} /* findbottom */ + + +boolean moresteps(node *a, node *b) +{ /* see if numsteps of node a exceeds those of node b */ + long i; + + for (i = 0; i < endsite; i++) + if (a->numsteps[i] > b->numsteps[i]) + return true; + return false; +} /* moresteps */ + + +boolean passdown(node *desc, node *parent, node *start, node *below, + node *item, node *added, node *total, node *tempdsc, + node *tempprt, boolean multf) +{ /* track down to node start to see if an ancestor branch can be collapsed */ + node *temp; + boolean done, allsame; + + done = (parent == start); + while (!done) { + desc = parent; + findbottom(parent->back, &parent); + if (multf && start == below && parent == below) + parent = added; + memcpy(tempdsc->base, tempprt->base, endsite*sizeof(long)); + memcpy(tempdsc->numsteps, tempprt->numsteps, endsite*sizeof(long)); + memcpy(tempdsc->oldbase, desc->base, endsite*sizeof(long)); + memcpy(tempdsc->oldnumsteps, desc->numsteps, endsite*sizeof(long)); + memcpy(tempprt->base, parent->base, endsite*sizeof(long)); + memcpy(tempprt->numsteps, parent->numsteps, endsite*sizeof(long)); + memcpy(tempprt->numnuc, parent->numnuc, endsite*sizeof(nucarray)); + tempprt->numdesc = parent->numdesc; + multifillin(tempprt, tempdsc, 0); + if (!allcommonbases(tempprt, parent, &allsame)) + return false; + else if (moresteps(tempprt, parent)) + return false; + else if (allsame) + return true; + if (parent == added) + parent = below; + done = (parent == start); + if (done && ((start == item) || (!multf && start == below))) { + memcpy(tempdsc->base, tempprt->base, endsite*sizeof(long)); + memcpy(tempdsc->numsteps, tempprt->numsteps, endsite*sizeof(long)); + memcpy(tempdsc->oldbase, start->base, endsite*sizeof(long)); + memcpy(tempdsc->oldnumsteps, start->numsteps, endsite*sizeof(long)); + multifillin(added, tempdsc, 0); + tempprt = added; + } + } + temp = tempdsc; + if (start == below || start == item) + fillin(temp, tempprt, below->back); + else + fillin(temp, tempprt, added); + return !moresteps(temp, total); +} /* passdown */ + + +boolean trycollapdesc(node *desc, node *parent, node *start, + node *below, node *item, node *added, node *total, + node *tempdsc, node *tempprt, boolean multf, long *zeros) + { /* see if branch between nodes desc and parent can be collapsed */ + boolean allsame; + + if (desc->numdesc == 1) + return true; + if (multf && start == below && parent == below) + parent = added; + memcpy(tempdsc->base, zeros, endsite*sizeof(long)); + memcpy(tempdsc->numsteps, zeros, endsite*sizeof(long)); + memcpy(tempdsc->oldbase, desc->base, endsite*sizeof(long)); + memcpy(tempdsc->oldnumsteps, desc->numsteps, endsite*sizeof(long)); + memcpy(tempprt->base, parent->base, endsite*sizeof(long)); + memcpy(tempprt->numsteps, parent->numsteps, endsite*sizeof(long)); + memcpy(tempprt->numnuc, parent->numnuc, endsite*sizeof(nucarray)); + tempprt->numdesc = parent->numdesc - 1; + multifillin(tempprt, tempdsc, -1); + tempprt->numdesc += desc->numdesc; + collabranch(desc, tempdsc, tempprt); + if (!allcommonbases(tempprt, parent, &allsame) || + moresteps(tempprt, parent)) { + if (parent != added) { + desc->collapse = nocollap; + parent->collapse = nocollap; + } + return false; + } else if (allsame) { + if (parent != added) { + desc->collapse = tocollap; + parent->collapse = tocollap; + } + return true; + } + if (parent == added) + parent = below; + if ((start == item && parent == item) || + (!multf && start == below && parent == below)) { + memcpy(tempdsc->base, tempprt->base, endsite*sizeof(long)); + memcpy(tempdsc->numsteps, tempprt->numsteps, endsite*sizeof(long)); + memcpy(tempdsc->oldbase, start->base, endsite*sizeof(long)); + memcpy(tempdsc->oldnumsteps, start->numsteps, endsite*sizeof(long)); + memcpy(tempprt->base, added->base, endsite*sizeof(long)); + memcpy(tempprt->numsteps, added->numsteps, endsite*sizeof(long)); + memcpy(tempprt->numnuc, added->numnuc, endsite*sizeof(nucarray)); + tempprt->numdesc = added->numdesc; + multifillin(tempprt, tempdsc, 0); + if (!allcommonbases(tempprt, added, &allsame)) + return false; + else if (moresteps(tempprt, added)) + return false; + else if (allsame) + return true; + } + return passdown(desc, parent, start, below, item, added, total, tempdsc, + tempprt, multf); +} /* trycollapdesc */ + + +void setbottom(node *p) +{ /* set field bottom at node p */ + node *q; + + p->bottom = true; + q = p->next; + do { + q->bottom = false; + q = q->next; + } while (q != p); +} /* setbottom */ + +boolean zeroinsubtree(node *subtree, node *start, node *below, node *item, + node *added, node *total, node *tempdsc, node *tempprt, + boolean multf, node* root, long *zeros) +{ /* sees if subtree contains a zero length branch */ + node *p; + + if (!subtree->tip) { + setbottom(subtree); + p = subtree->next; + do { + if (p->back && !p->back->tip && + !((p->back->collapse == nocollap) && (subtree->collapse == nocollap)) + && (subtree->numdesc != 1)) { + if ((p->back->collapse == tocollap) && (subtree->collapse == tocollap) + && multf && (subtree != below)) + return true; + /* when root->numdesc == 2 + * there is no mandatory step at the root, + * instead of checking at the root we check around it + * we only need to check p because the first if + * statement already gets rid of it for the subtree */ + else if ((p->back->index != root->index || root->numdesc > 2) && + trycollapdesc(p->back, subtree, start, below, item, added, total, + tempdsc, tempprt, multf, zeros)) + return true; + else if ((p->back->index == root->index && root->numdesc == 2) && + !(root->next->back->tip) && !(root->next->next->back->tip) && + trycollapdesc(root->next->back, root->next->next->back, start, + below, item,added, total, tempdsc, tempprt, multf, zeros)) + return true; + } + p = p->next; + } while (p != subtree); + p = subtree->next; + do { + if (p->back && !p->back->tip) { + if (zeroinsubtree(p->back, start, below, item, added, total, + tempdsc, tempprt, multf, root, zeros)) + return true; + } + p = p->next; + } while (p != subtree); + } + return false; +} /* zeroinsubtree */ + + +boolean collapsible(node *item, node *below, node *temp, node *temp1, + node *tempdsc, node *tempprt, node *added, node *total, + boolean multf, node *root, long *zeros, pointarray treenode) +{ + /* sees if any branch can be collapsed */ + node *belowbk; + boolean allsame; + + if (multf) { + memcpy(tempdsc->base, item->base, endsite*sizeof(long)); + memcpy(tempdsc->numsteps, item->numsteps, endsite*sizeof(long)); + memcpy(tempdsc->oldbase, zeros, endsite*sizeof(long)); + memcpy(tempdsc->oldnumsteps, zeros, endsite*sizeof(long)); + memcpy(added->base, below->base, endsite*sizeof(long)); + memcpy(added->numsteps, below->numsteps, endsite*sizeof(long)); + memcpy(added->numnuc, below->numnuc, endsite*sizeof(nucarray)); + added->numdesc = below->numdesc + 1; + multifillin(added, tempdsc, 1); + } else { + fillin(added, item, below); + added->numdesc = 2; + } + fillin(total, added, below->back); + clearbottom(treenode); + if (below->back) { + if (zeroinsubtree(below->back, below->back, below, item, added, total, + tempdsc, tempprt, multf, root, zeros)) + return true; + } + if (multf) { + if (zeroinsubtree(below, below, below, item, added, total, + tempdsc, tempprt, multf, root, zeros)) + return true; + } else if (!below->tip) { + if (zeroinsubtree(below, below, below, item, added, total, + tempdsc, tempprt, multf, root, zeros)) + return true; + } + if (!item->tip) { + if (zeroinsubtree(item, item, below, item, added, total, + tempdsc, tempprt, multf, root, zeros)) + return true; + } + if (multf && below->back && !below->back->tip) { + memcpy(tempdsc->base, zeros, endsite*sizeof(long)); + memcpy(tempdsc->numsteps, zeros, endsite*sizeof(long)); + memcpy(tempdsc->oldbase, added->base, endsite*sizeof(long)); + memcpy(tempdsc->oldnumsteps, added->numsteps, endsite*sizeof(long)); + if (below->back == treenode[below->back->index - 1]) + belowbk = below->back->next; + else + belowbk = treenode[below->back->index - 1]; + memcpy(tempprt->base, belowbk->base, endsite*sizeof(long)); + memcpy(tempprt->numsteps, belowbk->numsteps, endsite*sizeof(long)); + memcpy(tempprt->numnuc, belowbk->numnuc, endsite*sizeof(nucarray)); + tempprt->numdesc = belowbk->numdesc - 1; + multifillin(tempprt, tempdsc, -1); + tempprt->numdesc += added->numdesc; + collabranch(added, tempdsc, tempprt); + if (!allcommonbases(tempprt, belowbk, &allsame)) + return false; + else if (allsame && !moresteps(tempprt, belowbk)) + return true; + else if (belowbk->back) { + fillin(temp, tempprt, belowbk->back); + fillin(temp1, belowbk, belowbk->back); + return !moresteps(temp, temp1); + } + } + return false; +} /* collapsible */ + + +void replaceback(node **oldback, node *item, node *forknode, + node **grbg, long *zeros) +{ /* replaces back node of item with another */ + node *p; + + p = forknode; + while (p->next->back != item) + p = p->next; + *oldback = p->next; + gnutreenode(grbg, &p->next, forknode->index, endsite, zeros); + p->next->next = (*oldback)->next; + p->next->back = (*oldback)->back; + p->next->back->back = p->next; + (*oldback)->next = (*oldback)->back = NULL; +} /* replaceback */ + + +void putback(node *oldback, node *item, node *forknode, node **grbg) +{ /* restores node to back of item */ + node *p, *q; + + p = forknode; + while (p->next != item->back) + p = p->next; + q = p->next; + oldback->next = p->next->next; + p->next = oldback; + oldback->back = item; + item->back = oldback; + oldback->index = forknode->index; + chucktreenode(grbg, q); +} /* putback */ + + +void savelocrearr(node *item, node *forknode, node *below, node *tmp, + node *tmp1, node *tmp2, node *tmp3, node *tmprm, node *tmpadd, + node **root, long maxtrees, long *nextree, boolean multf, + boolean bestever, boolean *saved, long *place, + bestelm *bestrees, pointarray treenode, node **grbg, + long *zeros) +{ /* saves tied or better trees during local rearrangements by removing + item from forknode and adding to below */ + node *other, *otherback = NULL, *oldfork, *nufork, *oldback; + long pos; + boolean found, collapse; + + if (forknode->numdesc == 2) { + findbelow(&other, item, forknode); + otherback = other->back; + oldback = NULL; + } else { + other = NULL; + replaceback(&oldback, item, forknode, grbg, zeros); + } + re_move(item, &oldfork, root, false, treenode, grbg, zeros); + if (!multf) + getnufork(&nufork, grbg, treenode, zeros); + else + nufork = NULL; + addnsave(below, item, nufork, root, grbg, multf, treenode, place, zeros); + pos = 0; + findtree(&found, &pos, *nextree, place, bestrees); + if (other) { + add(other, item, oldfork, root, false, treenode, grbg, zeros); + if (otherback->back != other) + flipnodes(item, other); + } else + add(forknode, item, NULL, root, false, treenode, grbg, zeros); + *saved = false; + if (found) { + if (oldback) + putback(oldback, item, forknode, grbg); + } else { + if (oldback) + chucktreenode(grbg, oldback); + re_move(item, &oldfork, root, true, treenode, grbg, zeros); + collapse = collapsible(item, below, tmp, tmp1, tmp2, tmp3, tmprm, + tmpadd, multf, *root, zeros, treenode); + if (!collapse) { + if (bestever) + addbestever(&pos, nextree, maxtrees, collapse, place, bestrees); + else + addtiedtree(pos, nextree, maxtrees, collapse, place, bestrees); + } + if (other) + add(other, item, oldfork, root, true, treenode, grbg, zeros); + else + add(forknode, item, NULL, root, true, treenode, grbg, zeros); + *saved = !collapse; + } +} /* savelocrearr */ + + +void clearvisited(pointarray treenode) +{ + /* clears boolean visited at a node */ + long i; + node *p; + + for (i = 0; i < nonodes; i++) { + treenode[i]->visited = false; + if (!treenode[i]->tip) { + p = treenode[i]->next; + while (p != treenode[i]) { + p->visited = false; + p = p->next; + } + } + } +} /* clearvisited */ + + +void hyprint(long b1, long b2, struct LOC_hyptrav *htrav, + pointarray treenode, Char *basechar) +{ + /* print out states in sites b1 through b2 at node */ + long i, j, k, n; + boolean dot; + bases b; + + if (htrav->bottom) { + if (!outgropt) + fprintf(outfile, " "); + else + fprintf(outfile, "root "); + } else + fprintf(outfile, "%4ld ", htrav->r->back->index - spp); + if (htrav->r->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[htrav->r->index - 1][i], outfile); + } else + fprintf(outfile, "%4ld ", htrav->r->index - spp); + if (htrav->bottom) + fprintf(outfile, " "); + else if (htrav->nonzero) + fprintf(outfile, " yes "); + else if (htrav->maybe) + fprintf(outfile, " maybe "); + else + fprintf(outfile, " no "); + for (i = b1; i <= b2; i++) { + j = location[ally[i - 1] - 1]; + htrav->tempset = htrav->r->base[j - 1]; + htrav->anc = htrav->hypset[j - 1]; + if (!htrav->bottom) + htrav->anc = treenode[htrav->r->back->index - 1]->base[j - 1]; + dot = dotdiff && (htrav->tempset == htrav->anc && !htrav->bottom); + if (dot) + putc('.', outfile); + else if (htrav->tempset == (1 << A)) + putc('A', outfile); + else if (htrav->tempset == (1 << C)) + putc('C', outfile); + else if (htrav->tempset == (1 << G)) + putc('G', outfile); + else if (htrav->tempset == (1 << T)) + putc('T', outfile); + else if (htrav->tempset == (1 << O)) + putc('-', outfile); + else { + k = 1; + n = 0; + for (b = A; b <= O; b = b + 1) { + if (((1 << b) & htrav->tempset) != 0) + n += k; + k += k; + } + putc(basechar[n - 1], outfile); + } + if (i % 10 == 0) + putc(' ', outfile); + } + putc('\n', outfile); +} /* hyprint */ + + +void gnubase(gbases **p, gbases **garbage, long endsite) +{ + /* this and the following are do-it-yourself garbage collectors. + Make a new node or pull one off the garbage list */ + if (*garbage != NULL) { + *p = *garbage; + *garbage = (*garbage)->next; + } else { + *p = (gbases *)Malloc(sizeof(gbases)); + (*p)->base = (baseptr)Malloc(endsite*sizeof(long)); + } + (*p)->next = NULL; +} /* gnubase */ + + +void chuckbase(gbases *p, gbases **garbage) +{ + /* collect garbage on p -- put it on front of garbage list */ + p->next = *garbage; + *garbage = p; +} /* chuckbase */ + + +void hyptrav(node *r_, long *hypset_, long b1, long b2, boolean bottom_, + pointarray treenode, gbases **garbage, Char *basechar) +{ + /* compute, print out states at one interior node */ + struct LOC_hyptrav Vars; + long i, j, k; + long largest; + gbases *ancset; + nucarray *tempnuc; + node *p, *q; + + Vars.bottom = bottom_; + Vars.r = r_; + Vars.hypset = hypset_; + gnubase(&ancset, garbage, endsite); + tempnuc = (nucarray *)Malloc(endsite*sizeof(nucarray)); + Vars.maybe = false; + Vars.nonzero = false; + if (!Vars.r->tip) + zeronumnuc(Vars.r, endsite); + for (i = b1 - 1; i < b2; i++) { + j = location[ally[i] - 1]; + Vars.anc = Vars.hypset[j - 1]; + if (!Vars.r->tip) { + p = Vars.r->next; + for (k = (long)A; k <= (long)O; k++) + if (Vars.anc & (1 << k)) + Vars.r->numnuc[j - 1][k]++; + do { + for (k = (long)A; k <= (long)O; k++) + if (p->back->base[j - 1] & (1 << k)) + Vars.r->numnuc[j - 1][k]++; + p = p->next; + } while (p != Vars.r); + largest = getlargest(Vars.r->numnuc[j - 1]); + Vars.tempset = 0; + for (k = (long)A; k <= (long)O; k++) { + if (Vars.r->numnuc[j - 1][k] == largest) + Vars.tempset |= (1 << k); + } + Vars.r->base[j - 1] = Vars.tempset; + } + if (!Vars.bottom) + Vars.anc = treenode[Vars.r->back->index - 1]->base[j - 1]; + Vars.nonzero = (Vars.nonzero || (Vars.r->base[j - 1] & Vars.anc) == 0); + Vars.maybe = (Vars.maybe || Vars.r->base[j - 1] != Vars.anc); + } + hyprint(b1, b2, &Vars, treenode, basechar); + Vars.bottom = false; + if (!Vars.r->tip) { + memcpy(tempnuc, Vars.r->numnuc, endsite*sizeof(nucarray)); + q = Vars.r->next; + do { + memcpy(Vars.r->numnuc, tempnuc, endsite*sizeof(nucarray)); + for (i = b1 - 1; i < b2; i++) { + j = location[ally[i] - 1]; + for (k = (long)A; k <= (long)O; k++) + if (q->back->base[j - 1] & (1 << k)) + Vars.r->numnuc[j - 1][k]--; + largest = getlargest(Vars.r->numnuc[j - 1]); + ancset->base[j - 1] = 0; + for (k = (long)A; k <= (long)O; k++) + if (Vars.r->numnuc[j - 1][k] == largest) + ancset->base[j - 1] |= (1 << k); + if (!Vars.bottom) + Vars.anc = ancset->base[j - 1]; + } + hyptrav(q->back, ancset->base, b1, b2, Vars.bottom, + treenode, garbage, basechar); + q = q->next; + } while (q != Vars.r); + } + chuckbase(ancset, garbage); +} /* hyptrav */ + + +void hypstates(long chars, node *root, pointarray treenode, + gbases **garbage, Char *basechar) +{ + /* fill in and describe states at interior nodes */ + /* used in dnacomp, dnapars, & dnapenny */ + long i, n; + baseptr nothing; + + fprintf(outfile, "\nFrom To Any Steps? State at upper node\n"); + fprintf(outfile, " "); + if (dotdiff) + fprintf(outfile, " ( . means same as in the node below it on tree)\n"); + nothing = (baseptr)Malloc(endsite*sizeof(long)); + for (i = 0; i < endsite; i++) + nothing[i] = 0; + for (i = 1; i <= ((chars - 1) / 40 + 1); i++) { + putc('\n', outfile); + n = i * 40; + if (n > chars) + n = chars; + hyptrav(root, nothing, i * 40 - 39, n, true, treenode, garbage, basechar); + } + free(nothing); +} /* hypstates */ + + +void initbranchlen(node *p) +{ + node *q; + + p->v = 0.0; + if (p->back) + p->back->v = 0.0; + if (p->tip) + return; + q = p->next; + while (q != p) { + initbranchlen(q->back); + q = q->next; + } + q = p->next; + while (q != p) { + q->v = 0.0; + q = q->next; + } +} /* initbranchlen */ + + +void initmin(node *p, long sitei, boolean internal) +{ + long i; + + if (internal) { + for (i = (long)A; i <= (long)O; i++) { + p->cumlengths[i] = 0; + p->numreconst[i] = 1; + } + } else { + for (i = (long)A; i <= (long)O; i++) { + if (p->base[sitei - 1] & (1 << i)) { + p->cumlengths[i] = 0; + p->numreconst[i] = 1; + } else { + p->cumlengths[i] = -1; + p->numreconst[i] = 0; + } + } + } +} /* initmin */ + + +void initbase(node *p, long sitei) +{ + /* traverse tree to initialize base at internal nodes */ + node *q; + long i, largest; + + if (p->tip) + return; + q = p->next; + while (q != p) { + if (q->back) { + memcpy(q->numnuc, p->numnuc, endsite*sizeof(nucarray)); + for (i = (long)A; i <= (long)O; i++) { + if (q->back->base[sitei - 1] & (1 << i)) + q->numnuc[sitei - 1][i]--; + } + if (p->back) { + for (i = (long)A; i <= (long)O; i++) { + if (p->back->base[sitei - 1] & (1 << i)) + q->numnuc[sitei - 1][i]++; + } + } + largest = getlargest(q->numnuc[sitei - 1]); + q->base[sitei - 1] = 0; + for (i = (long)A; i <= (long)O; i++) { + if (q->numnuc[sitei - 1][i] == largest) + q->base[sitei - 1] |= (1 << i); + } + } + q = q->next; + } + q = p->next; + while (q != p) { + initbase(q->back, sitei); + q = q->next; + } +} /* initbase */ + + +void inittreetrav(node *p, long sitei) +{ + /* traverse tree to clear boolean initialized and set up base */ + node *q; + + if (p->tip) { + initmin(p, sitei, false); + p->initialized = true; + return; + } + q = p->next; + while (q != p) { + inittreetrav(q->back, sitei); + q = q->next; + } + initmin(p, sitei, true); + p->initialized = false; + q = p->next; + while (q != p) { + initmin(q, sitei, true); + q->initialized = false; + q = q->next; + } +} /* inittreetrav */ + + +void compmin(node *p, node *desc) +{ + /* computes minimum lengths up to p */ + long i, j, minn, cost, desclen, descrecon=0, maxx; + + maxx = 10 * spp; + for (i = (long)A; i <= (long)O; i++) { + minn = maxx; + for (j = (long)A; j <= (long)O; j++) { + if (transvp) { + if ( + ( + ((i == (long)A) || (i == (long)G)) + && ((j == (long)A) || (j == (long)G)) + ) + || ( + ((j == (long)C) || (j == (long)T)) + && ((i == (long)C) || (i == (long)T)) + ) + ) + cost = 0; + else + cost = 1; + } else { + if (i == j) + cost = 0; + else + cost = 1; + } + if (desc->cumlengths[j] == -1) { + desclen = maxx; + } else { + desclen = desc->cumlengths[j]; + } + if (minn > cost + desclen) { + minn = cost + desclen; + descrecon = 0; + } + if (minn == cost + desclen) { + descrecon += desc->numreconst[j]; + } + } + p->cumlengths[i] += minn; + p->numreconst[i] *= descrecon; + } + p->initialized = true; +} /* compmin */ + + +void minpostorder(node *p, pointarray treenode) +{ + /* traverses an n-ary tree, computing minimum steps at each node */ + node *q; + + if (p->tip) { + return; + } + q = p->next; + while (q != p) { + if (q->back) + minpostorder(q->back, treenode); + q = q->next; + } + if (!p->initialized) { + q = p->next; + while (q != p) { + if (q->back) + compmin(p, q->back); + q = q->next; + } + } +} /* minpostorder */ + + +void branchlength(node *subtr1, node *subtr2, double *brlen, + pointarray treenode) +{ + /* computes a branch length between two subtrees for a given site */ + long i, j, minn, cost, nom, denom; + node *temp; + + if (subtr1->tip) { + temp = subtr1; + subtr1 = subtr2; + subtr2 = temp; + } + if (subtr1->index == outgrno) { + temp = subtr1; + subtr1 = subtr2; + subtr2 = temp; + } + minpostorder(subtr1, treenode); + minpostorder(subtr2, treenode); + minn = 10 * spp; + nom = 0; + denom = 0; + for (i = (long)A; i <= (long)O; i++) { + for (j = (long)A; j <= (long)O; j++) { + if (transvp) { + if ( + ( + ((i == (long)A) || (i == (long)G)) + && ((j == (long)A) || (j == (long)G)) + ) + || ( + ((j == (long)C) || (j == (long)T)) + && ((i == (long)C) || (i == (long)T)) + ) + ) + cost = 0; + else + cost = 1; + } else { + if (i == j) + cost = 0; + else + cost = 1; + } + if (subtr1->cumlengths[i] != -1 && (subtr2->cumlengths[j] != -1)) { + if (subtr1->cumlengths[i] + cost + subtr2->cumlengths[j] < minn) { + minn = subtr1->cumlengths[i] + cost + subtr2->cumlengths[j]; + nom = 0; + denom = 0; + } + if (subtr1->cumlengths[i] + cost + subtr2->cumlengths[j] == minn) { + nom += subtr1->numreconst[i] * subtr2->numreconst[j] * cost; + denom += subtr1->numreconst[i] * subtr2->numreconst[j]; + } + } + } + } + *brlen = (double)nom/(double)denom; +} /* branchlength */ + + +void printbranchlengths(node *p) +{ + node *q; + long i; + + if (p->tip) + return; + q = p->next; + do { + fprintf(outfile, "%6ld ",q->index - spp); + if (q->back->tip) { + for (i = 0; i < nmlngth; i++) + putc(nayme[q->back->index - 1][i], outfile); + } else + fprintf(outfile, "%6ld ", q->back->index - spp); + fprintf(outfile, " %f\n",q->v); + if (q->back) + printbranchlengths(q->back); + q = q->next; + } while (q != p); +} /* printbranchlengths */ + + +void branchlentrav(node *p, node *root, long sitei, long chars, + double *brlen, pointarray treenode) + { + /* traverses the tree computing tree length at each branch */ + node *q; + + if (p->tip) + return; + if (p->index == outgrno) + p = p->back; + q = p->next; + do { + if (q->back) { + branchlength(q, q->back, brlen, treenode); + q->v += ((weight[sitei - 1] / 10.0) * (*brlen)/chars); + q->back->v += ((weight[sitei - 1] / 10.0) * (*brlen)/chars); + if (!q->back->tip) + branchlentrav(q->back, root, sitei, chars, brlen, treenode); + } + q = q->next; + } while (q != p); +} /* branchlentrav */ + + +void treelength(node *root, long chars, pointarray treenode) + { + /* calls branchlentrav at each site */ + long sitei; + double trlen; + + initbranchlen(root); + for (sitei = 1; sitei <= endsite; sitei++) { + trlen = 0.0; + initbase(root, sitei); + inittreetrav(root, sitei); + branchlentrav(root, root, sitei, chars, &trlen, treenode); + } +} /* treelength */ + + +void coordinates(node *p, long *tipy, double f, long *fartemp) +{ + /* establishes coordinates of nodes for display without lengths */ + node *q, *first, *last; + node *mid1 = NULL, *mid2 = NULL; + long numbranches, numb2; + + if (p->tip) { + p->xcoord = 0; + p->ycoord = *tipy; + p->ymin = *tipy; + p->ymax = *tipy; + (*tipy) += down; + return; + } + numbranches = 0; + q = p->next; + do { + coordinates(q->back, tipy, f, fartemp); + numbranches += 1; + q = q->next; + } while (p != q); + first = p->next->back; + q = p->next; + while (q->next != p) + q = q->next; + last = q->back; + numb2 = 1; + q = p->next; + while (q != p) { + if (numb2 == (long)(numbranches + 1)/2) + mid1 = q->back; + if (numb2 == (long)(numbranches/2 + 1)) + mid2 = q->back; + numb2 += 1; + q = q->next; + } + p->xcoord = (long)((double)(last->ymax - first->ymin) * f); + p->ycoord = (long)((mid1->ycoord + mid2->ycoord) / 2); + p->ymin = first->ymin; + p->ymax = last->ymax; + if (p->xcoord > *fartemp) + *fartemp = p->xcoord; +} /* coordinates */ + + +void drawline(long i, double scale, node *root) +{ + /* draws one row of the tree diagram by moving up tree */ + node *p, *q, *r, *first =NULL, *last =NULL; + long n, j; + boolean extra, done, noplus; + + p = root; + q = root; + extra = false; + noplus = false; + if (i == (long)p->ycoord && p == root) { + if (p->index - spp >= 10) + fprintf(outfile, " %2ld", p->index - spp); + else + fprintf(outfile, " %ld", p->index - spp); + extra = true; + noplus = true; + } else + fprintf(outfile, " "); + do { + if (!p->tip) { + r = p->next; + done = false; + do { + if (i >= r->back->ymin && i <= r->back->ymax) { + q = r->back; + done = true; + } + r = r->next; + } while (!(done || r == p)); + first = p->next->back; + r = p->next; + while (r->next != p) + r = r->next; + last = r->back; + } + done = (p == q); + n = (long)(scale * (p->xcoord - q->xcoord) + 0.5); + if (n < 3 && !q->tip) + n = 3; + if (extra) { + n--; + extra = false; + } + if ((long)q->ycoord == i && !done) { + if (noplus) { + putc('-', outfile); + noplus = false; + } + else + putc('+', outfile); + if (!q->tip) { + for (j = 1; j <= n - 2; j++) + putc('-', outfile); + if (q->index - spp >= 10) + fprintf(outfile, "%2ld", q->index - spp); + else + fprintf(outfile, "-%ld", q->index - spp); + extra = true; + noplus = true; + } else { + for (j = 1; j < n; j++) + putc('-', outfile); + } + } else if (!p->tip) { + if ((long)last->ycoord > i && (long)first->ycoord < i + && i != (long)p->ycoord) { + putc('!', outfile); + for (j = 1; j < n; j++) + putc(' ', outfile); + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + } + noplus = false; + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + noplus = false; + } + if (p != q) + p = q; + } while (!done); + if ((long)p->ycoord == i && p->tip) { + for (j = 0; j < nmlngth; j++) + putc(nayme[p->index - 1][j], outfile); + } + putc('\n', outfile); +} /* drawline */ + + +void printree(node *root, double f) +{ + /* prints out diagram of the tree */ + /* used in dnacomp, dnapars, & dnapenny */ + long i, tipy, dummy; + double scale; + + putc('\n', outfile); + if (!treeprint) + return; + putc('\n', outfile); + tipy = 1; + dummy = 0; + coordinates(root, &tipy, f, &dummy); + scale = 1.5; + putc('\n', outfile); + for (i = 1; i <= (tipy - down); i++) + drawline(i, scale, root); + fprintf(outfile, "\n remember:"); + if (outgropt) + fprintf(outfile, " (although rooted by outgroup)"); + fprintf(outfile, " this is an unrooted tree!\n\n"); +} /* printree */ + + +void writesteps(long chars, boolean weights, steptr oldweight, node *root) +{ + /* used in dnacomp, dnapars, & dnapenny */ + long i, j, k, l; + + putc('\n', outfile); + if (weights) + fprintf(outfile, "weighted "); + fprintf(outfile, "steps in each site:\n"); + fprintf(outfile, " "); + for (i = 0; i <= 9; i++) + fprintf(outfile, "%4ld", i); + fprintf(outfile, "\n *------------------------------------"); + fprintf(outfile, "-----\n"); + for (i = 0; i <= (chars / 10); i++) { + fprintf(outfile, "%5ld", i * 10); + putc('|', outfile); + for (j = 0; j <= 9; j++) { + k = i * 10 + j; + if (k == 0 || k > chars) + fprintf(outfile, " "); + else { + l = location[ally[k - 1] - 1]; + if (oldweight[k - 1] > 0) + fprintf(outfile, "%4ld", + oldweight[k - 1] * + (root->numsteps[l - 1] / weight[l - 1])); + else + fprintf(outfile, " 0"); + } + } + putc('\n', outfile); + } +} /* writesteps */ + + +void treeout(node *p, long nextree, long *col, node *root) +{ + /* write out file with representation of final tree */ + /* used in dnacomp, dnamove, dnapars, & dnapenny */ + node *q; + long i, n; + Char c; + + if (p->tip) { + n = 0; + for (i = 1; i <= nmlngth; i++) { + if (nayme[p->index - 1][i - 1] != ' ') + n = i; + } + for (i = 0; i < n; i++) { + c = nayme[p->index - 1][i]; + if (c == ' ') + c = '_'; + putc(c, outtree); + } + *col += n; + } else { + putc('(', outtree); + (*col)++; + q = p->next; + while (q != p) { + treeout(q->back, nextree, col, root); + q = q->next; + if (q == p) + break; + putc(',', outtree); + (*col)++; + if (*col > 60) { + putc('\n', outtree); + *col = 0; + } + } + putc(')', outtree); + (*col)++; + } + if (p != root) + return; + if (nextree > 2) + fprintf(outtree, "[%6.4f];\n", 1.0 / (nextree - 1)); + else + fprintf(outtree, ";\n"); +} /* treeout */ + + +void treeout3(node *p, long nextree, long *col, node *root) +{ + /* write out file with representation of final tree */ + /* used in dnapars -- writes branch lengths */ + node *q; + long i, n, w; + double x; + Char c; + + if (p->tip) { + n = 0; + for (i = 1; i <= nmlngth; i++) { + if (nayme[p->index - 1][i - 1] != ' ') + n = i; + } + for (i = 0; i < n; i++) { + c = nayme[p->index - 1][i]; + if (c == ' ') + c = '_'; + putc(c, outtree); + } + *col += n; + } else { + putc('(', outtree); + (*col)++; + q = p->next; + while (q != p) { + treeout3(q->back, nextree, col, root); + q = q->next; + if (q == p) + break; + putc(',', outtree); + (*col)++; + if (*col > 60) { + putc('\n', outtree); + *col = 0; + } + } + putc(')', outtree); + (*col)++; + } + x = p->v; + if (x > 0.0) + w = (long)(0.43429448222 * log(x)); + else if (x == 0.0) + w = 0; + else + w = (long)(0.43429448222 * log(-x)) + 1; + if (w < 0) + w = 0; + if (p != root) { + fprintf(outtree, ":%*.5f", (int)(w + 7), x); + *col += w + 8; + } + if (p != root) + return; + if (nextree > 2) + fprintf(outtree, "[%6.4f];\n", 1.0 / (nextree - 1)); + else + fprintf(outtree, ";\n"); +} /* treeout3 */ + + +void drawline2(long i, double scale, tree curtree) +{ + /* draws one row of the tree diagram by moving up tree */ + /* used in dnaml, proml, & restml */ + node *p, *q; + long n, j; + boolean extra; + node *r, *first =NULL, *last =NULL; + boolean done; + + p = curtree.start; + q = curtree.start; + extra = false; + if (i == (long)p->ycoord && p == curtree.start) { + if (p->index - spp >= 10) + fprintf(outfile, " %2ld", p->index - spp); + else + fprintf(outfile, " %ld", p->index - spp); + extra = true; + } else + fprintf(outfile, " "); + do { + if (!p->tip) { + r = p->next; + done = false; + do { + if (i >= r->back->ymin && i <= r->back->ymax) { + q = r->back; + done = true; + } + r = r->next; + } while (!(done || (p != curtree.start && r == p) || + (p == curtree.start && r == p->next))); + first = p->next->back; + r = p; + while (r->next != p) + r = r->next; + last = r->back; + if (p == curtree.start) + last = p->back; + } + done = (p->tip || p == q); + n = (long)(scale * (q->xcoord - p->xcoord) + 0.5); + if (n < 3 && !q->tip) + n = 3; + if (extra) { + n--; + extra = false; + } + if ((long)q->ycoord == i && !done) { + if ((long)p->ycoord != (long)q->ycoord) + putc('+', outfile); + else + putc('-', outfile); + if (!q->tip) { + for (j = 1; j <= n - 2; j++) + putc('-', outfile); + if (q->index - spp >= 10) + fprintf(outfile, "%2ld", q->index - spp); + else + fprintf(outfile, "-%ld", q->index - spp); + extra = true; + } else { + for (j = 1; j < n; j++) + putc('-', outfile); + } + } else if (!p->tip) { + if ((long)last->ycoord > i && (long)first->ycoord < i && + (i != (long)p->ycoord || p == curtree.start)) { + putc('|', outfile); + for (j = 1; j < n; j++) + putc(' ', outfile); + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + } + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + } + if (q != p) + p = q; + } while (!done); + if ((long)p->ycoord == i && p->tip) { + for (j = 0; j < nmlngth; j++) + putc(nayme[p->index-1][j], outfile); + } + putc('\n', outfile); +} /* drawline2 */ + + +void drawline3(long i, double scale, node *start) +{ + /* draws one row of the tree diagram by moving up tree */ + /* used in dnapars */ + node *p, *q; + long n, j; + boolean extra; + node *r, *first =NULL, *last =NULL; + boolean done; + + p = start; + q = start; + extra = false; + if (i == (long)p->ycoord) { + if (p->index - spp >= 10) + fprintf(outfile, " %2ld", p->index - spp); + else + fprintf(outfile, " %ld", p->index - spp); + extra = true; + } else + fprintf(outfile, " "); + do { + if (!p->tip) { + r = p->next; + done = false; + do { + if (i >= r->back->ymin && i <= r->back->ymax) { + q = r->back; + done = true; + } + r = r->next; + } while (!(done || (r == p))); + first = p->next->back; + r = p; + while (r->next != p) + r = r->next; + last = r->back; + } + done = (p->tip || p == q); + n = (long)(scale * (q->xcoord - p->xcoord) + 0.5); + if (n < 3 && !q->tip) + n = 3; + if (extra) { + n--; + extra = false; + } + if ((long)q->ycoord == i && !done) { + if ((long)p->ycoord != (long)q->ycoord) + putc('+', outfile); + else + putc('-', outfile); + if (!q->tip) { + for (j = 1; j <= n - 2; j++) + putc('-', outfile); + if (q->index - spp >= 10) + fprintf(outfile, "%2ld", q->index - spp); + else + fprintf(outfile, "-%ld", q->index - spp); + extra = true; + } else { + for (j = 1; j < n; j++) + putc('-', outfile); + } + } else if (!p->tip) { + if ((long)last->ycoord > i && (long)first->ycoord < i && + (i != (long)p->ycoord || p == start)) { + putc('|', outfile); + for (j = 1; j < n; j++) + putc(' ', outfile); + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + } + } else { + for (j = 1; j <= n; j++) + putc(' ', outfile); + } + if (q != p) + p = q; + } while (!done); + if ((long)p->ycoord == i && p->tip) { + for (j = 0; j < nmlngth; j++) + putc(nayme[p->index-1][j], outfile); + } + putc('\n', outfile); +} /* drawline3 */ + + +void copynode(node *c, node *d, long categs) +{ + long i, j; + + for (i = 0; i < endsite; i++) + for (j = 0; j < categs; j++) + memcpy(d->x[i][j], c->x[i][j], sizeof(sitelike)); + memcpy(d->underflows,c->underflows,sizeof(double) * endsite); + d->tyme = c->tyme; + d->v = c->v; + d->xcoord = c->xcoord; + d->ycoord = c->ycoord; + d->ymin = c->ymin; + d->ymax = c->ymax; + d->iter = c->iter; /* iter used in dnaml only */ + d->haslength = c->haslength; /* haslength used in dnamlk only */ + d->initialized = c->initialized; /* initialized used in dnamlk only */ +} /* copynode */ + + +void prot_copynode(node *c, node *d, long categs) +{ + /* a version of copynode for proml */ + long i, j; + + for (i = 0; i < endsite; i++) + for (j = 0; j < categs; j++) + memcpy(d->protx[i][j], c->protx[i][j], sizeof(psitelike)); + memcpy(d->underflows,c->underflows,sizeof(double) * endsite); + d->tyme = c->tyme; + d->v = c->v; + d->xcoord = c->xcoord; + d->ycoord = c->ycoord; + d->ymin = c->ymin; + d->ymax = c->ymax; + d->iter = c->iter; /* iter used in dnaml only */ + d->haslength = c->haslength; /* haslength used in dnamlk only */ + d->initialized = c->initialized; /* initialized used in dnamlk only */ +} /* prot_copynode */ + + +void copy_(tree *a, tree *b, long nonodes, long categs) +{ + /* used in dnamlk */ + long i; + node *p, *q, *r, *s, *t; + + for (i = 0; i < spp; i++) { + copynode(a->nodep[i], b->nodep[i], categs); + if (a->nodep[i]->back) { + if (a->nodep[i]->back == a->nodep[a->nodep[i]->back->index - 1]) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]; + else if (a->nodep[i]->back == a->nodep[a->nodep[i]->back->index - 1]->next) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]->next; + else + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]->next->next; + } + else b->nodep[i]->back = NULL; + } + for (i = spp; i < nonodes; i++) { + if (a->nodep[i]) { + p = a->nodep[i]; + q = b->nodep[i]; + r = p; + do { + copynode(p, q, categs); + if (p->back) { + s = a->nodep[p->back->index - 1]; + t = b->nodep[p->back->index - 1]; + if (s->tip) { + if(p->back == s) + q->back = t; + } else { + do { + if (p->back == s) + q->back = t; + s = s->next; + t = t->next; + } while (s != a->nodep[p->back->index - 1]); + } + } + else + q->back = NULL; + p = p->next; + q = q->next; + } while (p != r); + } + } + b->likelihood = a->likelihood; + b->start = a->start; /* start used in dnaml only */ + b->root = a->root; /* root used in dnamlk only */ +} /* copy_ */ + + +void prot_copy_(tree *a, tree *b, long nonodes, long categs) +{ + /* used in promlk */ + /* identical to copy_() except for calls to prot_copynode rather */ + /* than copynode. */ + long i; + node *p, *q, *r, *s, *t; + + for (i = 0; i < spp; i++) { + prot_copynode(a->nodep[i], b->nodep[i], categs); + if (a->nodep[i]->back) { + if (a->nodep[i]->back == a->nodep[a->nodep[i]->back->index - 1]) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]; + else if (a->nodep[i]->back == a->nodep[a->nodep[i]->back->index - 1]->next +) + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]->next; + else + b->nodep[i]->back = b->nodep[a->nodep[i]->back->index - 1]->next->next; + } + else b->nodep[i]->back = NULL; + } + for (i = spp; i < nonodes; i++) { + if (a->nodep[i]) { + p = a->nodep[i]; + q = b->nodep[i]; + r = p; + do { + prot_copynode(p, q, categs); + if (p->back) { + s = a->nodep[p->back->index - 1]; + t = b->nodep[p->back->index - 1]; + if (s->tip) + { + if(p->back == s) + q->back = t; + } else { + do { + if (p->back == s) + q->back = t; + s = s->next; + t = t->next; + } while (s != a->nodep[p->back->index - 1]); + } + } + else + q->back = NULL; + p = p->next; + q = q->next; + } while (p != r); + } + } + b->likelihood = a->likelihood; + b->start = a->start; /* start used in dnaml only */ + b->root = a->root; /* root used in dnamlk only */ +} /* prot_copy_ */ + + +void standev(long chars, long numtrees, long minwhich, double minsteps, + double *nsteps, long **fsteps, longer seed) +{ /* do paired sites test (KHT or SH test) on user-defined trees */ + /* used in dnapars & protpars */ + long i, j, k; + double wt, sumw, sum, sum2, sd; + double temp; + double **covar, *P, *f, *r; + +#define SAMPLES 1000 + if (numtrees == 2) { + fprintf(outfile, "Kishino-Hasegawa-Templeton test\n\n"); + fprintf(outfile, "Tree Steps Diff Steps Its S.D."); + fprintf(outfile, " Significantly worse?\n\n"); + which = 1; + while (which <= numtrees) { + fprintf(outfile, "%3ld%10.1f", which, nsteps[which - 1] / 10); + if (minwhich == which) + fprintf(outfile, " <------ best\n"); + else { + sumw = 0.0; + sum = 0.0; + sum2 = 0.0; + for (i = 0; i < endsite; i++) { + if (weight[i] > 0) { + wt = weight[i] / 10.0; + sumw += wt; + temp = (fsteps[which - 1][i] - fsteps[minwhich - 1][i]) / 10.0; + sum += wt * temp; + sum2 += wt * temp * temp; + } + } + sd = sqrt(sumw / (sumw - 1.0) * (sum2 - sum * sum / sumw)); + fprintf(outfile, "%10.1f%12.4f", + (nsteps[which - 1] - minsteps) / 10, sd); + if ((sum > 0.0) && (sum > 1.95996 * sd)) + fprintf(outfile, " Yes\n"); + else + fprintf(outfile, " No\n"); + } + which++; + } + fprintf(outfile, "\n\n"); + } else { /* Shimodaira-Hasegawa test using normal approximation */ + if(numtrees > MAXSHIMOTREES){ + fprintf(outfile, "Shimodaira-Hasegawa test on first %d of %ld trees\n\n" + , MAXSHIMOTREES, numtrees); + numtrees = MAXSHIMOTREES; + } else { + fprintf(outfile, "Shimodaira-Hasegawa test\n\n"); + } + covar = (double **)Malloc(numtrees*sizeof(double *)); + sumw = 0.0; + for (i = 0; i < endsite; i++) + sumw += weight[i] / 10.0; + for (i = 0; i < numtrees; i++) + covar[i] = (double *)Malloc(numtrees*sizeof(double)); + for (i = 0; i < numtrees; i++) { /* compute covariances of trees */ + sum = nsteps[i]/(10.0*sumw); + for (j = 0; j <=i; j++) { + sum2 = nsteps[j]/(10.0*sumw); + temp = 0.0; + for (k = 0; k < endsite; k++) { + if (weight[k] > 0) { + wt = weight[k]/10.0; + temp = temp + wt*(fsteps[i][k]/10.0-sum) + *(fsteps[j][k]/10.0-sum2); + } + } + covar[i][j] = temp; + if (i != j) + covar[j][i] = temp; + } + } + for (i = 0; i < numtrees; i++) { /* in-place Cholesky decomposition + of trees x trees covariance matrix */ + sum = 0.0; + for (j = 0; j <= i-1; j++) + sum = sum + covar[i][j] * covar[i][j]; + if (covar[i][i] <= sum) + temp = 0.0; + else + temp = sqrt(covar[i][i] - sum); + covar[i][i] = temp; + for (j = i+1; j < numtrees; j++) { + sum = 0.0; + for (k = 0; k < i; k++) + sum = sum + covar[i][k] * covar[j][k]; + if (fabs(temp) < 1.0E-12) + covar[j][i] = 0.0; + else + covar[j][i] = (covar[j][i] - sum)/temp; + } + } + f = (double *)Malloc(numtrees*sizeof(double)); /* resampled sums */ + P = (double *)Malloc(numtrees*sizeof(double)); /* vector of P's of trees */ + r = (double *)Malloc(numtrees*sizeof(double)); /* store Normal variates */ + for (i = 0; i < numtrees; i++) + P[i] = 0.0; + sum2 = nsteps[0]/10.0; /* sum2 will be smallest # of steps */ + for (i = 1; i < numtrees; i++) + if (sum2 > nsteps[i]/10.0) + sum2 = nsteps[i]/10.0; + for (i = 1; i <= SAMPLES; i++) { /* loop over resampled trees */ + for (j = 0; j < numtrees; j++) /* draw Normal variates */ + r[j] = normrand(seed); + for (j = 0; j < numtrees; j++) { /* compute vectors */ + sum = 0.0; + for (k = 0; k <= j; k++) + sum += covar[j][k]*r[k]; + f[j] = sum; + } + sum = f[1]; + for (j = 1; j < numtrees; j++) /* get min of vector */ + if (f[j] < sum) + sum = f[j]; + for (j = 0; j < numtrees; j++) /* accumulate P's */ + if (nsteps[j]/10.0-sum2 <= f[j] - sum) + P[j] += 1.0/SAMPLES; + } + fprintf(outfile, "Tree Steps Diff Steps P value"); + fprintf(outfile, " Significantly worse?\n\n"); + for (i = 0; i < numtrees; i++) { + fprintf(outfile, "%3ld%10.1f", i+1, nsteps[i]/10); + if ((minwhich-1) == i) + fprintf(outfile, " <------ best\n"); + else { + fprintf(outfile, " %9.1f %10.3f", nsteps[i]/10.0-sum2, P[i]); + if (P[i] < 0.05) + fprintf(outfile, " Yes\n"); + else + fprintf(outfile, " No\n"); + } + } + fprintf(outfile, "\n"); + free(P); /* free the variables we Malloc'ed */ + free(f); + free(r); + for (i = 0; i < numtrees; i++) + free(covar[i]); + free(covar); + } +} /* standev */ + + +void standev2(long numtrees, long maxwhich, long a, long b, double maxlogl, + double *l0gl, double **l0gf, steptr aliasweight, longer seed) +{ /* do paired sites test (KHT or SH) for user-defined trees */ + /* used in dnaml, dnamlk, proml, promlk, and restml */ + double **covar, *P, *f, *r; + long i, j, k; + double wt, sumw, sum, sum2, sd; + double temp; + +#define SAMPLES 1000 + if (numtrees == 2) { + fprintf(outfile, "Kishino-Hasegawa-Templeton test\n\n"); + fprintf(outfile, "Tree logL Diff logL Its S.D."); + fprintf(outfile, " Significantly worse?\n\n"); + which = 1; + while (which <= numtrees) { + fprintf(outfile, "%3ld %9.1f", which, l0gl[which - 1]); + if (maxwhich == which) + fprintf(outfile, " <------ best\n"); + else { + sumw = 0.0; + sum = 0.0; + sum2 = 0.0; + for (i = a; i <= b; i++) { + if (aliasweight[i] > 0) { + wt = aliasweight[i]; + sumw += wt; + temp = l0gf[which - 1][i] - l0gf[maxwhich - 1][i]; + sum += temp * wt; + sum2 += wt * temp * temp; + } + } + temp = sum / sumw; + sd = sqrt(sumw / (sumw - 1.0) * (sum2 - sum * sum / sumw )); + fprintf(outfile, "%10.1f %11.4f", (l0gl[which - 1])-maxlogl, sd); + if ((sum < 0.0) && ((-sum) > 1.95996 * sd)) + fprintf(outfile, " Yes\n"); + else + fprintf(outfile, " No\n"); + } + which++; + } + fprintf(outfile, "\n\n"); + } else { /* Shimodaira-Hasegawa test using normal approximation */ + if(numtrees > MAXSHIMOTREES){ + fprintf(outfile, "Shimodaira-Hasegawa test on first %d of %ld trees\n\n" + , MAXSHIMOTREES, numtrees); + numtrees = MAXSHIMOTREES; + } else { + fprintf(outfile, "Shimodaira-Hasegawa test\n\n"); + } + covar = (double **)Malloc(numtrees*sizeof(double *)); + sumw = 0.0; + for (i = a; i <= b; i++) + sumw += aliasweight[i]; + for (i = 0; i < numtrees; i++) + covar[i] = (double *)Malloc(numtrees*sizeof(double)); + for (i = 0; i < numtrees; i++) { /* compute covariances of trees */ + sum = l0gl[i]/sumw; + for (j = 0; j <=i; j++) { + sum2 = l0gl[j]/sumw; + temp = 0.0; + for (k = a; k <= b ; k++) { + if (aliasweight[k] > 0) { + wt = aliasweight[k]; + temp = temp + wt*(l0gf[i][k]-sum) + *(l0gf[j][k]-sum2); + } + } + covar[i][j] = temp; + if (i != j) + covar[j][i] = temp; + } + } + for (i = 0; i < numtrees; i++) { /* in-place Cholesky decomposition + of trees x trees covariance matrix */ + sum = 0.0; + for (j = 0; j <= i-1; j++) + sum = sum + covar[i][j] * covar[i][j]; + if (covar[i][i] <= sum) + temp = 0.0; + else + temp = sqrt(covar[i][i] - sum); + covar[i][i] = temp; + for (j = i+1; j < numtrees; j++) { + sum = 0.0; + for (k = 0; k < i; k++) + sum = sum + covar[i][k] * covar[j][k]; + if (fabs(temp) < 1.0E-12) + covar[j][i] = 0.0; + else + covar[j][i] = (covar[j][i] - sum)/temp; + } + } + f = (double *)Malloc(numtrees*sizeof(double)); /* resampled likelihoods */ + P = (double *)Malloc(numtrees*sizeof(double)); /* vector of P's of trees */ + r = (double *)Malloc(numtrees*sizeof(double)); /* store Normal variates */ + for (i = 0; i < numtrees; i++) + P[i] = 0.0; + for (i = 1; i <= SAMPLES; i++) { /* loop over resampled trees */ + for (j = 0; j < numtrees; j++) /* draw Normal variates */ + r[j] = normrand(seed); + for (j = 0; j < numtrees; j++) { /* compute vectors */ + sum = 0.0; + for (k = 0; k <= j; k++) + sum += covar[j][k]*r[k]; + f[j] = sum; + } + sum = f[1]; + for (j = 1; j < numtrees; j++) /* get max of vector */ + if (f[j] > sum) + sum = f[j]; + for (j = 0; j < numtrees; j++) /* accumulate P's */ + if (maxlogl-l0gl[j] <= sum-f[j]) + P[j] += 1.0/SAMPLES; + } + fprintf(outfile, "Tree logL Diff logL P value"); + fprintf(outfile, " Significantly worse?\n\n"); + for (i = 0; i < numtrees; i++) { + fprintf(outfile, "%3ld%10.1f", i+1, l0gl[i]); + if ((maxwhich-1) == i) + fprintf(outfile, " <------ best\n"); + else { + fprintf(outfile, " %9.1f %10.3f", l0gl[i]-maxlogl, P[i]); + if (P[i] < 0.05) + fprintf(outfile, " Yes\n"); + else + fprintf(outfile, " No\n"); + } + } + fprintf(outfile, "\n"); + free(P); /* free the variables we Malloc'ed */ + free(f); + free(r); + for (i = 0; i < numtrees; i++) + free(covar[i]); + free(covar); + } +} /* standev */ + + +void freetip(node *anode) +{ + /* used in dnacomp, dnapars, & dnapenny */ + + free(anode->numsteps); + free(anode->oldnumsteps); + free(anode->base); + free(anode->oldbase); +} /* freetip */ + + +void freenontip(node *anode) +{ + /* used in dnacomp, dnapars, & dnapenny */ + + free(anode->numsteps); + free(anode->oldnumsteps); + free(anode->base); + free(anode->oldbase); + free(anode->numnuc); +} /* freenontip */ + + +void freenodes(long nonodes, pointarray treenode) +{ + /* used in dnacomp, dnapars, & dnapenny */ + long i; + node *p; + + for (i = 0; i < spp; i++) + freetip(treenode[i]); + for (i = spp; i < nonodes; i++) { + if (treenode[i] != NULL) { + p = treenode[i]->next; + do { + freenontip(p); + p = p->next; + } while (p != treenode[i]); + freenontip(p); + } + } +} /* freenodes */ + + +void freenode(node **anode) +{ + /* used in dnacomp, dnapars, & dnapenny */ + + freenontip(*anode); + free(*anode); +} /* freenode */ + + +void freetree(long nonodes, pointarray treenode) +{ + /* used in dnacomp, dnapars, & dnapenny */ + long i; + node *p, *q; + + for (i = 0; i < spp; i++) + free(treenode[i]); + for (i = spp; i < nonodes; i++) { + if (treenode[i] != NULL) { + p = treenode[i]->next; + do { + q = p->next; + free(p); + p = q; + } while (p != treenode[i]); + free(p); + } + } + free(treenode); +} /* freetree */ + + +void prot_freex_notip(long nonodes, pointarray treenode) +{ + /* used in proml */ + long i, j; + node *p; + + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + if ( p == NULL ) continue; + do { + for (j = 0; j < endsite; j++){ + free(p->protx[j]); + p->protx[j] = NULL; + } + free(p->protx); + p->protx = NULL; + p = p->next; + } while (p != treenode[i]); + } +} /* prot_freex_notip */ + + +void prot_freex(long nonodes, pointarray treenode) +{ + /* used in proml */ + long i, j; + node *p; + + for (i = 0; i < spp; i++) { + for (j = 0; j < endsite; j++) + free(treenode[i]->protx[j]); + free(treenode[i]->protx); + } + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + do { + for (j = 0; j < endsite; j++) + free(p->protx[j]); + free(p->protx); + p = p->next; + } while (p != treenode[i]); + } +} /* prot_freex */ + + +void freex_notip(long nonodes, pointarray treenode) +{ + /* used in dnaml & dnamlk */ + long i, j; + node *p; + + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + if ( p == NULL ) continue; + do { + for (j = 0; j < endsite; j++) + free(p->x[j]); + free(p->x); + p = p->next; + } while (p != treenode[i]); + } +} /* freex_notip */ + + +void freex(long nonodes, pointarray treenode) +{ + /* used in dnaml & dnamlk */ + long i, j; + node *p; + + for (i = 0; i < spp; i++) { + for (j = 0; j < endsite; j++) + free(treenode[i]->x[j]); + free(treenode[i]->x); + } + for (i = spp; i < nonodes; i++) { + if(treenode[i]){ + p = treenode[i]; + do { + for (j = 0; j < endsite; j++) + free(p->x[j]); + free(p->x); + p = p->next; + } while (p != treenode[i]); + } + } +} /* freex */ + + +void freex2(long nonodes, pointarray treenode) +{ + /* used in restml */ + long i, j; + node *p; + + for (i = 0; i < spp; i++) + free(treenode[i]->x2); + for (i = spp; i < nonodes; i++) { + p = treenode[i]; + for (j = 1; j <= 3; j++) { + free(p->x2); + p = p->next; + } + } +} /* freex2 */ + + +void freegarbage(gbases **garbage) +{ + /* used in dnacomp, dnapars, & dnapenny */ + gbases *p; + + while (*garbage) { + p = *garbage; + *garbage = (*garbage)->next; + free(p->base); + free(p); + } +} /*freegarbage */ + + +void freegrbg(node **grbg) +{ + /* used in dnacomp, dnapars, & dnapenny */ + node *p; + + while (*grbg) { + p = *grbg; + *grbg = (*grbg)->next; + freenontip(p); + free(p); + } +} /*freegrbg */ + + +void collapsetree(node *p, node *root, node **grbg, pointarray treenode, + long *zeros) +{ + /* Recurse through tree searching for zero length brances between */ + /* nodes (not to tips). If one exists, collapse the nodes together, */ + /* removing the branch. */ + node *q, *x1, *y1, *x2, *y2; + long i, j, index, index2, numd; + if (p->tip) + return; + q = p->next; + do { + if (!q->back->tip && q->v == 0.000000) { + /* merge the two nodes. */ + x1 = y2 = q->next; + x2 = y1 = q->back->next; + while(x1->next != q) + x1 = x1-> next; + while(y1->next != q->back) + y1 = y1-> next; + x1->next = x2; + y1->next = y2; + + index = q->index; + index2 = q->back->index; + numd = treenode[index-1]->numdesc + q->back->numdesc -1; + chucktreenode(grbg, q->back); + chucktreenode(grbg, q); + q = x2; + + /* update the indicies around the node circle */ + do{ + if(q->index != index){ + q->index = index; + } + q = q-> next; + }while(x2 != q); + updatenumdesc(treenode[index-1], root, numd); + + /* Alter treenode to point to real nodes, and update indicies */ + /* acordingly. */ + j = 0; i=0; + for(i = (index2-1); i < nonodes-1 && treenode[i+1]; i++){ + treenode[i]=treenode[i+1]; + treenode[i+1] = NULL; + x1=x2=treenode[i]; + do{ + x1->index = i+1; + x1 = x1 -> next; + } while(x1 != x2); + } + + /* Create a new empty fork in the blank spot of treenode */ + x1=NULL; + for(i=1; i <=3 ; i++){ + gnutreenode(grbg, &x2, index2, endsite, zeros); + x2->next = x1; + x1 = x2; + } + x2->next->next->next = x2; + treenode[nonodes-1]=x2; + if (q->back) + collapsetree(q->back, root, grbg, treenode, zeros); + } else { + if (q->back) + collapsetree(q->back, root, grbg, treenode, zeros); + q = q->next; + } + } while (q != p); +} /* collapsetree */ + + +void collapsebestrees(node **root, node **grbg, pointarray treenode, + bestelm *bestrees, long *place, long *zeros, + long chars, boolean recompute, boolean progress) + +{ + /* Goes through all best trees, collapsing trees where possible, and */ + /* deleting trees that are not unique. */ + long i,j, k, pos, nextnode, oldnextree; + boolean found; + node *dummy; + + oldnextree = nextree; + for(i = 0 ; i < (oldnextree - 1) ; i++){ + bestrees[i].collapse = true; + } + + if(progress) + printf("Collapsing best trees\n "); + k = 0; + for(i = 0 ; i < (oldnextree - 1) ; i++){ + if(progress){ + if(i % (((oldnextree-1) / 72) + 1) == 0) + putchar('.'); + fflush(stdout); + } + while(!bestrees[k].collapse) + k++; + /* Reconstruct tree. */ + *root = treenode[0]; + add(treenode[0], treenode[1], treenode[spp], root, recompute, + treenode, grbg, zeros); + nextnode = spp + 2; + for (j = 3; j <= spp; j++) { + if (bestrees[k].btree[j - 1] > 0) + add(treenode[bestrees[k].btree[j - 1] - 1], treenode[j - 1], + treenode[nextnode++ - 1], root, recompute, treenode, grbg, + zeros); + else + add(treenode[treenode[-bestrees[k].btree[j - 1]-1]->back->index-1], + treenode[j - 1], NULL, root, recompute, treenode, grbg, zeros); + } + reroot(treenode[outgrno - 1], *root); + + treelength(*root, chars, treenode); + collapsetree(*root, *root, grbg, treenode, zeros); + savetree(*root, place, treenode, grbg, zeros); + /* move everything down in the bestree list */ + for(j = k ; j < (nextree - 2) ; j++){ + memcpy(bestrees[j].btree, bestrees[j + 1].btree, spp * sizeof(long)); + bestrees[j].gloreange = bestrees[j + 1].gloreange; + bestrees[j + 1].gloreange = false; + bestrees[j].locreange = bestrees[j + 1].locreange; + bestrees[j + 1].locreange = false; + bestrees[j].collapse = bestrees[j + 1].collapse; + } + pos=0; + findtree(&found, &pos, nextree-1, place, bestrees); + + /* put the new tree at the end of the list if it wasn't found */ + nextree--; + if(!found) + addtree(pos, &nextree, false, place, bestrees); + + /* Deconstruct the tree */ + for (j = 1; j < spp; j++){ + re_move(treenode[j], &dummy, root, recompute, treenode, + grbg, zeros); + } + } + if (progress) { + putchar('\n'); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } +} diff --git a/forester/archive/RIO/others/phylip_mod/src/seq.h b/forester/archive/RIO/others/phylip_mod/src/seq.h new file mode 100644 index 0000000..e70a07a --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/seq.h @@ -0,0 +1,216 @@ +/*Modified by Christian Zmasek. Use at your own risk.*/ + +/* version 3.6. (c) Copyright 1993-2000 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +/* + seq.h: included in dnacomp, dnadist, dnainvar, dnaml, dnamlk, dnamove, + dnapars, dnapenny, protdist, protpars, & restml +*/ + +/* move */ +/* All the below moved here in the Great TreeRead Migration of '96 */ + +#define ebcdic EBCDIC +#define MAXNCH 26 /*changed from to 20 to 26 by CZ 2006-07-28 */ + +/* All of this came over from cons.h -plc*/ +#define OVER 7 +#define ADJACENT_PAIRS 1 +#define CORR_IN_1_AND_2 2 +#define ALL_IN_1_AND_2 3 +#define NO_PAIRING 4 +#define ALL_IN_FIRST 5 +#define TREE1 8 +#define TREE2 9 + +#define FULL_MATRIX 11 +#define VERBOSE 22 +#define SPARSE 33 + +/* Used in proml, promlk, dnaml, dnamlk for undefined bestyet*/ +#define UNDEFINED 1.0 + + +/* Number of columns per block in a matrix output */ +#define COLUMNS_PER_BLOCK 10 + + +/*end move*/ + + +typedef struct gbases { + baseptr base; + struct gbases *next; +} gbases; + +typedef struct nuview_data { + /* A big 'ol collection of pointers used in nuview */ + double *yy, *wwzz, *vvzz, *vzsumr, *vzsumy, *sum, *sumr, *sumy; + sitelike *xx; +} nuview_data; + +struct LOC_hyptrav { + boolean bottom; + node *r; + long *hypset; + boolean maybe, nonzero; + long tempset, anc; +} ; + + +extern long nonodes, endsite, outgrno, nextree, which; + +extern boolean interleaved, printdata, outgropt, treeprint, dotdiff, transvp; +extern steptr weight, category, alias, location, ally; +extern sequence y; + +#ifndef OLDC +/* function prototypes */ +void alloctemp(node **, long *, long); +void freetemp(node **); +void freetree2 (pointarray, long); +void inputdata(long); +void alloctree(pointarray *, long, boolean); +void allocx(long, long, pointarray, boolean); + +void prot_allocx(long, long, pointarray, boolean); +void allocx2(long, long, long, pointarray, boolean); +void setuptree(pointarray, long, boolean); +void setuptree2(tree); +void alloctip(node *, long *); +void freetrans(transptr *, long ,long ); +void getbasefreqs(double, double, double, double, double *, double *, + double *, double *, double *, double *, double *, + double *xi, double *, double *, boolean, boolean); +void empiricalfreqs(double *,double *,double *,double *,steptr,pointarray); +void sitesort(long, steptr); +void sitecombine(long); + +void sitescrunch(long); +void sitesort2(long, steptr); +void sitecombine2(long, steptr); +void sitescrunch2(long, long, long, steptr); +void makevalues(pointarray, long *, boolean); +void makevalues2(long, pointarray, long, long, sequence, steptr); +void fillin(node *, node *, node *); +long getlargest(long *); +void multifillin(node *, node *, long); +void sumnsteps(node *, node *, node *, long, long); + +void sumnsteps2(node *, node *, node *, long, long, long *); +void multisumnsteps(node *, node *, long, long, long *); +void multisumnsteps2(node *); +boolean alltips(node *, node *); +void gdispose(node *, node **, pointarray); +void preorder(node *, node *, node *, node *, node *, node *, long); +void updatenumdesc(node *, node *, long); +void add(node *,node *,node *,node **,boolean,pointarray,node **,long *); +void findbelow(node **below, node *item, node *fork); + +void re_move(node *item, node **fork, node **root, boolean recompute, + pointarray, node **, long *); +void postorder(node *p); +void getnufork(node **, node **, pointarray, long *); +void reroot(node *, node *); +void reroot2(node *, node *); +void reroot3(node *, node *, node *, node *, node **); +void savetraverse(node *); +void newindex(long, node *); +void flipindexes(long, pointarray); +boolean parentinmulti(node *); + +long sibsvisited(node *, long *); +long smallest(node *, long *); +void bintomulti(node **, node **, node **, long *); +void backtobinary(node **, node *, node **); +boolean outgrin(node *, node *); +void flipnodes(node *, node *); +void moveleft(node *, node *, node **); +void savetree(node *, long *, pointarray, node **, long *); +void addnsave(node *, node *, node *, node **, node **,boolean, + pointarray, long *, long *); +void addbestever(long *, long *, long, boolean, long *, bestelm *); + +void addtiedtree(long, long *, long, boolean,long *, bestelm *); +void clearcollapse(pointarray); +void clearbottom(pointarray); +void collabranch(node *, node *, node *); +boolean allcommonbases(node *, node *, boolean *); +void findbottom(node *, node **); +boolean moresteps(node *, node *); +boolean passdown(node *, node *, node *, node *, node *, node *, + node *, node *, node *, boolean); +boolean trycollapdesc(node *, node *, node *, node *, node *, + node *, node *, node *, node *, boolean , long *); +void setbottom(node *); + +boolean zeroinsubtree(node *, node *, node *, node *, node *, + node *, node *, node *, boolean, node *, long *); +boolean collapsible(node *, node *, node *, node *, node *, + node *, node *, node *, boolean, node *, long *, pointarray); +void replaceback(node **, node *, node *, node **, long *); +void putback(node *, node *, node *, node **); +void savelocrearr(node *, node *, node *, node *, node *, node *, + node *, node *, node *, node **, long, long *, boolean, + boolean , boolean *, long *, bestelm *, pointarray , + node **, long *); +void clearvisited(pointarray); +void hyprint(long, long, struct LOC_hyptrav *,pointarray, Char *); +void gnubase(gbases **, gbases **, long); +void chuckbase(gbases *, gbases **); +void hyptrav(node *, long *, long, long, boolean,pointarray, + gbases **, Char *); + +void hypstates(long , node *, pointarray, gbases **, Char *); +void initbranchlen(node *p); +void initmin(node *, long, boolean); +void initbase(node *, long); +void inittreetrav(node *, long); +void compmin(node *, node *); +void minpostorder(node *, pointarray); +void branchlength(node *,node *,double *,pointarray); +void printbranchlengths(node *); +void branchlentrav(node *,node *,long,long,double *,pointarray); + +void treelength(node *, long, pointarray); +void coordinates(node *, long *, double, long *); +void drawline(long, double, node *); +void printree(node *, double); +void writesteps(long, boolean, steptr, node *); +void treeout(node *, long, long *, node *); +void treeout3(node *, long, long *, node *); +void drawline2(long, double, tree); +void drawline3(long, double, node *); +void copynode(node *, node *, long); + +void prot_copynode(node *, node *, long); +void copy_(tree *, tree *, long, long); +void prot_copy_(tree *, tree *, long, long); +void standev(long, long, long, double, double *, long **, longer); +void standev2(long, long, long, long, double, double *, double **, + steptr, longer); +void freetip(node *); +void freenontip(node *); +void freenodes(long, pointarray); +void freenode(node **); +void freetree(long, pointarray); + +void freex(long, pointarray); +void freex_notip(long, pointarray); +void freex2(long, pointarray); +void prot_freex_notip(long nonodes, pointarray treenode); +void prot_freex(long nonodes, pointarray treenode); +void freegarbage(gbases **); +void freegrbg(node **); + +void collapsetree(node *, node *, node **, pointarray, long *); +void collapsebestrees(node **, node **, pointarray, bestelm *, long *, + long *, long, boolean, boolean); +void fix_x(node* p,long site, double maxx, long rcategs); +void fix_protx(node* p,long site,double maxx, long rcategs); +/*function prototypes*/ +#endif + diff --git a/forester/archive/RIO/others/phylip_mod/src/seqboot.c b/forester/archive/RIO/others/phylip_mod/src/seqboot.c new file mode 100644 index 0000000..c5b9c29 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/seqboot.c @@ -0,0 +1,1419 @@ +#include "phylip.h" +#include "seq.h" + +/* version 3.6. (c) Copyright 1993-2005 by the University of Washington. + Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, Andrew Keeffe, + and Doug Buxton. + Permission is granted to copy and use this program provided no fee is + charged for it and provided that this copyright notice is not removed. */ + +typedef enum { + seqs, morphology, restsites, genefreqs +} datatype; + +typedef enum { + dna, rna, protein +} seqtype; + + +#ifndef OLDC +/* function prototypes */ +void getoptions(void); +void seqboot_inputnumbers(void); +void seqboot_inputfactors(void); +void inputoptions(void); +void seqboot_inputdata(void); +void allocrest(void); +void allocnew(void); +void doinput(int argc, Char *argv[]); +void bootweights(void); +void sppermute(long); +void charpermute(long, long); +void writedata(void); +void writeweights(void); +void writecategories(void); +void writeauxdata(steptr, FILE*); +void writefactors(void); +void bootwrite(void); +void seqboot_inputaux(steptr, FILE*); +/* function prototypes */ +#endif + + +FILE *outcatfile, *outweightfile, *outmixfile, *outancfile, *outfactfile; +Char infilename[FNMLNGTH], outfilename[FNMLNGTH], catfilename[FNMLNGTH], outcatfilename[FNMLNGTH], + weightfilename[FNMLNGTH], outweightfilename[FNMLNGTH], mixfilename[FNMLNGTH], outmixfilename[FNMLNGTH], ancfilename[FNMLNGTH], outancfilename[FNMLNGTH], + factfilename[FNMLNGTH], outfactfilename[FNMLNGTH]; +long sites, loci, maxalleles, groups, newsites, newersites, + newgroups, newergroups, nenzymes, reps, ws, blocksize, categs, maxnewsites; +boolean bootstrap, permute, ild, lockhart, jackknife, regular, xml, nexus, + weights, categories, factors, enzymes, all, justwts, progress, mixture, + firstrep, ancvar; +double fracsample; +datatype data; +seqtype seq; +steptr oldweight, where, how_many, newwhere, newhowmany, + newerwhere, newerhowmany, factorr, newerfactor, mixdata, ancdata; +steptr *charorder; +Char *factor; +long *alleles; +Char **nodep; +double **nodef; +long **sppord; +longer seed; + + +void getoptions() +{ + /* interactively set options */ + long reps0; + long inseed, inseed0, loopcount, loopcount2; + Char ch; + boolean done1; + + data = seqs; + seq = dna; + bootstrap = true; + jackknife = false; + permute = false; + ild = false; + lockhart = false; + blocksize = 1; + regular = true; + fracsample = 1.0; + all = false; + reps = 100; + weights = false; + mixture = false; + ancvar = false; + categories = false; + justwts = false; + printdata = false; + dotdiff = true; + progress = true; + interleaved = true; + xml = false; + nexus = false; + factors = false; + loopcount = 0; + for (;;) { + cleerhome(); + printf("\nBootstrapping algorithm, version %s\n\n",VERSION); + printf("Settings for this run:\n"); + printf(" D Sequence, Morph, Rest., Gene Freqs? %s\n", + (data == seqs ) ? "Molecular sequences" : + (data == morphology ) ? "Discrete Morphology" : + (data == restsites) ? "Restriction Sites" : + (data == genefreqs) ? "Gene Frequencies" : ""); + if (data == restsites) + printf(" E Number of enzymes? %s\n", + enzymes ? "Present in input file" : + "Not present in input file"); + if (data == genefreqs) + printf(" A All alleles present at each locus? %s\n", + all ? "Yes" : "No, one absent at each locus"); + if (data == morphology) + printf(" F Use factors information? %s\n", + factors ? "Yes" : "No"); + + printf(" J Bootstrap, Jackknife, Permute, Rewrite? %s\n", + regular && jackknife ? "Delete-half jackknife" : + (!regular) && jackknife ? "Delete-fraction jackknife" : + permute ? "Permute species for each character" : + ild ? "Permute character order" : + lockhart ? "Permute within species" : + regular && bootstrap ? "Bootstrap" : + (!regular) && bootstrap ? "Partial bootstrap" : + "Rewrite data"); + if (bootstrap || jackknife) { + printf(" "); + putchar('%'); + printf(" Regular or altered sampling fraction? "); + if (regular) + printf("regular\n"); + else { + if (fabs(fracsample*100 - (int)(fracsample*100)) > 0.01) { + if (fracsample < 1) + printf("%2.1lf", 100.0*fracsample); + else + printf("%3.1lf", 100.0*fracsample); + } else { if (fracsample < 1) + printf("%2.0lf", 100.0*fracsample); + else + printf("%3.0lf", 100.0*fracsample); + } + putchar('%'); + printf(" sampled\n"); + } + } + if ((data == seqs) + && !(jackknife || permute || bootstrap || ild || lockhart)) { + printf(" P PHYLIP, NEXUS, or XML output format? %s\n", + nexus ? "NEXUS" : xml ? "XML" : "PHYLIP"); + if (xml || ((data == seqs) && nexus)) { + printf(" S Type of molecular sequences? " ); + switch (seq) { + case (dna) : printf("DNA\n"); break; + case (rna) : printf("RNA\n"); break; + case (protein) : printf("Protein\n"); break; + } + } + } + if ((data == morphology) && !(jackknife || permute || ild + || lockhart || bootstrap)) + printf(" P PHYLIP or NEXUS output format? %s\n", + nexus ? "NEXUS" : "PHYLIP"); + if (bootstrap) { + if (blocksize > 1) + printf(" B Block size for block-bootstrapping? %ld\n", blocksize); + else + printf(" B Block size for block-bootstrapping? %ld (regular bootstrap)\n", blocksize); + } + if (bootstrap || jackknife || permute || ild || lockhart) + printf(" R How many replicates? %ld\n", reps); + if (jackknife || bootstrap || permute) { + printf(" W Read weights of characters? %s\n", + (weights ? "Yes" : "No")); + if(data == morphology){ + printf(" X Read mixture file? %s\n", + (mixture ? "Yes" : "No")); + printf(" N Read ancestors file? %s\n", + (ancvar ? "Yes" : "No")); + } + if (data == seqs) + printf(" C Read categories of sites? %s\n", + (categories ? "Yes" : "No")); + if ((!permute)) { + printf(" S Write out data sets or just weights? %s\n", + (justwts ? "Just weights" : "Data sets")); + } + } + if (data == seqs || data == restsites) + printf(" I Input sequences interleaved? %s\n", + interleaved ? "Yes" : "No, sequential"); + printf(" 0 Terminal type (IBM PC, ANSI, none)? %s\n", + ibmpc ? "IBM PC" : ansi ? "ANSI" : "(none)"); + printf(" 1 Print out the data at start of run %s\n", + printdata ? "Yes" : "No"); + if (printdata) + printf(" . Use dot-differencing to display them %s\n", + dotdiff ? "Yes" : "No"); + printf(" 2 Print indications of progress of run %s\n", + progress ? "Yes" : "No"); + printf("\n Y to accept these or type the letter for one to change\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%c%*[^\n]", &ch); + getchar(); + uppercase(&ch); + if (ch == 'Y') + break; + if ( + (bootstrap && (strchr("ABCDEFSJPRWXNI%1.20",ch) != NULL)) || + (jackknife && (strchr("ACDEFSJPRWXNI%1.20",ch) != NULL)) || + ((permute || ild || lockhart) + && (strchr("ACDEFSJPRXNI%1.20",ch) != NULL)) || + ((!(bootstrap || jackknife || permute || ild || lockhart)) && + ((!xml) && (strchr("ADEFJPI1.20",ch) != NULL))) || + (((data == morphology) || (data == seqs)) + && (nexus || xml) && (strchr("ADEFJPSI1.20",ch) != NULL)) + ) { + switch (ch) { + + case 'D': + if (data == genefreqs) + data = seqs; + else + data = (datatype)((long)data + 1); + break; + + case 'A': + all = !all; + break; + + case 'E': + enzymes = !enzymes; + break; + + case 'J': + if (permute) { + permute = false; + ild = true; + } else if (ild) { + ild = false; + lockhart = true; + } else if (lockhart) + lockhart = false; + else if (jackknife) { + jackknife = false; + permute = true; + } else if (bootstrap) { + bootstrap = false; + jackknife = true; + } else + bootstrap = true; + break; + + case '%': + regular = !regular; + if (!regular) { + loopcount2 = 0; + do { + printf("Samples as percentage of"); + if ((data == seqs) || (data == restsites)) + printf(" sites?\n"); + if (data == morphology) + printf(" characters?\n"); + if (data == genefreqs) + printf(" loci?\n"); + scanf("%lf%*[^\n]", &fracsample); + getchar(); + done1 = (fracsample > 0.0); + if (!done1) { + printf("BAD NUMBER: must be positive\n"); + } + fracsample = fracsample/100.0; + countup(&loopcount2, 10); + } while (done1 != true); + } + break; + + case 'P': + if (data == seqs) { + if (!xml && !nexus) + nexus = true; + else { + if (nexus) { + nexus = false; + xml = true; + } + else xml = false; + } + } + if (data == morphology) { + nexus = !nexus; + xml = false; + } + break; + + case 'S': + if(jackknife || permute || bootstrap || ild || lockhart){ + justwts = !justwts; + } else { + switch (seq) { + case (dna): seq = rna; break; + case (rna): seq = protein; break; + case (protein): seq = dna; break; + } + } + break; + + case 'B': + loopcount2 = 0; + do { + printf("Block size?\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%ld%*[^\n]", &blocksize); + getchar(); + done1 = (blocksize > 0); + if (!done1) { + printf("BAD NUMBER: must be positive\n"); + } + countup(&loopcount2, 10); + } while (done1 != true); + break; + + case 'R': + reps0 = reps; + loopcount2 = 0; + do { + printf("Number of replicates?\n"); +#ifdef WIN32 + phyFillScreenColor(); +#endif + scanf("%ld%*[^\n]", &reps); + getchar(); + done1 = (reps > 0); + if (!done1) { + printf("BAD NUMBER: must be positive\n"); + reps = reps0; + } + countup(&loopcount2, 10); + } while (done1 != true); + break; + + case 'W': + weights = !weights; + break; + + case 'X': + mixture = !mixture; + break; + + case 'N': + ancvar = !ancvar; + break; + + case 'C': + categories = !categories; + break; + + case 'F': + factors = !factors; + break; + + case 'I': + interleaved = !interleaved; + break; + + case '0': + initterminal(&ibmpc, &ansi); + break; + + case '1': + printdata = !printdata; + break; + + case '.': + dotdiff = !dotdiff; + break; + + case '2': + progress = !progress; + break; + } + } else + printf("Not a possible option!\n"); + countup(&loopcount, 100); + } + if (bootstrap || jackknife) { + if (jackknife && regular) + fracsample = 0.5; + if (bootstrap && regular) + fracsample = 1.0; + } + if (bootstrap || jackknife || permute || ild || lockhart) + initseed(&inseed, &inseed0, seed); + xml = xml && (data == seqs); + categories = categories && (data == seqs); + mixture = mixture && (data == morphology); + ancvar = ancvar && (data == morphology); +} /* getoptions */ + + +void seqboot_inputnumbers() +{ + /* read numbers of species and of sites */ + long i; + + fscanf(infile, "%ld%ld", &spp, &sites); + loci = sites; + maxalleles = 1; + if (data == restsites && enzymes) + fscanf(infile, "%ld", &nenzymes); + if (data == genefreqs) { + alleles = (long *)Malloc(sites*sizeof(long)); + scan_eoln(infile); + sites = 0; + for (i = 0; i < (loci); i++) { + if (eoln(infile)) + scan_eoln(infile); + fscanf(infile, "%ld", &alleles[i]); + if (alleles[i] > maxalleles) + maxalleles = alleles[i]; + if (all) + sites += alleles[i]; + else + sites += alleles[i] - 1; + } + if (!all) + maxalleles--; + scan_eoln(infile); + } +} /* seqboot_inputnumbers */ + + +void seqboot_inputfactors() +{ + long i, j; + Char ch, prevch; + + prevch = ' '; + j = 0; + for (i = 0; i < (sites); i++) { + do { + if (eoln(factfile)) + scan_eoln(factfile); + ch = gettc(factfile); + } while (ch == ' '); + if (ch != prevch) + j++; + prevch = ch; + factorr[i] = j; + } + scan_eoln(factfile); +} /* seqboot_inputfactors */ + + +void inputoptions() +{ + /* input the information on the options */ + long weightsum, maxfactsize, i, j, k, l, m; + + if (data == genefreqs) { + k = 0; + l = 0; + for (i = 0; i < (loci); i++) { + if (all) + m = alleles[i]; + else + m = alleles[i] - 1; + k++; + for (j = 1; j <= m; j++) { + l++; + factorr[l - 1] = k; + } + } + } else { + for (i = 1; i <= (sites); i++) + factorr[i - 1] = i; + } + if(factors){ + seqboot_inputfactors(); + } + for (i = 0; i < (sites); i++) + oldweight[i] = 1; + if (weights) + inputweights2(0, sites, &weightsum, oldweight, &weights, "seqboot"); + if (factors && printdata) { + for(i = 0; i < sites; i++) + factor[i] = (char)('0' + (factorr[i]%10)); + printfactors(outfile, sites, factor, " (least significant digit)"); + } + if (weights && printdata) + printweights(outfile, 0, sites, oldweight, "Sites"); + for (i = 0; i < (loci); i++) + how_many[i] = 0; + for (i = 0; i < (loci); i++) + where[i] = 0; + for (i = 1; i <= (sites); i++) { + how_many[factorr[i - 1] - 1]++; + if (where[factorr[i - 1] - 1] == 0) + where[factorr[i - 1] - 1] = i; + } + groups = factorr[sites - 1]; + newgroups = 0; + newsites = 0; + maxfactsize = 0; + for(i = 0 ; i < loci ; i++){ + if(how_many[i] > maxfactsize){ + maxfactsize = how_many[i]; + } + } + maxnewsites = groups * maxfactsize; + allocnew(); + for (i = 0; i < (groups); i++) { + if (oldweight[where[i] - 1] > 0) { + newgroups++; + newsites += how_many[i]; + newwhere[newgroups - 1] = where[i]; + newhowmany[newgroups - 1] = how_many[i]; + } + } +} /* inputoptions */ + + +void seqboot_inputdata() +{ + /* input the names and sequences for each species */ + long i, j, k, l, m, n, basesread, basesnew=0; + double x; + Char charstate; + boolean allread, done; + + if (data == genefreqs) { + nodef = (double **)Malloc(spp*sizeof(double *)); + for (i = 0; i < (spp); i++) + nodef[i] = (double *)Malloc(sites*sizeof(double)); + } else { + nodep = (Char **)Malloc(spp*sizeof(Char *)); + for (i = 0; i < (spp); i++) + nodep[i] = (Char *)Malloc(sites*sizeof(Char)); + } + j = nmlngth + (sites + (sites - 1) / 10) / 2 - 5; + if (j < nmlngth - 1) + j = nmlngth - 1; + if (j > 37) + j = 37; + if (printdata) { + fprintf(outfile, "\nBootstrapping algorithm, version %s\n\n\n",VERSION); + if (bootstrap) { + if (blocksize > 1) { + if (regular) + fprintf(outfile, "Block-bootstrap with block size %ld\n\n", blocksize); + else + fprintf(outfile, "Partial (%2.0f%%) block-bootstrap with block size %ld\n\n", + 100*fracsample, blocksize); + } else { + if (regular) + fprintf(outfile, "Bootstrap\n\n"); + else + fprintf(outfile, "Partial (%2.0f%%) bootstrap\n\n", 100*fracsample); + } + } else { + if (jackknife) { + if (regular) + fprintf(outfile, "Delete-half Jackknife\n\n"); + else + fprintf(outfile, "Delete-%2.0f%% Jackknife\n\n", 100*(1.0-fracsample)); + } else { + if (permute) { + fprintf(outfile, "Species order permuted separately for each"); + if (data == genefreqs) + fprintf(outfile, " locus\n\n"); + if (data == seqs) + fprintf(outfile, " site\n\n"); + if (data == morphology) + fprintf(outfile, " character\n\n"); + if (data == restsites) + fprintf(outfile, " site\n\n"); + } + else { + if (ild) { + if (data == genefreqs) + fprintf(outfile, "Locus"); + if (data == seqs) + fprintf(outfile, "Site"); + if (data == morphology) + fprintf(outfile, "Character"); + if (data == restsites) + fprintf(outfile, "Site"); + fprintf(outfile, " order permuted\n\n"); + } else { + if (lockhart) + if (data == genefreqs) + fprintf(outfile, "Locus"); + if (data == seqs) + fprintf(outfile, "Site"); + if (data == morphology) + fprintf(outfile, "Character"); + if (data == restsites) + fprintf(outfile, "Site"); + fprintf(outfile, " order permuted separately for each species\n\n"); + } + } + } + } + if (data == genefreqs) + fprintf(outfile, "%3ld species, %3ld loci\n\n", spp, loci); + else { + fprintf(outfile, "%3ld species, ", spp); + if (data == seqs) + fprintf(outfile, "%3ld sites\n\n", sites); + else if (data == morphology) + fprintf(outfile, "%3ld characters\n\n", sites); + else if (data == restsites) + fprintf(outfile, "%3ld sites\n\n", sites); + } + fprintf(outfile, "Name"); + for (i = 1; i <= j; i++) + putc(' ', outfile); + fprintf(outfile, "Data\n"); + fprintf(outfile, "----"); + for (i = 1; i <= j; i++) + putc(' ', outfile); + fprintf(outfile, "----\n\n"); + } + interleaved = (interleaved && ((data == seqs) || (data == restsites))); + if (data == genefreqs) { + for (i = 1; i <= (spp); i++) { + initname(i - 1); + j = 1; + while (j <= sites && !eoff(infile)) { + if (eoln(infile)) + scan_eoln(infile); + fscanf(infile, "%lf", &x); + if ((unsigned)x > 1.0) { + printf("GENE FREQ OUTSIDE [0,1] in species %ld\n", i); + exxit(-1); + } else { + nodef[i - 1][j - 1] = x; + j++; + } + } + scan_eoln(infile); + } + return; + } + basesread = 0; + allread = false; + while (!allread) { + /* eat white space -- if the separator line has spaces on it*/ + do { + charstate = gettc(infile); + } while (charstate == ' ' || charstate == '\t'); + ungetc(charstate, infile); + if (eoln(infile)) + scan_eoln(infile); + i = 1; + while (i <= spp) { + if ((interleaved && basesread == 0) || !interleaved) + initname(i-1); + j = interleaved ? basesread : 0; + done = false; + while (!done && !eoff(infile)) { + if (interleaved) + done = true; + while (j < sites && !(eoln(infile) ||eoff(infile))) { + charstate = gettc(infile); + if (charstate == '\n' || charstate == '\t') + charstate = ' '; + if (charstate == ' ' || + (data == seqs && charstate >= '0' && charstate <= '9')) + continue; + uppercase(&charstate); + j++; + if (charstate == '.') + charstate = nodep[0][j-1]; + nodep[i-1][j-1] = charstate; + } + if (interleaved) + continue; + if (j < sites) + scan_eoln(infile); + else if (j == sites) + done = true; + } + if (interleaved && i == 1) + basesnew = j; + scan_eoln(infile); + if ((interleaved && j != basesnew) || ((!interleaved) && j != sites)){ + printf("\n\nERROR: sequences out of alignment at site %ld", j+1); + printf(" of species %ld\n\n", i); + exxit(-1);} + i++; + } + if (interleaved) { + basesread = basesnew; + allread = (basesread == sites); + } else + allread = (i > spp); + } + if (!printdata) + return; + if (data == genefreqs) + m = (sites - 1) / 8 + 1; + else + m = (sites - 1) / 60 + 1; + for (i = 1; i <= m; i++) { + for (j = 0; j < spp; j++) { + for (k = 0; k < nmlngth; k++) + putc(nayme[j][k], outfile); + fprintf(outfile, " "); + if (data == genefreqs) + l = i * 8; + else + l = i * 60; + if (l > sites) + l = sites; + if (data == genefreqs) + n = (i - 1) * 8; + else + n = (i - 1) * 60; + for (k = n; k < l; k++) { + if (data == genefreqs) + fprintf(outfile, "%8.5f", nodef[j][k]); + else { + if (j + 1 > 1 && nodep[j][k] == nodep[0][k]) + charstate = '.'; + else + charstate = nodep[j][k]; + putc(charstate, outfile); + if ((k + 1) % 10 == 0 && (k + 1) % 60 != 0) + putc(' ', outfile); + + } + } + putc('\n', outfile); + } + putc('\n', outfile); + } + putc('\n', outfile); +} /* seqboot_inputdata */ + + +void allocrest() +{ /* allocate memory for bookkeeping arrays */ + + oldweight = (steptr)Malloc(sites*sizeof(long)); + weight = (steptr)Malloc(sites*sizeof(long)); + if (categories) + category = (steptr)Malloc(sites*sizeof(long)); + if (mixture) + mixdata = (steptr)Malloc(sites*sizeof(long)); + if (ancvar) + ancdata = (steptr)Malloc(sites*sizeof(long)); + where = (steptr)Malloc(loci*sizeof(long)); + how_many = (steptr)Malloc(loci*sizeof(long)); + factor = (Char *)Malloc(sites*sizeof(Char)); + factorr = (steptr)Malloc(sites*sizeof(long)); + nayme = (naym *)Malloc(spp*sizeof(naym)); +} /* allocrest */ + +void allocnew(void) +{ /* allocate memory for arrays that depend on the lenght of the + output sequence*/ + long i; + + newwhere = (steptr)Malloc(loci*sizeof(long)); + newhowmany = (steptr)Malloc(loci*sizeof(long)); + newerwhere = (steptr)Malloc(loci*sizeof(long)); + newerhowmany = (steptr)Malloc(loci*sizeof(long)); + newerfactor = (steptr)Malloc(maxnewsites*maxalleles*sizeof(long)); + charorder = (steptr *)Malloc(spp*sizeof(steptr)); + for (i = 0; i < spp; i++) + charorder[i] = (steptr)Malloc(maxnewsites*sizeof(long)); +} + +void doinput(int argc, Char *argv[]) +{ /* reads the input data */ + getoptions(); + seqboot_inputnumbers(); + allocrest(); + if (weights) + openfile(&weightfile,WEIGHTFILE,"input weight file", + "r",argv[0],weightfilename); + if (mixture){ + openfile(&mixfile,MIXFILE,"mixture file", "r",argv[0],mixfilename); + openfile(&outmixfile,"outmixture","output mixtures file","w",argv[0], + outmixfilename); + seqboot_inputaux(mixdata, mixfile); + } + if (ancvar){ + openfile(&ancfile,ANCFILE,"ancestor file", "r",argv[0],ancfilename); + openfile(&outancfile,"outancestors","output ancestors file","w",argv[0], + outancfilename); + seqboot_inputaux(ancdata, ancfile); + } + if (categories) { + openfile(&catfile,CATFILE,"input category file","r",argv[0],catfilename); + openfile(&outcatfile,"outcategories","output category file","w",argv[0], + outcatfilename); + inputcategs(0, sites, category, 9, "SeqBoot"); + } + if (factors){ + openfile(&factfile,FACTFILE,"factors file","r",argv[0],factfilename); + openfile(&outfactfile,"outfactors","output factors file","w",argv[0], + outfactfilename); + } + if (justwts && !permute) + openfile(&outweightfile,"outweights","output weight file", + "w",argv[0],outweightfilename); + else { + openfile(&outfile,OUTFILE,"output data file","w",argv[0],outfilename); + } + inputoptions(); + seqboot_inputdata(); +} /* doinput */ + + +void bootweights() +{ /* sets up weights by resampling data */ + long i, j, k, blocks; + double p, q, r; + + ws = newgroups; + for (i = 0; i < (ws); i++) + weight[i] = 0; + if (jackknife) { + if (fabs(newgroups*fracsample - (long)(newgroups*fracsample+0.5)) + > 0.00001) { + if (randum(seed) + < (newgroups*fracsample - (long)(newgroups*fracsample)) + /((long)(newgroups*fracsample+1.0)-(long)(newgroups*fracsample))) + q = (long)(newgroups*fracsample)+1; + else + q = (long)(newgroups*fracsample); + } else + q = (long)(newgroups*fracsample+0.5); + r = newgroups; + p = q / r; + ws = 0; + for (i = 0; i < (newgroups); i++) { + if (randum(seed) < p) { + weight[i]++; + ws++; + q--; + } + r--; + if (i + 1 < newgroups) + p = q / r; + } + } else if (permute) { + for (i = 0; i < (newgroups); i++) + weight[i] = 1; + } else if (bootstrap) { + blocks = fracsample * newgroups / blocksize; + for (i = 1; i <= (blocks); i++) { + j = (long)(newgroups * randum(seed)) + 1; + for (k = 0; k < blocksize; k++) { + weight[j - 1]++; + j++; + if (j > newgroups) + j = 1; + } + } + } else /* case of rewriting data */ + for (i = 0; i < (newgroups); i++) + weight[i] = 1; + for (i = 0; i < (newgroups); i++) + newerwhere[i] = 0; + for (i = 0; i < (newgroups); i++) + newerhowmany[i] = 0; + newergroups = 0; + newersites = 0; + for (i = 0; i < (newgroups); i++) { + for (j = 1; j <= (weight[i]); j++) { + newergroups++; + for (k = 1; k <= (newhowmany[i]); k++) { + newersites++; + newerfactor[newersites - 1] = newergroups; + } + newerwhere[newergroups - 1] = newwhere[i]; + newerhowmany[newergroups - 1] = newhowmany[i]; + } + } +} /* bootweights */ + + +void sppermute(long n) +{ /* permute the species order as given in array sppord */ + long i, j, k; + + for (i = 1; i <= (spp - 1); i++) { + k = (long)((i+1) * randum(seed)); + j = sppord[n - 1][i]; + sppord[n - 1][i] = sppord[n - 1][k]; + sppord[n - 1][k] = j; + } +} /* sppermute */ + + +void charpermute(long m, long n) +{ /* permute the n+1 characters of species m+1 */ + long i, j, k; + + for (i = 1; i <= (n-1); i++) { + k = (long)((i+1) * randum(seed)); + j = charorder[m][i]; + charorder[m][i] = charorder[m][k]; + charorder[m][k] = j; + } +} /* charpermute */ + + +void writedata() +{ + /* write out one set of bootstrapped sequences */ + long i, j, k, l, m, n, n2; + double x; + Char charstate; + + sppord = (long **)Malloc(newergroups*sizeof(long *)); + for (i = 0; i < (newergroups); i++) + sppord[i] = (long *)Malloc(spp*sizeof(long)); + for (j = 1; j <= spp; j++) + sppord[0][j - 1] = j; + for (i = 1; i < newergroups; i++) { + for (j = 1; j <= (spp); j++) + sppord[i][j - 1] = sppord[i - 1][j - 1]; + } + if (!justwts || permute) { + if (data == restsites && enzymes) + fprintf(outfile, "%5ld %5ld% 4ld\n", spp, newergroups, nenzymes); + else if (data == genefreqs) + fprintf(outfile, "%5ld %5ld\n", spp, newergroups); + else { + if ((data == seqs) + && !(bootstrap || jackknife || permute || ild || lockhart) && xml) + fprintf(outfile, "\n"); + else + if (!(bootstrap || jackknife || permute || ild || lockhart) && nexus) { + fprintf(outfile, "#NEXUS\n"); + fprintf(outfile, "BEGIN DATA;\n"); + fprintf(outfile, " DIMENSIONS NTAX=%ld NCHAR=%ld;\n", + spp, newersites); + fprintf(outfile, " FORMAT"); + if (interleaved) + fprintf(outfile, " interleave=yes"); + else + fprintf(outfile, " interleave=no"); + fprintf(outfile, " DATATYPE="); + if (data == seqs) { + switch (seq) { + case (dna): fprintf(outfile, "DNA missing=N gap=-"); break; + case (rna): fprintf(outfile, "RNA missing=N gap=-"); break; + case (protein): + fprintf(outfile, "protein missing=? gap=-"); + break; + } + } + if (data == morphology) + fprintf(outfile, "STANDARD"); + fprintf(outfile, ";\n MATRIX\n"); + } + else fprintf(outfile, "%5ld %5ld\n", spp, newersites); + } + if (data == genefreqs) { + for (i = 0; i < (newergroups); i++) + fprintf(outfile, " %3ld", alleles[factorr[newerwhere[i] - 1] - 1]); + putc('\n', outfile); + } + } + l = 1; + if ((!(bootstrap || jackknife || permute || ild || lockhart | nexus)) + && ((data == seqs) || (data == restsites))) { + interleaved = !interleaved; + if (!(bootstrap || jackknife || permute || ild || lockhart) && xml) + interleaved = false; + } + if (interleaved) + m = 60; + else + m = newergroups; + do { + if (m > newergroups) + m = newergroups; + for (j = 0; j < spp; j++) { + n = 0; + if ((l == 1) || (interleaved && nexus)) { + if (!(bootstrap || jackknife || permute || ild || lockhart) && xml) { + fprintf(outfile, " \n"); + fprintf(outfile, " "); + } + n2 = nmlngth; + if (!(bootstrap || jackknife || permute || ild || lockhart) + && (xml || nexus)) { + while (nayme[j][n2-1] == ' ') + n2--; + } + if (nexus) + fprintf(outfile, " "); + for (k = 0; k < n2; k++) + if (nexus && (nayme[j][k] == ' ') && (k < n2)) + putc('_', outfile); + else + putc(nayme[j][k], outfile); + if (!(bootstrap || jackknife || permute || ild || lockhart) && xml) + fprintf(outfile, "\n "); + } else { + if (!(bootstrap || jackknife || permute || ild || lockhart) && xml) { + fprintf(outfile, " "); + } + else { + for (k = 1; k <= nmlngth; k++) + putc(' ', outfile); + } + } + if (!xml) { + for (k = 0; k < nmlngth-n2; k++) + fprintf(outfile, " "); + fprintf(outfile, " "); + } + for (k = l - 1; k < m; k++) { + if (permute && j + 1 == 1) + sppermute(newerfactor[n]); /* we can assume chars not permuted */ + for (n2 = -1; n2 <= (newerhowmany[k] - 2); n2++) { + n++; + if (data == genefreqs) { + if (n > 1 && (n & 7) == 1) + fprintf(outfile, "\n "); + x = nodef[sppord[newerfactor[charorder[j][n - 1]] - 1][j] - 1] + [newerwhere[charorder[j][k]] + n2]; + fprintf(outfile, "%8.5f", x); + } else { + if (!(bootstrap || jackknife || permute || ild || lockhart) && xml + && (n > 1) && (n % 60 == 1)) + fprintf(outfile, "\n "); + else if (!nexus && !interleaved && (n > 1) && (n % 60 == 1)) + fprintf(outfile, "\n "); + charstate = nodep[sppord[newerfactor[charorder[j][n - 1]] - 1] + [j] - 1][newerwhere[charorder[j][k]] + n2]; + putc(charstate, outfile); + if (n % 10 == 0 && n % 60 != 0) + putc(' ', outfile); + } + } + } + if (!(bootstrap || jackknife || permute || ild || lockhart ) && xml) { + fprintf(outfile, "\n \n"); + } + putc('\n', outfile); + } + if (interleaved) { + if ((m <= newersites) && (newersites > 60)) + putc('\n', outfile); + l += 60; + m += 60; + } + } while (interleaved && l <= newersites); + if ((data == seqs) && + (!(bootstrap || jackknife || permute || ild || lockhart) && xml)) + fprintf(outfile, "\n"); + if (!(bootstrap || jackknife || permute || ild || lockhart) && nexus) + fprintf(outfile, " ;\nEND;\n"); + for (i = 0; i < (newergroups); i++) + free(sppord[i]); + free(sppord); +} /* writedata */ + + +void writeweights() +{ /* write out one set of post-bootstrapping weights */ + long j, k, l, m, n, o; + + j = 0; + l = 1; + if (interleaved) + m = 60; + else + m = sites; + do { + if(m > sites) + m = sites; + n = 0; + for (k = l - 1; k < m; k++) { + for(o = 0 ; o < how_many[k] ; o++){ + if(oldweight[k]==0){ + fprintf(outweightfile, "0"); + j++; + } + else{ + if (weight[k-j] < 10) + fprintf(outweightfile, "%c", (char)('0'+weight[k-j])); + else + fprintf(outweightfile, "%c", (char)('A'+weight[k-j]-10)); + n++; + if (!interleaved && n > 1 && n % 60 == 1) { + fprintf(outweightfile, "\n"); + if (n % 10 == 0 && n % 60 != 0) + putc(' ', outweightfile); + } + } + } + } + putc('\n', outweightfile); + if (interleaved) { + l += 60; + m += 60; + } + } while (interleaved && l <= sites); +} /* writeweights */ + + +void writecategories() +{ + /* write out categories for the bootstrapped sequences */ + long k, l, m, n, n2; + Char charstate; + if(justwts){ + if (interleaved) + m = 60; + else + m = sites; + l=1; + do { + if(m > sites) + m = sites; + n=0; + for(k=l-1 ; k < m ; k++){ + n++; + if (!interleaved && n > 1 && n % 60 == 1) + fprintf(outcatfile, "\n "); + charstate = '0' + category[k]; + putc(charstate, outcatfile); + } + if (interleaved) { + l += 60; + m += 60; + } + }while(interleaved && l <= sites); + fprintf(outcatfile, "\n"); + return; + } + + l = 1; + if (interleaved) + m = 60; + else + m = newergroups; + do { + if (m > newergroups) + m = newergroups; + n = 0; + for (k = l - 1; k < m; k++) { + for (n2 = -1; n2 <= (newerhowmany[k] - 2); n2++) { + n++; + if (!interleaved && n > 1 && n % 60 == 1) + fprintf(outcatfile, "\n "); + charstate = '0' + category[newerwhere[k] + n2]; + putc(charstate, outcatfile); + if (n % 10 == 0 && n % 60 != 0) + putc(' ', outcatfile); + } + } + if (interleaved) { + l += 60; + m += 60; + } + } while (interleaved && l <= newersites); + fprintf(outcatfile, "\n"); +} /* writecategories */ + + +void writeauxdata(steptr auxdata, FILE *outauxfile) +{ + /* write out auxiliary option data (mixtures, ancestors, ect) to + appropriate file. Samples parralel to data, or just gives one + output entry if justwts is true */ + long k, l, m, n, n2; + Char charstate; + + /* if we just output weights (justwts), and this is first set + just output the data unsampled */ + if(justwts){ + if(firstrep){ + if (interleaved) + m = 60; + else + m = sites; + l=1; + do { + if(m > sites) + m = sites; + n = 0; + for(k=l-1 ; k < m ; k++){ + n++; + if (!interleaved && n > 1 && n % 60 == 1) + fprintf(outauxfile, "\n "); + charstate = auxdata[k]; + putc(charstate, outauxfile); + } + if (interleaved) { + l += 60; + m += 60; + } + }while(interleaved && l <= sites); + fprintf(outauxfile, "\n"); + } + return; + } + + l = 1; + if (interleaved) + m = 60; + else + m = newergroups; + do { + if (m > newergroups) + m = newergroups; + n = 0; + for (k = l - 1; k < m; k++) { + for (n2 = -1; n2 <= (newerhowmany[k] - 2); n2++) { + n++; + if (!interleaved && n > 1 && n % 60 == 1) + fprintf(outauxfile, "\n "); + charstate = auxdata[newerwhere[k] + n2]; + putc(charstate, outauxfile); + if (n % 10 == 0 && n % 60 != 0) + putc(' ', outauxfile); + } + } + if (interleaved) { + l += 60; + m += 60; + } + } while (interleaved && l <= newersites); + fprintf(outauxfile, "\n"); +} /* writeauxdata */ + +void writefactors(void) +{ + long k, l, m, n, prevfact, writesites; + char symbol; + steptr wfactor; + + if(!justwts || firstrep){ + if(justwts){ + writesites = sites; + wfactor = factorr; + } else { + writesites = newersites; + wfactor = newerfactor; + } + prevfact = wfactor[0]; + symbol = '+'; + if (interleaved) + m = 60; + else + m = writesites; + l=1; + do { + if(m > writesites) + m = writesites; + n = 0; + for(k=l-1 ; k < m ; k++){ + n++; + if (!interleaved && n > 1 && n % 60 == 1) + fprintf(outfactfile, "\n "); + if(prevfact != wfactor[k]){ + symbol = (symbol == '+') ? '-' : '+'; + prevfact = wfactor[k]; + } + putc(symbol, outfactfile); + if (n % 10 == 0 && n % 60 != 0) + putc(' ', outfactfile); + } + if (interleaved) { + l += 60; + m += 60; + } + }while(interleaved && l <= writesites); + fprintf(outfactfile, "\n"); + } +} /* writefactors */ + + +void bootwrite() +{ /* does bootstrapping and writes out data sets */ + long i, j, rr, repdiv10; + + if (!(bootstrap || jackknife || permute || ild || lockhart)) + reps = 1; + repdiv10 = reps / 10; + if (repdiv10 < 1) + repdiv10 = 1; + if (progress) + putchar('\n'); + for (rr = 1; rr <= (reps); rr++) { + for (i = 0; i < spp; i++) + for (j = 0; j < maxnewsites; j++) + charorder[i][j] = j; + if(rr==1) + firstrep = true; + else + firstrep = false; + if (ild) { + charpermute(0, maxnewsites); + for (i = 1; i < spp; i++) + for (j = 0; j < maxnewsites; j++) + charorder[i][j] = charorder[0][j]; + } + if (lockhart) + for (i = 0; i < spp; i++) + charpermute(i, maxnewsites); + bootweights(); + if (!justwts || permute || ild || lockhart) + writedata(); + if (justwts && !(permute || ild || lockhart)) + writeweights(); + if (categories) + writecategories(); + if (factors) + writefactors(); + if (mixture) + writeauxdata(mixdata, outmixfile); + if (ancvar) + writeauxdata(ancdata, outancfile); + if (progress && (bootstrap || jackknife || permute || ild || lockhart) + && ((reps < 10) || rr % repdiv10 == 0)) { + printf("completed replicate number %4ld\n", rr); +#ifdef WIN32 + phyFillScreenColor(); +#endif + } + } + if (progress) { + if (justwts) + printf("\nOutput weights written to file \"%s\"\n\n", outweightfilename); + else + printf("\nOutput written to file \"%s\"\n\n", outfilename); + } +} /* bootwrite */ + + +void seqboot_inputaux(steptr dataptr, FILE* auxfile) +{ /* input auxiliary option data (mixtures, ancestors, ect) for + new style input, assumes that data is correctly formated + in input files*/ + long i, j, k; + Char ch; + + j = 0; + k = 1; + for (i = 0; i < (sites); i++) { + do { + if (eoln(auxfile)) + scan_eoln(auxfile); + ch = gettc(auxfile); + if (ch == '\n') + ch = ' '; + } while (ch == ' '); + dataptr[i] = ch; + } + scan_eoln(auxfile); +} /* seqboot_inputaux */ + + +int main(int argc, Char *argv[]) +{ /* Read in sequences or frequencies and bootstrap or jackknife them */ +#ifdef MAC + argc = 1; /* macsetup("SeqBoot",""); */ + argv[0] = "SeqBoot"; +#endif + init(argc,argv); + openfile(&infile, INFILE, "input file", "r", argv[0], infilename); + ibmpc = IBMCRT; + ansi = ANSICRT; + doinput(argc, argv); + bootwrite(); + FClose(infile); + if (weights) + FClose(weightfile); + if (categories) { + FClose(catfile); + FClose(outcatfile); + } + if(mixture) + FClose(outmixfile); + if(ancvar) + FClose(outancfile); + if (justwts && !permute) { + FClose(outweightfile); + } + else + FClose(outfile); +#ifdef MAC + fixmacfile(outfilename); + if (justwts && !permute) + fixmacfile(outweightfilename); + if (categories) + fixmacfile(outcatfilename); + if (mixture) + fixmacfile(outmixfilename); +#endif + printf("Done.\n\n"); +#ifdef WIN32 + phyRestoreConsoleAttributes(); +#endif + return 0; +} diff --git a/forester/archive/RIO/others/phylip_mod/src/test_infile_fitch b/forester/archive/RIO/others/phylip_mod/src/test_infile_fitch new file mode 100644 index 0000000..3c129f1 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/test_infile_fitch @@ -0,0 +1,8 @@ +7 +Bovine_sequence 0.0000 1.6866 1.7198 1.6606 1.5243 1.6043 1.5905 +Mouse_sequence 1.6866 0.0000 1.5232 1.4841 1.4465 1.4389 1.4629 +Gibbon_sequence 1.7198 1.5232 0.0000 0.7115 0.5958 0.6179 0.5583 +Orang_sequence 1.6606 1.4841 0.7115 0.0000 0.4631 0.5061 0.4710 +Gorilla_sequence 1.5243 1.4465 0.5958 0.4631 0.0000 0.3484 0.3083 +Chimp_sequence 1.6043 1.4389 0.6179 0.5061 0.3484 0.0000 0.2692 +Human_sequence 1.5905 1.4629 0.5583 0.4710 0.3083 0.2692 0.0000 diff --git a/forester/archive/RIO/others/phylip_mod/src/test_infile_protdist b/forester/archive/RIO/others/phylip_mod/src/test_infile_protdist new file mode 100644 index 0000000..d263f1b --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/test_infile_protdist @@ -0,0 +1,6 @@ + 5 13 +Alpha AACGTGGCCACAT +Beta AAGGTCGCCACAC +Gamma CAGTTCGCCACAA +Delta GAGATTTCCGCCT +Epsilon GAGATCTCCGCCC diff --git a/forester/archive/RIO/others/phylip_mod/src/test_infile_protml b/forester/archive/RIO/others/phylip_mod/src/test_infile_protml new file mode 100644 index 0000000..67a49d6 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/test_infile_protml @@ -0,0 +1,6 @@ + 5 13 +Alpha_sequence AACGTGGCCAAAT +Beta_sequence AAGGTCGCCAAAC +Gamma_sequence CATTTCGTCACAA +Delta_sequence GGTATTTCGGCCT +Epsilon_sequence GGGATCTCGGCCC diff --git a/forester/archive/RIO/others/phylip_mod/src/test_infile_protmlk b/forester/archive/RIO/others/phylip_mod/src/test_infile_protmlk new file mode 100644 index 0000000..d825652 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/test_infile_protmlk @@ -0,0 +1,6 @@ + 5 13 +Alpha AACGTGGCCAAAT +Beta AAGGTCGCCAAAC +Gamma CATTTCGTCACAA +Delta GGTATTTCGGCCT +Epsilon GGGATCTCGGCCC diff --git a/forester/archive/RIO/others/phylip_mod/src/test_infile_protpars b/forester/archive/RIO/others/phylip_mod/src/test_infile_protpars new file mode 100644 index 0000000..30d58b2 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/test_infile_protpars @@ -0,0 +1,6 @@ + 5 10 +Alpha ABCDEFGHIK +Beta AB--EFGHIK +Gamma ?BCDSFG*?? +Delta CIKDEFGHIK +Epsilon DIKDEFGHIK diff --git a/forester/archive/RIO/others/phylip_mod/src/test_infile_seqbboot b/forester/archive/RIO/others/phylip_mod/src/test_infile_seqbboot new file mode 100644 index 0000000..19766b0 --- /dev/null +++ b/forester/archive/RIO/others/phylip_mod/src/test_infile_seqbboot @@ -0,0 +1,6 @@ + 5 6 +Alpha_sequence AACAAC +Beta_sequence AACCCC +Gamma_sequence ACCAAC +Delta_sequence CCACCA +Epsilon_sequence CCAAAC diff --git a/forester/archive/RIO/others/puzzle_dqo/AUTHORS b/forester/archive/RIO/others/puzzle_dqo/AUTHORS new file mode 100644 index 0000000..cbef439 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/AUTHORS @@ -0,0 +1,45 @@ +since 1999 by Heiko A. Schmidt, Korbinian Strimmer, + Martin Vingron, Arndt von Haeseler + +1995-1999 by Korbinian Strimmer and Arndt von Haeseler + + + +Heiko A. Schmidt + Theoretical Bioinformatics + Deutsches Krebsforschungszentrum (DKFZ) + Im Neuenheimer Feld 280 + D-69124 Heidelberg + Germany + + email: h.schmidt@dkfz-heidelberg.de, + http://www.dkfz-heidelberg.de/tbi/ + +Korbinian Strimmer + Department of Zoology + University of Oxford + South Parks Road + Oxford OX1 3PS, UK + + email: korbinian.strimmer@zoo.ox.ac.uk + http://www.zoo.ox.ac.uk/ + +Martin Vingron + Theoretical Bioinformatics + Deutsches Krebsforschungszentrum (DKFZ) + Im Neuenheimer Feld 280 + D-69124 Heidelberg + Germany + + email: vingron@dkfz-heidelberg.de + http://www.dkfz-heidelberg.de/tbi/ + +Arndt von Haeseler + Max-Planck-Institute for Evolutionary Anthropology + Inselstr. 22 + D-04103 Leipzig + Germany + + email: haeseler@eva.mpg.de, + http://www.eva.mpg.de/ + diff --git a/forester/archive/RIO/others/puzzle_dqo/COPYING b/forester/archive/RIO/others/puzzle_dqo/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/forester/archive/RIO/others/puzzle_dqo/ChangeLog b/forester/archive/RIO/others/puzzle_dqo/ChangeLog new file mode 100644 index 0000000..824b296 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/ChangeLog @@ -0,0 +1,347 @@ + +Version date what has been changed + +5.0 26.08.2000 - changes to manual, Makefile.in + - cpREV hidden by -DCPREV flag + - chi2test, quartio included into source code files + - generic scr/Makefile.generic + - src/makefile.com for VAX + - AUTHORS, README, ChangeLog updated + - INSTALL checked + 27.08.2000 - test code excluded + - '-randseed#' added for debugging purposes + - ./data added to autoconf/automake + - warning output if cmdline parameter unknown + 11.10.2000 - fixed output of rate categories of sites before + computing them + - check whether rate categories were computed by + 1st user tree or NJ tree fixed in the output + 12.10.2000 - invariant site model normalization fixed + + +CODE FREEZE +=========== + +5.0.a33 15.08.2000 - changes for autoconf/automake + +5.0.a32 01.08.2000 - a FPE error fixed (badq == 0) + - small error in -bestq fixed + - fflush's added at several places + +5.0.a31 01.08.2000 - comments added to tree structure sorting puzzle2.c + - changes in configure.in, Makefile.in + +5.0.a30 23.07.2000 - some debugging in checkquart + - changed to autoconf + +5.0.a29 13.07.2000 - some debugging in checkquart + +5.0.a28 13.07.2000 - use best quartet topology option (-bestq) implemented + +5.0.a27 13.07.2000 - further developement to checkquart + - ascii/binary quartet values (-wqla/-wqlb) + - typo correction + +5.0.a26 11.07.2000 - fflush at all checktimer + - further developement at checkquart + - possibility to write quartet values to file (-wqlh) + +5.0.a25 06.07.2000 - fflush at checktimer + +5.0.a24 02.07.2000 - further debugging of checkquart + +5.0.a23 02.07.2000 - further developement to checkquart + +5.0.a22 29.06.2000 - checkquart added to makefile + - bad quartet stats added after reading in *.allquarts + +5.0.a21 27.06.2000 - site pattern statistics implemented and added to + SEQUENCE ALIGNMENT section in puzzle report + +5.0.a20 26.06.2000 - cpREV45 implemented + +5.0.a19 26.06.2000 - for debugging purposes: typo "MPE" changed to "FPE" + - fflush(stdout) added in chi2test + +5.0.a18 20.06.2000 - checkquart implemented + +5.0.a17 19.06.2000 - FPRINTF(STDOUTFILE and STDOUT definition changed + and moved; fputid/fputid10 writes to STDOUT instead + of stdout + - ppuzzle checks slaves enough slave-processes + - numquarts, num2quart, quart2num moved from ppuzzle.c + to puzzle1.c + - read/writeallquart implemented (undocumented feature) + to be used by -wqf/-rqf at comandline + -wqf = write quartet file (infilename.allquart) after + quartet evaluation + -rqf = read quartet file (infilename.allquart), no + quartet evaluation, unless -wqf is used as + well, then quartets are written and read in + - '-h' option at comandline -> printusage + +5.0.a16 31.05.2000 - chi2test bug fixed + - WAG matrix added, model choice adopted + 13.06.2000 - date set to June 2000 + - author order changed to Schmidt, Strimmer, Vingron, + v.Haeseler + - CPU time output stopped, due to overflow errors + 16.06.2000 - sequence composition chi2test moved before + parameter output. + - output of chi2test and bad quartet statistics split, + to do the chi2test output earlier. + +5.0.a15 02.05.2000 - Names changed back from TREE-PUZZLE to PUZZLE + 09.05.2000 - and to TREE-PUZZLE again ;-) + +5.0.a14 13.03.2000 - Changes to the manual. + - Executable names changed to (p)treepuzzle. + (changes in the makefiles) + 15.03.2000 - Output of parameters after estimation added. + +5.0.a13 18.02.2000 - ALPHA version number removed from the code + +5.0.a12 18.02.2000 - CPU time measurement problems fixed for case where + clock_t is an unsigned type. + +5.0.a11 17.02.2000 - time measure problems (CPU/wallclock) fixed + not all features in addtimes are used at the moment. + - unnecessary and unused routines removed fron source + code. + +5.0.a10 20.01.2000 - Name changes from PUZZLE to TREE-PUZZLE + - Chi2-fit model guessing for VT model added + - little model printing bug fixed + +5.0.a9 22.12.1999 - VT Model incorporated (Mueller, Vingron (2000) + JCB, to appear). + - TODO: Chi2-fit model guessing for VT model + +5.0.a8 21.12.1999 - 'sys/times.h' and 'sys/types.h' removed from + puzzle.h. They were neither ANSI conform nor + necessary, but occured in the SUN man pages. + - Definition and call of writetimesstat eliminated + from the sequention version by compiler switched, + and not just the function body as before. + - '-O4' canged to '-O' to be more generic. + +5.0.a7 21.12.1999 - Macro constants introduced for data_optn + (NUCLEOTIDE, AMINOACID, BINARY) + - round robbing of datatype and AA model option changed + in menu to make adjustment of the model possible by a + determined sequence of letters: + 'd': Auto -> Nucleotides + -> Amino acids + -> Binary states + -> Auto + ('m' && data_optn == AMINOACID): + Auto -> Dayhoff + -> JTT + -> mtREV24 + -> BLOSUM62 + -> Auto + - manual.html adjusted + +5.0.a6 20.12.1999 - new manual.html added + +5.0.a5 07.12.1999 - output bug fixed (bestrates were written before they + were computed) + +5.0.a4 02.12.1999 - header file inclusion ajusted: + added: #include + changed from: #include "ppuzzle.h" + to: #ifdef PARALLEL + # include "ppuzzle.h" + #endif + +5.0.a3 27.11.1999 - '-h' comandline option removed, because of problems + with MPICH under LINUX + - new memory leaks of 5.0.a2 closed in PP_Finalize + +5.0.a2 27.11.1999 - Cleanup of the source code + - Measurement of CPU time added + - Parallel load statistics added (quartets, trees, time) + to puzzle report. + - Cleanup debug messages + - Comments "[...]" are removed from usertrees now. + - single quotes will only be printed arount species + names if -DUSEQUOTES is set at compiletime. + - tree likelihood is printed infront of a tree as a + comment, [ lh=-xx.xxxxx ](...); + +5.0.a1 26.11.1999 - Cleanup of the directories + - Copyright changes + - Version changes + + +VERSION CHANGE +============== + +4.1.a26 25.11.1999 - Makefile made universal for pauzzle and ppuzzle + - lines not needed removed from puzzle.h + +4.1.a25 19.11.1999 - Output file prefixes for distances, trees, and + puzzlereport changed in user trees analysis case + to user tree file name + - Temporary output of likelihood to treefile added + +4.1.a24 11.11.1999 - Output of puzzling step trees changed + ptorder: [ orderno # % ID #UniqTopos #Steps ]PHYLIP + pstep: chunk #InChunk sum ID #UniqTopos #Steps + - preliminary leap frog RNG implemented, i.e. uses + the rand4 in the usual way in the sequential case. + If run in parallel all rand4 are initialized with + the same seed and started with PP_Myid-th random + number. after that each process uses the every + PP_NumProcs-th random number to make sure that these + unique. + +4.1.a23 08.11.1999 - output of sequential and parallel version to *.pstep + made identical + +4.1.a22 05.11.1999 - two different puzzle step tree outputs intruduced + and added to the menu ("[ 1. 35 ](...);": + - ordered unique tree list -> *.ptorder + Format: "[ 1. 35 ]" (Ordernumber, Amount) + - chronological tree list -> *.pstep + Format: "[ 1. 35 ]" (Chunknumber, Amount in chunk) + (the last is a problem in parallel, because the come + in chunks, as scheduled) + - debugged the output +4.1.a21 04.11.1999 - Makefile adjustments for other Plattforms + - pstep tree output changed. unique treestructures + printed to *.pstep file with a leading comment + containing an order number and the ammount padded + with blanks (e.g. "[ 1. 356 ]('mouse'..."). + output is done right before writing the puzzle file. + - controlled MPI finish to the Quit menu option added + +4.1.a20 03.11.1999 - some garbage collection (free) added + - makefile adjusted, OFLAGS for optimization added + (ppuzzle/MPICH has problems with -O, so the + ppuzzle is created without optimization) + Some minor changes in the makefiles + - still to do: garbage collection from 'internalnode' + in master process + +4.1.a19 13.10.1999 - adding the output of standardized (i.e. sorted) + puzzling step trees. Those are printed to the + standard output at the moment. (Routines to sort + and print the trees implemented) + 14.10.1999 - routines for printing the sorted trees to a string. + needed to send them between Master and Worker, and + to have a unique key to sort and count the trees. + 21.10.1999 - counting of sorted trees implemented by doubly linked + list, sort routine, print to stdout + 25.10.1999 - change place of writing distances to file right after + distances have been computed. + - output of puzzling step trees now with true name, + not numbers + 02.11.1999 - parallel counting and sending of puzzling step trees + - some parallel sending bugs fixed + +4.1.a18 14.09.1999 - adding possibility to specify input file at + command line, this specifies also the output + filenames (puzzle output: *.puzzle; treefile: + *.tree; distances: *.dist; Triangel EPS: *.eps; + unresolved: *.qlist; puzzling step trees: *.pstep) + If an unexisting name is given, one has to reenter + the right name, but the wrong one is used as prefix. + 15.09.1999 - sending back of bad quartets from slaves added + - bug in quart2num fixed (not used before; was shifted + by 1) + - first version of a README added ;-) + +4.1.a17 03.08.1999 - Recv-Error in receiving DoPuzzleBlock fixed + - double freeing of same MPI_Datatype fixed + - changing of scheduling algorithm to smaller chunks + in gss -> sgss + 13.09.1999 - bug fixed in optimization routine in ml2.c: + boundary check added + +4.1.a16 12.07.1999 - slight changes in verbosity levels + - changed all printf to FPRINTF(STDOUTFILE to + change easily from stdout to a file. + +4.1.a15 08.07.1999 - scheduler for both parallel parts + - several small changes + +4.1.a14 25.06.1999 - computation of tree parallel, scheduler dependent, + sending all biparts in one message instead of one + by one + - several small changes since a13 in sched.c, et al. + +4.1.a13 10.06.1999 - computation of tree parallel (chunk = #trees/#slaves) + - scheduling schemes implemented for minimum chunk sizes + +4.1.a12 07.06.1999 - computation of quartets properly parallel + - scheduling implemented + - counting of quartets by slave ajusted + - TODO: sending of bad quartets (array + list) + - distinction between '1st user tree' and 'NJ tree' + in result output removed again + +4.1.a11 28.05.1999 - PP_SendDoQuartBlock, PP_RecvDoQuartBlock, + PP_SendQuartBlock, PP_RecvQuartBlock + - mallocquartets() changed from global to local + variables to be more flexible + - Quartet computation moved to slave (badquartet + handling missing: output, badquartet vector); + - distinction between '1st user tree' and 'NJ tree' + added in result output (puzzle1.c around l.1756) + +4.1.a10 20.05.1999 - num2quart, numquarts, quart2num introduced + - parallel init/finalize, quartets computed on + master and slave, compared -> equal -> all necessary + parameter exported + +4.1.a9 19.05.1999 - 'dvector forg' removed from onepamratematrix + cmdline, because it's not used in the function. + +4.1.a8 18.05.1999 - add _GAMMA_ (not necessary) to gamma.h and _PUZZLE_ + to puzzle.h to avoid dublicate includes, possible + due to ppuzzle.h + - ppuzzle added to makefile and to check + - 1st parallel version but no slave computations + only sending parameters and done signals. + +4.1.a7 18.05.1999 - export reevaluation of tree and evaluation of + usertrees to evaluatetree. + +4.1.a6 17.05.1999 - -DNEWFORLOOP added to fixed.src, because the changed + for loop structure changes the sequence of randomized + quartets during likelihood mapping + - change 'int main()' to 'int main(argc, argv)' + - export more functionalities from main: + memcleanup(), inputandinit(&argc, &argv) + - grouping if's (excluding eachother) together in + switch() + - split treereavaluation and 1st usertree, + evaluate all usertrees together (TODO: both, + treereavaluation and usertrees in one loop) + - MAKE CHECK added to ./makefile + +4.1.a5 16.05.1999 - adding ´dvector Brnlength´ to lslength cmdline to + reduce globality of Brnlength. (Later better to *Tree) + +4.1.a4 11.05.1999 - structure of for loops changed in computeallquartets + and recon_tree, so that the quarted addresses are in + one contigous sequence (for a /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done + for subdir in $(SUBDIRS); do \ + if test "$$subdir" = .; then :; else \ + test -d $(distdir)/$$subdir \ + || mkdir $(distdir)/$$subdir \ + || exit 1; \ + chmod 777 $(distdir)/$$subdir; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(distdir) distdir=../$(distdir)/$$subdir distdir) \ + || exit 1; \ + fi; \ + done +info-am: +info: info-recursive +dvi-am: +dvi: dvi-recursive +check-am: all-am +check: check-recursive +installcheck-am: +installcheck: installcheck-recursive +install-exec-am: +install-exec: install-exec-recursive + +install-data-am: +install-data: install-data-recursive + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-recursive +uninstall-am: +uninstall: uninstall-recursive +all-am: Makefile +all-redirect: all-recursive +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: installdirs-recursive +installdirs-am: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-recursive + +clean-am: clean-tags clean-generic mostlyclean-am + +clean: clean-recursive + +distclean-am: distclean-tags distclean-generic clean-am + +distclean: distclean-recursive + -rm -f config.status + +maintainer-clean-am: maintainer-clean-tags maintainer-clean-generic \ + distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-recursive + -rm -f config.status + +.PHONY: install-data-recursive uninstall-data-recursive \ +install-exec-recursive uninstall-exec-recursive installdirs-recursive \ +uninstalldirs-recursive all-recursive check-recursive \ +installcheck-recursive info-recursive dvi-recursive \ +mostlyclean-recursive distclean-recursive clean-recursive \ +maintainer-clean-recursive tags tags-recursive mostlyclean-tags \ +distclean-tags clean-tags maintainer-clean-tags distdir info-am info \ +dvi-am dvi check check-am installcheck-am installcheck install-exec-am \ +install-exec install-data-am install-data install-am install \ +uninstall-am uninstall all-redirect all-am all installdirs-am \ +installdirs mostlyclean-generic distclean-generic clean-generic \ +maintainer-clean-generic clean mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/Makefile.am b/forester/archive/RIO/others/puzzle_dqo/Makefile.am new file mode 100644 index 0000000..2a0bac6 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/Makefile.am @@ -0,0 +1,2 @@ +EXTRA_DIST = +SUBDIRS = src doc data diff --git a/forester/archive/RIO/others/puzzle_dqo/Makefile.in b/forester/archive/RIO/others/puzzle_dqo/Makefile.in new file mode 100644 index 0000000..38b4d60 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/Makefile.in @@ -0,0 +1,327 @@ +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = . + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +EXTRA_DIST = +SUBDIRS = src doc data +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = README AUTHORS COPYING ChangeLog INSTALL Makefile.am \ +Makefile.in NEWS aclocal.m4 configure configure.in install-sh missing \ +mkinstalldirs + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = tar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(ACLOCAL_M4): configure.in + cd $(srcdir) && $(ACLOCAL) + +config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + $(SHELL) ./config.status --recheck +$(srcdir)/configure: $(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES) + cd $(srcdir) && $(AUTOCONF) + +# This directory's subdirectories are mostly independent; you can cd +# into them and run `make' without going through this Makefile. +# To change the values of `make' variables: instead of editing Makefiles, +# (1) if the variable is set in `config.status', edit `config.status' +# (which will cause the Makefiles to be regenerated when you run `make'); +# (2) otherwise, pass the desired values on the `make' command line. + +@SET_MAKE@ + +all-recursive install-data-recursive install-exec-recursive \ +installdirs-recursive install-recursive uninstall-recursive \ +check-recursive installcheck-recursive info-recursive dvi-recursive: + @set fnord $(MAKEFLAGS); amf=$$2; \ + dot_seen=no; \ + target=`echo $@ | sed s/-recursive//`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + dot_seen=yes; \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + done; \ + if test "$$dot_seen" = "no"; then \ + $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ + fi; test -z "$$fail" + +mostlyclean-recursive clean-recursive distclean-recursive \ +maintainer-clean-recursive: + @set fnord $(MAKEFLAGS); amf=$$2; \ + dot_seen=no; \ + rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \ + rev="$$subdir $$rev"; \ + test "$$subdir" = "." && dot_seen=yes; \ + done; \ + test "$$dot_seen" = "no" && rev=". $$rev"; \ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $$rev; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + done && test -z "$$fail" +tags-recursive: + list='$(SUBDIRS)'; for subdir in $$list; do \ + test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ + done + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + here=`pwd` && cd $(srcdir) \ + && mkid -f$$here/ID $$unique $(LISP) + +TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + if test "$$subdir" = .; then :; else \ + test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \ + fi; \ + done; \ + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \ + || (cd $(srcdir) && etags $(ETAGS_ARGS) $$tags $$unique $(LISP) -o $$here/TAGS) + +mostlyclean-tags: + +clean-tags: + +distclean-tags: + -rm -f TAGS ID + +maintainer-clean-tags: + +distdir = $(PACKAGE)-$(VERSION) +top_distdir = $(distdir) + +# This target untars the dist file and tries a VPATH configuration. Then +# it guarantees that the distribution is self-contained by making another +# tarfile. +distcheck: dist + -rm -rf $(distdir) + GZIP=$(GZIP_ENV) $(TAR) zxf $(distdir).tar.gz + mkdir $(distdir)/=build + mkdir $(distdir)/=inst + dc_install_base=`cd $(distdir)/=inst && pwd`; \ + cd $(distdir)/=build \ + && ../configure --srcdir=.. --prefix=$$dc_install_base \ + && $(MAKE) $(AM_MAKEFLAGS) \ + && $(MAKE) $(AM_MAKEFLAGS) dvi \ + && $(MAKE) $(AM_MAKEFLAGS) check \ + && $(MAKE) $(AM_MAKEFLAGS) install \ + && $(MAKE) $(AM_MAKEFLAGS) installcheck \ + && $(MAKE) $(AM_MAKEFLAGS) dist + -rm -rf $(distdir) + @banner="$(distdir).tar.gz is ready for distribution"; \ + dashes=`echo "$$banner" | sed s/./=/g`; \ + echo "$$dashes"; \ + echo "$$banner"; \ + echo "$$dashes" +dist: distdir + -chmod -R a+r $(distdir) + GZIP=$(GZIP_ENV) $(TAR) chozf $(distdir).tar.gz $(distdir) + -rm -rf $(distdir) +dist-all: distdir + -chmod -R a+r $(distdir) + GZIP=$(GZIP_ENV) $(TAR) chozf $(distdir).tar.gz $(distdir) + -rm -rf $(distdir) +distdir: $(DISTFILES) + -rm -rf $(distdir) + mkdir $(distdir) + -chmod 777 $(distdir) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done + for subdir in $(SUBDIRS); do \ + if test "$$subdir" = .; then :; else \ + test -d $(distdir)/$$subdir \ + || mkdir $(distdir)/$$subdir \ + || exit 1; \ + chmod 777 $(distdir)/$$subdir; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(distdir) distdir=../$(distdir)/$$subdir distdir) \ + || exit 1; \ + fi; \ + done +info-am: +info: info-recursive +dvi-am: +dvi: dvi-recursive +check-am: all-am +check: check-recursive +installcheck-am: +installcheck: installcheck-recursive +install-exec-am: +install-exec: install-exec-recursive + +install-data-am: +install-data: install-data-recursive + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-recursive +uninstall-am: +uninstall: uninstall-recursive +all-am: Makefile +all-redirect: all-recursive +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: installdirs-recursive +installdirs-am: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-recursive + +clean-am: clean-tags clean-generic mostlyclean-am + +clean: clean-recursive + +distclean-am: distclean-tags distclean-generic clean-am + +distclean: distclean-recursive + -rm -f config.status + +maintainer-clean-am: maintainer-clean-tags maintainer-clean-generic \ + distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-recursive + -rm -f config.status + +.PHONY: install-data-recursive uninstall-data-recursive \ +install-exec-recursive uninstall-exec-recursive installdirs-recursive \ +uninstalldirs-recursive all-recursive check-recursive \ +installcheck-recursive info-recursive dvi-recursive \ +mostlyclean-recursive distclean-recursive clean-recursive \ +maintainer-clean-recursive tags tags-recursive mostlyclean-tags \ +distclean-tags clean-tags maintainer-clean-tags distdir info-am info \ +dvi-am dvi check check-am installcheck-am installcheck install-exec-am \ +install-exec install-data-am install-data install-am install \ +uninstall-am uninstall all-redirect all-am all installdirs-am \ +installdirs mostlyclean-generic distclean-generic clean-generic \ +maintainer-clean-generic clean mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/aclocal.m4 b/forester/archive/RIO/others/puzzle_dqo/aclocal.m4 new file mode 100644 index 0000000..9f8add8 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/aclocal.m4 @@ -0,0 +1,104 @@ +dnl aclocal.m4 generated automatically by aclocal 1.4 + +dnl Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY, to the extent permitted by law; without +dnl even the implied warranty of MERCHANTABILITY or FITNESS FOR A +dnl PARTICULAR PURPOSE. + +# Do all the work for Automake. This macro actually does too much -- +# some checks are only needed if your package does certain things. +# But this isn't really a big deal. + +# serial 1 + +dnl Usage: +dnl AM_INIT_AUTOMAKE(package,version, [no-define]) + +AC_DEFUN(AM_INIT_AUTOMAKE, +[AC_REQUIRE([AC_PROG_INSTALL]) +PACKAGE=[$1] +AC_SUBST(PACKAGE) +VERSION=[$2] +AC_SUBST(VERSION) +dnl test to see if srcdir already configured +if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then + AC_MSG_ERROR([source directory already configured; run "make distclean" there first]) +fi +ifelse([$3],, +AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package]) +AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])) +AC_REQUIRE([AM_SANITY_CHECK]) +AC_REQUIRE([AC_ARG_PROGRAM]) +dnl FIXME This is truly gross. +missing_dir=`cd $ac_aux_dir && pwd` +AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir) +AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir) +AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir) +AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir) +AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir) +AC_REQUIRE([AC_PROG_MAKE_SET])]) + +# +# Check to make sure that the build environment is sane. +# + +AC_DEFUN(AM_SANITY_CHECK, +[AC_MSG_CHECKING([whether build environment is sane]) +# Just in case +sleep 1 +echo timestamp > conftestfile +# Do `set' in a subshell so we don't clobber the current shell's +# arguments. Must try -L first in case configure is actually a +# symlink; some systems play weird games with the mod time of symlinks +# (eg FreeBSD returns the mod time of the symlink's containing +# directory). +if ( + set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null` + if test "[$]*" = "X"; then + # -L didn't work. + set X `ls -t $srcdir/configure conftestfile` + fi + if test "[$]*" != "X $srcdir/configure conftestfile" \ + && test "[$]*" != "X conftestfile $srcdir/configure"; then + + # If neither matched, then we have a broken ls. This can happen + # if, for instance, CONFIG_SHELL is bash and it inherits a + # broken ls alias from the environment. This has actually + # happened. Such a system could not be considered "sane". + AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken +alias in your environment]) + fi + + test "[$]2" = conftestfile + ) +then + # Ok. + : +else + AC_MSG_ERROR([newly created file is older than distributed files! +Check your system clock]) +fi +rm -f conftest* +AC_MSG_RESULT(yes)]) + +dnl AM_MISSING_PROG(NAME, PROGRAM, DIRECTORY) +dnl The program must properly implement --version. +AC_DEFUN(AM_MISSING_PROG, +[AC_MSG_CHECKING(for working $2) +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if ($2 --version) < /dev/null > /dev/null 2>&1; then + $1=$2 + AC_MSG_RESULT(found) +else + $1="$3/missing $2" + AC_MSG_RESULT(missing) +fi +AC_SUBST($1)]) + diff --git a/forester/archive/RIO/others/puzzle_dqo/config.status b/forester/archive/RIO/others/puzzle_dqo/config.status new file mode 100755 index 0000000..da58b56 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/config.status @@ -0,0 +1,179 @@ +#! /bin/sh +# Generated automatically by configure. +# Run this file to recreate the current configuration. +# This directory was configured as follows, +# on host forester.wustl.edu: +# +# ./configure +# +# Compiler output produced by configure, useful for debugging +# configure, is in ./config.log if it exists. + +ac_cs_usage="Usage: ./config.status [--recheck] [--version] [--help]" +for ac_option +do + case "$ac_option" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + echo "running ${CONFIG_SHELL-/bin/sh} ./configure --no-create --no-recursion" + exec ${CONFIG_SHELL-/bin/sh} ./configure --no-create --no-recursion ;; + -version | --version | --versio | --versi | --vers | --ver | --ve | --v) + echo "./config.status generated by autoconf version 2.13" + exit 0 ;; + -help | --help | --hel | --he | --h) + echo "$ac_cs_usage"; exit 0 ;; + *) echo "$ac_cs_usage"; exit 1 ;; + esac +done + +ac_given_srcdir=. +ac_given_INSTALL="/usr/bin/install -c" + +trap 'rm -fr Makefile src/Makefile src/test doc/Makefile data/Makefile conftest*; exit 1' 1 2 15 + +# Protect against being on the right side of a sed subst in config.status. +sed 's/%@/@@/; s/@%/@@/; s/%g$/@g/; /@g$/s/[\\&%]/\\&/g; + s/@@/%@/; s/@@/@%/; s/@g$/%g/' > conftest.subs <<\CEOF +/^[ ]*VPATH[ ]*=[^:]*$/d + +s%@SHELL@%/bin/sh%g +s%@CFLAGS@%-g -O2%g +s%@CPPFLAGS@%%g +s%@CXXFLAGS@%%g +s%@FFLAGS@%%g +s%@DEFS@% -DPACKAGE=\"tree-puzzle\" -DVERSION=\"5.0\" -DHAVE_LIBM=1 -DSTDC_HEADERS=1 -DHAVE_LIMITS_H=1 %g +s%@LDFLAGS@%%g +s%@LIBS@%-lm %g +s%@exec_prefix@%${prefix}%g +s%@prefix@%/usr/local%g +s%@program_transform_name@%s,x,x,%g +s%@bindir@%${exec_prefix}/bin%g +s%@sbindir@%${exec_prefix}/sbin%g +s%@libexecdir@%${exec_prefix}/libexec%g +s%@datadir@%${prefix}/share%g +s%@sysconfdir@%${prefix}/etc%g +s%@sharedstatedir@%${prefix}/com%g +s%@localstatedir@%${prefix}/var%g +s%@libdir@%${exec_prefix}/lib%g +s%@includedir@%${prefix}/include%g +s%@oldincludedir@%/usr/include%g +s%@infodir@%${prefix}/info%g +s%@mandir@%${prefix}/man%g +s%@INSTALL_PROGRAM@%${INSTALL}%g +s%@INSTALL_SCRIPT@%${INSTALL_PROGRAM}%g +s%@INSTALL_DATA@%${INSTALL} -m 644%g +s%@PACKAGE@%tree-puzzle%g +s%@VERSION@%5.0%g +s%@ACLOCAL@%aclocal%g +s%@AUTOCONF@%autoconf%g +s%@AUTOMAKE@%automake%g +s%@AUTOHEADER@%autoheader%g +s%@MAKEINFO@%makeinfo%g +s%@SET_MAKE@%%g +s%@CC@%gcc%g +s%@MPICC0@%%g +s%@MPICC1@%%g +s%@MPICC2@%%g +s%@MPICC3@%%g +s%@MPICC4@%%g +s%@MPICC5@%%g +s%@MPICC@%%g +s%@MPILIBS@%%g +s%@MPIDEFS@%%g +s%@MPICFLAGS@%%g +s%@PPUZZLE@%%g +s%@CPP@%gcc -E%g + +CEOF + +# Split the substitutions into bite-sized pieces for seds with +# small command number limits, like on Digital OSF/1 and HP-UX. +ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script. +ac_file=1 # Number of current file. +ac_beg=1 # First line for current file. +ac_end=$ac_max_sed_cmds # Line after last line for current file. +ac_more_lines=: +ac_sed_cmds="" +while $ac_more_lines; do + if test $ac_beg -gt 1; then + sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file + else + sed "${ac_end}q" conftest.subs > conftest.s$ac_file + fi + if test ! -s conftest.s$ac_file; then + ac_more_lines=false + rm -f conftest.s$ac_file + else + if test -z "$ac_sed_cmds"; then + ac_sed_cmds="sed -f conftest.s$ac_file" + else + ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file" + fi + ac_file=`expr $ac_file + 1` + ac_beg=$ac_end + ac_end=`expr $ac_end + $ac_max_sed_cmds` + fi +done +if test -z "$ac_sed_cmds"; then + ac_sed_cmds=cat +fi + +CONFIG_FILES=${CONFIG_FILES-"Makefile src/Makefile src/test doc/Makefile data/Makefile"} +for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories. + + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`" + # A "../" for each directory in $ac_dir_suffix. + ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'` + else + ac_dir_suffix= ac_dots= + fi + + case "$ac_given_srcdir" in + .) srcdir=. + if test -z "$ac_dots"; then top_srcdir=. + else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;; + /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;; + *) # Relative path. + srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix" + top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + case "$ac_given_INSTALL" in + [/$]*) INSTALL="$ac_given_INSTALL" ;; + *) INSTALL="$ac_dots$ac_given_INSTALL" ;; + esac + + echo creating "$ac_file" + rm -f "$ac_file" + configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure." + case "$ac_file" in + *Makefile*) ac_comsub="1i\\ +# $configure_input" ;; + *) ac_comsub= ;; + esac + + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + sed -e "$ac_comsub +s%@configure_input@%$configure_input%g +s%@srcdir@%$srcdir%g +s%@top_srcdir@%$top_srcdir%g +s%@INSTALL@%$INSTALL%g +" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file +fi; done +rm -f conftest.s* + + + +exit 0 diff --git a/forester/archive/RIO/others/puzzle_dqo/configure b/forester/archive/RIO/others/puzzle_dqo/configure new file mode 100755 index 0000000..5d4db41 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/configure @@ -0,0 +1,2265 @@ +#! /bin/sh + +# Guess values for system-dependent variables and create Makefiles. +# Generated automatically using autoconf version 2.13 +# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc. +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. + +# Defaults: +ac_help= +ac_default_prefix=/usr/local +# Any additions from configure.in: + +# Initialize some variables set by options. +# The variables have the same names as the options, with +# dashes changed to underlines. +build=NONE +cache_file=./config.cache +exec_prefix=NONE +host=NONE +no_create= +nonopt=NONE +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +target=NONE +verbose= +x_includes=NONE +x_libraries=NONE +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datadir='${prefix}/share' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +libdir='${exec_prefix}/lib' +includedir='${prefix}/include' +oldincludedir='/usr/include' +infodir='${prefix}/info' +mandir='${prefix}/man' + +# Initialize some other variables. +subdirs= +MFLAGS= MAKEFLAGS= +SHELL=${CONFIG_SHELL-/bin/sh} +# Maximum number of lines to put in a shell here document. +ac_max_here_lines=12 + +ac_prev= +for ac_option +do + + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval "$ac_prev=\$ac_option" + ac_prev= + continue + fi + + case "$ac_option" in + -*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;; + *) ac_optarg= ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case "$ac_option" in + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir="$ac_optarg" ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build="$ac_optarg" ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file="$ac_optarg" ;; + + -datadir | --datadir | --datadi | --datad | --data | --dat | --da) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \ + | --da=*) + datadir="$ac_optarg" ;; + + -disable-* | --disable-*) + ac_feature=`echo $ac_option|sed -e 's/-*disable-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + eval "enable_${ac_feature}=no" ;; + + -enable-* | --enable-*) + ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "enable_${ac_feature}='$ac_optarg'" ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix="$ac_optarg" ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he) + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat << EOF +Usage: configure [options] [host] +Options: [defaults in brackets after descriptions] +Configuration: + --cache-file=FILE cache test results in FILE + --help print this message + --no-create do not create output files + --quiet, --silent do not print \`checking...' messages + --version print the version of autoconf that created configure +Directory and file names: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [same as prefix] + --bindir=DIR user executables in DIR [EPREFIX/bin] + --sbindir=DIR system admin executables in DIR [EPREFIX/sbin] + --libexecdir=DIR program executables in DIR [EPREFIX/libexec] + --datadir=DIR read-only architecture-independent data in DIR + [PREFIX/share] + --sysconfdir=DIR read-only single-machine data in DIR [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data in DIR + [PREFIX/com] + --localstatedir=DIR modifiable single-machine data in DIR [PREFIX/var] + --libdir=DIR object code libraries in DIR [EPREFIX/lib] + --includedir=DIR C header files in DIR [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc in DIR [/usr/include] + --infodir=DIR info documentation in DIR [PREFIX/info] + --mandir=DIR man documentation in DIR [PREFIX/man] + --srcdir=DIR find the sources in DIR [configure dir or ..] + --program-prefix=PREFIX prepend PREFIX to installed program names + --program-suffix=SUFFIX append SUFFIX to installed program names + --program-transform-name=PROGRAM + run sed PROGRAM on installed program names +EOF + cat << EOF +Host type: + --build=BUILD configure for building on BUILD [BUILD=HOST] + --host=HOST configure for HOST [guessed] + --target=TARGET configure for TARGET [TARGET=HOST] +Features and packages: + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) + --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --x-includes=DIR X include files are in DIR + --x-libraries=DIR X library files are in DIR +EOF + if test -n "$ac_help"; then + echo "--enable and --with options recognized:$ac_help" + fi + exit 0 ;; + + -host | --host | --hos | --ho) + ac_prev=host ;; + -host=* | --host=* | --hos=* | --ho=*) + host="$ac_optarg" ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir="$ac_optarg" ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir="$ac_optarg" ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir="$ac_optarg" ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir="$ac_optarg" ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst \ + | --locals | --local | --loca | --loc | --lo) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* \ + | --locals=* | --local=* | --loca=* | --loc=* | --lo=*) + localstatedir="$ac_optarg" ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir="$ac_optarg" ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir="$ac_optarg" ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix="$ac_optarg" ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix="$ac_optarg" ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix="$ac_optarg" ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name="$ac_optarg" ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir="$ac_optarg" ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir="$ac_optarg" ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site="$ac_optarg" ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir="$ac_optarg" ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir="$ac_optarg" ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target="$ac_optarg" ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers) + echo "configure generated by autoconf version 2.13" + exit 0 ;; + + -with-* | --with-*) + ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "with_${ac_package}='$ac_optarg'" ;; + + -without-* | --without-*) + ac_package=`echo $ac_option|sed -e 's/-*without-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + eval "with_${ac_package}=no" ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes="$ac_optarg" ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries="$ac_optarg" ;; + + -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; } + ;; + + *) + if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then + echo "configure: warning: $ac_option: invalid host type" 1>&2 + fi + if test "x$nonopt" != xNONE; then + { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; } + fi + nonopt="$ac_option" + ;; + + esac +done + +if test -n "$ac_prev"; then + { echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; } +fi + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +# File descriptor usage: +# 0 standard input +# 1 file creation +# 2 errors and warnings +# 3 some systems may open it to /dev/tty +# 4 used on the Kubota Titan +# 6 checking for... messages and results +# 5 compiler messages saved in config.log +if test "$silent" = yes; then + exec 6>/dev/null +else + exec 6>&1 +fi +exec 5>./config.log + +echo "\ +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. +" 1>&5 + +# Strip out --no-create and --no-recursion so they do not pile up. +# Also quote any args containing shell metacharacters. +ac_configure_args= +for ac_arg +do + case "$ac_arg" in + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) ;; + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;; + *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*) + ac_configure_args="$ac_configure_args '$ac_arg'" ;; + *) ac_configure_args="$ac_configure_args $ac_arg" ;; + esac +done + +# NLS nuisances. +# Only set these to C if already set. These must not be set unconditionally +# because not all systems understand e.g. LANG=C (notably SCO). +# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'! +# Non-C LC_CTYPE values break the ctype check. +if test "${LANG+set}" = set; then LANG=C; export LANG; fi +if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi +if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi +if test "${LC_CTYPE+set}" = set; then LC_CTYPE=C; export LC_CTYPE; fi + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -rf conftest* confdefs.h +# AIX cpp loses on an empty file, so make sure it contains at least a newline. +echo > confdefs.h + +# A filename unique to this package, relative to the directory that +# configure is in, which we can look for to find out if srcdir is correct. +ac_unique_file=src/ml.h + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then its parent. + ac_prog=$0 + ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'` + test "x$ac_confdir" = "x$ac_prog" && ac_confdir=. + srcdir=$ac_confdir + if test ! -r $srcdir/$ac_unique_file; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r $srcdir/$ac_unique_file; then + if test "$ac_srcdir_defaulted" = yes; then + { echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; } + else + { echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; } + fi +fi +srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'` + +# Prefer explicitly selected file to automatically selected ones. +if test -z "$CONFIG_SITE"; then + if test "x$prefix" != xNONE; then + CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site" + else + CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" + fi +fi +for ac_site_file in $CONFIG_SITE; do + if test -r "$ac_site_file"; then + echo "loading site script $ac_site_file" + . "$ac_site_file" + fi +done + + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +ac_exeext= +ac_objext=o +if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then + # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu. + if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then + ac_n= ac_c=' +' ac_t=' ' + else + ac_n=-n ac_c= ac_t= + fi +else + ac_n= ac_c='\c' ac_t= +fi + + + +ac_aux_dir= +for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do + if test -f $ac_dir/install-sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install-sh -c" + break + elif test -f $ac_dir/install.sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install.sh -c" + break + fi +done +if test -z "$ac_aux_dir"; then + { echo "configure: error: can not find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; } +fi +ac_config_guess=$ac_aux_dir/config.guess +ac_config_sub=$ac_aux_dir/config.sub +ac_configure=$ac_aux_dir/configure # This should be Cygnus configure. + +# Find a good install program. We prefer a C program (faster), +# so one script is as good as another. But avoid the broken or +# incompatible versions: +# SysV /etc/install, /usr/sbin/install +# SunOS /usr/etc/install +# IRIX /sbin/install +# AIX /bin/install +# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag +# AFS /usr/afsws/bin/install, which mishandles nonexistent args +# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff" +# ./install, which can be erroneously created by make from ./install.sh. +echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6 +echo "configure:550: checking for a BSD compatible install" >&5 +if test -z "$INSTALL"; then +if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + IFS="${IFS= }"; ac_save_IFS="$IFS"; IFS=":" + for ac_dir in $PATH; do + # Account for people who put trailing slashes in PATH elements. + case "$ac_dir/" in + /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;; + *) + # OSF1 and SCO ODT 3.0 have their own names for install. + # Don't use installbsd from OSF since it installs stuff as root + # by default. + for ac_prog in ginstall scoinst install; do + if test -f $ac_dir/$ac_prog; then + if test $ac_prog = install && + grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then + # AIX install. It has an incompatible calling convention. + : + else + ac_cv_path_install="$ac_dir/$ac_prog -c" + break 2 + fi + fi + done + ;; + esac + done + IFS="$ac_save_IFS" + +fi + if test "${ac_cv_path_install+set}" = set; then + INSTALL="$ac_cv_path_install" + else + # As a last resort, use the slow shell script. We don't cache a + # path for INSTALL within a source directory, because that will + # break other packages using the cache if that directory is + # removed, or if the path is relative. + INSTALL="$ac_install_sh" + fi +fi +echo "$ac_t""$INSTALL" 1>&6 + +# Use test -z because SunOS4 sh mishandles braces in ${var-val}. +# It thinks the first close brace ends the variable substitution. +test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}' + +test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}' + +test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' + +echo $ac_n "checking whether build environment is sane""... $ac_c" 1>&6 +echo "configure:603: checking whether build environment is sane" >&5 +# Just in case +sleep 1 +echo timestamp > conftestfile +# Do `set' in a subshell so we don't clobber the current shell's +# arguments. Must try -L first in case configure is actually a +# symlink; some systems play weird games with the mod time of symlinks +# (eg FreeBSD returns the mod time of the symlink's containing +# directory). +if ( + set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null` + if test "$*" = "X"; then + # -L didn't work. + set X `ls -t $srcdir/configure conftestfile` + fi + if test "$*" != "X $srcdir/configure conftestfile" \ + && test "$*" != "X conftestfile $srcdir/configure"; then + + # If neither matched, then we have a broken ls. This can happen + # if, for instance, CONFIG_SHELL is bash and it inherits a + # broken ls alias from the environment. This has actually + # happened. Such a system could not be considered "sane". + { echo "configure: error: ls -t appears to fail. Make sure there is not a broken +alias in your environment" 1>&2; exit 1; } + fi + + test "$2" = conftestfile + ) +then + # Ok. + : +else + { echo "configure: error: newly created file is older than distributed files! +Check your system clock" 1>&2; exit 1; } +fi +rm -f conftest* +echo "$ac_t""yes" 1>&6 +if test "$program_transform_name" = s,x,x,; then + program_transform_name= +else + # Double any \ or $. echo might interpret backslashes. + cat <<\EOF_SED > conftestsed +s,\\,\\\\,g; s,\$,$$,g +EOF_SED + program_transform_name="`echo $program_transform_name|sed -f conftestsed`" + rm -f conftestsed +fi +test "$program_prefix" != NONE && + program_transform_name="s,^,${program_prefix},; $program_transform_name" +# Use a double $ so make ignores it. +test "$program_suffix" != NONE && + program_transform_name="s,\$\$,${program_suffix},; $program_transform_name" + +# sed with no file args requires a program. +test "$program_transform_name" = "" && program_transform_name="s,x,x," + +echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6 +echo "configure:660: checking whether ${MAKE-make} sets \${MAKE}" >&5 +set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftestmake <<\EOF +all: + @echo 'ac_maketemp="${MAKE}"' +EOF +# GNU make sometimes prints "make[1]: Entering...", which would confuse us. +eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=` +if test -n "$ac_maketemp"; then + eval ac_cv_prog_make_${ac_make}_set=yes +else + eval ac_cv_prog_make_${ac_make}_set=no +fi +rm -f conftestmake +fi +if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then + echo "$ac_t""yes" 1>&6 + SET_MAKE= +else + echo "$ac_t""no" 1>&6 + SET_MAKE="MAKE=${MAKE-make}" +fi + + +PACKAGE=tree-puzzle + +VERSION=5.0 + +if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then + { echo "configure: error: source directory already configured; run "make distclean" there first" 1>&2; exit 1; } +fi +cat >> confdefs.h <> confdefs.h <&6 +echo "configure:706: checking for working aclocal" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (aclocal --version) < /dev/null > /dev/null 2>&1; then + ACLOCAL=aclocal + echo "$ac_t""found" 1>&6 +else + ACLOCAL="$missing_dir/missing aclocal" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working autoconf""... $ac_c" 1>&6 +echo "configure:719: checking for working autoconf" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (autoconf --version) < /dev/null > /dev/null 2>&1; then + AUTOCONF=autoconf + echo "$ac_t""found" 1>&6 +else + AUTOCONF="$missing_dir/missing autoconf" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working automake""... $ac_c" 1>&6 +echo "configure:732: checking for working automake" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (automake --version) < /dev/null > /dev/null 2>&1; then + AUTOMAKE=automake + echo "$ac_t""found" 1>&6 +else + AUTOMAKE="$missing_dir/missing automake" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working autoheader""... $ac_c" 1>&6 +echo "configure:745: checking for working autoheader" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (autoheader --version) < /dev/null > /dev/null 2>&1; then + AUTOHEADER=autoheader + echo "$ac_t""found" 1>&6 +else + AUTOHEADER="$missing_dir/missing autoheader" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working makeinfo""... $ac_c" 1>&6 +echo "configure:758: checking for working makeinfo" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (makeinfo --version) < /dev/null > /dev/null 2>&1; then + MAKEINFO=makeinfo + echo "$ac_t""found" 1>&6 +else + MAKEINFO="$missing_dir/missing makeinfo" + echo "$ac_t""missing" 1>&6 +fi + + + +# Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:775: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="gcc" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:805: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_prog_rejected=no + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + break + fi + done + IFS="$ac_save_ifs" +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# -gt 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + set dummy "$ac_dir/$ac_word" "$@" + shift + ac_cv_prog_CC="$@" + fi +fi +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + if test -z "$CC"; then + case "`uname -s`" in + *win32* | *WIN32*) + # Extract the first word of "cl", so it can be a program name with args. +set dummy cl; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:856: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="cl" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + ;; + esac + fi + test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; } +fi + +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6 +echo "configure:888: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5 + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +cat > conftest.$ac_ext << EOF + +#line 899 "configure" +#include "confdefs.h" + +main(){return(0);} +EOF +if { (eval echo configure:904: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + ac_cv_prog_cc_works=yes + # If we can't run a trivial program, we are probably using a cross compiler. + if (./conftest; exit) 2>/dev/null; then + ac_cv_prog_cc_cross=no + else + ac_cv_prog_cc_cross=yes + fi +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + ac_cv_prog_cc_works=no +fi +rm -fr conftest* +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +echo "$ac_t""$ac_cv_prog_cc_works" 1>&6 +if test $ac_cv_prog_cc_works = no; then + { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; } +fi +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6 +echo "configure:930: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5 +echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6 +cross_compiling=$ac_cv_prog_cc_cross + +echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6 +echo "configure:935: checking whether we are using GNU C" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.c <&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then + ac_cv_prog_gcc=yes +else + ac_cv_prog_gcc=no +fi +fi + +echo "$ac_t""$ac_cv_prog_gcc" 1>&6 + +if test $ac_cv_prog_gcc = yes; then + GCC=yes +else + GCC= +fi + +ac_test_CFLAGS="${CFLAGS+set}" +ac_save_CFLAGS="$CFLAGS" +CFLAGS= +echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6 +echo "configure:963: checking whether ${CC-cc} accepts -g" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'void f(){}' > conftest.c +if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then + ac_cv_prog_cc_g=yes +else + ac_cv_prog_cc_g=no +fi +rm -f conftest* + +fi + +echo "$ac_t""$ac_cv_prog_cc_g" 1>&6 +if test "$ac_test_CFLAGS" = set; then + CFLAGS="$ac_save_CFLAGS" +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi + +if test "x$CC" != xcc; then + echo $ac_n "checking whether $CC and cc understand -c and -o together""... $ac_c" 1>&6 +echo "configure:996: checking whether $CC and cc understand -c and -o together" >&5 +else + echo $ac_n "checking whether cc understands -c and -o together""... $ac_c" 1>&6 +echo "configure:999: checking whether cc understands -c and -o together" >&5 +fi +set dummy $CC; ac_cc="`echo $2 | + sed -e 's/[^a-zA-Z0-9_]/_/g' -e 's/^[0-9]/_/'`" +if eval "test \"`echo '$''{'ac_cv_prog_cc_${ac_cc}_c_o'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'foo(){}' > conftest.c +# Make sure it works both with $CC and with simple cc. +# We do the test twice because some compilers refuse to overwrite an +# existing .o file with -o, though they will create one. +ac_try='${CC-cc} -c conftest.c -o conftest.o 1>&5' +if { (eval echo configure:1011: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } && + test -f conftest.o && { (eval echo configure:1012: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; +then + eval ac_cv_prog_cc_${ac_cc}_c_o=yes + if test "x$CC" != xcc; then + # Test first that cc exists at all. + if { ac_try='cc -c conftest.c 1>&5'; { (eval echo configure:1017: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then + ac_try='cc -c conftest.c -o conftest.o 1>&5' + if { (eval echo configure:1019: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } && + test -f conftest.o && { (eval echo configure:1020: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; + then + # cc works too. + : + else + # cc exists but doesn't like -o. + eval ac_cv_prog_cc_${ac_cc}_c_o=no + fi + fi + fi +else + eval ac_cv_prog_cc_${ac_cc}_c_o=no +fi +rm -f conftest* + +fi +if eval "test \"`echo '$ac_cv_prog_cc_'${ac_cc}_c_o`\" = yes"; then + echo "$ac_t""yes" 1>&6 +else + echo "$ac_t""no" 1>&6 + cat >> confdefs.h <<\EOF +#define NO_MINUS_C_MINUS_O 1 +EOF + +fi + +# Find a good install program. We prefer a C program (faster), +# so one script is as good as another. But avoid the broken or +# incompatible versions: +# SysV /etc/install, /usr/sbin/install +# SunOS /usr/etc/install +# IRIX /sbin/install +# AIX /bin/install +# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag +# AFS /usr/afsws/bin/install, which mishandles nonexistent args +# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff" +# ./install, which can be erroneously created by make from ./install.sh. +echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6 +echo "configure:1058: checking for a BSD compatible install" >&5 +if test -z "$INSTALL"; then +if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + IFS="${IFS= }"; ac_save_IFS="$IFS"; IFS=":" + for ac_dir in $PATH; do + # Account for people who put trailing slashes in PATH elements. + case "$ac_dir/" in + /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;; + *) + # OSF1 and SCO ODT 3.0 have their own names for install. + # Don't use installbsd from OSF since it installs stuff as root + # by default. + for ac_prog in ginstall scoinst install; do + if test -f $ac_dir/$ac_prog; then + if test $ac_prog = install && + grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then + # AIX install. It has an incompatible calling convention. + : + else + ac_cv_path_install="$ac_dir/$ac_prog -c" + break 2 + fi + fi + done + ;; + esac + done + IFS="$ac_save_IFS" + +fi + if test "${ac_cv_path_install+set}" = set; then + INSTALL="$ac_cv_path_install" + else + # As a last resort, use the slow shell script. We don't cache a + # path for INSTALL within a source directory, because that will + # break other packages using the cache if that directory is + # removed, or if the path is relative. + INSTALL="$ac_install_sh" + fi +fi +echo "$ac_t""$INSTALL" 1>&6 + +# Use test -z because SunOS4 sh mishandles braces in ${var-val}. +# It thinks the first close brace ends the variable substitution. +test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}' + +test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}' + +test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' + +echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6 +echo "configure:1111: checking whether ${MAKE-make} sets \${MAKE}" >&5 +set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftestmake <<\EOF +all: + @echo 'ac_maketemp="${MAKE}"' +EOF +# GNU make sometimes prints "make[1]: Entering...", which would confuse us. +eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=` +if test -n "$ac_maketemp"; then + eval ac_cv_prog_make_${ac_make}_set=yes +else + eval ac_cv_prog_make_${ac_make}_set=no +fi +rm -f conftestmake +fi +if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then + echo "$ac_t""yes" 1>&6 + SET_MAKE= +else + echo "$ac_t""no" 1>&6 + SET_MAKE="MAKE=${MAKE-make}" +fi + + + + + +if test "$MPICC" != "" ; then + # Extract the first word of "$MPICC", so it can be a program name with args. +set dummy $MPICC; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1145: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC0'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC0" in + /*) + ac_cv_path_MPICC0="$MPICC0" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC0="$MPICC0" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC0="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC0="$ac_cv_path_MPICC0" +if test -n "$MPICC0"; then + echo "$ac_t""$MPICC0" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +fi +# Extract the first word of "mpcc", so it can be a program name with args. +set dummy mpcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1181: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC1'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC1" in + /*) + ac_cv_path_MPICC1="$MPICC1" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC1="$MPICC1" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC1="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC1="$ac_cv_path_MPICC1" +if test -n "$MPICC1"; then + echo "$ac_t""$MPICC1" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "hcc", so it can be a program name with args. +set dummy hcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1216: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC2'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC2" in + /*) + ac_cv_path_MPICC2="$MPICC2" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC2="$MPICC2" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC2="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC2="$ac_cv_path_MPICC2" +if test -n "$MPICC2"; then + echo "$ac_t""$MPICC2" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "mpicc", so it can be a program name with args. +set dummy mpicc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1251: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC3'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC3" in + /*) + ac_cv_path_MPICC3="$MPICC3" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC3="$MPICC3" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC3="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC3="$ac_cv_path_MPICC3" +if test -n "$MPICC3"; then + echo "$ac_t""$MPICC3" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "mpicc_lam", so it can be a program name with args. +set dummy mpicc_lam; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1286: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC4'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC4" in + /*) + ac_cv_path_MPICC4="$MPICC4" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC4="$MPICC4" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC4="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC4="$ac_cv_path_MPICC4" +if test -n "$MPICC4"; then + echo "$ac_t""$MPICC4" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "mpicc_mpich", so it can be a program name with args. +set dummy mpicc_mpich; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1321: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC5'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC5" in + /*) + ac_cv_path_MPICC5="$MPICC5" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC5="$MPICC5" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC5="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC5="$ac_cv_path_MPICC5" +if test -n "$MPICC5"; then + echo "$ac_t""$MPICC5" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + + if test "$MPICC0" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC0 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1371: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1382: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC1" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC1 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1419: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1430: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC2" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC2 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1467: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1478: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC3" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC3 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1515: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1526: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC4" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC4 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1563: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1574: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC5" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC5 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1611: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1622: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + +ac_cv_prog_MPICC=$MPICC + + + + + + + +echo $ac_n "checking for main in -lm""... $ac_c" 1>&6 +echo "configure:1652: checking for main in -lm" >&5 +ac_lib_var=`echo m'_'main | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lm $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo m | sed -e 's/[^a-zA-Z0-9_]/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + + +echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6 +echo "configure:1696: checking how to run the C preprocessor" >&5 +# On Suns, sometimes $CPP names a directory. +if test -n "$CPP" && test -d "$CPP"; then + CPP= +fi +if test -z "$CPP"; then +if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + # This must be in double quotes, not single quotes, because CPP may get + # substituted into the Makefile and "${CC-cc}" will confuse make. + CPP="${CC-cc} -E" + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1717: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -E -traditional-cpp" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1734: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -nologo -E" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1751: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP=/lib/cpp +fi +rm -f conftest* +fi +rm -f conftest* +fi +rm -f conftest* + ac_cv_prog_CPP="$CPP" +fi + CPP="$ac_cv_prog_CPP" +else + ac_cv_prog_CPP="$CPP" +fi +echo "$ac_t""$CPP" 1>&6 + +echo $ac_n "checking for ANSI C header files""... $ac_c" 1>&6 +echo "configure:1776: checking for ANSI C header files" >&5 +if eval "test \"`echo '$''{'ac_cv_header_stdc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +#include +#include +#include +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1789: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + ac_cv_header_stdc=yes +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. +cat > conftest.$ac_ext < +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "memchr" >/dev/null 2>&1; then + : +else + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. +cat > conftest.$ac_ext < +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "free" >/dev/null 2>&1; then + : +else + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. +if test "$cross_compiling" = yes; then + : +else + cat > conftest.$ac_ext < +#define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +#define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int main () { int i; for (i = 0; i < 256; i++) +if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2); +exit (0); } + +EOF +if { (eval echo configure:1856: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + : +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_header_stdc=no +fi +rm -fr conftest* +fi + +fi +fi + +echo "$ac_t""$ac_cv_header_stdc" 1>&6 +if test $ac_cv_header_stdc = yes; then + cat >> confdefs.h <<\EOF +#define STDC_HEADERS 1 +EOF + +fi + +for ac_hdr in limits.h +do +ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 +echo "configure:1883: checking for $ac_hdr" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1893: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'` + cat >> confdefs.h <&6 +fi +done + + + + +echo $ac_n "checking for working const""... $ac_c" 1>&6 +echo "configure:1923: checking for working const" >&5 +if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext <j = 5; +} +{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ + const int foo = 10; +} + +; return 0; } +EOF +if { (eval echo configure:1977: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + ac_cv_c_const=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_c_const=no +fi +rm -f conftest* +fi + +echo "$ac_t""$ac_cv_c_const" 1>&6 +if test $ac_cv_c_const = no; then + cat >> confdefs.h <<\EOF +#define const +EOF + +fi + +echo $ac_n "checking for size_t""... $ac_c" 1>&6 +echo "configure:1998: checking for size_t" >&5 +if eval "test \"`echo '$''{'ac_cv_type_size_t'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +#if STDC_HEADERS +#include +#include +#endif +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "(^|[^a-zA-Z_0-9])size_t[^a-zA-Z_0-9]" >/dev/null 2>&1; then + rm -rf conftest* + ac_cv_type_size_t=yes +else + rm -rf conftest* + ac_cv_type_size_t=no +fi +rm -f conftest* + +fi +echo "$ac_t""$ac_cv_type_size_t" 1>&6 +if test $ac_cv_type_size_t = no; then + cat >> confdefs.h <<\EOF +#define size_t unsigned +EOF + +fi + + + +trap '' 1 2 15 + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Any assignment to VPATH causes Sun make to only execute +# the first set of double-colon rules, so remove it if not needed. +# If there is a colon in the path, we need to keep it. +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[^:]*$/d' +fi + +trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15 + +# Transform confdefs.h into DEFS. +# Protect against shell expansion while executing Makefile rules. +# Protect against Makefile macro expansion. +cat > conftest.defs <<\EOF +s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%-D\1=\2%g +s%[ `~#$^&*(){}\\|;'"<>?]%\\&%g +s%\[%\\&%g +s%\]%\\&%g +s%\$%$$%g +EOF +DEFS=`sed -f conftest.defs confdefs.h | tr '\012' ' '` +rm -f conftest.defs + + +# Without the "./", some shells look in PATH for config.status. +: ${CONFIG_STATUS=./config.status} + +echo creating $CONFIG_STATUS +rm -f $CONFIG_STATUS +cat > $CONFIG_STATUS </dev/null | sed 1q`: +# +# $0 $ac_configure_args +# +# Compiler output produced by configure, useful for debugging +# configure, is in ./config.log if it exists. + +ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]" +for ac_option +do + case "\$ac_option" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion" + exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;; + -version | --version | --versio | --versi | --vers | --ver | --ve | --v) + echo "$CONFIG_STATUS generated by autoconf version 2.13" + exit 0 ;; + -help | --help | --hel | --he | --h) + echo "\$ac_cs_usage"; exit 0 ;; + *) echo "\$ac_cs_usage"; exit 1 ;; + esac +done + +ac_given_srcdir=$srcdir +ac_given_INSTALL="$INSTALL" + +trap 'rm -fr `echo "Makefile src/Makefile src/test doc/Makefile data/Makefile" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15 +EOF +cat >> $CONFIG_STATUS < conftest.subs <<\\CEOF +$ac_vpsub +$extrasub +s%@SHELL@%$SHELL%g +s%@CFLAGS@%$CFLAGS%g +s%@CPPFLAGS@%$CPPFLAGS%g +s%@CXXFLAGS@%$CXXFLAGS%g +s%@FFLAGS@%$FFLAGS%g +s%@DEFS@%$DEFS%g +s%@LDFLAGS@%$LDFLAGS%g +s%@LIBS@%$LIBS%g +s%@exec_prefix@%$exec_prefix%g +s%@prefix@%$prefix%g +s%@program_transform_name@%$program_transform_name%g +s%@bindir@%$bindir%g +s%@sbindir@%$sbindir%g +s%@libexecdir@%$libexecdir%g +s%@datadir@%$datadir%g +s%@sysconfdir@%$sysconfdir%g +s%@sharedstatedir@%$sharedstatedir%g +s%@localstatedir@%$localstatedir%g +s%@libdir@%$libdir%g +s%@includedir@%$includedir%g +s%@oldincludedir@%$oldincludedir%g +s%@infodir@%$infodir%g +s%@mandir@%$mandir%g +s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%g +s%@INSTALL_SCRIPT@%$INSTALL_SCRIPT%g +s%@INSTALL_DATA@%$INSTALL_DATA%g +s%@PACKAGE@%$PACKAGE%g +s%@VERSION@%$VERSION%g +s%@ACLOCAL@%$ACLOCAL%g +s%@AUTOCONF@%$AUTOCONF%g +s%@AUTOMAKE@%$AUTOMAKE%g +s%@AUTOHEADER@%$AUTOHEADER%g +s%@MAKEINFO@%$MAKEINFO%g +s%@SET_MAKE@%$SET_MAKE%g +s%@CC@%$CC%g +s%@MPICC0@%$MPICC0%g +s%@MPICC1@%$MPICC1%g +s%@MPICC2@%$MPICC2%g +s%@MPICC3@%$MPICC3%g +s%@MPICC4@%$MPICC4%g +s%@MPICC5@%$MPICC5%g +s%@MPICC@%$MPICC%g +s%@MPILIBS@%$MPILIBS%g +s%@MPIDEFS@%$MPIDEFS%g +s%@MPICFLAGS@%$MPICFLAGS%g +s%@PPUZZLE@%$PPUZZLE%g +s%@CPP@%$CPP%g + +CEOF +EOF + +cat >> $CONFIG_STATUS <<\EOF + +# Split the substitutions into bite-sized pieces for seds with +# small command number limits, like on Digital OSF/1 and HP-UX. +ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script. +ac_file=1 # Number of current file. +ac_beg=1 # First line for current file. +ac_end=$ac_max_sed_cmds # Line after last line for current file. +ac_more_lines=: +ac_sed_cmds="" +while $ac_more_lines; do + if test $ac_beg -gt 1; then + sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file + else + sed "${ac_end}q" conftest.subs > conftest.s$ac_file + fi + if test ! -s conftest.s$ac_file; then + ac_more_lines=false + rm -f conftest.s$ac_file + else + if test -z "$ac_sed_cmds"; then + ac_sed_cmds="sed -f conftest.s$ac_file" + else + ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file" + fi + ac_file=`expr $ac_file + 1` + ac_beg=$ac_end + ac_end=`expr $ac_end + $ac_max_sed_cmds` + fi +done +if test -z "$ac_sed_cmds"; then + ac_sed_cmds=cat +fi +EOF + +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF +for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories. + + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`" + # A "../" for each directory in $ac_dir_suffix. + ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'` + else + ac_dir_suffix= ac_dots= + fi + + case "$ac_given_srcdir" in + .) srcdir=. + if test -z "$ac_dots"; then top_srcdir=. + else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;; + /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;; + *) # Relative path. + srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix" + top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + case "$ac_given_INSTALL" in + [/$]*) INSTALL="$ac_given_INSTALL" ;; + *) INSTALL="$ac_dots$ac_given_INSTALL" ;; + esac + + echo creating "$ac_file" + rm -f "$ac_file" + configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure." + case "$ac_file" in + *Makefile*) ac_comsub="1i\\ +# $configure_input" ;; + *) ac_comsub= ;; + esac + + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + sed -e "$ac_comsub +s%@configure_input@%$configure_input%g +s%@srcdir@%$srcdir%g +s%@top_srcdir@%$top_srcdir%g +s%@INSTALL@%$INSTALL%g +" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file +fi; done +rm -f conftest.s* + +EOF +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF + +exit 0 +EOF +chmod +x $CONFIG_STATUS +rm -fr confdefs* $ac_clean_files +test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1 + diff --git a/forester/archive/RIO/others/puzzle_dqo/configure.in b/forester/archive/RIO/others/puzzle_dqo/configure.in new file mode 100644 index 0000000..57f0e27 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/configure.in @@ -0,0 +1,117 @@ + +dnl Disable caching. +define([AC_CACHE_LOAD], )dnl +define([AC_CACHE_SAVE], )dnl + +dnl Process this file with autoconf to produce a configure script. +AC_INIT(src/ml.h) + +AM_INIT_AUTOMAKE(tree-puzzle, 5.0) + +dnl Checks for programs. +AC_PROG_CC +AC_PROG_CC_C_O +AC_PROG_INSTALL +AC_PROG_MAKE_SET + + +AC_DEFUN(AC_TEST_MPICC,[dnl + if test "$1" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$1 +dnl if test "$MPICC" != "$CC" ; then +dnl +dnl fi + + if test "$MPICC" != "" ; then + AC_MSG_CHECKING(whether $MPICC works as MPI compiler) + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + AC_MSG_RESULT(yes) + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + AC_MSG_RESULT(no) + AC_MSG_CHECKING(whether $MPICC needs -lmpi) + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + AC_MSG_RESULT(yes) + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + AC_MSG_RESULT(no) + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi ]) + +if test "$MPICC" != "" ; then + AC_PATH_PROG(MPICC0, $MPICC) +fi +AC_PATH_PROG(MPICC1, mpcc) +AC_PATH_PROG(MPICC2, hcc) +AC_PATH_PROG(MPICC3, mpicc) +AC_PATH_PROG(MPICC4, mpicc_lam) +AC_PATH_PROG(MPICC5, mpicc_mpich) + +AC_TEST_MPICC($MPICC0) +AC_TEST_MPICC($MPICC1) +AC_TEST_MPICC($MPICC2) +AC_TEST_MPICC($MPICC3) +AC_TEST_MPICC($MPICC4) +AC_TEST_MPICC($MPICC5) + +ac_cv_prog_MPICC=$MPICC + +AC_SUBST(MPICC) +AC_SUBST(MPILIBS) +AC_SUBST(MPIDEFS) +AC_SUBST(MPICFLAGS) +AC_SUBST(PPUZZLE) + +dnl Checks for libraries. +dnl Replace `main' with a function in -lm: +AC_CHECK_LIB(m, main) +dnl AC_CHECK_LIB(mpi, main) + +dnl Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS(limits.h) +dnl AC_HAVE_HEADERS(mpi.h) + +dnl AC_HAVE_HEADERS(rpc/xdr.h) + + +dnl Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST +AC_TYPE_SIZE_T + +dnl Checks for library functions. +dnl AC_CHECK_FUNCS(xdr_u_char) +dnl AC_CHECK_FUNCS(xdr_double) +dnl AC_CHECK_FUNCS(xdrstdio_create) +dnl AC_CHECK_FUNCS(xdr_destroy) +dnl AC_CHECK_FUNCS(xdr_inline) + +AC_OUTPUT(Makefile src/Makefile src/test doc/Makefile data/Makefile) diff --git a/forester/archive/RIO/others/puzzle_dqo/data/Makefile b/forester/archive/RIO/others/puzzle_dqo/data/Makefile new file mode 100644 index 0000000..6f5f672 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/data/Makefile @@ -0,0 +1,177 @@ +# Generated automatically from Makefile.in by configure. +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = /bin/sh + +srcdir = . +top_srcdir = .. +prefix = /usr/local +exec_prefix = ${prefix} + +bindir = ${exec_prefix}/bin +sbindir = ${exec_prefix}/sbin +libexecdir = ${exec_prefix}/libexec +datadir = ${prefix}/share +sysconfdir = ${prefix}/etc +sharedstatedir = ${prefix}/com +localstatedir = ${prefix}/var +libdir = ${exec_prefix}/lib +infodir = ${prefix}/info +mandir = ${prefix}/man +includedir = ${prefix}/include +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/tree-puzzle +pkglibdir = $(libdir)/tree-puzzle +pkgincludedir = $(includedir)/tree-puzzle + +top_builddir = .. + +ACLOCAL = aclocal +AUTOCONF = autoconf +AUTOMAKE = automake +AUTOHEADER = autoheader + +INSTALL = /usr/bin/install -c +INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_SCRIPT = ${INSTALL_PROGRAM} +transform = s,x,x, + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = gcc +MAKEINFO = makeinfo +MPICC = +MPICC0 = +MPICC1 = +MPICC2 = +MPICC3 = +MPICC4 = +MPICC5 = +MPICFLAGS = +MPIDEFS = +MPILIBS = +PACKAGE = tree-puzzle +PPUZZLE = +VERSION = 5.0 + +EXTRA_DIST = atp6.a globin.a marswolf.n primates.b +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = tar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps data/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = data + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/data/Makefile.am b/forester/archive/RIO/others/puzzle_dqo/data/Makefile.am new file mode 100644 index 0000000..9589f1e --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/data/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = atp6.a globin.a marswolf.n primates.b diff --git a/forester/archive/RIO/others/puzzle_dqo/data/Makefile.in b/forester/archive/RIO/others/puzzle_dqo/data/Makefile.in new file mode 100644 index 0000000..f844e6e --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/data/Makefile.in @@ -0,0 +1,177 @@ +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +EXTRA_DIST = atp6.a globin.a marswolf.n primates.b +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = tar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps data/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = data + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/doc/Makefile b/forester/archive/RIO/others/puzzle_dqo/doc/Makefile new file mode 100644 index 0000000..0f281d4 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/doc/Makefile @@ -0,0 +1,177 @@ +# Generated automatically from Makefile.in by configure. +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = /bin/sh + +srcdir = . +top_srcdir = .. +prefix = /usr/local +exec_prefix = ${prefix} + +bindir = ${exec_prefix}/bin +sbindir = ${exec_prefix}/sbin +libexecdir = ${exec_prefix}/libexec +datadir = ${prefix}/share +sysconfdir = ${prefix}/etc +sharedstatedir = ${prefix}/com +localstatedir = ${prefix}/var +libdir = ${exec_prefix}/lib +infodir = ${prefix}/info +mandir = ${prefix}/man +includedir = ${prefix}/include +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/tree-puzzle +pkglibdir = $(libdir)/tree-puzzle +pkgincludedir = $(includedir)/tree-puzzle + +top_builddir = .. + +ACLOCAL = aclocal +AUTOCONF = autoconf +AUTOMAKE = automake +AUTOHEADER = autoheader + +INSTALL = /usr/bin/install -c +INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_SCRIPT = ${INSTALL_PROGRAM} +transform = s,x,x, + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = gcc +MAKEINFO = makeinfo +MPICC = +MPICC0 = +MPICC1 = +MPICC2 = +MPICC3 = +MPICC4 = +MPICC5 = +MPICFLAGS = +MPIDEFS = +MPILIBS = +PACKAGE = tree-puzzle +PPUZZLE = +VERSION = 5.0 + +EXTRA_DIST = manual.html ppuzzle.gif puzzle.gif +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = tar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps doc/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = doc + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/doc/Makefile.am b/forester/archive/RIO/others/puzzle_dqo/doc/Makefile.am new file mode 100644 index 0000000..3cb95e6 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/doc/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = manual.html ppuzzle.gif puzzle.gif diff --git a/forester/archive/RIO/others/puzzle_dqo/doc/Makefile.in b/forester/archive/RIO/others/puzzle_dqo/doc/Makefile.in new file mode 100644 index 0000000..e48590c --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/doc/Makefile.in @@ -0,0 +1,177 @@ +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +EXTRA_DIST = manual.html ppuzzle.gif puzzle.gif +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = tar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps doc/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = doc + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/install-sh b/forester/archive/RIO/others/puzzle_dqo/install-sh new file mode 100755 index 0000000..e9de238 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/install-sh @@ -0,0 +1,251 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + chmodcmd="" + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/forester/archive/RIO/others/puzzle_dqo/missing b/forester/archive/RIO/others/puzzle_dqo/missing new file mode 100755 index 0000000..7789652 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/missing @@ -0,0 +1,190 @@ +#! /bin/sh +# Common stub for a few missing GNU programs while installing. +# Copyright (C) 1996, 1997 Free Software Foundation, Inc. +# Franc,ois Pinard , 1996. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + +if test $# -eq 0; then + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 +fi + +case "$1" in + + -h|--h|--he|--hel|--help) + echo "\ +$0 [OPTION]... PROGRAM [ARGUMENT]... + +Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an +error status if there is no known handling for PROGRAM. + +Options: + -h, --help display this help and exit + -v, --version output version information and exit + +Supported PROGRAM values: + aclocal touch file \`aclocal.m4' + autoconf touch file \`configure' + autoheader touch file \`config.h.in' + automake touch all \`Makefile.in' files + bison create \`y.tab.[ch]', if possible, from existing .[ch] + flex create \`lex.yy.c', if possible, from existing .c + lex create \`lex.yy.c', if possible, from existing .c + makeinfo touch the output file + yacc create \`y.tab.[ch]', if possible, from existing .[ch]" + ;; + + -v|--v|--ve|--ver|--vers|--versi|--versio|--version) + echo "missing - GNU libit 0.0" + ;; + + -*) + echo 1>&2 "$0: Unknown \`$1' option" + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 + ;; + + aclocal) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acinclude.m4' or \`configure.in'. You might want + to install the \`Automake' and \`Perl' packages. Grab them from + any GNU archive site." + touch aclocal.m4 + ;; + + autoconf) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`configure.in'. You might want to install the + \`Autoconf' and \`GNU m4' packages. Grab them from any GNU + archive site." + touch configure + ;; + + autoheader) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acconfig.h' or \`configure.in'. You might want + to install the \`Autoconf' and \`GNU m4' packages. Grab them + from any GNU archive site." + files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' configure.in` + test -z "$files" && files="config.h" + touch_files= + for f in $files; do + case "$f" in + *:*) touch_files="$touch_files "`echo "$f" | + sed -e 's/^[^:]*://' -e 's/:.*//'`;; + *) touch_files="$touch_files $f.in";; + esac + done + touch $touch_files + ;; + + automake) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`Makefile.am', \`acinclude.m4' or \`configure.in'. + You might want to install the \`Automake' and \`Perl' packages. + Grab them from any GNU archive site." + find . -type f -name Makefile.am -print | + sed 's/\.am$/.in/' | + while read f; do touch "$f"; done + ;; + + bison|yacc) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.y' file. You may need the \`Bison' package + in order for those modifications to take effect. You can get + \`Bison' from any GNU archive site." + rm -f y.tab.c y.tab.h + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.y) + SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.c + fi + SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.h + fi + ;; + esac + fi + if [ ! -f y.tab.h ]; then + echo >y.tab.h + fi + if [ ! -f y.tab.c ]; then + echo 'main() { return 0; }' >y.tab.c + fi + ;; + + lex|flex) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.l' file. You may need the \`Flex' package + in order for those modifications to take effect. You can get + \`Flex' from any GNU archive site." + rm -f lex.yy.c + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.l) + SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" lex.yy.c + fi + ;; + esac + fi + if [ ! -f lex.yy.c ]; then + echo 'main() { return 0; }' >lex.yy.c + fi + ;; + + makeinfo) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.texi' or \`.texinfo' file, or any other file + indirectly affecting the aspect of the manual. The spurious + call might also be the consequence of using a buggy \`make' (AIX, + DU, IRIX). You might want to install the \`Texinfo' package or + the \`GNU make' package. Grab either from any GNU archive site." + file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'` + if test -z "$file"; then + file=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` + file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $file` + fi + touch $file + ;; + + *) + echo 1>&2 "\ +WARNING: \`$1' is needed, and you do not seem to have it handy on your + system. You might have modified some files without having the + proper tools for further handling them. Check the \`README' file, + it often tells you about the needed prerequirements for installing + this package. You may also peek at any GNU archive site, in case + some other package would contain this missing \`$1' program." + exit 1 + ;; +esac + +exit 0 diff --git a/forester/archive/RIO/others/puzzle_dqo/mkinstalldirs b/forester/archive/RIO/others/puzzle_dqo/mkinstalldirs new file mode 100755 index 0000000..bff4a66 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/mkinstalldirs @@ -0,0 +1,40 @@ +#! /bin/sh +# mkinstalldirs --- make directory hierarchy +# Author: Noah Friedman +# Created: 1993-05-16 +# Public domain + +# $Id: mkinstalldirs,v 1.1.1.1 2005/03/22 08:34:59 cmzmasek Exp $ + +errstatus=0 + +for file +do + set fnord `echo ":$file" | sed -ne 's/^:\//#/;s/^://;s/\// /g;s/^#/\//;p'` + shift + + pathcomp= + for d + do + pathcomp="$pathcomp$d" + case "$pathcomp" in + -* ) pathcomp=./$pathcomp ;; + esac + + if test ! -d "$pathcomp"; then + echo "mkdir $pathcomp" + + mkdir "$pathcomp" || lasterr=$? + + if test ! -d "$pathcomp"; then + errstatus=$lasterr + fi + fi + + pathcomp="$pathcomp/" + done +done + +exit $errstatus + +# mkinstalldirs ends here diff --git a/forester/archive/RIO/others/puzzle_dqo/src/Makefile b/forester/archive/RIO/others/puzzle_dqo/src/Makefile new file mode 100644 index 0000000..8a90eaa --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/Makefile @@ -0,0 +1,317 @@ +# Generated automatically from Makefile.in by configure. +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = /bin/sh + +srcdir = . +top_srcdir = .. +prefix = /usr/local +exec_prefix = ${prefix} + +bindir = ${exec_prefix}/bin +sbindir = ${exec_prefix}/sbin +libexecdir = ${exec_prefix}/libexec +datadir = ${prefix}/share +sysconfdir = ${prefix}/etc +sharedstatedir = ${prefix}/com +localstatedir = ${prefix}/var +libdir = ${exec_prefix}/lib +infodir = ${prefix}/info +mandir = ${prefix}/man +includedir = ${prefix}/include +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/tree-puzzle +pkglibdir = $(libdir)/tree-puzzle +pkgincludedir = $(includedir)/tree-puzzle + +top_builddir = .. + +ACLOCAL = aclocal +AUTOCONF = autoconf +AUTOMAKE = automake +AUTOHEADER = autoheader + +INSTALL = /usr/bin/install -c +INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_SCRIPT = ${INSTALL_PROGRAM} +transform = s,x,x, + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = gcc +MAKEINFO = makeinfo +MPICC = +MPICC0 = +MPICC1 = +MPICC2 = +MPICC3 = +MPICC4 = +MPICC5 = +MPICFLAGS = +MPIDEFS = +MPILIBS = +PACKAGE = tree-puzzle +PPUZZLE = +VERSION = 5.0 + +bin_PROGRAMS = puzzle + +puzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c util.c ml.h util.h puzzle.h gamma.h +puzzle_LDADD = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o spuzzle1.o spuzzle2.o sutil.o + +SDEFS = +SCFLAGS = +SLDFLAGS = -lm + +SCOMPILE = $(CC) $(SDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(SCFLAGS) $(CFLAGS) +SCCLD = $(CC) +SLINK = $(SCCLD) $(AM_CFLAGS) $(CFLAGS) $(SLDFLAGS) $(LDFLAGS) + +PCC = +PDEFS = -DPARALLEL +PCFLAGS = +PLDFLAGS = -lm +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = test +PROGRAMS = $(bin_PROGRAMS) + + +DEFS = -DPACKAGE=\"tree-puzzle\" -DVERSION=\"5.0\" -DHAVE_LIBM=1 -DSTDC_HEADERS=1 -DHAVE_LIMITS_H=1 -I. -I$(srcdir) +CPPFLAGS = +LDFLAGS = +LIBS = -lm +puzzle_OBJECTS = gamma.o ml1.o ml2.o ml3.o model1.o model2.o puzzle1.o \ +puzzle2.o util.o +puzzle_DEPENDENCIES = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o \ +spuzzle1.o spuzzle2.o sutil.o +puzzle_LDFLAGS = +CFLAGS = -g -O2 +COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ +DIST_COMMON = README Makefile.am Makefile.in test.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +SOURCES = $(puzzle_SOURCES) +OBJECTS = $(puzzle_OBJECTS) + +all: all-redirect +.SUFFIXES: +.SUFFIXES: .S .c .o .s +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps src/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +test: $(top_builddir)/config.status test.in + cd $(top_builddir) && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +mostlyclean-binPROGRAMS: + +clean-binPROGRAMS: + -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) + +distclean-binPROGRAMS: + +maintainer-clean-binPROGRAMS: + +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(bindir) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + if test -f $$p; then \ + echo " $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`"; \ + $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + list='$(bin_PROGRAMS)'; for p in $$list; do \ + rm -f $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + done + +.c.o: + $(COMPILE) -c $< + +.s.o: + $(COMPILE) -c $< + +.S.o: + $(COMPILE) -c $< + +mostlyclean-compile: + -rm -f *.o core *.core + +clean-compile: + +distclean-compile: + -rm -f *.tab.c + +maintainer-clean-compile: + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + here=`pwd` && cd $(srcdir) \ + && mkid -f$$here/ID $$unique $(LISP) + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \ + || (cd $(srcdir) && etags $(ETAGS_ARGS) $$tags $$unique $(LISP) -o $$here/TAGS) + +mostlyclean-tags: + +clean-tags: + +distclean-tags: + -rm -f TAGS ID + +maintainer-clean-tags: + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = src + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: install-binPROGRAMS +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: uninstall-binPROGRAMS +uninstall: uninstall-am +all-am: Makefile $(PROGRAMS) +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-binPROGRAMS mostlyclean-compile \ + mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-binPROGRAMS clean-compile clean-tags clean-generic \ + mostlyclean-am + +clean: clean-am + +distclean-am: distclean-binPROGRAMS distclean-compile distclean-tags \ + distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-binPROGRAMS \ + maintainer-clean-compile maintainer-clean-tags \ + maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: mostlyclean-binPROGRAMS distclean-binPROGRAMS clean-binPROGRAMS \ +maintainer-clean-binPROGRAMS uninstall-binPROGRAMS install-binPROGRAMS \ +mostlyclean-compile distclean-compile clean-compile \ +maintainer-clean-compile tags mostlyclean-tags distclean-tags \ +clean-tags maintainer-clean-tags distdir info-am info dvi-am dvi check \ +check-am installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +puzzle: $(puzzle_LDADD) $(puzzle_SOURCES) + $(SLINK) $(puzzle_LDADD) -o $@ + +sml1.o: ml1.c ml.h util.h + $(SCOMPILE) -c ml1.c && mv ml1.o $@ +sml2.o: ml2.c ml.h util.h + $(SCOMPILE) -c ml2.c && mv ml2.o $@ +sml3.o: ml3.c ml.h util.h gamma.h + $(SCOMPILE) -c ml3.c && mv ml3.o $@ +smodel1.o: model1.c ml.h util.h + $(SCOMPILE) -c model1.c && mv model1.o $@ +smodel2.o: model2.c ml.h util.h + $(SCOMPILE) -c model2.c && mv model2.o $@ +spuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(SCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +spuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(SCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +sutil.o: util.c util.h + $(SCOMPILE) -c util.c && mv util.o $@ +sgamma.o: gamma.c gamma.h util.h + $(SCOMPILE) -c gamma.c && mv gamma.o $@ + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/src/Makefile.am b/forester/archive/RIO/others/puzzle_dqo/src/Makefile.am new file mode 100644 index 0000000..e28c498 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/Makefile.am @@ -0,0 +1,49 @@ +bin_PROGRAMS = puzzle + +puzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c util.c ml.h util.h puzzle.h gamma.h +puzzle_LDADD = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o spuzzle1.o spuzzle2.o sutil.o + +SDEFS = +SCFLAGS = +SLDFLAGS = @LIBS@ + +SCOMPILE = $(CC) $(SDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(SCFLAGS) $(CFLAGS) +SCCLD = $(CC) +SLINK = $(SCCLD) $(AM_CFLAGS) $(CFLAGS) $(SLDFLAGS) $(LDFLAGS) + + + +PCC = @MPICC@ +PDEFS = -DPARALLEL +PCFLAGS = +PLDFLAGS = @LIBS@ @MPILIBS@ + + + + +puzzle: $(puzzle_LDADD) $(puzzle_SOURCES) + $(SLINK) $(puzzle_LDADD) -o $@ + +sml1.o: ml1.c ml.h util.h + $(SCOMPILE) -c ml1.c && mv ml1.o $@ +sml2.o: ml2.c ml.h util.h + $(SCOMPILE) -c ml2.c && mv ml2.o $@ +sml3.o: ml3.c ml.h util.h gamma.h + $(SCOMPILE) -c ml3.c && mv ml3.o $@ +smodel1.o: model1.c ml.h util.h + $(SCOMPILE) -c model1.c && mv model1.o $@ +smodel2.o: model2.c ml.h util.h + $(SCOMPILE) -c model2.c && mv model2.o $@ +spuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(SCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +spuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(SCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +sutil.o: util.c util.h + $(SCOMPILE) -c util.c && mv util.o $@ +sgamma.o: gamma.c gamma.h util.h + $(SCOMPILE) -c gamma.c && mv gamma.o $@ + + + + + diff --git a/forester/archive/RIO/others/puzzle_dqo/src/Makefile.in b/forester/archive/RIO/others/puzzle_dqo/src/Makefile.in new file mode 100644 index 0000000..a8fb19d --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/Makefile.in @@ -0,0 +1,317 @@ +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +bin_PROGRAMS = puzzle + +puzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c util.c ml.h util.h puzzle.h gamma.h +puzzle_LDADD = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o spuzzle1.o spuzzle2.o sutil.o + +SDEFS = +SCFLAGS = +SLDFLAGS = @LIBS@ + +SCOMPILE = $(CC) $(SDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(SCFLAGS) $(CFLAGS) +SCCLD = $(CC) +SLINK = $(SCCLD) $(AM_CFLAGS) $(CFLAGS) $(SLDFLAGS) $(LDFLAGS) + +PCC = @MPICC@ +PDEFS = -DPARALLEL +PCFLAGS = +PLDFLAGS = @LIBS@ @MPILIBS@ +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = test +PROGRAMS = $(bin_PROGRAMS) + + +DEFS = @DEFS@ -I. -I$(srcdir) +CPPFLAGS = @CPPFLAGS@ +LDFLAGS = @LDFLAGS@ +LIBS = @LIBS@ +puzzle_OBJECTS = gamma.o ml1.o ml2.o ml3.o model1.o model2.o puzzle1.o \ +puzzle2.o util.o +puzzle_DEPENDENCIES = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o \ +spuzzle1.o spuzzle2.o sutil.o +puzzle_LDFLAGS = +CFLAGS = @CFLAGS@ +COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ +DIST_COMMON = README Makefile.am Makefile.in test.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +SOURCES = $(puzzle_SOURCES) +OBJECTS = $(puzzle_OBJECTS) + +all: all-redirect +.SUFFIXES: +.SUFFIXES: .S .c .o .s +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps src/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +test: $(top_builddir)/config.status test.in + cd $(top_builddir) && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +mostlyclean-binPROGRAMS: + +clean-binPROGRAMS: + -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) + +distclean-binPROGRAMS: + +maintainer-clean-binPROGRAMS: + +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(bindir) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + if test -f $$p; then \ + echo " $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`"; \ + $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + list='$(bin_PROGRAMS)'; for p in $$list; do \ + rm -f $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + done + +.c.o: + $(COMPILE) -c $< + +.s.o: + $(COMPILE) -c $< + +.S.o: + $(COMPILE) -c $< + +mostlyclean-compile: + -rm -f *.o core *.core + +clean-compile: + +distclean-compile: + -rm -f *.tab.c + +maintainer-clean-compile: + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + here=`pwd` && cd $(srcdir) \ + && mkid -f$$here/ID $$unique $(LISP) + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \ + || (cd $(srcdir) && etags $(ETAGS_ARGS) $$tags $$unique $(LISP) -o $$here/TAGS) + +mostlyclean-tags: + +clean-tags: + +distclean-tags: + -rm -f TAGS ID + +maintainer-clean-tags: + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = src + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: install-binPROGRAMS +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: uninstall-binPROGRAMS +uninstall: uninstall-am +all-am: Makefile $(PROGRAMS) +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-binPROGRAMS mostlyclean-compile \ + mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-binPROGRAMS clean-compile clean-tags clean-generic \ + mostlyclean-am + +clean: clean-am + +distclean-am: distclean-binPROGRAMS distclean-compile distclean-tags \ + distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-binPROGRAMS \ + maintainer-clean-compile maintainer-clean-tags \ + maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: mostlyclean-binPROGRAMS distclean-binPROGRAMS clean-binPROGRAMS \ +maintainer-clean-binPROGRAMS uninstall-binPROGRAMS install-binPROGRAMS \ +mostlyclean-compile distclean-compile clean-compile \ +maintainer-clean-compile tags mostlyclean-tags distclean-tags \ +clean-tags maintainer-clean-tags distdir info-am info dvi-am dvi check \ +check-am installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +puzzle: $(puzzle_LDADD) $(puzzle_SOURCES) + $(SLINK) $(puzzle_LDADD) -o $@ + +sml1.o: ml1.c ml.h util.h + $(SCOMPILE) -c ml1.c && mv ml1.o $@ +sml2.o: ml2.c ml.h util.h + $(SCOMPILE) -c ml2.c && mv ml2.o $@ +sml3.o: ml3.c ml.h util.h gamma.h + $(SCOMPILE) -c ml3.c && mv ml3.o $@ +smodel1.o: model1.c ml.h util.h + $(SCOMPILE) -c model1.c && mv model1.o $@ +smodel2.o: model2.c ml.h util.h + $(SCOMPILE) -c model2.c && mv model2.o $@ +spuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(SCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +spuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(SCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +sutil.o: util.c util.h + $(SCOMPILE) -c util.c && mv util.o $@ +sgamma.o: gamma.c gamma.h util.h + $(SCOMPILE) -c gamma.c && mv gamma.o $@ + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_dqo/src/README b/forester/archive/RIO/others/puzzle_dqo/src/README new file mode 100644 index 0000000..9c89883 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/README @@ -0,0 +1 @@ +Sources of the TREE-PUZZLE package diff --git a/forester/archive/RIO/others/puzzle_dqo/src/gamma.c b/forester/archive/RIO/others/puzzle_dqo/src/gamma.c new file mode 100644 index 0000000..ee1f6df --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/gamma.c @@ -0,0 +1,346 @@ +/* + * gamma.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + +#include +#include "util.h" +#include "gamma.h" + +/* private prototypes */ +static double IncompleteGamma (double x, double alpha, double ln_gamma_alpha); +static double PointNormal (double prob); +static double PointChi2 (double prob, double v); + +/* Gamma density function */ +double densityGamma (double x, double shape) +{ + return pow (shape, shape) * pow (x, shape-1) / + exp (shape*x + LnGamma(shape)); +} + +/* Gamma cdf */ +double cdfGamma (double x, double shape) +{ + double result; + + result = IncompleteGamma (shape*x, shape, LnGamma(shape)); + + return result; +} + +/* Gamma inverse cdf */ +double icdfGamma (double y, double shape) +{ + double result; + + result = PointChi2 (y, 2.0*shape)/(2.0*shape); + + /* to avoid -1.0 */ + if (result < 0.0) + { + result = 0.0; + } + + return result; +} + +/* Gamma n-th moment */ +double momentGamma (int n, double shape) +{ + int i; + double tmp = 1.0; + + for (i = 1; i < n; i++) + { + tmp *= (shape + i)/shape; + } + + return tmp; +} + +/* The following code comes from tools.c in Yang's PAML package */ + +double LnGamma (double alpha) +{ +/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places. + Stirling's formula is used for the central polynomial part of the procedure. + Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function. + Communications of the Association for Computing Machinery, 9:684 +*/ + double x=alpha, f=0, z; + + if (x<7) { + f=1; z=x-1; + while (++z<7) f*=z; + x=z; f=-log(f); + } + z = 1/(x*x); + return f + (x-0.5)*log(x) - x + .918938533204673 + + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z + +.083333333333333)/x; +} + +static double IncompleteGamma (double x, double alpha, double ln_gamma_alpha) +{ +/* returns the incomplete gamma ratio I(x,alpha) where x is the upper + limit of the integration and alpha is the shape parameter. + returns (-1) if in error + (1) series expansion if (alpha>x || x<=1) + (2) continued fraction otherwise + RATNEST FORTRAN by + Bhattacharjee GP (1970) The incomplete gamma integral. Applied Statistics, + 19: 285-287 (AS32) +*/ + int i; + double p=alpha, g=ln_gamma_alpha; + double accurate=1e-8, overflow=1e30; + double factor, gin=0, rn=0, a=0,b=0,an=0,dif=0, term=0, pn[6]; + + if (x==0) return (0); + if (x<0 || p<=0) return (-1); + + factor=exp(p*log(x)-x-g); + if (x>1 && x>=p) goto l30; + /* (1) series expansion */ + gin=1; term=1; rn=p; + l20: + rn++; + term*=x/rn; gin+=term; + + if (term > accurate) goto l20; + gin*=factor/p; + goto l50; + l30: + /* (2) continued fraction */ + a=1-p; b=a+x+1; term=0; + pn[0]=1; pn[1]=x; pn[2]=x+1; pn[3]=x*b; + gin=pn[2]/pn[3]; + l32: + a++; b+=2; term++; an=a*term; + for (i=0; i<2; i++) pn[i+4]=b*pn[i+2]-an*pn[i]; + if (pn[5] == 0) goto l35; + rn=pn[4]/pn[5]; dif=fabs(gin-rn); + if (dif>accurate) goto l34; + if (dif<=accurate*rn) goto l42; + l34: + gin=rn; + l35: + for (i=0; i<4; i++) pn[i]=pn[i+2]; + if (fabs(pn[4]) < overflow) goto l32; + for (i=0; i<4; i++) pn[i]/=overflow; + goto l32; + l42: + gin=1-factor*gin; + + l50: + return (gin); +} + + +/* functions concerning the CDF and percentage points of the gamma and + Chi2 distribution +*/ +static double PointNormal (double prob) +{ +/* returns z so that Prob{x.999998 || v<=0) return (-1); + + g = LnGamma (v/2); + xx=v/2; c=xx-1; + if (v >= -1.24*log(p)) goto l1; + + ch=pow((p*xx*exp(g+xx*aa)), 1/xx); + if (ch-e<0) return (ch); + goto l4; +l1: + if (v>.32) goto l3; + ch=0.4; a=log(1-p); +l2: + q=ch; p1=1+ch*(4.67+ch); p2=ch*(6.73+ch*(6.66+ch)); + t=-0.5+(4.67+2*ch)/p1 - (6.73+ch*(13.32+3*ch))/p2; + ch-=(1-exp(a+g+.5*ch+c*aa)*p2/p1)/t; + if (fabs(q/ch-1)-.01 <= 0) goto l4; + else goto l2; + +l3: + x=PointNormal (p); + p1=0.222222/v; ch=v*pow((x*sqrt(p1)+1-p1), 3.0); + if (ch>2.2*v+6) ch=-2*(log(1-p)-c*log(.5*ch)+g); +l4: + + do + { + q=ch; p1=.5*ch; + if ((t=IncompleteGamma (p1, xx, g))<0) { + return (-1); + } + p2=p-t; + t=p2*exp(xx*aa+g+p1-c*log(ch)); + b=t/ch; a=0.5*t-b*c; + + s1=(210+a*(140+a*(105+a*(84+a*(70+60*a))))) / 420; + s2=(420+a*(735+a*(966+a*(1141+1278*a))))/2520; + s3=(210+a*(462+a*(707+932*a)))/2520; + s4=(252+a*(672+1182*a)+c*(294+a*(889+1740*a)))/5040; + s5=(84+264*a+c*(175+606*a))/2520; + s6=(120+c*(346+127*c))/5040; + ch+=t*(1+0.5*t*s1-b*c*(s1-b*(s2-b*(s3-b*(s4-b*(s5-b*s6)))))); + } + while (fabs(q/ch-1) > e); + + return (ch); +} + + +/* Incomplete Gamma function Q(a,x) + - this is a cleanroom implementation of NRs gammq(a,x) +*/ +double IncompleteGammaQ (double a, double x) +{ + return 1.0-IncompleteGamma (x, a, LnGamma(a)); +} + + +/* probability that the observed chi-square + exceeds chi2 even if model is correct */ +double chi2prob (int deg, double chi2) +{ + return IncompleteGammaQ (0.5*deg, 0.5*chi2); +} + + + +/* chi square test + ef expected frequencies (sum up to 1 !!) + of observed frequencies (sum up to the number of samples) + numcat number of categories + returns critical significance level */ +double chi2test(double *ef, int *of, int numcat, int *chi2fail) +{ + double chi2, criticals, efn; + int i, below1, below5, reducedcat; + int samples; + + *chi2fail = FALSE; + reducedcat = numcat; + below1 = 0; + below5 = 0; + + /* compute number of samples */ + samples = 0; + for (i = 0; i < numcat; i++) + samples = samples + of[i]; + + /* compute chi square */ + chi2 = 0; + for (i = 0; i < numcat; i++) { + efn = ef[i]*((double) samples); + if (efn < 1.0) below1++; + if (efn < 5.0) below5++; + if (efn == 0.0) { + reducedcat--; + fprintf(stdout, "FPE error: samples=%d, ef[%d]=%f, of[%d]=%d, efn=%f, nc=%d, rc=%d\n", + samples, i, ef[i], i, of[i], efn, numcat, reducedcat); + fprintf(stdout, "PLEASE REPORT THIS ERROR TO DEVELOPERS !!!\n"); + fflush(stdout); + } else chi2 = chi2 + ((double) of[i]-efn)*((double) of[i]-efn)/efn; + } + + /* compute significance */ + criticals = chi2prob (numcat-1, chi2); + + /* no expected frequency category (sum up to # samples) below 1.0 */ + if (below1 > 0) *chi2fail = TRUE; + /* no more than 1/5 of the frequency categories below 5.0 */ + if (below5 > (int) floor(samples/5.0)) *chi2fail = TRUE; + + return criticals; +} + + +/* chi square test + ef expected frequencies (sum up to 1 !!) + of observed frequencies (sum up to the number of samples) + numcat number of categories + returns critical significance level */ +double altchi2test(double *ef, int *of, int numcat, int *chi2fail) +{ + double chi2, criticals, efn; + int i, below1, below5; + int samples; + + *chi2fail = FALSE; + below1 = 0; + below5 = 0; + + /* compute number of samples */ + samples = 0; + for (i = 0; i < numcat; i++) + samples = samples + of[i]; + + /* compute chi square */ + chi2 = 0; + for (i = 0; i < numcat; i++) { + efn = ef[i]*((double) samples); + if (efn < 1.0) below1++; + if (efn < 5.0) below5++; + chi2 = chi2 + ((double) of[i]-efn)*((double) of[i]-efn)/efn; + } + + /* compute significance */ + criticals = chi2prob (numcat-1, chi2); + + /* no expected frequency category (sum up to # samples) below 1.0 */ + if (below1 > 0) *chi2fail = TRUE; + /* no more than 1/5 of the frequency categories below 5.0 */ + if (below5 > (int) floor(samples/5.0)) *chi2fail = TRUE; + + return criticals; +} diff --git a/forester/archive/RIO/others/puzzle_dqo/src/gamma.h b/forester/archive/RIO/others/puzzle_dqo/src/gamma.h new file mode 100644 index 0000000..975f4ee --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/gamma.h @@ -0,0 +1,30 @@ +/* + * gamma.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + +#ifndef _GAMMA_ +#define _GAMMA_ + +double densityGamma (double, double); +double cdfGamma (double, double); +double icdfGamma (double, double); +double momentGamma (int, double); + +double LnGamma (double); +double IncompleteGammaQ (double, double); + +double chi2prob (int, double); +double chi2test (double *, int *, int , int *); + + +#endif /* _GAMMA_ */ diff --git a/forester/archive/RIO/others/puzzle_dqo/src/ml.h b/forester/archive/RIO/others/puzzle_dqo/src/ml.h new file mode 100644 index 0000000..7dfd2b0 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/ml.h @@ -0,0 +1,279 @@ +/* + * ml.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef _ML_ +#define _ML_ + +/* definitions */ + +#define MINTS 0.20 /* Ts/Tv parameter */ +#define MAXTS 30.0 +#define MINYR 0.10 /* Y/R Ts parameter */ +#define MAXYR 6.00 +#define MINFI 0.00 /* fraction invariable sites */ +#define MAXFI 0.99 /* only for input */ +#define MINGE 0.01 /* rate heterogeneity parameter */ +#define MAXGE 0.99 +#define MINCAT 4 /* discrete Gamma categories */ +#define MAXCAT 16 + +#define RMHROOT 5.0 /* upper relative bound for height of root */ +#define MAXARC 900.0 /* upper limit on branch length (PAM) = 6.0 */ +#define MINARC 0.001 /* lower limit on branch length (PAM) = 0.00001 */ +#define EPSILON 0.0001 /* error in branch length (PAM) = 0.000001 */ +#define HEPSILON 0.0001 /* error in node and root heights */ +#define MAXIT 100 /* maximum number of iterates of smoothing */ +#define MINFDIFF 0.00002 /* lower limit on base frequency differences */ +#define MINFREQ 0.0001 /* lower limit on base frequencies = 0.01% */ +#define NUMQBRNCH 5 /* number of branches in a quartet */ +#define NUMQIBRNCH 1 /* number of internal branches in a quartet */ +#define NUMQSPC 4 /* number of sequences in a quartet */ + +/* 2D minimisation */ +#define PEPS1 0.01 /* epsilon substitution process estimation */ +#define PEPS2 0.01 /* epsilon rate heterogeneity estimation */ + +/* quartet series */ +#define MINPERTAXUM 2 +#define MAXPERTAXUM 6 +#define TSDIFF 0.20 +#define YRDIFF 0.10 + +/* type definitions */ + +typedef struct node +{ + struct node *isop; + struct node *kinp; + int descen; + int number; + double length; + double lengthc; + double varlen; + double height; + double varheight; + ivector paths; + cvector eprob; + dcube partials; /* partial likelihoods */ + char *label; /* internal labels */ +} Node; + +typedef struct tree +{ + Node *rootp; + Node **ebrnchp; /* list of pointers to external branches */ + Node **ibrnchp; /* list of pointers to internal branches */ + double lklhd; /* total log-likelihood */ + double lklhdc; /* total log-likelihood clock */ + dmatrix condlkl; /* likelihoods for each pattern and non-zero rate */ + double rssleast; +} Tree; + + +/* global variables */ + +EXTERN Node *chep; /* pointer to current height node */ +EXTERN Node *rootbr; /* pointer to root branch */ +EXTERN Node **heights; /* pointer to height nodes in unrooted tree */ +EXTERN int Numhts; /* number of height nodes in unrooted tree */ +EXTERN double hroot; /* height of root */ +EXTERN double varhroot; /* variance of height of root */ +EXTERN double maxhroot; /* maximal height of root */ +EXTERN int locroot; /* location of root */ +EXTERN int numbestroot; /* number of best locations for root */ +EXTERN int clockmode; /* clocklike vs. nonclocklike computation */ +EXTERN cmatrix Identif; /* sequence names */ +EXTERN cmatrix Seqchar; /* ML sequence data */ +EXTERN cmatrix Seqpat; /* ordered site patterns */ +EXTERN ivector constpat; /* indicates constant site patterns */ +EXTERN cvector seqchi; +EXTERN cvector seqchj; +EXTERN dcube partiali; +EXTERN dcube partialj; +EXTERN dcube ltprobr; /* transition probabilites (for all non-zero rates */ +EXTERN dvector Distanmat; /* vector with maximum likelihood distances CZ 05/16/01 */ +EXTERN dmatrix Evec; /* Eigenvectors */ +EXTERN dmatrix Ievc; /* Inverse eigenvectors */ +EXTERN double TSparam; /* Ts/Tv parameter */ +EXTERN double tsmean, yrmean; +EXTERN double YRparam; /* Y/R Ts parameter */ +EXTERN double geerr; /* estimated error of rate heterogeneity */ +EXTERN double Geta; /* rate heterogeneity parameter */ +EXTERN double fracconst; /* fraction of constant sites */ +EXTERN double fracconstpat;/* fraction of constant patterns */ +EXTERN double Proportion; /* for tree drawing */ +EXTERN double tserr; /* estimated error of TSparam */ +EXTERN double yrerr; /* estimated error of YRparam */ +EXTERN double fracinv; /* fraction of invariable sites */ +EXTERN double fierr; /* estimated error of fracinv */ +EXTERN dvector Brnlength; +EXTERN dvector Distanvec; +EXTERN dvector Eval; /* Eigenvalues of 1 PAM rate matrix */ +EXTERN dvector Freqtpm; /* base frequencies */ +EXTERN dvector Rates; /* rate of each of the categories */ +EXTERN dmatrix iexp; +EXTERN imatrix Basecomp; /* base composition of each taxon */ +EXTERN ivector usedtaxa; /* list needed in the input treefile procedure */ +EXTERN int numtc; /* auxiliary variable for printing rooted tree */ +EXTERN int qcalg_optn; /* use quartet subsampling algorithm */ +EXTERN int approxp_optn; /* approximate parameter estimation */ +EXTERN int chi2fail; /* flag for chi2 test */ +EXTERN int Converg; /* flag for ML convergence (no clock) */ +EXTERN int Convergc; /* flag for ML convergence (clock) */ +EXTERN int data_optn; /* type of sequence input data */ +EXTERN int Dayhf_optn; /* Dayhoff model */ +EXTERN int HKY_optn; /* use HKY model */ +EXTERN int Jtt_optn; /* JTT model */ +EXTERN int blosum62_optn; /* BLOSUM 62 model */ +EXTERN int mtrev_optn; /* mtREV model */ +EXTERN int cprev_optn; /* cpREV model */ +EXTERN int vtmv_optn; /* VT model */ +EXTERN int wag_optn; /* WAG model */ +EXTERN int Maxsite; /* number of ML characters per taxum */ +EXTERN int Maxspc; /* number of sequences */ +EXTERN int mlmode; /* quartet ML or user defined tree ML */ +EXTERN int nuc_optn; /* nucleotide (4x4) models */ +EXTERN int Numbrnch; /* number of branches of current tree */ +EXTERN int numcats; /* number of rate categories */ +EXTERN int Numconst; /* number of constant sites */ +EXTERN int Numconstpat; /* number of constant patterns */ +EXTERN int Numibrnch; /* number of internal branches of current tree */ +EXTERN int Numitc; /* number of ML iterations assumning clock */ +EXTERN int Numit; /* number of ML iterations if there is convergence */ +EXTERN int Numptrn; /* number of site patterns */ +EXTERN int Numspc; /* number of sequences of current tree */ +EXTERN int optim_optn; /* optimize model parameters */ +EXTERN int grate_optim; /* optimize Gamma rate heterogeneity parameter */ +EXTERN int SH_optn; /* SH nucleotide (16x16) model */ +EXTERN int TN_optn; /* use TN model */ +EXTERN int tpmradix; /* number of different states */ +EXTERN int fracinv_optim; /* optimize fraction of invariable sites */ +EXTERN int typ_optn; /* type of PUZZLE analysis */ +EXTERN ivector Weight; /* weight of each site pattern */ +EXTERN Tree *Ctree; /* pointer to current tree */ +EXTERN ulivector badtaxon; /* involment of each taxon in a bad quartet */ +EXTERN int qca, qcb, qcc, qcd; /* quartet currently optimized */ +EXTERN ivector Alias; /* link site -> corresponding site pattern */ +EXTERN ivector bestrate; /* optimal assignment of rates to sequence sites */ + +EXTERN int bestratefound; + +/* function prototypes of all ml function */ + +void convfreq(dvector); +void radixsort(cmatrix, ivector, int, int, int *); +void condenceseq(cmatrix, ivector, cmatrix, ivector, int, int, int); +void countconstantsites(cmatrix, ivector, int, int, int *, int*); +void evaluateseqs(void); +void elmhes(dmatrix, ivector, int); +void eltran(dmatrix, dmatrix, ivector, int); +void mcdiv(double, double, double, double, double *, double *); +void hqr2(int, int, int, dmatrix, dmatrix, dvector, dvector); +void onepamratematrix(dmatrix); +void eigensystem(dvector, dmatrix); +void luinverse(dmatrix, dmatrix, int); +void checkevector(dmatrix, dmatrix, int); +void tranprobmat(void); +void tprobmtrx(double, dmatrix); +double comptotloglkl(dmatrix); +void allsitelkl(dmatrix, dvector); +double pairlkl(double); +double mldistance(int); +void initdistan(void); +void computedistan(void); +void productpartials(Node *); +void partialsinternal(Node *); +void partialsexternal(Node *); +void initpartials(Tree *); +double intlkl(double); +void optinternalbranch(Node *); +double extlkl(double); +void optexternalbranch(Node *); +void finishlkl(Node *); +double optlkl(Tree *); +double treelkl(Tree *); +void luequation(dmatrix, dvector, int); +void lslength(Tree *, dvector, int, int, dvector); + +void getusertree(FILE *, cvector, int); +Node *internalnode(Tree *, char **, int *); +void constructtree(Tree *, cvector); +void removebasalbif(cvector); +void makeusertree(FILE *); +Tree *new_tree(int, int, cmatrix); +Tree *new_quartet(int, cmatrix); +void free_tree(Tree *, int); +void make_quartet(int, int, int, int); +void changedistan(dmatrix, dvector, int); +double quartet_lklhd(int, int, int, int); +double quartet_alklhd(int, int, int, int); +void readusertree(FILE *); +double usertree_lklhd(void); +double usertree_alklhd(void); +void mlstart(void); +void distupdate(int, int, int, int); +void mlfinish(void); +void prbranch(Node *, int, int, int, ivector, ivector, FILE *); +void getproportion(double *, dvector, int); +void prtopology(FILE *); +void fputphylogeny(FILE *); +void resulttree(FILE *); +void njtree(FILE *); +void njdistantree(Tree *); +void findbestratecombination(void); +void printbestratecombination(FILE *); +int checkedge(int); +void fputsubstree(FILE *, Node *); +void fputrooted(FILE *, int); +void findheights(Node *); +void initclock(int); +double clock_alklhd(int); +double heightlkl(double); +void optheight(void); +double rheightlkl(double); +void optrheight(void); +double clock_lklhd(int); +int findrootedge(void); +void resultheights(FILE *); + +double homogentest(int); +void YangDiscreteGamma(double, int, double *); +void updaterates(void); +void computestat(double *, int, double *, double *); +double quartetml(int, int, int, int); +double opttsq(double); +double optyrq(double); +void optimseqevolparamsq(void); +double opttst(double); +double optyrt(double); +void optimseqevolparamst(void); +double optfi(double); +double optge(double); +void optimrateparams(void); + +int gettpmradix(void); +void rtfdata(dmatrix, double *); +int code2int(cvector); +char *int2code(int); + +void jttdata(dmatrix, double *); +void dyhfdata(dmatrix, double *); +void mtrevdata(dmatrix, double *); +void cprev45data(dmatrix, double *); +void blosum62data(dmatrix, double *); +void vtmvdata(dmatrix, double *); +void wagdata(dmatrix, double *); + +#endif diff --git a/forester/archive/RIO/others/puzzle_dqo/src/ml1.c b/forester/archive/RIO/others/puzzle_dqo/src/ml1.c new file mode 100644 index 0000000..a3a561f --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/ml1.c @@ -0,0 +1,1743 @@ +/* + * ml1.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/******************************************************************************/ +/* definitions and prototypes */ +/******************************************************************************/ + +#define EXTERN extern + +/* prototypes */ +#include +#include +#include +#include +#include "util.h" +#include "ml.h" + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + + +/******************************************************************************/ +/* compacting sequence data information */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +/* make all frequencies a little different */ +void convfreq(dvector freqemp) +{ + int i, j, maxi=0; + double freq, maxfreq, sum; + + + sum = 0.0; + maxfreq = 0.0; + for (i = 0; i < tpmradix; i++) { + freq = freqemp[i]; + if (freq < MINFREQ) freqemp[i] = MINFREQ; + if (freq > maxfreq) { + maxfreq = freq; + maxi = i; + } + sum += freqemp[i]; + } + freqemp[maxi] += 1.0 - sum; + + for (i = 0; i < tpmradix - 1; i++) { + for (j = i + 1; j < tpmradix; j++) { + if (freqemp[i] == freqemp[j]) { + freqemp[i] += MINFDIFF/2.0; + freqemp[j] -= MINFDIFF/2.0; + } + } + } +} + +/* sort site patters of original input data */ +void radixsort(cmatrix seqchar, ivector ali, int maxspc, int maxsite, + int *numptrn) +{ + int i, j, k, l, n, pass; + int *awork; + int *count; + + + awork = new_ivector(maxsite); + count = new_ivector(tpmradix+1); + for (i = 0; i < maxsite; i++) + ali[i] = i; + for (pass = maxspc - 1; pass >= 0; pass--) { + for (j = 0; j < tpmradix+1; j++) + count[j] = 0; + for (i = 0; i < maxsite; i++) + count[(int) seqchar[pass][ali[i]]]++; + for (j = 1; j < tpmradix+1; j++) + count[j] += count[j-1]; + for (i = maxsite-1; i >= 0; i--) + awork[ --count[(int) seqchar[pass][ali[i]]] ] = ali[i]; + for (i = 0; i < maxsite; i++) + ali[i] = awork[i]; + } + free_ivector(awork); + free_ivector(count); + n = 1; + for (j = 1; j < maxsite; j++) { + k = ali[j]; + l = ali[j-1]; + for (i = 0; i < maxspc; i++) { + if (seqchar[i][l] != seqchar[i][k]) { + n++; + break; + } + } + } + *numptrn = n; +} + + +void condenceseq(cmatrix seqchar, ivector ali, cmatrix seqconint, + ivector weight, int maxspc, int maxsite, int numptrn) +{ + int i, j, k, n; + int agree_flag; /* boolean */ + + + n = 0; + k = ali[n]; + for (i = 0; i < maxspc; i++) { + seqconint[i][n] = seqchar[i][k]; + } + weight[n] = 1; + Alias[k] = 0; + for (j = 1; j < maxsite; j++) { + k = ali[j]; + agree_flag = TRUE; + for (i = 0; i < maxspc; i++) { + if (seqconint[i][n] != seqchar[i][k]) { + agree_flag = FALSE; + break; + } + } + if (agree_flag == FALSE) { + n++; + for (i = 0; i < maxspc; i++) { + seqconint[i][n] = seqchar[i][k]; + } + weight[n] = 1; + Alias[k] = n; + } else { + weight[n]++; + Alias[k] = n; + } + } + n++; + if (numptrn != n) { + /* Problem in condenceseq */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR A TO DEVELOPERS\n\n\n"); + exit(1); + } +} + +void countconstantsites(cmatrix seqpat, ivector weight, int maxspc, int numptrn, + int *numconst, int *numconstpat) +{ + int character, s, i, constflag; + + *numconst = 0; + *numconstpat = 0; + for (s = 0; s < numptrn; s++) { /* check all patterns */ + constpat[s] = FALSE; + constflag = TRUE; + character = seqpat[0][s]; + for (i = 1; i < maxspc; i++) { + if (seqpat[i][s] != character) { + constflag = FALSE; + break; + } + } + if (character != tpmradix && constflag) { + (*numconst) = (*numconst) + weight[s]; + (*numconstpat)++; + constpat[s] = TRUE; + } + } +} + +/***************************** exported functions *****************************/ + + +void evaluateseqs() +{ + ivector ali; + + convfreq(Freqtpm); /* make all frequencies slightly different */ + ali = new_ivector(Maxsite); + radixsort(Seqchar, ali, Maxspc, Maxsite, &Numptrn); + Seqpat = new_cmatrix(Maxspc, Numptrn); + constpat = new_ivector(Numptrn); + Weight = new_ivector(Numptrn); + condenceseq(Seqchar, ali, Seqpat, Weight, Maxspc, Maxsite, Numptrn); + free_ivector(ali); + countconstantsites(Seqpat, Weight, Maxspc, Numptrn, &Numconst, &Numconstpat); + fracconstpat = (double) Numconstpat / (double) Numptrn; + fracconst = (double) Numconst / (double) Maxsite; +} + + +/******************************************************************************/ +/* computation of Pij(t) */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +void elmhes(dmatrix a, ivector ordr, int n) +{ + int m, j, i; + double y, x; + + + for (i = 0; i < n; i++) + ordr[i] = 0; + for (m = 2; m < n; m++) { + x = 0.0; + i = m; + for (j = m; j <= n; j++) { + if (fabs(a[j - 1][m - 2]) > fabs(x)) { + x = a[j - 1][m - 2]; + i = j; + } + } + ordr[m - 1] = i; /* vector */ + if (i != m) { + for (j = m - 2; j < n; j++) { + y = a[i - 1][j]; + a[i - 1][j] = a[m - 1][j]; + a[m - 1][j] = y; + } + for (j = 0; j < n; j++) { + y = a[j][i - 1]; + a[j][i - 1] = a[j][m - 1]; + a[j][m - 1] = y; + } + } + if (x != 0.0) { + for (i = m; i < n; i++) { + y = a[i][m - 2]; + if (y != 0.0) { + y /= x; + a[i][m - 2] = y; + for (j = m - 1; j < n; j++) + a[i][j] -= y * a[m - 1][j]; + for (j = 0; j < n; j++) + a[j][m - 1] += y * a[j][i]; + } + } + } + } +} + + +void eltran(dmatrix a, dmatrix zz, ivector ordr, int n) +{ + int i, j, m; + + + for (i = 0; i < n; i++) { + for (j = i + 1; j < n; j++) { + zz[i][j] = 0.0; + zz[j][i] = 0.0; + } + zz[i][i] = 1.0; + } + if (n <= 2) + return; + for (m = n - 1; m >= 2; m--) { + for (i = m; i < n; i++) + zz[i][m - 1] = a[i][m - 2]; + i = ordr[m - 1]; + if (i != m) { + for (j = m - 1; j < n; j++) { + zz[m - 1][j] = zz[i - 1][j]; + zz[i - 1][j] = 0.0; + } + zz[i - 1][m - 1] = 1.0; + } + } +} + + +void mcdiv(double ar, double ai, double br, double bi, + double *cr, double *ci) +{ + double s, ars, ais, brs, bis; + + + s = fabs(br) + fabs(bi); + ars = ar / s; + ais = ai / s; + brs = br / s; + bis = bi / s; + s = brs * brs + bis * bis; + *cr = (ars * brs + ais * bis) / s; + *ci = (ais * brs - ars * bis) / s; +} + + +void hqr2(int n, int low, int hgh, dmatrix h, + dmatrix zz, dvector wr, dvector wi) +{ + int i, j, k, l=0, m, en, na, itn, its; + double p=0, q=0, r=0, s=0, t, w, x=0, y, ra, sa, vi, vr, z=0, norm, tst1, tst2; + int notlas; /* boolean */ + + + norm = 0.0; + k = 1; + /* store isolated roots and compute matrix norm */ + for (i = 0; i < n; i++) { + for (j = k - 1; j < n; j++) + norm += fabs(h[i][j]); + k = i + 1; + if (i + 1 < low || i + 1 > hgh) { + wr[i] = h[i][i]; + wi[i] = 0.0; + } + } + en = hgh; + t = 0.0; + itn = n * 30; + while (en >= low) { /* search for next eigenvalues */ + its = 0; + na = en - 1; + while (en >= 1) { + /* look for single small sub-diagonal element */ + for (l = en; l > low; l--) { + s = fabs(h[l - 2][l - 2]) + fabs(h[l - 1][l - 1]); + if (s == 0.0) + s = norm; + tst1 = s; + tst2 = tst1 + fabs(h[l - 1][l - 2]); + if (tst2 == tst1) + goto L100; + } + l = low; + L100: + x = h[en - 1][en - 1]; /* form shift */ + if (l == en || l == na) + break; + if (itn == 0) { + /* all eigenvalues have not converged */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR B TO DEVELOPERS\n\n\n"); + exit(1); + } + y = h[na - 1][na - 1]; + w = h[en - 1][na - 1] * h[na - 1][en - 1]; + /* form exceptional shift */ + if (its == 10 || its == 20) { + t += x; + for (i = low - 1; i < en; i++) + h[i][i] -= x; + s = fabs(h[en - 1][na - 1]) + fabs(h[na - 1][en - 3]); + x = 0.75 * s; + y = x; + w = -0.4375 * s * s; + } + its++; + itn--; + /* look for two consecutive small sub-diagonal elements */ + for (m = en - 2; m >= l; m--) { + z = h[m - 1][m - 1]; + r = x - z; + s = y - z; + p = (r * s - w) / h[m][m - 1] + h[m - 1][m]; + q = h[m][m] - z - r - s; + r = h[m + 1][m]; + s = fabs(p) + fabs(q) + fabs(r); + p /= s; + q /= s; + r /= s; + if (m == l) + break; + tst1 = fabs(p) * + (fabs(h[m - 2][m - 2]) + fabs(z) + fabs(h[m][m])); + tst2 = tst1 + fabs(h[m - 1][m - 2]) * (fabs(q) + fabs(r)); + if (tst2 == tst1) + break; + } + for (i = m + 2; i <= en; i++) { + h[i - 1][i - 3] = 0.0; + if (i != m + 2) + h[i - 1][i - 4] = 0.0; + } + for (k = m; k <= na; k++) { + notlas = (k != na); + if (k != m) { + p = h[k - 1][k - 2]; + q = h[k][k - 2]; + r = 0.0; + if (notlas) + r = h[k + 1][k - 2]; + x = fabs(p) + fabs(q) + fabs(r); + if (x != 0.0) { + p /= x; + q /= x; + r /= x; + } + } + if (x != 0.0) { + if (p < 0.0) /* sign */ + s = - sqrt(p * p + q * q + r * r); + else + s = sqrt(p * p + q * q + r * r); + if (k != m) + h[k - 1][k - 2] = -s * x; + else { + if (l != m) + h[k - 1][k - 2] = -h[k - 1][k - 2]; + } + p += s; + x = p / s; + y = q / s; + z = r / s; + q /= p; + r /= p; + if (!notlas) { + for (j = k - 1; j < n; j++) { /* row modification */ + p = h[k - 1][j] + q * h[k][j]; + h[k - 1][j] -= p * x; + h[k][j] -= p * y; + } + j = (en < (k + 3)) ? en : (k + 3); /* min */ + for (i = 0; i < j; i++) { /* column modification */ + p = x * h[i][k - 1] + y * h[i][k]; + h[i][k - 1] -= p; + h[i][k] -= p * q; + } + /* accumulate transformations */ + for (i = low - 1; i < hgh; i++) { + p = x * zz[i][k - 1] + y * zz[i][k]; + zz[i][k - 1] -= p; + zz[i][k] -= p * q; + } + } else { + for (j = k - 1; j < n; j++) { /* row modification */ + p = h[k - 1][j] + q * h[k][j] + r * h[k + 1][j]; + h[k - 1][j] -= p * x; + h[k][j] -= p * y; + h[k + 1][j] -= p * z; + } + j = (en < (k + 3)) ? en : (k + 3); /* min */ + for (i = 0; i < j; i++) { /* column modification */ + p = x * h[i][k - 1] + y * h[i][k] + z * h[i][k + 1]; + h[i][k - 1] -= p; + h[i][k] -= p * q; + h[i][k + 1] -= p * r; + } + /* accumulate transformations */ + for (i = low - 1; i < hgh; i++) { + p = x * zz[i][k - 1] + y * zz[i][k] + + z * zz[i][k + 1]; + zz[i][k - 1] -= p; + zz[i][k] -= p * q; + zz[i][k + 1] -= p * r; + } + } + } + } /* for k */ + } /* while infinite loop */ + if (l == en) { /* one root found */ + h[en - 1][en - 1] = x + t; + wr[en - 1] = h[en - 1][en - 1]; + wi[en - 1] = 0.0; + en = na; + continue; + } + y = h[na - 1][na - 1]; + w = h[en - 1][na - 1] * h[na - 1][en - 1]; + p = (y - x) / 2.0; + q = p * p + w; + z = sqrt(fabs(q)); + h[en - 1][en - 1] = x + t; + x = h[en - 1][en - 1]; + h[na - 1][na - 1] = y + t; + if (q >= 0.0) { /* real pair */ + if (p < 0.0) /* sign */ + z = p - fabs(z); + else + z = p + fabs(z); + wr[na - 1] = x + z; + wr[en - 1] = wr[na - 1]; + if (z != 0.0) + wr[en - 1] = x - w / z; + wi[na - 1] = 0.0; + wi[en - 1] = 0.0; + x = h[en - 1][na - 1]; + s = fabs(x) + fabs(z); + p = x / s; + q = z / s; + r = sqrt(p * p + q * q); + p /= r; + q /= r; + for (j = na - 1; j < n; j++) { /* row modification */ + z = h[na - 1][j]; + h[na - 1][j] = q * z + p * h[en - 1][j]; + h[en - 1][j] = q * h[en - 1][j] - p * z; + } + for (i = 0; i < en; i++) { /* column modification */ + z = h[i][na - 1]; + h[i][na - 1] = q * z + p * h[i][en - 1]; + h[i][en - 1] = q * h[i][en - 1] - p * z; + } + /* accumulate transformations */ + for (i = low - 1; i < hgh; i++) { + z = zz[i][na - 1]; + zz[i][na - 1] = q * z + p * zz[i][en - 1]; + zz[i][en - 1] = q * zz[i][en - 1] - p * z; + } + } else { /* complex pair */ + wr[na - 1] = x + p; + wr[en - 1] = x + p; + wi[na - 1] = z; + wi[en - 1] = -z; + } + en -= 2; + } /* while en >= low */ + /* backsubstitute to find vectors of upper triangular form */ + if (norm != 0.0) { + for (en = n; en >= 1; en--) { + p = wr[en - 1]; + q = wi[en - 1]; + na = en - 1; + if (q == 0.0) {/* real vector */ + m = en; + h[en - 1][en - 1] = 1.0; + if (na != 0) { + for (i = en - 2; i >= 0; i--) { + w = h[i][i] - p; + r = 0.0; + for (j = m - 1; j < en; j++) + r += h[i][j] * h[j][en - 1]; + if (wi[i] < 0.0) { + z = w; + s = r; + } else { + m = i + 1; + if (wi[i] == 0.0) { + t = w; + if (t == 0.0) { + tst1 = norm; + t = tst1; + do { + t = 0.01 * t; + tst2 = norm + t; + } while (tst2 > tst1); + } + h[i][en - 1] = -(r / t); + } else { /* solve real equations */ + x = h[i][i + 1]; + y = h[i + 1][i]; + q = (wr[i] - p) * (wr[i] - p) + wi[i] * wi[i]; + t = (x * s - z * r) / q; + h[i][en - 1] = t; + if (fabs(x) > fabs(z)) + h[i + 1][en - 1] = (-r - w * t) / x; + else + h[i + 1][en - 1] = (-s - y * t) / z; + } + /* overflow control */ + t = fabs(h[i][en - 1]); + if (t != 0.0) { + tst1 = t; + tst2 = tst1 + 1.0 / tst1; + if (tst2 <= tst1) { + for (j = i; j < en; j++) + h[j][en - 1] /= t; + } + } + } + } + } + } else if (q > 0.0) { + m = na; + if (fabs(h[en - 1][na - 1]) > fabs(h[na - 1][en - 1])) { + h[na - 1][na - 1] = q / h[en - 1][na - 1]; + h[na - 1][en - 1] = (p - h[en - 1][en - 1]) / + h[en - 1][na - 1]; + } else + mcdiv(0.0, -h[na - 1][en - 1], h[na - 1][na - 1] - p, q, + &h[na - 1][na - 1], &h[na - 1][en - 1]); + h[en - 1][na - 1] = 0.0; + h[en - 1][en - 1] = 1.0; + if (en != 2) { + for (i = en - 3; i >= 0; i--) { + w = h[i][i] - p; + ra = 0.0; + sa = 0.0; + for (j = m - 1; j < en; j++) { + ra += h[i][j] * h[j][na - 1]; + sa += h[i][j] * h[j][en - 1]; + } + if (wi[i] < 0.0) { + z = w; + r = ra; + s = sa; + } else { + m = i + 1; + if (wi[i] == 0.0) + mcdiv(-ra, -sa, w, q, &h[i][na - 1], + &h[i][en - 1]); + else { /* solve complex equations */ + x = h[i][i + 1]; + y = h[i + 1][i]; + vr = (wr[i] - p) * (wr[i] - p); + vr = vr + wi[i] * wi[i] - q * q; + vi = (wr[i] - p) * 2.0 * q; + if (vr == 0.0 && vi == 0.0) { + tst1 = norm * (fabs(w) + fabs(q) + fabs(x) + + fabs(y) + fabs(z)); + vr = tst1; + do { + vr = 0.01 * vr; + tst2 = tst1 + vr; + } while (tst2 > tst1); + } + mcdiv(x * r - z * ra + q * sa, + x * s - z * sa - q * ra, vr, vi, + &h[i][na - 1], &h[i][en - 1]); + if (fabs(x) > fabs(z) + fabs(q)) { + h[i + 1] + [na - 1] = (q * h[i][en - 1] - + w * h[i][na - 1] - ra) / x; + h[i + 1][en - 1] = (-sa - w * h[i][en - 1] - + q * h[i][na - 1]) / x; + } else + mcdiv(-r - y * h[i][na - 1], + -s - y * h[i][en - 1], z, q, + &h[i + 1][na - 1], &h[i + 1][en - 1]); + } + /* overflow control */ + t = (fabs(h[i][na - 1]) > fabs(h[i][en - 1])) ? + fabs(h[i][na - 1]) : fabs(h[i][en - 1]); + if (t != 0.0) { + tst1 = t; + tst2 = tst1 + 1.0 / tst1; + if (tst2 <= tst1) { + for (j = i; j < en; j++) { + h[j][na - 1] /= t; + h[j][en - 1] /= t; + } + } + } + } + } + } + } + } + /* end back substitution. vectors of isolated roots */ + for (i = 0; i < n; i++) { + if (i + 1 < low || i + 1 > hgh) { + for (j = i; j < n; j++) + zz[i][j] = h[i][j]; + } + } + /* multiply by transformation matrix to give vectors of + * original full matrix. */ + for (j = n - 1; j >= low - 1; j--) { + m = ((j + 1) < hgh) ? (j + 1) : hgh; /* min */ + for (i = low - 1; i < hgh; i++) { + z = 0.0; + for (k = low - 1; k < m; k++) + z += zz[i][k] * h[k][j]; + zz[i][j] = z; + } + } + } + return; +} + + +/* make rate matrix with 0.01 expected substitutions per unit time */ +void onepamratematrix(dmatrix a) +{ + int i, j; + double delta, temp, sum; + dvector m; + + for (i = 0; i < tpmradix; i++) + { + for (j = 0; j < tpmradix; j++) + { + a[i][j] = Freqtpm[j]*a[i][j]; + } + } + + m = new_dvector(tpmradix); + for (i = 0, sum = 0.0; i < tpmradix; i++) + { + for (j = 0, temp = 0.0; j < tpmradix; j++) + temp += a[i][j]; + m[i] = temp; /* row sum */ + sum += temp*Freqtpm[i]; /* exp. rate */ + } + delta = 0.01 / sum; /* 0.01 subst. per unit time */ + for (i = 0; i < tpmradix; i++) { + for (j = 0; j < tpmradix; j++) { + if (i != j) + a[i][j] = delta * a[i][j]; + else + a[i][j] = delta * (-m[i]); + } + } + free_dvector(m); +} + + +void eigensystem(dvector eval, dmatrix evec) +{ + dvector evali, forg; + dmatrix a, b; + ivector ordr; + int i, j, k, error; + double zero; + + + ordr = new_ivector(tpmradix); + evali = new_dvector(tpmradix); + forg = new_dvector(tpmradix); + a = new_dmatrix(tpmradix,tpmradix); + b = new_dmatrix(tpmradix,tpmradix); + + rtfdata(a, forg); /* get relative transition matrix and frequencies */ + + onepamratematrix(a); /* make 1 PAM rate matrix */ + + /* copy a to b */ + for (i = 0; i < tpmradix; i++) + for (j = 0; j < tpmradix; j++) + b[i][j] = a[i][j]; + + elmhes(a, ordr, tpmradix); /* compute eigenvalues and eigenvectors */ + eltran(a, evec, ordr, tpmradix); + hqr2(tpmradix, 1, tpmradix, a, evec, eval, evali); + + /* check eigenvalue equation */ + error = FALSE; + for (j = 0; j < tpmradix; j++) { + for (i = 0, zero = 0.0; i < tpmradix; i++) { + for (k = 0; k < tpmradix; k++) zero += b[i][k] * evec[k][j]; + zero -= eval[j] * evec[i][j]; + if (fabs(zero) > 1.0e-5) + error = TRUE; + } + } + if (error) + FPRINTF(STDOUTFILE "\nWARNING: Eigensystem doesn't satisfy eigenvalue equation!\n"); + + free_ivector(ordr); + free_dvector(evali); + free_dvector(forg); + free_dmatrix(a); + free_dmatrix(b); +} + + +void luinverse(dmatrix inmat, dmatrix imtrx, int size) +{ + double eps = 1.0e-20; /* ! */ + int i, j, k, l, maxi=0, idx, ix, jx; + double sum, tmp, maxb, aw; + ivector index; + double *wk; + dmatrix omtrx; + + + index = new_ivector(tpmradix); + omtrx = new_dmatrix(tpmradix,tpmradix); + + /* copy inmat to omtrx */ + for (i = 0; i < tpmradix; i++) + for (j = 0; j < tpmradix; j++) + omtrx[i][j] = inmat[i][j]; + + wk = (double *) malloc((unsigned)size * sizeof(double)); + aw = 1.0; + for (i = 0; i < size; i++) { + maxb = 0.0; + for (j = 0; j < size; j++) { + if (fabs(omtrx[i][j]) > maxb) + maxb = fabs(omtrx[i][j]); + } + if (maxb == 0.0) { + /* Singular matrix */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR C TO DEVELOPERS\n\n\n"); + exit(1); + } + wk[i] = 1.0 / maxb; + } + for (j = 0; j < size; j++) { + for (i = 0; i < j; i++) { + sum = omtrx[i][j]; + for (k = 0; k < i; k++) + sum -= omtrx[i][k] * omtrx[k][j]; + omtrx[i][j] = sum; + } + maxb = 0.0; + for (i = j; i < size; i++) { + sum = omtrx[i][j]; + for (k = 0; k < j; k++) + sum -= omtrx[i][k] * omtrx[k][j]; + omtrx[i][j] = sum; + tmp = wk[i] * fabs(sum); + if (tmp >= maxb) { + maxb = tmp; + maxi = i; + } + } + if (j != maxi) { + for (k = 0; k < size; k++) { + tmp = omtrx[maxi][k]; + omtrx[maxi][k] = omtrx[j][k]; + omtrx[j][k] = tmp; + } + aw = -aw; + wk[maxi] = wk[j]; + } + index[j] = maxi; + if (omtrx[j][j] == 0.0) + omtrx[j][j] = eps; + if (j != size - 1) { + tmp = 1.0 / omtrx[j][j]; + for (i = j + 1; i < size; i++) + omtrx[i][j] *= tmp; + } + } + for (jx = 0; jx < size; jx++) { + for (ix = 0; ix < size; ix++) + wk[ix] = 0.0; + wk[jx] = 1.0; + l = -1; + for (i = 0; i < size; i++) { + idx = index[i]; + sum = wk[idx]; + wk[idx] = wk[i]; + if (l != -1) { + for (j = l; j < i; j++) + sum -= omtrx[i][j] * wk[j]; + } else if (sum != 0.0) + l = i; + wk[i] = sum; + } + for (i = size - 1; i >= 0; i--) { + sum = wk[i]; + for (j = i + 1; j < size; j++) + sum -= omtrx[i][j] * wk[j]; + wk[i] = sum / omtrx[i][i]; + } + for (ix = 0; ix < size; ix++) + imtrx[ix][jx] = wk[ix]; + } + free((char *)wk); + wk = NULL; + free_ivector(index); + free_dmatrix(omtrx); +} + + +void checkevector(dmatrix evec, dmatrix ivec, int nn) +{ + int i, j, ia, ib, ic, error; + dmatrix matx; + double sum; + + + matx = new_dmatrix(nn, nn); + /* multiply matrix of eigenvectors and its inverse */ + for (ia = 0; ia < nn; ia++) { + for (ic = 0; ic < nn; ic++) { + sum = 0.0; + for (ib = 0; ib < nn; ib++) sum += evec[ia][ib] * ivec[ib][ic]; + matx[ia][ic] = sum; + } + } + /* check whether the unitary matrix is obtained */ + error = FALSE; + for (i = 0; i < nn; i++) { + for (j = 0; j < nn; j++) { + if (i == j) { + if (fabs(matx[i][j] - 1.0) > 1.0e-5) + error = TRUE; + } else { + if (fabs(matx[i][j]) > 1.0e-5) + error = TRUE; + } + } + } + if (error) { + FPRINTF(STDOUTFILE "\nWARNING: Inversion of eigenvector matrix not perfect!\n"); + } + free_dmatrix(matx); +} + + +/***************************** exported functions *****************************/ + + +/* compute 1 PAM rate matrix, its eigensystem, and the inverse matrix thereof */ +void tranprobmat() +{ + eigensystem(Eval, Evec); /* eigensystem of 1 PAM rate matrix */ + luinverse(Evec, Ievc, tpmradix); /* inverse eigenvectors are in Ievc */ + checkevector(Evec, Ievc, tpmradix); /* check whether inversion was OK */ +} + + +/* compute P(t) */ +void tprobmtrx(double arc, dmatrix tpr) +{ + register int i, j, k; + register double temp; + + + for (k = 0; k < tpmradix; k++) { + temp = exp(arc * Eval[k]); + for (j = 0; j < tpmradix; j++) + iexp[k][j] = Ievc[k][j] * temp; + } + for (i = 0; i < tpmradix; i++) { + for (j = 0; j < tpmradix; j++) { + temp = 0.0; + for (k = 0; k < tpmradix; k++) + temp += Evec[i][k] * iexp[k][j]; + tpr[i][j] = fabs(temp); + } + } +} + + +/******************************************************************************/ +/* estimation of maximum likelihood distances */ +/******************************************************************************/ + +/* compute total log-likelihood + input: likelihoods for each site and non-zero rate + output: total log-likelihood (incl. zero rate category) */ +double comptotloglkl(dmatrix cdl) +{ + int k, r; + double loglkl, fv, fv2, sitelkl; + + loglkl = 0.0; + fv = 1.0-fracinv; + fv2 = (1.0-fracinv)/(double) numcats; + + if (numcats == 1) { + + for (k = 0; k < Numptrn; k++) { + + /* compute likelihood for pattern k */ + sitelkl = cdl[0][k]*fv; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } else { + + for (k = 0; k < Numptrn; k++) { + + /* this general routine works always but it's better + to run it only when it's really necessary */ + + /* compute likelihood for pattern k */ + sitelkl = 0.0; + for (r = 0; r < numcats; r++) + sitelkl += cdl[r][k]; + sitelkl = fv2*sitelkl; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } + + return loglkl; +} + + +/* computes the site log-likelihoods + input: likelihoods for each site and non-zero rate + output: log-likelihood for each site */ +void allsitelkl(dmatrix cdl, dvector aslkl) +{ + int k, r; + double fv, fv2, sitelkl; + + fv = 1.0-fracinv; + fv2 = (1.0-fracinv)/(double) numcats; + + if (numcats == 1) { + + for (k = 0; k < Numptrn; k++) { + + /* compute likelihood for pattern k */ + sitelkl = cdl[0][k]*fv; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* site log-likelihood */ + aslkl[k] = log(sitelkl); + } + + } else { + + for (k = 0; k < Numptrn; k++) { + + /* this general routine works always but it's better + to run it only when it's really necessary */ + + /* compute likelihood for pattern k */ + sitelkl = 0.0; + for (r = 0; r < numcats; r++) + sitelkl += cdl[r][k]; + sitelkl = fv2*sitelkl; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* total log-likelihood */ + aslkl[k] = log(sitelkl); + + } + } +} + + +/***************************** internal functions *****************************/ + +/* compute negative log-likelihood of distance arc between sequences seqchi/j */ +double pairlkl(double arc) +{ + int k, r, ci, cj; + double loglkl, fv, sitelkl; + + + /* compute tpms */ + for (r = 0; r < numcats; r++) + /* compute tpm for rate category r */ + tprobmtrx(arc*Rates[r], ltprobr[r]); + + loglkl = 0.0; + fv = 1.0-fracinv; + + if (numcats == 1) { + + for (k = 0; k < Numptrn; k++) { + + /* compute likelihood for site k */ + ci = seqchi[k]; + cj = seqchj[k]; + if (ci != tpmradix && cj != tpmradix) + sitelkl = ltprobr[0][ci][cj]*fv; + else + sitelkl = fv; + if (ci == cj && ci != tpmradix) + sitelkl += fracinv*Freqtpm[ci]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } else { + + for (k = 0; k < Numptrn; k++) { + + /* this general routine works always but it's better + to run it only when it's really necessary */ + + /* compute likelihood for site k */ + ci = seqchi[k]; + cj = seqchj[k]; + if (ci != tpmradix && cj != tpmradix) { + sitelkl = 0.0; + for (r = 0; r < numcats; r++) + sitelkl += ltprobr[r][ci][cj]; + sitelkl = fv*sitelkl/(double) numcats; + } else + sitelkl = fv; + if (ci == cj && ci != tpmradix) + sitelkl += fracinv*Freqtpm[ci]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } + + /* return negative log-likelihood as we use a minimizing procedure */ + return -loglkl; +} + + +/***************************** exported functions *****************************/ + + + +/******************************************************************************/ + +/* maximum likelihood distance between sequence i and j */ +/* CZ changed 05/16/01 */ +double mldistance( int i ) { + double dist, fx, f2x; + + /* use old distance as start value */ + dist = Distanmat[ i ]; + + if ( dist == 0.0 ) { + return 0.0; + } + + seqchi = Seqpat[ Maxspc - 1 ]; + seqchj = Seqpat[ i ]; + + if (dist <= MINARC) dist = MINARC+1.0; + if (dist >= MAXARC) dist = MAXARC-1.0; + + dist = onedimenmin(MINARC, dist, MAXARC, pairlkl, EPSILON, &fx, &f2x); + + return dist; +} + + + +/* initialize distance matrix */ +/* CZ changed 05/16/01 */ +void initdistan() { + int i, k, diff, x, y; + double obs, temp; + + for (i = 0; i < Maxspc - 1 ; i++) { + + seqchi = Seqpat[i]; + seqchj = Seqpat[Maxspc - 1]; + + /* count observed differences */ + diff = 0; + for (k = 0; k < Numptrn; k++) { + x = seqchi[k]; + y = seqchj[k]; + if (x != y && + x != tpmradix && + y != tpmradix) + diff += Weight[k]; + } + if (diff == 0) + Distanmat[i] = 0.0; + else { + /* use generalized JC correction to get first estimate + (for the SH model the observed distance is used) */ + /* observed distance */ + obs = (double) diff / (double) Maxsite; + temp = 1.0 - (double) obs*tpmradix/(tpmradix-1.0); + if (temp > 0.0 && !(data_optn == 0 && SH_optn)) + /* use JC corrected distance */ + Distanmat[i] = -100.0*(tpmradix-1.0)/tpmradix * log(temp); + else + /* use observed distance */ + Distanmat[i] = obs * 100.0; + if (Distanmat[i] < MINARC) Distanmat[i] = MINARC; + if (Distanmat[i] > MAXARC) Distanmat[i] = MAXARC; + } + } + +} + + + + +/* compute distance matrix */ +/* CZ changed 05/16/01 */ +void computedistan() { + int i; + + for ( i = 0; i < Maxspc - 1; i++ ) { + Distanmat[ i ] = mldistance( i ); + } +} + + +/******************************************************************************/ + + + + + +/******************************************************************************/ +/* computation of maximum likelihood edge lengths for a given tree */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +/* multiply partial likelihoods */ +void productpartials(Node *op) +{ + Node *cp; + int i, j, r; + dcube opc, cpc; + + cp = op; + opc = op->partials; + while (cp->isop->isop != op) { + cp = cp->isop; + cpc = cp->partials; + for (r = 0; r < numcats; r++) + for (i = 0; i < Numptrn; i++) + for (j = 0; j < tpmradix; j++) + opc[r][i][j] *= cpc[r][i][j]; + } +} + + +/* compute internal partial likelihoods */ +void partialsinternal(Node *op) +{ + int i, j, k, r; + double sum; + dcube oprob, cprob; + + if (clockmode == 1) { /* clocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->lengthc)*Rates[r], ltprobr[r]); + } + } else { /* non-clocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->length)*Rates[r], ltprobr[r]); + } + } + + oprob = op->partials; + cprob = op->kinp->isop->partials; + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + for (i = 0; i < tpmradix; i++) { + sum = 0.0; + for (j = 0; j < tpmradix; j++) + sum += ltprobr[r][i][j] * cprob[r][k][j]; + oprob[r][k][i] = sum; + } + } + } +} + + +/* compute external partial likelihoods */ +void partialsexternal(Node *op) +{ + int i, j, k, r; + dcube oprob; + cvector dseqi; + + if (clockmode == 1) { /* clocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->lengthc)*Rates[r], ltprobr[r]); + } + } else { /* nonclocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->length)*Rates[r], ltprobr[r]); + } + } + + oprob = op->partials; + dseqi = op->kinp->eprob; + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + if ((j = dseqi[k]) == tpmradix) { + for (i = 0; i < tpmradix; i++) + oprob[r][k][i] = 1.0; + } else { + for (i = 0; i < tpmradix; i++) + oprob[r][k][i] = ltprobr[r][i][j]; + } + } + } +} + + +/* compute all partial likelihoods */ +void initpartials(Tree *tr) +{ + Node *cp, *rp; + + cp = rp = tr->rootp; + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + cp = cp->kinp; /* not descen */ + partialsexternal(cp); + } else { /* internal node */ + if (!cp->descen) { + productpartials(cp->kinp->isop); + partialsinternal(cp); + } + } + } while (cp != rp); +} + + +/* compute log-likelihood given internal branch with length arc + between partials partiali and partials partialj */ +double intlkl(double arc) +{ + double sumlk, slk; + int r, s, i, j; + dmatrix cdl; + + cdl = Ctree->condlkl; + for (r = 0; r < numcats; r++) { + tprobmtrx(arc*Rates[r], ltprobr[r]); + } + for (r = 0; r < numcats; r++) { + for (s = 0; s < Numptrn; s++) { + sumlk = 0.0; + for (i = 0; i < tpmradix; i++) { + slk = 0.0; + for (j = 0; j < tpmradix; j++) + slk += partialj[r][s][j] * ltprobr[r][i][j]; + sumlk += Freqtpm[i] * partiali[r][s][i] * slk; + } + cdl[r][s] = sumlk; + } + } + + /* compute total log-likelihood for current tree */ + Ctree->lklhd = comptotloglkl(cdl); + + return -(Ctree->lklhd); /* we use a minimizing procedure */ +} + + +/* optimize internal branch */ +void optinternalbranch(Node *op) +{ + double arc, fx, f2x; + + partiali = op->isop->partials; + partialj = op->kinp->isop->partials; + arc = op->length; /* nonclocklike branch lengths */ + if (arc <= MINARC) arc = MINARC+1.0; + if (arc >= MAXARC) arc = MAXARC-1.0; + arc = onedimenmin(MINARC, arc, MAXARC, intlkl, EPSILON, &fx, &f2x); + op->kinp->length = arc; + op->length = arc; + + /* variance of branch length */ + f2x = fabs(f2x); + if (1.0/(MAXARC*MAXARC) < f2x) + op->varlen = 1.0/f2x; + else + op->varlen = MAXARC*MAXARC; +} + + +/* compute log-likelihood given external branch with length arc + between partials partiali and sequence seqchi */ +double extlkl(double arc) +{ + double sumlk; + int r, s, i, j; + dvector opb; + dmatrix cdl; + + cdl = Ctree->condlkl; + for (r = 0; r < numcats; r++) { + tprobmtrx(arc*Rates[r], ltprobr[r]); + } + for (r = 0; r < numcats; r++) { + for (s = 0; s < Numptrn; s++) { + opb = partiali[r][s]; + sumlk = 0.0; + if ((j = seqchi[s]) != tpmradix) { + for (i = 0; i < tpmradix; i++) + sumlk += (Freqtpm[i] * (opb[i] * ltprobr[r][i][j])); + } else { + for (i = 0; i < tpmradix; i++) + sumlk += Freqtpm[i] * opb[i]; + } + cdl[r][s] = sumlk; + } + } + + /* compute total log-likelihood for current tree */ + Ctree->lklhd = comptotloglkl(cdl); + + return -(Ctree->lklhd); /* we use a minimizing procedure */ +} + +/* optimize external branch */ +void optexternalbranch(Node *op) +{ + double arc, fx, f2x; + + partiali = op->isop->partials; + seqchi = op->kinp->eprob; + arc = op->length; /* nonclocklike branch lengths */ + if (arc <= MINARC) arc = MINARC+1.0; + if (arc >= MAXARC) arc = MAXARC-1.0; + arc = onedimenmin(MINARC, arc, MAXARC, extlkl, EPSILON, &fx, &f2x); + op->kinp->length = arc; + op->length = arc; + + /* variance of branch length */ + f2x = fabs(f2x); + if (1.0/(MAXARC*MAXARC) < f2x) + op->varlen = 1.0/f2x; + else + op->varlen = MAXARC*MAXARC; +} + + +/* finish likelihoods for each rate and site */ +void finishlkl(Node *op) +{ + int r, k, i, j; + double arc, sumlk, slk; + dmatrix cdl; + + partiali = op->isop->partials; + partialj = op->kinp->isop->partials; + cdl = Ctree->condlkl; + arc = op->length; /* nonclocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx(arc*Rates[r], ltprobr[r]); + } + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + sumlk = 0.0; + for (i = 0; i < tpmradix; i++) { + slk = 0.0; + for (j = 0; j < tpmradix; j++) + slk += partialj[r][k][j] * ltprobr[r][i][j]; + sumlk += Freqtpm[i] * partiali[r][k][i] * slk; + } + cdl[r][k] = sumlk; + } + } +} + + +/***************************** exported functions *****************************/ + + +/* optimize branch lengths to get maximum likelihood (nonclocklike branchs) */ +double optlkl(Tree *tr) +{ + Node *cp, *rp; + int nconv; + double lendiff; + + clockmode = 0; /* nonclocklike branch lengths */ + nconv = 0; + Converg = FALSE; + initpartials(tr); + for (Numit = 1; (Numit <= MAXIT) && (!Converg); Numit++) { + + cp = rp = tr->rootp; + do { + cp = cp->isop->kinp; + productpartials(cp->kinp->isop); + if (cp->isop == NULL) { /* external node */ + cp = cp->kinp; /* not descen */ + + lendiff = cp->length; + optexternalbranch(cp); + lendiff = fabs(lendiff - cp->length); + if (lendiff < EPSILON) nconv++; + else nconv = 0; + + partialsexternal(cp); + } else { /* internal node */ + if (cp->descen) { + partialsinternal(cp); + } else { + + lendiff = cp->length; + optinternalbranch(cp); + lendiff = fabs(lendiff - cp->length); + if (lendiff < EPSILON) nconv++; + else nconv = 0; + + /* eventually compute likelihoods for each site */ + if ((cp->number == Numibrnch-1 && lendiff < EPSILON) || + Numit == MAXIT-1) finishlkl(cp); + + partialsinternal(cp); + } + } + if (nconv >= Numbrnch) { /* convergence */ + Converg = TRUE; + cp = rp; /* get out of here */ + } + } while (cp != rp); + } + + /* compute total log-likelihood for current tree */ + return comptotloglkl(tr->condlkl); +} + + +/* compute likelihood of tree for given branch lengths */ +double treelkl(Tree *tr) +{ + int i, k, r; + Node *cp; + dmatrix cdl; + dcube prob1, prob2; + double sumlk; + + /* compute for each site and rate log-likelihoods */ + initpartials(tr); + cp = tr->rootp; + productpartials(cp->isop); + prob1 = cp->partials; + prob2 = cp->isop->partials; + cdl = tr->condlkl; + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + sumlk = 0.0; + for (i = 0; i < tpmradix; i++) + sumlk += Freqtpm[i] * (prob1[r][k][i] * prob2[r][k][i]); + cdl[r][k] = sumlk; + } + } + + /* return total log-likelihood for current tree */ + return comptotloglkl(cdl); +} + + +/******************************************************************************/ +/* least-squares estimate of branch lengths */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +void luequation(dmatrix amat, dvector yvec, int size) +{ + double eps = 1.0e-20; /* ! */ + int i, j, k, l, maxi=0, idx; + double sum, tmp, maxb, aw; + dvector wk; + ivector index; + + + wk = new_dvector(size); + index = new_ivector(size); + aw = 1.0; + for (i = 0; i < size; i++) { + maxb = 0.0; + for (j = 0; j < size; j++) { + if (fabs(amat[i][j]) > maxb) + maxb = fabs(amat[i][j]); + } + if (maxb == 0.0) { + /* Singular matrix */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR D TO DEVELOPERS\n\n\n"); + exit(1); + } + wk[i] = 1.0 / maxb; + } + for (j = 0; j < size; j++) { + for (i = 0; i < j; i++) { + sum = amat[i][j]; + for (k = 0; k < i; k++) + sum -= amat[i][k] * amat[k][j]; + amat[i][j] = sum; + } + maxb = 0.0; + for (i = j; i < size; i++) { + sum = amat[i][j]; + for (k = 0; k < j; k++) + sum -= amat[i][k] * amat[k][j]; + amat[i][j] = sum; + tmp = wk[i] * fabs(sum); + if (tmp >= maxb) { + maxb = tmp; + maxi = i; + } + } + if (j != maxi) { + for (k = 0; k < size; k++) { + tmp = amat[maxi][k]; + amat[maxi][k] = amat[j][k]; + amat[j][k] = tmp; + } + aw = -aw; + wk[maxi] = wk[j]; + } + index[j] = maxi; + if (amat[j][j] == 0.0) + amat[j][j] = eps; + if (j != size - 1) { + tmp = 1.0 / amat[j][j]; + for (i = j + 1; i < size; i++) + amat[i][j] *= tmp; + } + } + l = -1; + for (i = 0; i < size; i++) { + idx = index[i]; + sum = yvec[idx]; + yvec[idx] = yvec[i]; + if (l != -1) { + for (j = l; j < i; j++) + sum -= amat[i][j] * yvec[j]; + } else if (sum != 0.0) + l = i; + yvec[i] = sum; + } + for (i = size - 1; i >= 0; i--) { + sum = yvec[i]; + for (j = i + 1; j < size; j++) + sum -= amat[i][j] * yvec[j]; + yvec[i] = sum / amat[i][i]; + } + free_ivector(index); + free_dvector(wk); +} + + +/* least square estimation of branch lengths + used for the approximate ML and as starting point + in the calculation of the exact value of the ML */ +void lslength(Tree *tr, dvector distanvec, int numspc, int numibrnch, dvector Brnlength) +{ + int i, i1, j, j1, j2, k, numbrnch, numpair; + double sum, leng, alllen, rss; + ivector pths; + dmatrix atmt, atamt; + Node **ebp, **ibp; + + numbrnch = numspc + numibrnch; + numpair = (numspc * (numspc - 1)) / 2; + atmt = new_dmatrix(numbrnch, numpair); + atamt = new_dmatrix(numbrnch, numbrnch); + ebp = tr->ebrnchp; + ibp = tr->ibrnchp; + for (i = 0; i < numspc; i++) { + for (j1 = 1, j = 0; j1 < numspc; j1++) { + if (j1 == i) { + for (j2 = 0; j2 < j1; j2++, j++) { + atmt[i][j] = 1.0; + } + } else { + for (j2 = 0; j2 < j1; j2++, j++) { + if (j2 == i) + atmt[i][j] = 1.0; + else + atmt[i][j] = 0.0; + } + } + } + } + for (i1 = 0, i = numspc; i1 < numibrnch; i1++, i++) { + pths = ibp[i1]->paths; + for (j1 = 1, j = 0; j1 < numspc; j1++) { + for (j2 = 0; j2 < j1; j2++, j++) { + if (pths[j1] != pths[j2]) + atmt[i][j] = 1.0; + else + atmt[i][j] = 0.0; + } + } + } + for (i = 0; i < numbrnch; i++) { + for (j = 0; j <= i; j++) { + for (k = 0, sum = 0.0; k < numpair; k++) + sum += atmt[i][k] * atmt[j][k]; + atamt[i][j] = sum; + atamt[j][i] = sum; + } + } + for (i = 0; i < numbrnch; i++) { + for (k = 0, sum = 0.0; k < numpair; k++) + sum += atmt[i][k] * distanvec[k]; + Brnlength[i] = sum; + } + luequation(atamt, Brnlength, numbrnch); + for (i = 0, rss = 0.0; i < numpair; i++) { + sum = distanvec[i]; + for (j = 0; j < numbrnch; j++) { + if (atmt[j][i] == 1.0 && Brnlength[j] > 0.0) + sum -= Brnlength[j]; + } + rss += sum * sum; + } + tr->rssleast = sqrt(rss); + alllen = 0.0; + for (i = 0; i < numspc; i++) { + leng = Brnlength[i]; + alllen += leng; + if (leng < MINARC) leng = MINARC; + if (leng > MAXARC) leng = MAXARC; + if (clockmode) { /* clock */ + ebp[i]->lengthc = leng; + ebp[i]->kinp->lengthc = leng; + } else { /* no clock */ + ebp[i]->length = leng; + ebp[i]->kinp->length = leng; + } + Brnlength[i] = leng; + } + for (i = 0, j = numspc; i < numibrnch; i++, j++) { + leng = Brnlength[j]; + alllen += leng; + if (leng < MINARC) leng = MINARC; + if (leng > MAXARC) leng = MAXARC; + if (clockmode) { /* clock */ + ibp[i]->lengthc = leng; + ibp[i]->kinp->lengthc = leng; + } else { /* no clock */ + ibp[i]->length = leng; + ibp[i]->kinp->length = leng; + } + Brnlength[j] = leng; + } + free_dmatrix(atmt); + free_dmatrix(atamt); +} diff --git a/forester/archive/RIO/others/puzzle_dqo/src/ml2.c b/forester/archive/RIO/others/puzzle_dqo/src/ml2.c new file mode 100644 index 0000000..7e1b3db --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/ml2.c @@ -0,0 +1,1637 @@ +/* + * ml2.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#define EXTERN extern + +/* prototypes */ +#include +#include +#include +#include +#include +#include "util.h" +#include "ml.h" + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + +/* prototypes for two functions of puzzle2.c */ +void fputid10(FILE *, int); +int fputid(FILE *, int); + + +/******************************************************************************/ +/* user tree input */ +/******************************************************************************/ + +/* read user tree, drop all blanks, tabs, and newlines. + Drop edgelengths (after :) but keep internal + labels. Check whether all pairs of brackets match. */ +void getusertree(FILE *itfp, cvector tr, int maxlen) +{ + int n, brac, ci; + int comment = 0; + + /* look for opening bracket */ + n = 0; + brac = 0; + do { + ci = fgetc(itfp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing start bracket in tree)\n\n\n"); + exit(1); + } + if (ci == '[') comment = 1; + if ((ci == ']') && comment) { + comment = 0; + ci = fgetc(itfp); + } + } while (comment || ((char) ci != '(')); + tr[n] = (char) ci; + brac++; + + do { + /* get next character (skip blanks, newlines, and tabs) */ + do { + ci = fgetc(itfp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no more characters in tree)\n\n\n"); + exit(1); + } + if (ci == '[') comment = 1; + if ((ci == ']') && comment) { + comment = 0; + ci = fgetc(itfp); + } + } while (comment || (char) ci == ' ' || (char) ci == '\n' || (char) ci == '\t'); + + if ((char) ci == ':') { /* skip characters until a ,) appears */ + do { + ci = fgetc(itfp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing ';' or ',' in tree)\n\n\n"); + exit(1); + } + if (ci == '[') comment = 1; + if ((ci == ']') && comment) { + comment = 0; + ci = fgetc(itfp); + } + } while (comment || ((char) ci != ',' && (char) ci != ')') ); + } + + if ((char) ci == '(') { + brac++; + } + if ((char) ci == ')') { + brac--; + } + + n++; + tr[n] = (char) ci; + + } while (((char) ci != ';') && (n != maxlen-2)); + + if (n == maxlen-2) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (tree description too long)\n\n\n"); + exit(1); + } + + if (brac != 0) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (brackets don't match in tree)\n\n\n"); + exit(1); + } + + n++; + tr[n] = '\0'; +} + + +Node *internalnode(Tree *tr, char **chpp, int *ninode) +{ + Node *xp, *np, *rp; + int i, j, dvg, ff, stop, numc; + char ident[100], idcomp[27]; /*CZ*/ + char *idp; + + (*chpp)++; + if (**chpp == '(') { /* process subgroup */ + + xp = internalnode(tr, chpp, ninode); + xp->isop = xp; + dvg = 1; + while (**chpp != ')') { + if (**chpp == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + dvg++; + /* insert edges around node */ + np = internalnode(tr, chpp, ninode); + np->isop = xp->isop; + xp->isop = np; + xp = np; + } + /* closing bracket reached */ + + (*chpp)++; + if (dvg < 2) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (only one OTU inside pair of brackets)\n\n\n"); + exit(1); + } + + if ((*ninode) >= Maxspc-3) { /* all internal nodes already used */ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no unrooted tree)\n\n\n"); + exit(1); + } + + rp = tr->ibrnchp[*ninode]; + rp->isop = xp->isop; + xp->isop = rp; + + for (j = 0; j < Numspc; j++) + rp->paths[j] = 0; + xp = rp->isop; + while (xp != rp) { + for (j = 0; j < Numspc; j++) { + if (xp->paths[j] == 1) + rp->paths[j] = 1; + } + xp = xp->isop; + } + (*ninode)++; + + if ((**chpp) == ',' || (**chpp) == ')') return rp->kinp; + if ((**chpp) == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + + /* read internal label into rp->label (max. 20 characters) */ + rp->label = new_cvector(21); + (rp->label)[0] = **chpp; + (rp->label)[1] = '\0'; + for (numc = 1; numc < 20; numc++) { + (*chpp)++; + if ((**chpp) == ',' || (**chpp) == ')') return rp->kinp; + if ((**chpp) == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + (rp->label)[numc] = **chpp; + (rp->label)[numc+1] = '\0'; + } + do { /* skip the rest of the internal label */ + (*chpp)++; + if ((**chpp) == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + } while (((**chpp) != ',' && (**chpp) != ')')); + + return rp->kinp; + + } else { /* process species names */ + /* read species name */ + for (idp = ident; **chpp != ',' && + **chpp != ')' && **chpp != '\0'; (*chpp)++) { + *idp++ = **chpp; + } + *idp = '\0'; + /* look for internal number */ + idcomp[26] = '\0'; /*CZ*/ + + for (i = 0; i < Maxspc; i++) { + ff = 0; + stop = FALSE; + do { + idcomp[ff] = Identif[i][ff]; + ff++; + if (idcomp[ff-1] == ' ') stop = TRUE; + } while (!stop && (ff != 26)); /*CZ*/ + if (stop) idcomp[ff-1] = '\0'; + + if (!strcmp(ident, idcomp)) { + if (usedtaxa[i]) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (multiple occurence of sequence '"); + FPRINTF(STDOUTFILE "%s' in tree)\n\n\n", ident); + exit(1); + } + usedtaxa[i] = TRUE; + return tr->ebrnchp[i]->kinp; + } + } + + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unknown sequence '%s' in tree)\n\n\n", ident); + exit(1); + } + return NULL; /* never returned but without some compilers complain */ +} + +/* make tree structure, the tree description may contain internal + labels but no edge lengths */ +void constructtree(Tree *tr, cvector strtree) +{ + char *chp; + int ninode, i; + int dvg, numc; + Node *xp, *np; + + ninode = 0; + chp = strtree; + usedtaxa = new_ivector(Maxspc); + for (i = 0; i < Maxspc; i++) usedtaxa[i] = FALSE; + + xp = internalnode(tr, &chp, &ninode); + xp->isop = xp; + dvg = 1; + while (*chp != ')') { /* look for closing bracket */ + if (*chp == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + dvg++; + /* insert edges around node */ + np = internalnode(tr, &chp, &ninode); + np->isop = xp->isop; + xp->isop = np; + xp = np; + } + + for (i = 0; i < Maxspc; i++) + if (usedtaxa[i] == FALSE) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (sequences missing in tree)\n\n\n"); + exit(1); + } + + /* closing bracket reached */ + if (dvg < 3) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no unrooted tree)\n\n\n"); + exit(1); + } + tr->rootp = xp; + Numibrnch = ninode; + Numbrnch = Numspc + ninode; + + chp++; + if (*chp == ';' || *chp == '\0') { + free_ivector(usedtaxa); + return; + } + + /* copy last internal label (max. 20 characters) */ + xp->label = new_cvector(21); + (xp->label)[0] = *chp; + (xp->label)[1] = '\0'; + for (numc = 1; numc < 20; numc++) { + chp++; + if (*chp == ';' || *chp == '\0') { + free_ivector(usedtaxa); + return; + } else { + (xp->label)[numc] = *chp; + (xp->label)[numc+1] = '\0'; + } + } + free_ivector(usedtaxa); + return; +} + + +/* remove possible basal bifurcation */ +void removebasalbif(cvector strtree) +{ + int n, c, brak, cutflag, h; + + /* check how many OTUs on basal level */ + n = 0; + c = 0; + brak = 0; + do { + if (strtree[n] == '(') brak++; + if (strtree[n] == ')') brak--; + + if (strtree[n] == ',' && brak == 1) c++; /* number of commas in outer bracket */ + + n++; + } while (strtree[n] != '\0'); + + /* if only 1 OTU inside outer bracket stop now */ + if (c == 0) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (Only 1 OTU inside outer bracket in tree)\n\n\n"); + exit(1); + } + + /* if only 2 OTUs inside outer bracket delete second pair of + brackets from the right to remove basal bifurcation */ + + if (c == 1) { + + n = 0; + brak = 0; + cutflag = 0; /* not yet cutted */ + h = 0; + do { + if (strtree[n] == '(') brak++; + if (strtree[n] == ')') brak--; + + if (brak == 2 && cutflag == 0) cutflag = 1; /* cutting */ + if (brak == 1 && cutflag == 1) { + cutflag = 2; /* cutted */ + /* leave out internal label */ + do { + h++; + } while (strtree[n+h] != ')' && strtree[n+h] != ','); + + } + + if (cutflag == 1) strtree[n] = strtree[n+1]; + if (cutflag == 2) strtree[n-1] = strtree[n+h]; + + n++; + } while (strtree[n] != '\0'); + } +} + + +void makeusertree(FILE *itfp) +{ + cvector strtree; + + strtree = new_cvector(23*Maxspc); /* for treefile */ + getusertree(itfp, strtree, 23*Maxspc); + removebasalbif(strtree); + constructtree(Ctree, strtree); + free_cvector(strtree); +} + + +/******************************************************************************/ +/* memory organisation for maximum likelihood tree */ +/******************************************************************************/ + +/* initialise new tree */ +Tree *new_tree(int maxspc, int numptrn, cmatrix seqconint) +{ + int n, i, maxibrnch; + Tree *tr; + Node *dp, *up; + + maxibrnch = maxspc - 3; + heights = (Node **) malloc((unsigned)(maxspc-2) * sizeof(Node *)); + if (heights == NULL) maerror("heights in new_tree"); + tr = (Tree *) malloc(sizeof(Tree)); + if (tr == NULL) maerror("tr in new_tree"); + tr->ebrnchp = (Node **) malloc((unsigned)maxspc * sizeof(Node *)); + if (tr->ebrnchp == NULL) maerror("ebrnchp in new_tree"); + tr->ibrnchp = (Node **) malloc((unsigned)maxibrnch * sizeof(Node *)); + if (tr->ibrnchp == NULL) maerror("ibrnchp in new_tree"); + tr->condlkl = new_dmatrix(numcats, numptrn); + for (n = 0; n < maxspc; n++) { + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_tree"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("up in new_tree"); + dp->isop = NULL; + up->isop = NULL; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = n; + up->number = n; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(maxspc); + up->paths = dp->paths; + for (i = 0; i < maxspc; i++) dp->paths[i] = 0; + dp->paths[n] = 1; + dp->eprob = seqconint[n]; + up->eprob = NULL; + dp->partials = NULL; + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ebrnchp[n] = dp; + up->label = NULL; + dp->label = NULL; + } + for (n = 0; n < maxibrnch; n++) { + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_tree"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("up in new_tree"); + dp->isop = NULL; + up->isop = NULL; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = n; + up->number = n; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(maxspc); + up->paths = dp->paths; + for (i = 0; i < maxspc; i++) dp->paths[i] = 0; + dp->eprob = NULL; + up->eprob = NULL; + dp->partials = new_dcube(numcats, numptrn, tpmradix); + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ibrnchp[n] = dp; + up->label = NULL; + dp->label = NULL; + } + tr->rootp = NULL; + + /* + * reserve memory for lengths of the tree branches + * and for the distance matrix as a vector + * (needed for LS estimation of tree branch lengths) + */ + + Brnlength = new_dvector(2 * maxspc - 3); + Distanvec = new_dvector((maxspc * (maxspc - 1)) / 2); + + return tr; +} + + +/* initialise quartet tree */ +Tree *new_quartet(int numptrn, cmatrix seqconint) +{ + int n, i; + Tree *tr; + Node *dp, *up; + + heights = (Node **) malloc((unsigned)2 * sizeof(Node *)); + if (heights == NULL) maerror("heights in new_quartet"); + /* reserve memory for tree */ + tr = (Tree *) malloc(sizeof(Tree)); + if (tr == NULL) maerror("tr in new_quartet"); + tr->ebrnchp = (Node **) malloc((unsigned) 4 * sizeof(Node *)); + if (tr->ebrnchp == NULL) maerror("ebrnchp in new_quartet"); + tr->ibrnchp = (Node **) malloc((unsigned) sizeof(Node *)); + if (tr->ibrnchp == NULL) maerror("ibrnchp in new_quartet"); + tr->condlkl = new_dmatrix(numcats, numptrn); + /* reserve memory for nodes */ + for (n = 0; n < 4; n++) { + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_quartet"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("dp in new_quartet"); + dp->isop = NULL; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = n; + up->number = n; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(4); + up->paths = dp->paths; + for (i = 0; i < 4; i++) dp->paths[i] = 0; + dp->paths[n] = 1; + dp->eprob = seqconint[n]; /* make quartet (0,1)-(2,3) as default */ + up->eprob = NULL; + dp->partials = NULL; + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ebrnchp[n] = dp; + } + + /* reserve memory for internal branch */ + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_quartet"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("dp in new_quartet"); + dp->isop = tr->ebrnchp[3]->kinp; /* connect internal branch */ + up->isop = tr->ebrnchp[0]->kinp; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = 0; + up->number = 0; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(4); + up->paths = dp->paths; + up->paths[0] = 0; + up->paths[1] = 0; + up->paths[2] = 1; + up->paths[3] = 1; + dp->eprob = NULL; + up->eprob = NULL; + dp->partials = new_dcube(numcats, numptrn, tpmradix); + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ibrnchp[0] = dp; + + /* place root */ + tr->rootp = up; + + /* connect external branches */ + tr->ebrnchp[0]->kinp->isop = tr->ebrnchp[1]->kinp; + tr->ebrnchp[1]->kinp->isop = tr->rootp; + tr->ebrnchp[3]->kinp->isop = tr->ebrnchp[2]->kinp; + tr->ebrnchp[2]->kinp->isop = tr->rootp->kinp; + + /* + * reserve memory for lengths of the five branches + * of a quartet and for the six possible distances + * (needed for LS estimation of branch lengths) + */ + Brnlength = new_dvector(NUMQBRNCH); + Distanvec = new_dvector(NUMQSPC*(NUMQSPC-1)/2); + + return tr; +} + + +/* free tree memory */ +void free_tree(Tree *tr, int taxa) +{ + int n; + Node *dp, *up; + + free(heights); + free_dmatrix(tr->condlkl); + for (n = 0; n < taxa; n++) { + dp = tr->ebrnchp[n]; + up = dp->kinp; + free_ivector(dp->paths); + free_dcube(up->partials); + free(dp); + free(up); + } + free(tr->ebrnchp); + for (n = 0; n < (taxa-3); n++) { + dp = tr->ibrnchp[n]; + up = dp->kinp; + free_dcube(dp->partials); + free_dcube(up->partials); + free_ivector(dp->paths); + free(dp); + free(up); + } + free(tr->ibrnchp); + free(tr); + free_dvector(Brnlength); /* branch lengths (for LS estimation) */ + free_dvector(Distanvec); /* distances (for LS estimation) */ +} + + +/* make (a,b)-(c,d) quartet + + a ---+ +--- c + +-----+ + b ---+ +--- d + + species numbers range from 0 to Maxspc - 1 */ + +void make_quartet(int a, int b, int c, int d) +{ + /* place sequences */ + /*Ctree->ebrnchp[0]->eprob = Seqpat[a]; + Ctree->ebrnchp[1]->eprob = Seqpat[b]; + Ctree->ebrnchp[2]->eprob = Seqpat[c]; + Ctree->ebrnchp[3]->eprob = Seqpat[d]; + CZ */ + /* make distance vector */ + /*Distanvec[0] = Distanmat[b][a]; + Distanvec[1] = Distanmat[c][a]; + Distanvec[2] = Distanmat[c][b]; + Distanvec[3] = Distanmat[d][a]; + Distanvec[4] = Distanmat[d][b]; + Distanvec[5] = Distanmat[d][c]; + CZ */ +} + +/* write distance matrix as vector */ +void changedistan(dmatrix distanmat, dvector distanvec, int numspc) +{ + int i, j, k; + + for (k = 0, i = 1; i < numspc; i++) { + for (j = 0; j < i; j++, k++) + distanvec[k] = distanmat[i][j]; + } +} + + +/******************************************************************************/ +/* computation of maximum likelihood tree */ +/******************************************************************************/ + + +/* compute the likelihood for (a,b)-(c,d) quartet */ +double quartet_lklhd(int a, int b, int c, int d) +{ + /* reserve memory for quartet if necessary */ + if (mlmode != 1) { /* no quartet tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_quartet(Numptrn, Seqpat); + Numbrnch = NUMQBRNCH; + Numibrnch = NUMQIBRNCH; + Numspc = NUMQSPC; + mlmode = 1; + } + + /* make (a,b)-(c,d) quartet */ + make_quartet(a,b,c,d); + + clockmode = 0; /* nonclocklike branch lengths */ + + /* least square estimate for branch length */ + lslength(Ctree, Distanvec, Numspc, Numibrnch, Brnlength); + + /* compute likelihood */ + Ctree->lklhd = optlkl(Ctree); + + return Ctree->lklhd; +} + + +/* compute the approximate likelihood for (a,b)-(c,d) quartet */ +double quartet_alklhd(int a, int b, int c, int d) +{ + /* reserve memory for quartet if necessary */ + if (mlmode != 1) { /* no quartet tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_quartet(Numptrn, Seqpat); + Numbrnch = NUMQBRNCH; + Numibrnch = NUMQIBRNCH; + Numspc = NUMQSPC; + mlmode = 1; + } + + /* make (a,b)-(c,d) quartet */ + make_quartet(a,b,c,d); + + clockmode = 0; /* nonclocklike branch lengths */ + + /* least square estimate for branch length */ + lslength(Ctree, Distanvec, Numspc, Numibrnch, Brnlength); + + /* compute likelihood */ + Ctree->lklhd = treelkl(Ctree); + + return Ctree->lklhd; +} + + +/* read usertree from file to memory */ +void readusertree(FILE *ifp) +{ + /* reserve memory for tree if necessary */ + if (mlmode != 2) { /* no tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_tree(Maxspc, Numptrn, Seqpat); + Numbrnch = 2*Maxspc-3; + Numibrnch = Maxspc-3; + Numspc = Maxspc; + mlmode = 2; + } + + /* read tree */ + makeusertree(ifp); +} + + +/* compute the likelihood of a usertree */ +double usertree_lklhd() +{ + + /* CZ 05/16/01 */ + + return 6.66; +} + + +/* compute the approximate likelihood of a usertree */ +double usertree_alklhd() +{ + /* CZ 05/16/01 */ + + return 6.66; +} + + +/* preparation for ML analysis */ +void mlstart() +{ + /* number of states and code length */ + tpmradix = gettpmradix(); + + /* declare variables */ + Eval = new_dvector(tpmradix); + Evec = new_dmatrix(tpmradix,tpmradix); + Ievc = new_dmatrix(tpmradix,tpmradix); + iexp = new_dmatrix(tpmradix,tpmradix); + Alias = new_ivector(Maxsite); + + /* process sequence information */ + evaluateseqs(); + bestrate = new_ivector(Numptrn); + + /* compute transition probability matrix */ + tranprobmat(); + + /* non-zero rate categories */ + Rates = new_dvector(numcats); + updaterates(); + ltprobr = new_dcube(numcats, tpmradix,tpmradix); + + /* compute distance matrix */ + Distanmat = new_dvector( Maxspc - 1 ); + initdistan(); + + /* initialize tree pointer for quartet tree */ + /*mlmode = 1; + Ctree = new_quartet(Numptrn, Seqpat); + Numbrnch = NUMQBRNCH; + Numibrnch = NUMQIBRNCH; + Numspc = NUMQSPC; + CZ, */ + /* computing ML distances */ + computedistan(); +} + + +/* recompute ml distances for quartet only */ +void distupdate(int a, int b, int c, int d) +{ + /* update distance matrix */ + /* consider only entries relevant to quartet */ + /* + Distanmat[a][b] = mldistance(a, b); + Distanmat[b][a] = Distanmat[a][b]; + Distanmat[a][c] = mldistance(a, c); + Distanmat[c][a] = Distanmat[a][c]; + Distanmat[a][d] = mldistance(a, d); + Distanmat[d][a] = Distanmat[a][d]; + Distanmat[b][c] = mldistance(b, c); + Distanmat[c][b] = Distanmat[b][c]; + Distanmat[b][d] = mldistance(b, d); + Distanmat[d][b] = Distanmat[b][d]; + Distanmat[c][d] = mldistance(c, d); + Distanmat[d][c] = Distanmat[c][d]; + CZ */ +} + + +/* cleanup after ML analysis */ +void mlfinish() +{ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + free_ivector(bestrate); + free_ivector(Alias); + free_cmatrix(Seqpat); + free_ivector(constpat); + free_ivector(Weight); + free_dvector(Distanmat); /* CZ */ + free_dvector(Eval); + free_dmatrix(Evec); + free_dmatrix(Ievc); + free_dvector(Rates); + free_dcube(ltprobr); + free_dmatrix(iexp); +} + + +/******************************************************************************/ +/* tree output */ +/******************************************************************************/ + + +#define MAXOVER 50 +#define MAXLENG 30 +#define MAXCOLUMN 80 + + +void prbranch(Node *up, int depth, int m, int maxm, + ivector umbrella, ivector column, FILE *outfp) +{ + int i, num, n, maxn, lim; + Node *cp; + char bch; + + if ((int)((clockmode ? up->lengthc : up->length) * Proportion) >= MAXOVER) { + column[depth] = MAXLENG; + bch = '+'; + } else { + column[depth] = (int)((clockmode ? up->lengthc : up->length) * Proportion) + 3; + bch = '-'; + } + + if (up->isop == NULL) { /* external branch */ + num = up->number + 1; /* offset */ + if (m == 1) umbrella[depth - 1] = TRUE; + for (i = 0; i < depth; i++) { + if (umbrella[i]) + fprintf(outfp, "%*c", column[i], ':'); + else + fprintf(outfp, "%*c", column[i], ' '); + } + if (m == maxm) + umbrella[depth - 1] = FALSE; + for (i = 0, lim = column[depth] - 3; i < lim; i++) + fputc(bch, outfp); + fprintf(outfp, "-%d ", num); + + fputid(outfp, up->number); + + + fputc('\n', outfp); + fputc(' ', outfp); + return; + } + + num = up->number + 1 + Numspc; /* offset, internal branch */ + for (cp = up->isop, maxn = 0; cp != up; cp = cp->isop, maxn++) + ; + for (cp = up->isop, n = 1; cp != up; cp = cp->isop, n++) { + prbranch(cp->kinp, depth + 1, n, maxn, umbrella, column, outfp); + if (m == 1 && n == maxn / 2) umbrella[depth - 1] = TRUE; + if (n != maxn) { + for (i = 0; i < depth; i++) { + if (umbrella[i]) + fprintf(outfp, "%*c", column[i], ':'); + else + fprintf(outfp, "%*c", column[i], ' '); + } + if (n == maxn / 2) { /* internal branch */ + for (i = 0, lim = column[depth] - 3; i < lim; i++) + fputc(bch, outfp); + if (num < 10) + fprintf(outfp, "--%d", num); + else if (num < 100) + fprintf(outfp, "-%2d", num); + else + fprintf(outfp, "%3d", num); + } else { + if (umbrella[depth]) + fprintf(outfp, "%*c", column[depth], ':'); + else + fprintf(outfp, "%*c", column[depth], ' '); + } + fputc('\n', outfp); + fputc(' ', outfp); + } + if (m == maxm) umbrella[depth - 1] = FALSE; + } + return; +} + + +void getproportion(double *proportion, dvector distanvec, int numspc) +{ + int i, maxpair; + double maxdis; + + maxpair = (numspc*(numspc-1))/2; + + maxdis = 0.0; + for (i = 0; i < maxpair; i++) { + if (distanvec[i] > maxdis) { + maxdis = distanvec[i]; + } + } + *proportion = (double) MAXCOLUMN / (maxdis * 3.0); + if (*proportion > 1.0) *proportion = 1.0; +} + + +void prtopology(FILE *outfp) +{ + int n, maxn, depth; + ivector umbrella; + ivector column; + Node *cp, *rp; + + getproportion(&Proportion, Distanvec, Numspc); + + umbrella = new_ivector(Numspc); + column = new_ivector(Numspc); + + for (n = 0; n < Numspc; n++) { + umbrella[n] = FALSE; + column[n] = 3; + } + column[0] = 1; + + fputc(' ', outfp); + + /* original code: rp = Ctree->rootp */ + /* but we want to print the first group in the + trichotomy as outgroup at the bottom! */ + rp = Ctree->rootp->isop; + + for (maxn = 1, cp = rp->isop; cp != rp; cp = cp->isop, maxn++) + ; + depth = 1; + n = 0; + + cp = rp; + do { + cp = cp->isop; + n++; + prbranch(cp->kinp, depth, n, maxn, umbrella, column, outfp); + if (cp != rp) fprintf(outfp, "%*c\n ", column[0], ':'); + } while (cp != rp); + + free_ivector(umbrella); + free_ivector(column); +} + + +/* print unrooted tree file with branch lengths */ +void fputphylogeny(FILE *fp) +{ + Node *cp, *rp; + int n; + + cp = rp = Ctree->rootp; + putc('(', fp); + n = 1; + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + if (n > 60) { + fprintf(fp, "\n"); + n = 2; + } + n += fputid(fp, cp->number); + fprintf(fp, ":%.5f", ((clockmode ? cp->lengthc : cp->length))*0.01); + n += 7; + cp = cp->kinp; + } else { /* internal node */ + if (cp->descen) { + if (n > 60) { + fprintf(fp, "\n"); + n = 1; + } + putc('(', fp); + n++; + } else { + putc(')', fp); + n++; + if (n > 60) { + fprintf(fp, "\n"); + n = 1; + } + /* internal label */ + if (cp->kinp->label != NULL) { + fprintf(fp, "%s", cp->kinp->label); + n += strlen(cp->kinp->label); + } + fprintf(fp, ":%.5f", ((clockmode ? cp->lengthc : cp->length))*0.01); + n += 7; + } + } + if (!cp->descen && !cp->isop->descen && cp != rp) { + putc(',', fp); /* not last subtree */ + n++; + } + } while (cp != rp); + fprintf(fp, ")"); + /* internal label */ + if (cp->label != NULL) + fprintf(fp, "%s", cp->label); + fprintf(fp, ";\n"); +} + + +void resulttree(FILE *outfp) +{ + int n, ne, closeflag; + Node *ep, *ip; + double blen; + + closeflag = FALSE; + + if (clockmode) { + fprintf(outfp, "\n branch length nc/c"); + fprintf(outfp, " branch length nc/c (= non-clock/clock)\n"); + } else { + fprintf(outfp, "\n branch length S.E."); + fprintf(outfp, " branch length S.E.\n"); + } + for (n = 0; n < Numspc; n++) { + ep = Ctree->ebrnchp[n]; + ne = ep->number; + fputid10(outfp, ne); + fputs(" ", outfp); + fprintf(outfp, "%3d", ne + 1); + blen = (clockmode ? ep->lengthc : ep->length); + fprintf(outfp, "%9.5f", blen*0.01); + if (blen < 5.0*MINARC || blen > 0.95*MAXARC) closeflag = TRUE; + if (clockmode) + fprintf(outfp, "%9.3f", (ep->length)/(ep->lengthc)); + else + fprintf(outfp, "%9.5f", 0.01*sqrt(ep->kinp->varlen)); + if (n < Numibrnch) { + ip = Ctree->ibrnchp[n]; + fprintf(outfp, "%8d", n + 1 + Numspc); + blen = (clockmode ? ip->lengthc : ip->length); + fprintf(outfp, "%9.5f", blen*0.01); + if (blen < 5.0*MINARC || blen > 0.95*MAXARC) closeflag = TRUE; + if (clockmode) + fprintf(outfp, "%9.3f", (ip->length)/(ip->lengthc)); + else + fprintf(outfp, "%9.5f", 0.01*sqrt(ip->kinp->varlen)); + fputc('\n', outfp); + } else { + if (n == Numspc - 3) { + fputc('\n', outfp); + } else if (n == Numspc - 2) { + if (clockmode) { + if (!Convergc) + fprintf(outfp, " No convergence after %d iterations!\n", Numitc); + else + fprintf(outfp, " %d iterations until convergence\n", Numitc); + } else { + if (!Converg) + fprintf(outfp, " No convergence after %d iterations!\n", Numit); + else + fprintf(outfp, " %d iterations until convergence\n", Numit); + } + } else if (n == Numspc - 1) { + fprintf(outfp, " log L: %.2f\n", (clockmode ? Ctree->lklhdc : Ctree->lklhd)); + } else { + fputc('\n', outfp); + } + } + } + if(closeflag) + fprintf(outfp, "\nWARNING --- at least one branch length is close to an internal boundary!\n"); +} + + +/******************************************************************************/ +/* Neighbor-joining tree */ +/******************************************************************************/ + + +/* compute NJ tree and write to file */ +void njtree(FILE *fp) +{ + /* reserve memory for tree if necessary */ + if (mlmode != 3) { /* no tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_tree(Maxspc, Numptrn, Seqpat); + Numbrnch = 2*Maxspc-3; + Numibrnch = Maxspc-3; + Numspc = Maxspc; + mlmode = 3; + } + + /* construct NJ tree from distance matrix */ + njdistantree(Ctree); + + fputphylogeny(fp); +} + + +/* construct NJ tree from distance matrix */ +void njdistantree(Tree *tr) +{ + /* removed, CZ, 05/16/01 */ +} + +/******************************************************************************/ +/* find best assignment of rate categories */ +/******************************************************************************/ + +/* find best assignment of rate categories */ +void findbestratecombination() +{ + int k, u; + double bestvalue, fv2; + dvector catprob; + dmatrix cdl; + + cdl = Ctree->condlkl; + catprob = new_dvector(numcats+1); + fv2 = (1.0-fracinv)/(double) numcats; + + for (k = 0; k < Numptrn; k++) { + /* zero rate */ + if (constpat[k] == TRUE) + catprob[0] = fracinv*Freqtpm[(int) Seqpat[0][k]]; + else + catprob[0] = 0.0; + /* non-zero-rates */ + for (u = 1; u < numcats+1; u++) + catprob[u] = fv2*cdl[u-1][k]; + /* find best */ + bestvalue = catprob[0]; + bestrate[k] = 0; + for (u = 1; u < numcats+1; u++) + if (catprob[u] >= bestvalue) { + bestvalue = catprob[u]; + bestrate[k] = u; + } + } + free_dvector(catprob); + bestratefound = 1; +} + +/* print best assignment of rate categories */ +void printbestratecombination(FILE *fp) +{ + int s, k; + + for (s = 0; s < Maxsite; s++) { + k = Alias[s]; + fprintf(fp, "%2d", bestrate[k]); + if ((s+1) % 30 == 0) + fprintf(fp, "\n"); + else if ((s+1) % 10 == 0) + fprintf(fp, " "); + } + if (s % 70 != 0) + fprintf(fp, "\n"); +} + + +/******************************************************************************/ +/* computation of clocklike branch lengths */ +/******************************************************************************/ + +/* checks wether e is a valid edge specification */ +int checkedge(int e) +{ + /* there are Numspc external branches: + 0 - Numspc-1 + there are Numibrnch internal branches: + Numspc - Numspc+Numibrnch-1 + */ + + if (e < 0) return FALSE; + if (e < Numspc+Numibrnch) return TRUE; + else return FALSE; +} + +/* print topology of subtree */ +void fputsubstree(FILE *fp, Node *ip) +{ + Node *cp; + + if (ip->isop == NULL) { /* terminal nodes */ + numtc += fputid(fp, ip->number); + } else { + cp = ip; + fprintf(fp, "("); + numtc += 1; + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + numtc += fputid(fp, cp->number); + fprintf(fp, ":%.5f", (cp->lengthc)*0.01); + numtc += 7; + cp = cp->kinp; + } else { /* internal node */ + if (cp->height > 0.0) { + fprintf(fp, "("); + numtc += 1; + } else if (cp->height < 0.0) { + fprintf(fp, ")"); + numtc += 1; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + /* internal label */ + if (cp->kinp->label != NULL) { + fprintf(fp, "%s", cp->kinp->label); + numtc += strlen(cp->kinp->label); + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fprintf(fp, ":%.5f", (cp->lengthc)*0.01); + numtc += 6; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + } + } + if (cp->height <= 0.0 && cp->isop->height <= 0.0 && + cp->isop != ip) { + putc(',', fp); /* not last subtree */ + numtc += 1; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + } + } while (cp->isop != ip); + fprintf(fp, ")"); + numtc += 1; + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + +} + +/* print rooted tree file */ +void fputrooted(FILE *fp, int e) +{ + Node *rootbr; + + /* to be called only after clocklike branch + lengths have been computed */ + + /* pointer to root branch */ + if (e < Numspc) rootbr = Ctree->ebrnchp[e]; + else rootbr = Ctree->ibrnchp[e - Numspc]; + + fprintf(fp, "("); + numtc = 2; + fputsubstree(fp, rootbr); + /* internal label */ + if (rootbr->label != NULL) { + fprintf(fp, "%s", rootbr->label); + numtc += strlen(rootbr->label); + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fprintf(fp, ":%.5f,", (hroot - rootbr->height)*0.01); + numtc += 7; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fputsubstree(fp, rootbr->kinp); + /* internal label */ + if (rootbr->kinp->label != NULL) { + fprintf(fp, "%s", rootbr->kinp->label); + numtc += strlen(rootbr->kinp->label); + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fprintf(fp, ":%.5f);\n", (hroot - rootbr->kinp->height)*0.01); +} + +/* finds heights in subtree */ +void findheights(Node *ip) +{ + Node *cp, *rp; + + if (ip->isop != NULL) { /* forget terminal nodes */ + + cp = ip; + + /* initialise node */ + cp->height = 1.0; /* up */ + rp = cp; + while (rp->isop != cp) { + rp = rp->isop; + rp->height = -1.0; /* down */ + } + + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + cp = cp->kinp; + } else { /* internal node */ + if (cp->height == 0.0) { /* node not yet visited */ + cp->height = 1.0; /* up */ + rp = cp; + while (rp->isop != cp) { + rp = rp->isop; + rp->height = -1.0; /* down */ + } + } else if (cp->kinp->height == 1.0) { + /* cp->kinp is next height pointer */ + heights[Numhts] = cp->kinp; + Numhts++; + } + } + } while (cp->isop != ip); + /* ip is last height pointer */ + heights[Numhts] = ip; + Numhts++; + } +} + + +/* initialise clocklike branch lengths (with root on edge e) */ +void initclock(int e) +{ + /* CZ */ +} + +/* approximate likelihood under the constaining assumption of + clocklike branch lengths (with root on edge e) */ +double clock_alklhd(int e) +{ + initclock(e); + Ctree->lklhdc = treelkl(Ctree); + + return Ctree->lklhdc; +} + +/* log-likelihood given height ht at node pointed to by chep */ +double heightlkl(double ht) +{ + Node *rp; + double len; + + /* adjust branch lengths */ + chep->height = ht; + /* descendent branches */ + rp = chep; + while (rp->isop != chep) { + rp = rp->isop; + len = chep->height - rp->kinp->height; + rp->kinp->lengthc = len; + rp->lengthc = len; + } + /* upward branch */ + if (chep == rootbr || chep->kinp == rootbr) { + len = (hroot - chep->height) + (hroot - chep->kinp->height); + chep->lengthc = len; + chep->kinp->lengthc = len; + } else { + rp = chep->kinp; + while (rp->isop->height <= 0.0) + rp = rp->isop; + chep->lengthc = rp->isop->height - chep->height; + chep->kinp->lengthc = rp->isop->height - chep->height; + } + + /* compute likelihood */ + Ctree->lklhdc = treelkl(Ctree); + + return -(Ctree->lklhdc); /* we use a minimizing procedure */ +} + +/* optimize current height */ +void optheight(void) +{ + double he, fx, f2x, minh, maxh, len; + Node *rp; + + /* current height */ + he = chep->height; + + /* minimum */ + minh = 0.0; + rp = chep; + while (rp->isop != chep) { + rp = rp->isop; + if (rp->kinp->height > minh) + minh = rp->kinp->height; + } + minh += MINARC; + + /* maximum */ + if (chep == rootbr || chep->kinp == rootbr) { + maxh = hroot; + } else { + rp = chep->kinp; + while (rp->isop->height <= 0.0) + rp = rp->isop; + maxh = rp->isop->height; + } + maxh -= MINARC; + + /* check borders for height */ + if (he < minh) he = minh; + if (he > maxh) he = maxh; + + /* optimization */ + if (!(he == minh && he == maxh)) + he = onedimenmin(minh, he, maxh, heightlkl, HEPSILON, &fx, &f2x); + + /* variance of height */ + f2x = fabs(f2x); + if (1.0/(maxhroot*maxhroot) < f2x) + chep->varheight = 1.0/f2x; + else + chep->varheight = maxhroot*maxhroot; + + /* adjust branch lengths */ + chep->height = he; + /* descendent branches */ + rp = chep; + while (rp->isop != chep) { + rp = rp->isop; + len = chep->height - rp->kinp->height; + rp->kinp->lengthc = len; + rp->lengthc = len; + } + /* upward branch */ + if (chep == rootbr || chep->kinp == rootbr) { + len = (hroot - chep->height) + (hroot - chep->kinp->height); + chep->lengthc = len; + chep->kinp->lengthc = len; + } else { + rp = chep->kinp; + while (rp->isop->height <= 0.0) + rp = rp->isop; + chep->lengthc = rp->isop->height - chep->height; + chep->kinp->lengthc = rp->isop->height - chep->height; + } +} + +/* log-likelihood given height ht at root */ +double rheightlkl(double ht) +{ + double len; + + /* adjust branch lengths */ + hroot = ht; + len = (hroot - rootbr->height) + (hroot - rootbr->kinp->height); + rootbr->lengthc = len; + rootbr->kinp->lengthc = len; + + /* compute likelihood */ + Ctree->lklhdc = treelkl(Ctree); + + return -(Ctree->lklhdc); /* we use a minimizing procedure */ +} + +/* optimize height of root */ +void optrheight(void) +{ + double he, fx, f2x, minh, len; + + /* current height */ + he = hroot; + + /* minimum */ + if (rootbr->height > rootbr->kinp->height) + minh = rootbr->height; + else + minh = rootbr->kinp->height; + minh += MINARC; + + /* check borders for height */ + if (he < minh) he = minh; + if (he > maxhroot) he = maxhroot; + + /* optimization */ + he = onedimenmin(minh, he, maxhroot, rheightlkl, HEPSILON, &fx, &f2x); + + /* variance of height of root */ + f2x = fabs(f2x); + if (1.0/(maxhroot*maxhroot) < f2x) + varhroot = 1.0/f2x; + else + varhroot = maxhroot*maxhroot; + + /* adjust branch lengths */ + hroot = he; + len = (hroot - rootbr->height) + (hroot - rootbr->kinp->height); + rootbr->lengthc = len; + rootbr->kinp->lengthc = len; +} + +/* exact likelihood under the constaining assumption of + clocklike branch lengths (with root on edge e) */ +double clock_lklhd(int e) +{ + int h, nconv; + double old; + + Numitc = 0; + Convergc = FALSE; + + initclock(e); + + do { + + Numitc++; + nconv = 0; + + /* optimize height of root */ + old = hroot; + optrheight(); + if (fabs(old - hroot) < HEPSILON) nconv++; + + /* optimize height of nodes */ + for (h = Numhts-1; h >= 0; h--) { + + /* pointer chep to current height node */ + chep = heights[h]; + + /* store old value */ + old = chep->height; + + /* find better height */ + optheight(); + + /* converged ? */ + if (fabs(old - chep->height) < HEPSILON) nconv++; + } + + if (nconv == Numhts+1) Convergc = TRUE; + + } while (Numitc < MAXIT && !Convergc); + + /* compute final likelihood */ + Ctree->lklhdc = treelkl(Ctree); + + return Ctree->lklhdc; +} + +/* find out the edge containing the root */ +int findrootedge() +{ + int e, ebest; + double logbest, logtest; + + /* compute the likelihood for all edges and take the edge with + best likelihood (using approximate ML) */ + + ebest = 0; + logbest = clock_alklhd(0); + numbestroot = 1; + for (e = 1; e < Numspc+Numibrnch; e++) { + logtest = clock_alklhd(e); + if (logtest > logbest) { + ebest = e; + logbest = logtest; + numbestroot = 1; + } else if (logtest == logbest) { + numbestroot++; + } + } + + return ebest; +} + +/* show heights and corresponding standard errors */ +void resultheights(FILE *fp) +{ + int h, num; + Node *cp; + + fprintf(fp, " height S.E. of node common to branches\n"); + for (h = 0; h < Numhts; h++) { + fprintf(fp, "%.5f %.5f ", (heights[h]->height)*0.01, + sqrt(heights[h]->varheight)*0.01); + cp = heights[h]; + do { + num = (cp->number) + 1; + if (cp->kinp->isop != NULL) num += Numspc; /* internal branch */ + fprintf(fp, "%d ", num); + cp = cp->isop; + } while (cp != heights[h]); + fprintf(fp, "\n"); + + } + fprintf(fp, "%.5f %.5f of root at branch %d\n", + hroot*0.01, sqrt(varhroot)*0.01, locroot+1); +} + diff --git a/forester/archive/RIO/others/puzzle_dqo/src/ml3.c b/forester/archive/RIO/others/puzzle_dqo/src/ml3.c new file mode 100644 index 0000000..a68a054 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/ml3.c @@ -0,0 +1,350 @@ +/* + * ml3.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#define EXTERN extern + + +/* prototypes */ +#include +#include +#include +#include "util.h" +#include "ml.h" +#include "gamma.h" + + + +/******************************************************************************/ +/* discrete Gamma-distribution and related stuff */ +/******************************************************************************/ + +/* compare general base frequencies with frequencies of taxon i with chi square */ +double homogentest(int taxon) +{ + return chi2test(Freqtpm, Basecomp[taxon], gettpmradix(), &chi2fail); +} + + +/* discrete Gamma according to Yang 1994 (JME 39:306-314) */ +void YangDiscreteGamma (double shape, int c, dvector x) +{ + double twoc, mu; + int i; + + twoc = 2.0*c; + mu = 0.0; + for (i = 0; i < c; i++) + { + /* corresponding rates */ + x[i] = icdfGamma ( (2.0*i+1.0)/twoc, shape); + mu += x[i]; + } + mu = mu/c; + + /* rescale for avarage rate of 1.0 */ + for (i = 0; i < c; i++) + { + x[i] /= mu; + } +} + +/* compute rates of each category when rates are Gamma-distributed */ +void updaterates() +{ + int i; + double alpha; + + if (numcats == 1) + { + Rates[0] = 1.0; + return; + } + if (Geta == 0.0) + { + for (i = 0; i < numcats; i++) + Rates[i] = 1.0; + return; + } + alpha = (1.0 - Geta)/Geta; + + YangDiscreteGamma (alpha, numcats, Rates); + + /* if invariable sites are present */ + for (i = 0; i < numcats; i++) + Rates[i] = Rates[i]/(1.0-fracinv); + + /* check for very small rates */ + for (i = 0; i < numcats; i++) + if (Rates[i] < 0.000001) Rates[i] = 0.000001; +} + + + +/******************************************************************************/ +/* parameter estimation */ +/******************************************************************************/ + +/* compute sample mean and standard deviation of sample mean */ +void computestat(double *data, int n, double *mean, double *err) +{ + int i; + double sum; + + sum = 0; + for (i = 0; i < n; i++) sum += data[i]; + (*mean) = sum/(double) n; + + sum = 0; + for (i = 0; i < n; i++) sum += (data[i] - (*mean))*(data[i] - (*mean)); + if (n != 1) + (*err) = sqrt(sum)/sqrt((double)(n-1)*n); /* unbiased estimator */ + else + (*err) = 0.0; /* if n == 1 */ +} + +/* compute ML value of quartet (a,b,c,d) */ +double quartetml(int a, int b, int c, int d) +{ + double d1, d2, d3; + + /* compute ML for all topologies */ + if (approxp_optn) { /* approximate parameter mode */ + d1 = quartet_alklhd(a,b,c,d); /* (a,b)-(c,d) */ + d2 = quartet_alklhd(a,c,b,d); /* (a,c)-(b,d) */ + d3 = quartet_alklhd(a,d,b,c); /* (a,d)-(b,c) */ + } else { + d1 = quartet_lklhd(a,b,c,d); /* (a,b)-(c,d) */ + d2 = quartet_lklhd(a,c,b,d); /* (a,c)-(b,d) */ + d3 = quartet_lklhd(a,d,b,c); /* (a,d)-(b,c) */ + } + + /* looking for max(d1, d2, d3) */ + if (d1 < d2) { /* d2 > d1 */ + if (d2 < d3) { /* d3 > d2 > d1 */ + /* d3 maximum */ + return d3; + } else { /* d2 >= d3 > d1 */ + /* d2 maximum */ + return d2; + } + } else { /* d1 >= d2 */ + if (d1 < d3) { /* d3 > d1 >= d2 */ + /* d3 maximum */ + return d3; + } else { /* d1 >= d2 && d1 >= d3 */ + /* d1 maximum */ + return d1; + } + } +} + +/* optimization function TSparam - quartets */ +double opttsq(double x) +{ + if (x < MINTS) TSparam = MINTS; + else if (x > MAXTS) TSparam = MAXTS; + else TSparam = x; + tranprobmat(); + distupdate(qca, qcb, qcc, qcd); + return (-quartetml(qca, qcb, qcc, qcd)); +} + +/* optimization function YRparam - quartets */ +double optyrq(double x) +{ + if (x < MINYR) YRparam = MINYR; + else if (x > MAXYR) YRparam = MAXYR; + else YRparam = x; + tranprobmat(); + distupdate(qca, qcb, qcc, qcd); + return (-quartetml(qca, qcb, qcc, qcd)); +} + +/* estimate substitution process parameters - random quartets */ +void optimseqevolparamsq() +{ + double tsmeanold, yrmeanold; + dvector tslist, yrlist; + int fin; + ivector taxon; + uli minqts, maxqts, n; + + + taxon = new_ivector(4); + + /* number of quartets to be investigated */ + minqts = (uli) floor(0.25 * MINPERTAXUM * Maxspc) + 1; + maxqts = (uli) floor(0.25 * MAXPERTAXUM * Maxspc) + 1; + if (Maxspc == 4) { + minqts = (uli) 1; + maxqts = (uli) 1; + } + + tslist = new_dvector(maxqts); + yrlist = new_dvector(maxqts); + + /* initialize averages */ + tsmean = TSparam; + yrmean = YRparam; + + fin = FALSE; + + /* investigate maxqts random quartets */ + for (n = 0; n < maxqts; n++) { + + /* choose random quartet */ + chooser(Maxspc, 4, taxon); + + /* + * optimize parameters on this quartet + */ + + qca = taxon[0]; + qcb = taxon[1]; + qcc = taxon[2]; + qcd = taxon[3]; + + /* initialize start values with average value */ + if ((SH_optn || nuc_optn) && optim_optn && (data_optn == 0)) TSparam = tsmean; + if ((nuc_optn && TN_optn) && optim_optn && (data_optn == 0)) YRparam = yrmean; + + /* estimation */ + twodimenmin(PEPS1, + (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), + MINTS, &TSparam, MAXTS, opttsq, &tserr, + (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), + MINYR, &YRparam, MAXYR, optyrq, &yrerr); + + + tsmeanold = tsmean; + yrmeanold = yrmean; + tslist[n] = TSparam; + yrlist[n] = YRparam; + computestat(tslist, n+1 , &tsmean, &tserr); + computestat(yrlist, n+1 , &yrmean, &yrerr); + + /* check whether the means are converging */ + if (n > minqts-2) { + if ((fabs(tsmean-tsmeanold) < TSDIFF) && + (fabs(yrmean-yrmeanold) < YRDIFF)) + fin = TRUE; + } + + /* investigate at least minqts quartets */ + if (n > minqts-2 && (fin || n > maxqts-2)) break; + } + + /* round estimated numbers to 2 digits after the decimal point */ + if (tserr != 0.0) tsmean = floor(100.0*tsmean+0.5)/100.0; + if (yrerr != 0.0) yrmean = floor(100.0*yrmean+0.5)/100.0; + + /* update ML engine */ + TSparam = tsmean; + YRparam = yrmean; + tranprobmat(); + + free_ivector(taxon); +} + +/* optimization function TSparam - tree */ +double opttst(double x) +{ + double result; + + if (x < MINTS) TSparam = MINTS; + else if (x > MAXTS) TSparam = MAXTS; + else TSparam = x; + tranprobmat(); + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + +/* optimization function YRparam - tree */ +double optyrt(double x) +{ + double result; + + if (x < MINYR) YRparam = MINYR; + else if (x > MAXYR) YRparam = MAXYR; + else YRparam = x; + tranprobmat(); + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + + +/* optimize substitution process parameters - tree */ +void optimseqevolparamst() +{ + twodimenmin(PEPS1, + (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), + MINTS, &TSparam, MAXTS, opttst, &tserr, + (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), + MINYR, &YRparam, MAXYR, optyrt, &yrerr); +} + + +/* optimization function fracinv */ +double optfi(double x) +{ + double result; + + if (x < MINFI) fracinv = MINFI; + else if (x > MAXFI) fracinv = MAXFI; + else fracinv = x; + + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + + +/* optimization function Geta */ +double optge(double x) +{ + double result; + + if (x < MINGE) Geta = MINGE; + else if (x > MAXGE) Geta = MAXGE; + else Geta = x; + + updaterates(); + + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + + +/* optimize rate heterogeneity parameters */ +void optimrateparams() +{ + twodimenmin(PEPS2, + fracinv_optim, + MINFI, &fracinv, fracconst, optfi, &fierr, + grate_optim, + MINGE, &Geta, MAXGE, optge, &geerr); + +} diff --git a/forester/archive/RIO/others/puzzle_dqo/src/model1.c b/forester/archive/RIO/others/puzzle_dqo/src/model1.c new file mode 100644 index 0000000..54fb889 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/model1.c @@ -0,0 +1,326 @@ +/* + * model1.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/* definitions */ +#define EXTERN extern + +/* prototypes */ +#include +#include "util.h" +#include "ml.h" + +/* number of states of the selected model */ +int gettpmradix() +{ + if (data_optn == 0) { /* nucleotides */ + if (nuc_optn) return 4; + if (SH_optn) return 16; + } else if (data_optn == 1) { /* amino acids */ + return 20; + } else { /* two-state model */ + return 2; + } + return 1; +} + +/* relative transition frequencies */ +void rtfdata(dmatrix q, double *f) +{ + double alp, alpy, alpr; + int i, j; + + if (data_optn == 0) + { /* nucleotides */ + + if (nuc_optn) + { /* 4x4 nucleotides */ + alp = 2.0*TSparam; + alpr = (alp * 2.0) / (YRparam + 1.0); + alpy = YRparam * alpr; + + q[0][1] = 1; q[0][2] = alpr; q[0][3] = 1; + q[1][2] = 1; q[1][3] = alpy; + q[2][3] = 1; + + f[0] = 0.25; f[1] = 0.25; f[2] = 0.25; f[3] = 0.25; + } + + if (SH_optn) + { /* 16x16 nucleotides */ + + alp = 2.0*TSparam; + + q[0][1] = 1; q[0][2] = alp; q[0][3] = 1; q[0][4] = 1; + q[0][5] = 0; q[0][6] = 0; q[0][7] = 0; q[0][8] = alp; + q[0][9] = 0; q[0][10] = 0; q[0][11] = 0; q[0][12] = 1; + q[0][13] = 0; q[0][14] = 0; q[0][15] = 0; + + q[1][2] = 1; q[1][3] = alp; q[1][4] = 0; q[1][5] = 1; + q[1][6] = 0; q[1][7] = 0; q[1][8] = 0; q[1][9] = alp; + q[1][10] = 0; q[1][11] = 0; q[1][12] = 0; q[1][13] = 1; + q[1][14] = 0; q[1][15] = 0; + + q[2][3] = 1; q[2][4] = 0; q[2][5] = 0; q[2][6] = 1; + q[2][7] = 0; q[2][8] = 0; q[2][9] = 0; q[2][10] = alp; + q[2][11] = 0; q[2][12] = 0; q[2][13] = 0; q[2][14] = 1; + q[2][15] = 0; + + q[3][4] = 0; q[3][5] = 0; q[3][6] = 0; q[3][7] = 1; + q[3][8] = 0; q[3][9] = 0; q[3][10] = 0; q[3][11] = alp; + q[3][12] = 0; q[3][13] = 0; q[3][14] = 0; q[3][15] = 1; + + q[4][5] = 1; q[4][6] = alp; q[4][7] = 1; q[4][8] = 1; + q[4][9] = 0; q[4][10] = 0; q[4][11] = 0; q[4][12] = alp; + q[4][13] = 0; q[4][14] = 0; q[4][15] = 0; + + q[5][6] = 1; q[5][7] = alp; q[5][8] = 0; q[5][9] = 1; + q[5][10] = 0; q[5][11] = 0; q[5][12] = 0; q[5][13] = alp; + q[5][14] = 0; q[5][15] = 0; + + q[6][7] = 1; q[6][8] = 0; q[6][9] = 0; q[6][10] = 1; + q[6][11] = 0; q[6][12] = 0; q[6][13] = 0; q[6][14] = alp; + q[6][15] = 0; + + q[7][8] = 0; q[7][9] = 0; q[7][10] = 0; q[7][11] = 1; + q[7][12] = 0; q[7][13] = 0; q[7][14] = 0; q[7][15] = alp; + + q[8][9] = 1; q[8][10] = alp; q[8][11] = 1; q[8][12] = 1; + q[8][13] = 0; q[8][14] = 0; q[8][15] = 0; + + q[9][10] = 1; q[9][11] = alp; q[9][12] = 0; q[9][13] = 1; + q[9][14] = 0; q[9][15] = 0; + + q[10][11] = 1; q[10][12] = 0; q[10][13] = 0; q[10][14] = 1; + q[10][15] = 0; + + q[11][12] = 0; q[11][13] = 0; q[11][14] = 0; q[11][15] = 1; + + q[12][13] = 1; q[12][14] = alp; q[12][15] = 1; + + q[13][14] = 1; q[13][15] = alp; + + q[14][15] = 1; + + + for (i = 0; i < 16; i++) f[i] = 0.0625; + } + } + else if (data_optn == 1) + { /* amino acids */ + if (Dayhf_optn) /* Dayhoff model */ + { + dyhfdata(q, f); + } + else if (Jtt_optn) /* JTT model */ + { + jttdata(q, f); + } + else if (blosum62_optn) /* BLOSUM 62 model */ + { + blosum62data(q, f); + } + else if (mtrev_optn) /* mtREV model */ + { + mtrevdata(q, f); + } + else if (cprev_optn) /* cpREV model */ + { + cprev45data(q, f); + } + else if (vtmv_optn) /* VT model */ + { + vtmvdata(q, f); + } + else /* if (wag_optn) */ /* WAG model */ + { + wagdata(q, f); + } + + } + else /* two-state model */ + { + q[0][1] = 1.0; + + f[0] = 0.5; f[1] = 0.5; + } + + /* fill matrix from upper triangle */ + for (i = 0; i < tpmradix; i++) + { + q[i][i] = 0.0; + for (j = i+1; j < tpmradix; j++) + { + q[j][i] = q[i][j]; + } + } +} + +/* transform letter codes to state numbers */ +int code2int(cvector c) +{ if (data_optn == 0) { /* nucleotides */ + if (nuc_optn) { /* 4x4 */ + switch (c[0]) { + case 'A': return 0; + case 'C': return 1; + case 'G': return 2; + case 'T': return 3; + case 'U': return 3; + default : return 4; + } + } + if (SH_optn) { /* 16x16 */ + if (c[0] == 'A') { + switch (c[1]) { + case 'A': return 0; /* AA */ + case 'C': return 1; /* AC */ + case 'G': return 2; /* AG */ + case 'T': return 3; /* AT */ + case 'U': return 3; /* AT */ + default: return 16; + } + } + if (c[0] == 'C') { + switch (c[1]) { + case 'A': return 4; /* CA */ + case 'C': return 5; /* CC */ + case 'G': return 6; /* CG */ + case 'T': return 7; /* CT */ + case 'U': return 7; /* CT */ + default: return 16; + } + } + if (c[0] == 'G') { + switch (c[1]) { + case 'A': return 8; /* GA */ + case 'C': return 9; /* GC */ + case 'G': return 10; /* GG */ + case 'T': return 11; /* GT */ + case 'U': return 11; /* GT */ + default: return 16; + } + } + if (c[0] == 'T' || c[0] == 'U') { + switch (c[1]) { + case 'A': return 12; /* TA */ + case 'C': return 13; /* TC */ + case 'G': return 14; /* TG */ + case 'T': return 15; /* TT */ + case 'U': return 15; /* TT */ + default: return 16; + } + } + return 16; + } + } else if (data_optn == 1) { /* amino acids */ + switch (c[0]) { + case 'A': return 0; + case 'C': return 4; + case 'D': return 3; + case 'E': return 6; + case 'F': return 13; + case 'G': return 7; + case 'H': return 8; + case 'I': return 9; + case 'K': return 11; + case 'L': return 10; + case 'M': return 12; + case 'N': return 2; + case 'P': return 14; + case 'Q': return 5; + case 'R': return 1; + case 'S': return 15; + case 'T': return 16; + case 'V': return 19; + case 'W': return 17; + case 'Y': return 18; + default : return 20; + } + } else { /* two-state model */ + switch (c[0]) { + case '0': return 0; + case '1': return 1; + default : return 2; + } + } + return 0; +} + +/* return letter code belonging to state number */ +char *int2code(int s) +{ + if (data_optn == 0) { /* nucleotides */ + if (nuc_optn) { /* 4x4 */ + switch (s) { + case 0: return "A"; + case 1: return "C"; + case 2: return "G"; + case 3: return "T"; + default : return "?"; + } + } + if (SH_optn) { /* 16x16 */ + switch (s) { + case 0: return "AA"; + case 1: return "AC"; + case 2: return "AG"; + case 3: return "AT"; + case 4: return "CA"; + case 5: return "CC"; + case 6: return "CG"; + case 7: return "CT"; + case 8: return "GA"; + case 9: return "GC"; + case 10: return "GG"; + case 11: return "GT"; + case 12: return "TA"; + case 13: return "TC"; + case 14: return "TG"; + case 15: return "TT"; + default : return "??"; + } + } + } else if (data_optn == 1) { /* amino acids */ + switch (s) { + case 0: return "A"; + case 1: return "R"; + case 2: return "N"; + case 3: return "D"; + case 4: return "C"; + case 5: return "Q"; + case 6: return "E"; + case 7: return "G"; + case 8: return "H"; + case 9: return "I"; + case 10: return "L"; + case 11: return "K"; + case 12: return "M"; + case 13: return "F"; + case 14: return "P"; + case 15: return "S"; + case 16: return "T"; + case 17: return "W"; + case 18: return "Y"; + case 19: return "V"; + default : return "?"; + } + } else { /* two-state model */ + switch (s) { + case 0: return "0"; + case 1: return "1"; + default : return "?"; + } + } + return "?"; +} diff --git a/forester/archive/RIO/others/puzzle_dqo/src/model2.c b/forester/archive/RIO/others/puzzle_dqo/src/model2.c new file mode 100644 index 0000000..9e2197f --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/model2.c @@ -0,0 +1,1125 @@ +/* + * model2.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/* definitions */ +#define EXTERN extern + +/* prototypes */ +#include +#include "util.h" +#include "ml.h" + + +void jttdata(dmatrix q, double *f) +{ + /* + * JTT model for amino acid evolution + * D.T. Jones, W.R. Taylor, and J.M. Thornton + * "The rapid generation of mutation data matrices from protein sequences" + * CABIOS vol. 8 no. 3 1992 pp. 275-282 + */ + + q[0][1]=3.1628651460584e+00; q[0][2]=3.2804935927860e+00; + q[0][3]=4.8477237048666e+00; q[0][4]=3.4612244897959e+00; + q[0][5]=3.3130910900946e+00; q[0][6]=6.3199473337722e+00; + q[0][7]=1.0440154440154e+01; q[0][8]=1.3061224489796e+00; + q[0][9]=2.1726844583987e+00; q[0][10]=1.8443597219107e+00; + q[0][11]=2.2137668626773e+00; q[0][12]=2.7210884353741e+00; + q[0][13]=8.3265306122449e-01; q[0][14]=1.1537414965986e+01; + q[0][15]=2.2838213546288e+01; q[0][16]=2.7007955724663e+01; + q[0][17]=5.1311953352770e-01; q[0][18]=8.3673469387755e-01; + q[0][19]=1.7474335188621e+01; + + q[1][2]=2.6598918637222e+00; q[1][3]=9.1014867485456e-01; + q[1][4]=6.1624649859944e+00; q[1][5]=1.8036482885837e+01; + q[1][6]=1.8924731182796e+00; q[1][7]=8.1810886516769e+00; + q[1][8]=1.9119717452198e+01; q[1][9]=1.4410687351864e+00; + q[1][10]=2.2211961707760e+00; q[1][11]=3.9239234676922e+01; + q[1][12]=2.5060690943044e+00; q[1][13]=3.9439775910364e-01; + q[1][14]=4.1953094963476e+00; q[1][15]=5.9016766126741e+00; + q[1][16]=3.8437069743152e+00; q[1][17]=7.6766706682673e+00; + q[1][18]=1.4173669467787e+00; q[1][19]=1.0308123249300e+00; + + q[2][3]=3.2226935854843e+01; q[2][4]=1.8710963455150e+00; + q[2][5]=4.5351268130622e+00; q[2][6]=3.3951344979102e+00; + q[2][7]=4.5987249708180e+00; q[2][8]=2.3693774375271e+01; + q[2][9]=2.9235880398671e+00; q[2][10]=8.0960899565551e-01; + q[2][11]=1.5024269384537e+01; q[2][12]=1.9003322259136e+00; + q[2][13]=4.3853820598007e-01; q[2][14]=7.1083317047749e-01; + q[2][15]=2.9456208772690e+01; q[2][16]=1.3735908553410e+01; + q[2][17]=1.6706217370669e-01; q[2][18]=4.1661129568106e+00; + q[2][19]=9.7452934662237e-01; + + q[3][4]=6.2857142857143e-01; q[3][5]=3.0662020905923e+00; + q[3][6]=4.5450549450549e+01; q[3][7]=7.5402435402435e+00; + q[3][8]=6.0544672718586e+00; q[3][9]=6.8808114961961e-01; + q[3][10]=3.6130902064968e-01; q[3][11]=1.6718197057180e+00; + q[3][12]=1.0879120879121e+00; q[3][13]=1.9340659340659e-01; + q[3][14]=7.3949579831933e-01; q[3][15]=3.4196528109572e+00; + q[3][16]=2.4749487800335e+00; q[3][17]=3.4536891679749e-01; + q[3][18]=2.6895604395604e+00; q[3][19]=1.8608058608059e+00; + + q[4][5]=5.5191637630662e-01; q[4][6]=3.2442396313364e-01; + q[4][7]=3.3297297297297e+00; q[4][8]=4.3726708074534e+00; + q[4][9]=9.1868131868132e-01; q[4][10]=9.9466248037677e-01; + q[4][11]=2.9830508474576e-01; q[4][12]=2.4095238095238e+00; + q[4][13]=4.1485714285714e+00; q[4][14]=7.3949579831933e-01; + q[4][15]=1.2862939958592e+01; q[4][16]=2.8125907990315e+00; + q[4][17]=6.8244897959184e+00; q[4][18]=1.2885714285714e+01; + q[4][19]=3.7714285714286e+00; + + q[5][6]=2.0316061593796e+01; q[5][7]=1.3922214897825e+00; + q[5][8]=3.3861536130889e+01; q[5][9]=4.7172339855267e-01; + q[5][10]=4.2320327755868e+00; q[5][11]=1.7835941652395e+01; + q[5][12]=2.6573751451800e+00; q[5][13]=2.7595818815331e-01; + q[5][14]=9.4992143198743e+00; q[5][15]=3.2350653941322e+00; + q[5][16]=3.0973838067678e+00; q[5][17]=1.0512692882031e+00; + q[5][18]=1.5331010452962e+00; q[5][19]=1.0778164924506e+00; + + q[6][7]=6.6857641051189e+00; q[6][8]=1.4458024443999e+00; + q[6][9]=6.7068415455512e-01; q[6][10]=5.7932850559579e-01; + q[6][11]=1.0365070686558e+01; q[6][12]=1.0138248847926e+00; + q[6][13]=2.6359447004608e-01; q[6][14]=1.1291226167887e+00; + q[6][15]=1.8337006611901e+00; q[6][16]=1.9520424900414e+00; + q[6][17]=6.9519420671494e-01; q[6][18]=3.8018433179723e-01; + q[6][19]=2.7772657450077e+00; + + q[7][8]=1.2113479939567e+00; q[7][9]=3.2670032670033e-01; + q[7][10]=4.1817641817642e-01; q[7][11]=1.6354950592239e+00; + q[7][12]=7.6447876447876e-01; q[7][13]=3.0579150579151e-01; + q[7][14]=1.2391551215081e+00; q[7][15]=1.1138492529797e+01; + q[7][16]=1.8888816176952e+00; q[7][17]=3.3491450634308e+00; + q[7][18]=3.1853281853282e-01; q[7][19]=2.8416988416988e+00; + + q[8][9]=1.0931677018634e+00; q[8][10]=3.2194389461470e+00; + q[8][11]=3.1498052426571e+00; q[8][12]=1.9130434782609e+00; + q[8][13]=2.7329192546584e+00; q[8][14]=6.7304834977469e+00; + q[8][15]=4.3726708074534e+00; q[8][16]=2.8162964522581e+00; + q[8][17]=7.8083407275954e-01; q[8][18]=3.5118012422360e+01; + q[8][19]=7.2877846790890e-01; + + q[9][10]=1.4069798333535e+01; q[9][11]=1.2292791953809e+00; + q[9][12]=2.8366300366300e+01; q[9][13]=4.7384615384615e+00; + q[9][14]=5.8780435251023e-01; q[9][15]=2.4105749323141e+00; + q[9][16]=1.5243062022723e+01; q[9][17]=8.2888540031397e-01; + q[9][18]=1.8434065934066e+00; q[9][19]=5.7699633699634e+01; + + q[10][11]=8.8039805231089e-01; q[10][12]=2.2425954997384e+01; + q[10][13]=1.5099529042386e+01; q[10][14]=6.2626896912611e+00; + q[10][15]=3.4917298022888e+00; q[10][16]=1.6109411169944e+00; + q[10][17]=3.2366001345593e+00; q[10][18]=1.4505494505495e+00; + q[10][19]=1.0557823129252e+01; + + q[11][12]=3.6577885391445e+00; q[11][13]=1.4915254237288e-01; + q[11][14]=1.2868062479229e+00; q[11][15]=2.8162964522581e+00; + q[11][16]=5.7494151926786e+00; q[11][17]=5.4790729851263e-01; + q[11][18]=5.3268765133172e-01; q[11][19]=7.4899112187248e-01; + + q[12][13]=2.5666666666667e+00; q[12][14]=9.4491129785247e-01; + q[12][15]=1.6397515527950e+00; q[12][16]=1.2180790960452e+01; + q[12][17]=1.1972789115646e+00; q[12][18]=1.1130952380952e+00; + q[12][19]=1.7746031746032e+01; + + q[13][14]=8.8739495798319e-01; q[13][15]=5.6298136645963e+00; + q[13][16]=8.3099273607748e-01; q[13][17]=3.3224489795918e+00; + q[13][18]=3.3392857142857e+01; q[13][19]=3.6000000000000e+00; + + q[14][15]=1.6261762676085e+01; q[14][16]=6.8852490148602e+00; + q[14][17]=4.2256902761104e-01; q[14][18]=6.7787114845938e-01; + q[14][19]=1.2549019607843e+00; + + q[15][16]=2.7891216619293e+01; q[15][17]=1.8740017746229e+00; + q[15][18]=3.7349896480331e+00; q[15][19]=2.4182194616977e+00; + + q[16][17]=4.8702870978900e-01; q[16][18]=1.1985472154964e+00; + q[16][19]=6.7925746569814e+00; + + q[17][18]=4.6020408163265e+00; q[17][19]=1.4693877551020e+00; + + q[18][19]=1.0000000000000e+00; + + + f[0] = 0.077; f[1] = 0.051; f[2] = 0.043; f[3] = 0.052; + f[4] = 0.02; f[5] = 0.041; f[6] = 0.062; f[7] = 0.074; + f[8] = 0.023; f[9] = 0.052; f[10] = 0.091; f[11] = 0.059; + f[12] = 0.024; f[13] = 0.04; f[14] = 0.051; f[15] = 0.069; + f[16] = 0.059; f[17] = 0.014; f[18] = 0.032; f[19] = 0.066; +} + +void dyhfdata(dmatrix q, double *f) +{ + /* + * Dayhoff model for amino acid evolution + * Dayhoff, M.O., Schwartz, R.M., Orcutt, B.C. (1978) + * "A model of evolutionary change in proteins." + * Dayhoff, M.O. (ed.) Atlas of Protein Sequence Structur., Vol5, Suppl. 3, + * National Biomedical Research Foundation, Washington DC, pp. 345-352. + */ + + q[0][1]=9.6472567159749e-01; q[0][2]=3.5927991886410e+00; + q[0][3]=4.3200552414656e+00; q[0][4]=1.3184584178499e+00; + q[0][5]=3.2267534963169e+00; q[0][6]=7.0141987829615e+00; + q[0][7]=8.5773867857875e+00; q[0][8]=8.1434196396611e-01; + q[0][9]=2.3518447453539e+00; q[0][10]=1.4735711728911e+00; + q[0][11]=9.3940162271805e-01; q[0][12]=2.5490196078431e+00; + q[0][13]=6.5922920892495e-01; q[0][14]=8.9189834148670e+00; + q[0][15]=1.4540712836859e+01; q[0][16]=1.3411904595370e+01; + q[0][17]=3.8517964118027e-02; q[0][18]=8.7897227856660e-01; + q[0][19]=7.4036511156187e+00; + + q[1][2]=1.1890243902439e+00; q[1][3]=5.9525626545377e-02; + q[1][4]=8.4778922655537e-01; q[1][5]=8.8348561504191e+00; + q[1][6]=5.5954088952654e-02; q[1][7]=3.1434881434075e-01; + q[1][8]=8.4753987678285e+00; q[1][9]=2.2684090115941e+00; + q[1][10]=5.5954088952654e-01; q[1][11]=1.6681312769010e+01; + q[1][12]=3.1707317073171e+00; q[1][13]=4.8959827833572e-01; + q[1][14]=3.6754156468900e+00; q[1][15]=5.4755072760812e+00; + q[1][16]=9.6472567159749e-01; q[1][17]=7.5538020086083e+00; + q[1][18]=2.7977044476327e-01; q[1][19]=8.6083213773314e-01; + + q[2][3]=3.2459324155194e+01; q[2][4]=7.3852625416383e-02; + q[2][5]=3.7732198142415e+00; q[2][6]=5.3911764705882e+00; + q[2][7]=5.0264375413087e+00; q[2][8]=1.9061418685121e+01; + q[2][9]=2.7901430842607e+00; q[2][10]=1.2482698961938e+00; + q[2][11]=1.1542279411765e+01; q[2][12]=1.9117647058824e-01; + q[2][13]=5.0183823529412e-01; q[2][14]=1.5181660899654e+00; + q[2][15]=1.7697478991597e+01; q[2][16]=8.3557302231237e+00; + q[2][17]=8.6029411764706e-01; q[2][18]=3.4411764705882e+00; + q[2][19]=5.7352941176471e-01; + + q[3][4]=2.5534152404601e-02; q[3][5]=4.8811013767209e+00; + q[3][6]=4.0561952440551e+01; q[3][7]=4.4423506911730e+00; + q[3][8]=3.0865788117500e+00; q[3][9]=8.5749078239692e-01; + q[3][10]=2.5926985518518e-02; q[3][11]=2.5930851063830e+00; + q[3][12]=1.1667143483333e-01; q[3][13]=1.2963492759259e-02; + q[3][14]=4.7853935065891e-01; q[3][15]=3.4167709637046e+00; + q[3][16]=2.3984722282163e+00; q[3][17]=3.2408731898147e-02; + q[3][18]=8.1351689612015e-02; q[3][19]=6.3829787234043e-01; + + q[4][5]=2.1864264103535e-02; q[4][6]=1.4770525083277e-02; + q[4][7]=3.9055458751427e-01; q[4][8]=1.0223340673168e+00; + q[4][9]=1.5970515970516e+00; q[4][10]=3.9098448749850e-02; + q[4][11]=8.0776309049169e-03; q[4][12]=1.4155086538140e-01; + q[4][13]=8.6898395721925e-02; q[4][14]=6.8155604487784e-01; + q[4][15]=5.8097784568373e+00; q[4][16]=5.9929928084086e-01; + q[4][17]=3.4759358288770e-01; q[4][18]=3.4759358288770e+00; + q[4][19]=1.7647058823529e+00; + + q[5][6]=2.5476780185759e+01; q[5][7]=1.0174974779977e+00; + q[5][8]=2.1573939173192e+01; q[5][9]=6.5266504894988e-01; + q[5][10]=2.6634492806410e+00; q[5][11]=5.5466331269350e+00; + q[5][12]=4.0247678018576e+00; q[5][13]=1.8038017885416e-02; + q[5][14]=5.5044618466582e+00; q[5][15]=2.0267580716497e+00; + q[5][16]=1.9256432155439e+00; q[5][17]=9.6202762055552e-02; + q[5][18]=1.0061919504644e-01; q[5][19]=1.2538699690402e+00; + + q[6][7]=2.8869795109055e+00; q[6][8]=1.5519031141869e+00; + q[6][9]=2.1701112877583e+00; q[6][10]=4.0484429065744e-01; + q[6][11]=2.9823529411765e+00; q[6][12]=1.0705882352941e+00; + q[6][13]=1.9801735189768e-02; q[6][14]=1.7993079584775e+00; + q[6][15]=2.8184873949580e+00; q[6][16]=1.2261663286004e+00; + q[6][17]=7.3114099162219e-02; q[6][18]=7.6470588235294e-01; + q[6][19]=1.3058823529412e+00; + + q[7][8]=3.7906768788150e-01; q[7][9]=2.3128004846840e-02; + q[7][10]=2.5776602775942e-01; q[7][11]=9.6662260409782e-01; + q[7][12]=6.0145406477198e-01; q[7][13]=5.4775280898876e-01; + q[7][14]=1.2382877804129e+00; q[7][15]=8.2853366065527e+00; + q[7][16]=1.1110604644803e+00; q[7][17]=1.2888301387971e-01; + q[7][18]=1.7114723586662e-02; q[7][19]=1.9233311302049e+00; + + q[8][9]=2.7354343963341e-01; q[8][10]=1.5876246692449e+00; + q[8][11]=9.6993944636678e-01; q[8][12]=1.2544085640577e-01; + q[8][13]=1.6868512110727e+00; q[8][14]=3.3075513942601e+00; + q[8][15]=1.2530894710826e+00; q[8][16]=8.1434196396611e-01; + q[8][17]=1.0121107266436e+00; q[8][18]=4.4982698961938e+00; + q[8][19]=1.5570934256055e+00; + + q[9][10]=9.2275320303002e+00; q[9][11]=1.6663354531002e+00; + q[9][12]=1.1780604133545e+01; q[9][13]=6.9753577106518e+00; + q[9][14]=4.2551201720752e-01; q[9][15]=8.8575970928912e-01; + q[9][16]=6.8951811852420e+00; q[9][17]=9.8802836705702e-02; + q[9][18]=1.3434022257552e+00; q[9][19]=3.1526232114467e+01; + + q[10][11]=6.5787197231834e-01; q[10][12]=1.8622837370242e+01; + q[10][13]=5.6340830449827e+00; q[10][14]=1.1377976796255e+00; + q[10][15]=6.1690558576372e-01; q[10][16]=1.2098794893211e+00; + q[10][17]=1.7543252595156e+00; q[10][18]=1.0346020761246e+00; + q[10][19]=6.2906574394464e+00; + + q[11][12]=8.6029411764706e+00; q[11][13]=6.6640454965565e-03; + q[11][14]=1.2089100346021e+00; q[11][15]=3.4411764705882e+00; + q[11][16]=4.9442190669371e+00; q[11][17]=3.4272233982290e-02; + q[11][18]=4.7794117647059e-01; q[11][19]=3.7500000000000e-01; + + q[12][13]=3.2500000000000e+00; q[12][14]=5.9976931949250e-01; + q[12][15]=2.1848739495798e+00; q[12][16]=3.6916835699797e+00; + q[12][17]=1.6247577591604e-01; q[12][18]=1.1508700794053e-01; + q[12][19]=9.0588235294118e+00; + + q[13][14]=3.9359861591695e-01; q[13][15]=1.6386554621849e+00; + q[13][16]=4.9442190669371e-01; q[13][17]=2.8676470588235e+00; + q[13][18]=2.4852941176471e+01; q[13][19]=4.4117647058824e-01; + + q[14][15]=8.6431043005437e+00; q[14][16]=2.8308077795013e+00; + q[14][17]=3.5840244687362e-02; q[14][18]=4.3804743506776e-02; + q[14][19]=1.7301038062284e+00; + + q[15][16]=1.9663865546218e+01; q[15][17]=2.7857142857143e+00; + q[15][18]=1.2016806722689e+00; q[15][19]=1.0840336134454e+00; + + q[16][17]=4.2019597219666e-02; q[16][18]=1.5162271805274e+00; + q[16][19]=5.6592292089249e+00; + + q[17][18]=2.2941176470588e+00; q[17][19]=1.2654363316538e-01; + + q[18][19]=1.0000000000000e+00; + + + f[0] = 0.087; f[1] = 0.041; f[2] = 0.040; f[3] = 0.047; + f[4] = 0.033; f[5] = 0.038; f[6] = 0.05; f[7] = 0.089; + f[8] = 0.034; f[9] = 0.037; f[10] = 0.085; f[11] = 0.08; + f[12] = 0.015; f[13] = 0.04; f[14] = 0.051; f[15] = 0.07; + f[16] = 0.058; f[17] = 0.01; f[18] = 0.03; f[19] = 0.065; +} + +void mtrevdata(dmatrix q, double *f) +{ + /* + * mtREV24 model of amino acid evolution + * (complete sequence data of mtDNA from 24 vertebrate species) + * Adachi, J. and Hasegawa, M. (1996) + */ + + q[0][1]=1.2199217606346e+01; q[0][2]=1.4182139942122e+01; + q[0][3]=9.2985091873208e+00; q[0][4]=3.1542792981957e+01; + q[0][5]=1.0025852846688e+00; q[0][6]=5.1418866803338e+00; + q[0][7]=6.3531246495131e+01; q[0][8]=7.3137132861715e+00; + q[0][9]=5.0782382656186e+01; q[0][10]=1.3399741808481e+01; + q[0][11]=4.4021672780560e+00; q[0][12]=7.4673480520104e+01; + q[0][13]=3.3513021631978e+00; q[0][14]=2.8582502221773e+01; + q[0][15]=2.0413623195312e+02; q[0][16]=2.5301305153906e+02; + q[0][17]=1.0000000000000e+00; q[0][18]=3.4084158197615e+00; + q[0][19]=1.0266468401249e+02; + + q[1][2]=6.9661274444534e+00; q[1][3]=1.0000000000000e+00; + q[1][4]=5.4384584796568e+01; q[1][5]=1.1631134513343e+02; + q[1][6]=1.0000000000000e+00; q[1][7]=1.2122831341194e+01; + q[1][8]=8.6961067087353e+01; q[1][9]=1.0000000000000e+00; + q[1][10]=8.1976829394538e+00; q[1][11]=7.4423215395318e+01; + q[1][12]=1.0000000000000e+00; q[1][13]=2.4659158338099e+00; + q[1][14]=1.2439947713615e+01; q[1][15]=3.1791814866372e+00; + q[1][16]=1.0935327216119e+00; q[1][17]=1.1550775790126e+01; + q[1][18]=1.0000000000000e+00; q[1][19]=4.0211417480338e+00; + + q[2][3]=4.1809325468160e+02; q[2][4]=3.1020979842967e+01; + q[2][5]=9.1349622725361e+01; q[2][6]=3.3185663516310e+01; + q[2][7]=2.8052324651124e+01; q[2][8]=2.6112087577885e+02; + q[2][9]=1.4261453863336e+01; q[2][10]=7.9775653461977e+00; + q[2][11]=3.2036829276162e+02; q[2][12]=3.4424354918739e+01; + q[2][13]=7.9996445145608e+00; q[2][14]=3.8586541461044e+01; + q[2][15]=2.6020426225852e+02; q[2][16]=1.2550758780474e+02; + q[2][17]=5.6207759736659e+00; q[2][18]=1.0071406219571e+02; + q[2][19]=1.0000000000000e+00; + + q[3][4]=1.0000000000000e+00; q[3][5]=2.9097352675564e+01; + q[3][6]=3.0713149855302e+02; q[3][7]=2.9877072751897e+01; + q[3][8]=5.9995408885817e+01; q[3][9]=2.2827096245105e+00; + q[3][10]=1.0000000000000e+00; q[3][11]=1.2183938185384e+00; + q[3][12]=1.0000000000000e+00; q[3][13]=2.6221929413096e+00; + q[3][14]=7.0708004204733e+00; q[3][15]=3.6327934317139e+01; + q[3][16]=1.4743408713748e+01; q[3][17]=1.0453246057102e+01; + q[3][18]=1.1165627147496e+01; q[3][19]=1.0000000000000e+00; + + q[4][5]=3.9599394038972e+01; q[4][6]=1.0000000000000e+00; + q[4][7]=1.6163581056674e+01; q[4][8]=7.4467985406234e+01; + q[4][9]=3.3018175376623e+01; q[4][10]=1.3500725995091e+01; + q[4][11]=1.0000000000000e+00; q[4][12]=3.2504095376923e+00; + q[4][13]=3.7264767083096e+01; q[4][14]=1.6454136037822e+01; + q[4][15]=1.4581783243113e+02; q[4][16]=9.4720031458442e+01; + q[4][17]=1.7684087896962e+01; q[4][18]=1.3409157685926e+02; + q[4][19]=1.0000000000000e+00; + + q[5][6]=1.6503249008836e+02; q[5][7]=3.5530760735494e+00; + q[5][8]=3.0652523140859e+02; q[5][9]=4.3905393139325e+00; + q[5][10]=2.0895470525345e+01; q[5][11]=2.4504076430724e+02; + q[5][12]=2.4931300477797e+01; q[5][13]=1.0059428264289e+01; + q[5][14]=7.2256314165467e+01; q[5][15]=2.8480937892158e+01; + q[5][16]=4.9962974409828e+01; q[5][17]=1.0000000000000e+00; + q[5][18]=2.0430790980529e+01; q[5][19]=9.9986289000676e+00; + + q[6][7]=1.4884496769963e+01; q[6][8]=2.5853576435567e+01; + q[6][9]=1.7418201388328e+00; q[6][10]=1.0000000000000e+00; + q[6][11]=1.6519126809071e+02; q[6][12]=1.0000000000000e+00; + q[6][13]=1.4067850525292e+00; q[6][14]=6.7547121641947e+00; + q[6][15]=2.8794794140840e+01; q[6][16]=7.8001372062558e+00; + q[6][17]=1.0000000000000e+00; q[6][18]=6.9067239183061e+00; + q[6][19]=1.1127702362585e+01; + + q[7][8]=1.0000000000000e+00; q[7][9]=3.1466649021550e+00; + q[7][10]=1.2699794194865e+00; q[7][11]=1.1962111069278e+01; + q[7][12]=1.0000000000000e+00; q[7][13]=1.0000000000000e+00; + q[7][14]=1.0000000000000e+00; q[7][15]=6.6277950574411e+01; + q[7][16]=5.8800079133028e+00; q[7][17]=5.7494182626674e+00; + q[7][18]=1.6887657206208e+00; q[7][19]=1.3320553471351e+00; + + q[8][9]=6.4536986087271e+00; q[8][10]=6.0472584534958e+00; + q[8][11]=6.7197196398961e+01; q[8][12]=6.2977633277779e+00; + q[8][13]=2.5347805183364e+01; q[8][14]=3.2089868698728e+01; + q[8][15]=4.0766987134407e+01; q[8][16]=2.3570850628539e+01; + q[8][17]=3.7286635325194e+00; q[8][18]=3.5270764890474e+02; + q[8][19]=1.0000000000000e+00; + + q[9][10]=1.7320653206333e+02; q[9][11]=1.0298655619743e+01; + q[9][12]=2.7262244199514e+02; q[9][13]=4.4561065036310e+01; + q[9][14]=1.0856482766156e+01; q[9][15]=2.5107659603898e+01; + q[9][16]=1.9391167162525e+02; q[9][17]=1.0000000000000e+00; + q[9][18]=1.3161329199391e+01; q[9][19]=6.4365086389428e+02; + + q[10][11]=7.8314019154706e+00; q[10][12]=2.8290920517725e+02; + q[10][13]=1.1371735519833e+02; q[10][14]=2.1105885757279e+01; + q[10][15]=3.8741359395934e+01; q[10][16]=6.6524559321657e+01; + q[10][17]=1.7071378554833e+01; q[10][18]=2.3234516108847e+01; + q[10][19]=4.8247261078055e+01; + + q[11][12]=4.8092094826036e+01; q[11][13]=3.3887559483420e+00; + q[11][14]=2.6368577564199e+01; q[11][15]=5.5679895711418e+01; + q[11][16]=7.1750284708933e+01; q[11][17]=1.2631893872825e+01; + q[11][18]=2.6932728996777e+01; q[11][19]=1.0000000000000e+00; + + q[12][13]=4.7798798034572e+01; q[12][14]=9.9165053447429e+00; + q[12][15]=5.8505442466161e+01; q[12][16]=2.7798190504760e+02; + q[12][17]=1.1427000119701e+01; q[12][18]=2.1029990530586e+01; + q[12][19]=2.0397078683768e+02; + + q[13][14]=9.1089574817139e+00; q[13][15]=3.3835737720574e+01; + q[13][16]=1.7815549567056e+01; q[13][17]=4.1272404968214e+00; + q[13][18]=2.4504156395152e+02; q[13][19]=3.3435675442163e+00; + + q[14][15]=8.9421193040709e+01; q[14][16]=6.7485067008375e+01; + q[14][17]=2.2161693733113e+00; q[14][18]=8.5338209390745e+00; + q[14][19]=4.3342126659660e+00; + + q[15][16]=3.1432036618746e+02; q[15][17]=2.0305343047059e+01; + q[15][18]=3.4167877957799e+01; q[15][19]=1.0000000000000e+00; + + q[16][17]=5.2559565123081e+00; q[16][18]=2.0382362288681e+01; + q[16][19]=1.0765527137500e+02; + + q[17][18]=1.3814733274637e+01; q[17][19]=2.8259139240676e+00; + + q[18][19]=1.0000000000000e+00; + + + /* amino acid frequencies */ + f[0]=0.072; f[1]=0.019; f[2]=0.039; f[3]=0.019; f[4]=0.006; + f[5]=0.025; f[6]=0.024; f[7]=0.056; f[8]=0.028; f[9]=0.088; + f[10]=0.168; f[11]=0.023; f[12]=0.054; f[13]=0.061; f[14]=0.054; + f[15]=0.072; f[16]=0.086; f[17]=0.029; f[18]=0.033; f[19]=0.043; +} + +void blosum62data(dmatrix q, double *f) +{ + /* + * BLOSUM62 model of amino acid evolution + * + * S. Henikoff and J. G. Henikoff. 1992. PNAS USA 89:10915-10919. + * + */ + + q[0][1]=7.3579038969751e-01; q[0][2]=4.8539105546575e-01; + q[0][3]=5.4316182089867e-01; q[0][4]=1.4599953104700e+00; + q[0][5]=1.1997057046020e+00; q[0][6]=1.1709490427999e+00; + q[0][7]=1.9558835749595e+00; q[0][8]=7.1624144499779e-01; + q[0][9]=6.0589900368677e-01; q[0][10]=8.0001653051838e-01; + q[0][11]=1.2952012667833e+00; q[0][12]=1.2537582666635e+00; + q[0][13]=4.9296467974759e-01; q[0][14]=1.1732759009239e+00; + q[0][15]=4.3250926870566e+00; q[0][16]=1.7291780194850e+00; + q[0][17]=4.6583936772479e-01; q[0][18]=7.1820669758623e-01; + q[0][19]=2.1877745220045e+00; + + q[1][2]=1.2974467051337e+00; q[1][3]=5.0096440855513e-01; + q[1][4]=2.2782657420895e-01; q[1][5]=3.0208336100636e+00; + q[1][6]=1.3605741904203e+00; q[1][7]=4.1876330851753e-01; + q[1][8]=1.4561411663360e+00; q[1][9]=2.3203644514174e-01; + q[1][10]=6.2271166969249e-01; q[1][11]=5.4111151414889e+00; + q[1][12]=9.8369298745695e-01; q[1][13]=3.7164469320875e-01; + q[1][14]=4.4813366171831e-01; q[1][15]=1.1227831042096e+00; + q[1][16]=9.1466595456337e-01; q[1][17]=4.2638231012175e-01; + q[1][18]=7.2051744121611e-01; q[1][19]=4.3838834377202e-01; + + q[2][3]=3.1801000482161e+00; q[2][4]=3.9735894989702e-01; + q[2][5]=1.8392161469920e+00; q[2][6]=1.2404885086396e+00; + q[2][7]=1.3558723444845e+00; q[2][8]=2.4145014342081e+00; + q[2][9]=2.8301732627800e-01; q[2][10]=2.1188815961519e-01; + q[2][11]=1.5931370434574e+00; q[2][12]=6.4844127878707e-01; + q[2][13]=3.5486124922252e-01; q[2][14]=4.9488704370192e-01; + q[2][15]=2.9041016564560e+00; q[2][16]=1.8981736345332e+00; + q[2][17]=1.9148204624678e-01; q[2][18]=5.3822251903674e-01; + q[2][19]=3.1285879799342e-01; + + q[3][4]=2.4083661480204e-01; q[3][5]=1.1909457033960e+00; + q[3][6]=3.7616252083685e+00; q[3][7]=7.9847324896839e-01; + q[3][8]=7.7814266402188e-01; q[3][9]=4.1855573246161e-01; + q[3][10]=2.1813157759360e-01; q[3][11]=1.0324479249521e+00; + q[3][12]=2.2262189795786e-01; q[3][13]=2.8173069420651e-01; + q[3][14]=7.3062827299842e-01; q[3][15]=1.5827541420653e+00; + q[3][16]=9.3418750943056e-01; q[3][17]=1.4534504627853e-01; + q[3][18]=2.6142220896504e-01; q[3][19]=2.5812928941763e-01; + + q[4][5]=3.2980150463028e-01; q[4][6]=1.4074889181440e-01; + q[4][7]=4.1820319228376e-01; q[4][8]=3.5405810983129e-01; + q[4][9]=7.7489402279418e-01; q[4][10]=8.3184264014158e-01; + q[4][11]=2.8507880090648e-01; q[4][12]=7.6768882347954e-01; + q[4][13]=4.4133747118660e-01; q[4][14]=3.5600849876863e-01; + q[4][15]=1.1971884150942e+00; q[4][16]=1.1198313585160e+00; + q[4][17]=5.2766441887169e-01; q[4][18]=4.7023773369610e-01; + q[4][19]=1.1163524786062e+00; + + q[5][6]=5.5289191779282e+00; q[5][7]=6.0984630538281e-01; + q[5][8]=2.4353411311401e+00; q[5][9]=2.3620245120365e-01; + q[5][10]=5.8073709318144e-01; q[5][11]=3.9452776745146e+00; + q[5][12]=2.4948960771127e+00; q[5][13]=1.4435695975031e-01; + q[5][14]=8.5857057567418e-01; q[5][15]=1.9348709245965e+00; + q[5][16]=1.2774802945956e+00; q[5][17]=7.5865380864172e-01; + q[5][18]=9.5898974285014e-01; q[5][19]=5.3078579012486e-01; + + q[6][7]=4.2357999217628e-01; q[6][8]=1.6268910569817e+00; + q[6][9]=1.8684804693170e-01; q[6][10]=3.7262517508685e-01; + q[6][11]=2.8024271516787e+00; q[6][12]=5.5541539747043e-01; + q[6][13]=2.9140908416530e-01; q[6][14]=9.2656393484598e-01; + q[6][15]=1.7698932389373e+00; q[6][16]=1.0710972360073e+00; + q[6][17]=4.0763564893830e-01; q[6][18]=5.9671930034577e-01; + q[6][19]=5.2425384633796e-01; + + q[7][8]=5.3985912495418e-01; q[7][9]=1.8929629237636e-01; + q[7][10]=2.1772115923623e-01; q[7][11]=7.5204244030271e-01; + q[7][12]=4.5943617357855e-01; q[7][13]=3.6816646445253e-01; + q[7][14]=5.0408659952683e-01; q[7][15]=1.5093262532236e+00; + q[7][16]=6.4143601140497e-01; q[7][17]=5.0835892463812e-01; + q[7][18]=3.0805573703500e-01; q[7][19]=2.5334079019018e-01; + + q[8][9]=2.5271844788492e-01; q[8][10]=3.4807220979697e-01; + q[8][11]=1.0225070358890e+00; q[8][12]=9.8431152535870e-01; + q[8][13]=7.1453370392764e-01; q[8][14]=5.2700733915060e-01; + q[8][15]=1.1170297629105e+00; q[8][16]=5.8540709022472e-01; + q[8][17]=3.0124860078016e-01; q[8][18]=4.2189539693890e+00; + q[8][19]=2.0155597175031e-01; + + q[9][10]=3.8909637733035e+00; q[9][11]=4.0619358664202e-01; + q[9][12]=3.3647977631042e+00; q[9][13]=1.5173593259539e+00; + q[9][14]=3.8835540920564e-01; q[9][15]=3.5754441245967e-01; + q[9][16]=1.1790911972601e+00; q[9][17]=3.4198578754023e-01; + q[9][18]=6.7461709322842e-01; q[9][19]=8.3118394054582e+00; + + q[10][11]=4.4557027426059e-01; q[10][12]=6.0305593795716e+00; + q[10][13]=2.0648397032375e+00; q[10][14]=3.7455568747097e-01; + q[10][15]=3.5296918452729e-01; q[10][16]=9.1525985769421e-01; + q[10][17]=6.9147463459998e-01; q[10][18]=8.1124585632307e-01; + q[10][19]=2.2314056889131e+00; + + q[11][12]=1.0730611843319e+00; q[11][13]=2.6692475051102e-01; + q[11][14]=1.0473834507215e+00; q[11][15]=1.7521659178195e+00; + q[11][16]=1.3038752007987e+00; q[11][17]=3.3224304063396e-01; + q[11][18]=7.1799348690032e-01; q[11][19]=4.9813847530407e-01; + + q[12][13]=1.7738551688305e+00; q[12][14]=4.5412362510273e-01; + q[12][15]=9.1872341574605e-01; q[12][16]=1.4885480537218e+00; + q[12][17]=8.8810109815193e-01; q[12][18]=9.5168216224591e-01; + q[12][19]=2.5758507553153e+00; + + q[13][14]=2.3359790962888e-01; q[13][15]=5.4002764482413e-01; + q[13][16]=4.8820611879305e-01; q[13][17]=2.0743248934965e+00; + q[13][18]=6.7472604308008e+00; q[13][19]=8.3811961017754e-01; + + q[14][15]=1.1691295777157e+00; q[14][16]=1.0054516831488e+00; + q[14][17]=2.5221483002727e-01; q[14][18]=3.6940531935451e-01; + q[14][19]=4.9690841067567e-01; + + q[15][16]=5.1515562922704e+00; q[15][17]=3.8792562209837e-01; + q[15][18]=7.9675152076106e-01; q[15][19]=5.6192545744165e-01; + + q[16][17]=5.1312812689059e-01; q[16][18]=8.0101024319939e-01; + q[16][19]=2.2530740511763e+00; + + q[17][18]=4.0544190065580e+00; q[17][19]=2.6650873142646e-01; + + q[18][19]=1.0000000000000e+00; + + + f[0]=0.074; f[1]=0.052; f[2]=0.045; f[3]=0.054; + f[4]=0.025; f[5]=0.034; f[6]=0.054; f[7]=0.074; + f[8]=0.026; f[9]=0.068; f[10]=0.099; f[11]=0.058; + f[12]=0.025; f[13]=0.047; f[14]=0.039; f[15]=0.057; + f[16]=0.051; f[17]=0.013; f[18]=0.032; f[19]=0.073; +} + + + +void vtmvdata(dmatrix q, double *f) +{ + /* + * variable time (VT) model for amino acid evolution + * Mueller, T. and Vingron, M. (1999) + * "Modeling Amino Acid Replacement" + * Journal of Comp. Biology + */ + +/* amino acid frequencies */ + +f[0]=0.078837 ; +f[1]=0.051238 ; +f[2]=0.042313 ; +f[3]=0.053066 ; +f[4]=0.015175 ; +f[5]=0.036713 ; +f[6]=0.061924 ; +f[7]=0.070852 ; +f[8]=0.023082 ; +f[9]=0.062056 ; +f[10]=0.096371 ; +f[11]=0.057324 ; +f[12]=0.023771 ; +f[13]=0.043296 ; +f[14]=0.043911 ; +f[15]=0.063403 ; +f[16]=0.055897 ; +f[17]=0.013272 ; +f[18]=0.034399 ; +f[19]=0.073101 ; + + +q[0][1] = 0.233108 ; +q[0][2] = 0.199097 ; +q[0][3] = 0.265145 ; +q[0][4] = 0.227333 ; +q[0][5] = 0.310084 ; +q[0][6] = 0.567957 ; +q[0][7] = 0.876213 ; +q[0][8] = 0.078692 ; +q[0][9] = 0.222972 ; +q[0][10] = 0.424630 ; +q[0][11] = 0.393245 ; +q[0][12] = 0.211550 ; +q[0][13] = 0.116646 ; +q[0][14] = 0.399143 ; +q[0][15] = 1.817198 ; +q[0][16] = 0.877877 ; +q[0][17] = 0.030309 ; +q[0][18] = 0.087061 ; +q[0][19] = 1.230985 ; + +q[1][2] = 0.210797 ; +q[1][3] = 0.105191 ; +q[1][4] = 0.031726 ; +q[1][5] = 0.493763 ; +q[1][6] = 0.255240 ; +q[1][7] = 0.156945 ; +q[1][8] = 0.213164 ; +q[1][9] = 0.081510 ; +q[1][10] = 0.192364 ; +q[1][11] = 1.755838 ; +q[1][12] = 0.087930 ; +q[1][13] = 0.042569 ; +q[1][14] = 0.128480 ; +q[1][15] = 0.292327 ; +q[1][16] = 0.204109 ; +q[1][17] = 0.046417 ; +q[1][18] = 0.097010 ; +q[1][19] = 0.113146 ; + +q[2][3] = 0.883422 ; +q[2][4] = 0.027495 ; +q[2][5] = 0.275700 ; +q[2][6] = 0.270417 ; +q[2][7] = 0.362028 ; +q[2][8] = 0.290006 ; +q[2][9] = 0.087225 ; +q[2][10] = 0.069245 ; +q[2][11] = 0.503060 ; +q[2][12] = 0.057420 ; +q[2][13] = 0.039769 ; +q[2][14] = 0.083956 ; +q[2][15] = 0.847049 ; +q[2][16] = 0.471268 ; +q[2][17] = 0.010459 ; +q[2][18] = 0.093268 ; +q[2][19] = 0.049824 ; + +q[3][4] = 0.010313 ; +q[3][5] = 0.205842 ; +q[3][6] = 1.599461 ; +q[3][7] = 0.311718 ; +q[3][8] = 0.134252 ; +q[3][9] = 0.011720 ; +q[3][10] = 0.060863 ; +q[3][11] = 0.261101 ; +q[3][12] = 0.012182 ; +q[3][13] = 0.016577 ; +q[3][14] = 0.160063 ; +q[3][15] = 0.461519 ; +q[3][16] = 0.178197 ; +q[3][17] = 0.011393 ; +q[3][18] = 0.051664 ; +q[3][19] = 0.048769 ; + +q[4][5] = 0.004315 ; +q[4][6] = 0.005321 ; +q[4][7] = 0.050876 ; +q[4][8] = 0.016695 ; +q[4][9] = 0.046398 ; +q[4][10] = 0.091709 ; +q[4][11] = 0.004067 ; +q[4][12] = 0.023690 ; +q[4][13] = 0.051127 ; +q[4][14] = 0.011137 ; +q[4][15] = 0.175270 ; +q[4][16] = 0.079511 ; +q[4][17] = 0.007732 ; +q[4][18] = 0.042823 ; +q[4][19] = 0.163831 ; + +q[5][6] = 0.960976 ; +q[5][7] = 0.128660 ; +q[5][8] = 0.315521 ; +q[5][9] = 0.054602 ; +q[5][10] = 0.243530 ; +q[5][11] = 0.738208 ; +q[5][12] = 0.120801 ; +q[5][13] = 0.026235 ; +q[5][14] = 0.156570 ; +q[5][15] = 0.358017 ; +q[5][16] = 0.248992 ; +q[5][17] = 0.021248 ; +q[5][18] = 0.062544 ; +q[5][19] = 0.112027 ; + +q[6][7] = 0.250447 ; +q[6][8] = 0.104458 ; +q[6][9] = 0.046589 ; +q[6][10] = 0.151924 ; +q[6][11] = 0.888630 ; +q[6][12] = 0.058643 ; +q[6][13] = 0.028168 ; +q[6][14] = 0.205134 ; +q[6][15] = 0.406035 ; +q[6][16] = 0.321028 ; +q[6][17] = 0.018844 ; +q[6][18] = 0.055200 ; +q[6][19] = 0.205868 ; + +q[7][8] = 0.058131 ; +q[7][9] = 0.051089 ; +q[7][10] = 0.087056 ; +q[7][11] = 0.193243 ; +q[7][12] = 0.046560 ; +q[7][13] = 0.050143 ; +q[7][14] = 0.124492 ; +q[7][15] = 0.612843 ; +q[7][16] = 0.136266 ; +q[7][17] = 0.023990 ; +q[7][18] = 0.037568 ; +q[7][19] = 0.082579 ; + +q[8][9] = 0.020039 ; +q[8][10] = 0.103552 ; +q[8][11] = 0.153323 ; +q[8][12] = 0.021157 ; +q[8][13] = 0.079807 ; +q[8][14] = 0.078892 ; +q[8][15] = 0.167406 ; +q[8][16] = 0.101117 ; +q[8][17] = 0.020009 ; +q[8][18] = 0.286027 ; +q[8][19] = 0.068575 ; + +q[9][10] = 2.089890 ; +q[9][11] = 0.093181 ; +q[9][12] = 0.493845 ; +q[9][13] = 0.321020 ; +q[9][14] = 0.054797 ; +q[9][15] = 0.081567 ; +q[9][16] = 0.376588 ; +q[9][17] = 0.034954 ; +q[9][18] = 0.086237 ; +q[9][19] = 3.654430 ; + +q[10][11] = 0.201204 ; +q[10][12] = 1.105667 ; +q[10][13] = 0.946499 ; +q[10][14] = 0.169784 ; +q[10][15] = 0.214977 ; +q[10][16] = 0.243227 ; +q[10][17] = 0.083439 ; +q[10][18] = 0.189842 ; +q[10][19] = 1.337571 ; + +q[11][12] = 0.096474 ; +q[11][13] = 0.038261 ; +q[11][14] = 0.212302 ; +q[11][15] = 0.400072 ; +q[11][16] = 0.446646 ; +q[11][17] = 0.023321 ; +q[11][18] = 0.068689 ; +q[11][19] = 0.144587 ; + +q[12][13] = 0.173052 ; +q[12][14] = 0.010363 ; +q[12][15] = 0.090515 ; +q[12][16] = 0.184609 ; +q[12][17] = 0.022019 ; +q[12][18] = 0.073223 ; +q[12][19] = 0.307309 ; + +q[13][14] = 0.042564 ; +q[13][15] = 0.138119 ; +q[13][16] = 0.085870 ; +q[13][17] = 0.128050 ; +q[13][18] = 0.898663 ; +q[13][19] = 0.247329 ; + +q[14][15] = 0.430431 ; +q[14][16] = 0.207143 ; +q[14][17] = 0.014584 ; +q[14][18] = 0.032043 ; +q[14][19] = 0.129315 ; + +q[15][16] = 1.767766 ; +q[15][17] = 0.035933 ; +q[15][18] = 0.121979 ; +q[15][19] = 0.127700 ; + +q[16][17] = 0.020437 ; +q[16][18] = 0.094617 ; +q[16][19] = 0.740372 ; + +q[17][18] = 0.124746 ; +q[17][19] = 0.022134 ; + +q[18][19] = 0.125733 ; + +} + + +/* + * WAG matrix: Simon Whelan and Nick Goldman + * + */ + +void wagdata(dmatrix q, double *f) +{ + /* + * WAG model of amino acid evolution + * + * S. Whelan and N. Goldman. 2000. In prep. + * + * presented at the MASAMB-X workshop in Cambridge + * + * Whelan, S., and N. Goldman. 2000. + * The WAG amino acid rate matrix. + * Manuscript in prep. + */ + + /* Q matrix */ + q[0][1] = 0.610810; q[0][2] = 0.569079; + q[0][3] = 0.821500; q[0][4] = 1.141050; + q[0][5] = 1.011980; q[0][6] = 1.756410; + q[0][7] = 1.572160; q[0][8] = 0.354813; + q[0][9] = 0.219023; q[0][10] = 0.443935; + q[0][11] = 1.005440; q[0][12] = 0.989475; + q[0][13] = 0.233492; q[0][14] = 1.594890; + q[0][15] = 3.733380; q[0][16] = 2.349220; + q[0][17] = 0.125227; q[0][18] = 0.268987; + q[0][19] = 2.221870; + + q[1][2] = 0.711690; q[1][3] = 0.165074; + q[1][4] = 0.585809; q[1][5] = 3.360330; + q[1][6] = 0.488649; q[1][7] = 0.650469; + q[1][8] = 2.362040; q[1][9] = 0.206722; + q[1][10] = 0.551450; q[1][11] = 5.925170; + q[1][12] = 0.758446; q[1][13] = 0.116821; + q[1][14] = 0.753467; q[1][15] = 1.357640; + q[1][16] = 0.613776; q[1][17] = 1.294610; + q[1][18] = 0.423612; q[1][19] = 0.280336; + + q[2][3] = 6.013660; q[2][4] = 0.296524; + q[2][5] = 1.716740; q[2][6] = 1.056790; + q[2][7] = 1.253910; q[2][8] = 4.378930; + q[2][9] = 0.615636; q[2][10] = 0.147156; + q[2][11] = 3.334390; q[2][12] = 0.224747; + q[2][13] = 0.110793; q[2][14] = 0.217538; + q[2][15] = 4.394450; q[2][16] = 2.257930; + q[2][17] = 0.078463; q[2][18] = 1.208560; + q[2][19] = 0.221176; + + q[3][4] = 0.033379; q[3][5] = 0.691268; + q[3][6] = 6.833400; q[3][7] = 0.961142; + q[3][8] = 1.032910; q[3][9] = 0.043523; + q[3][10] = 0.093930; q[3][11] = 0.533362; + q[3][12] = 0.116813; q[3][13] = 0.052004; + q[3][14] = 0.472601; q[3][15] = 1.192810; + q[3][16] = 0.417372; q[3][17] = 0.146348; + q[3][18] = 0.363243; q[3][19] = 0.169417; + + q[4][5] = 0.109261; q[4][6] = 0.023920; + q[4][7] = 0.341086; q[4][8] = 0.275403; + q[4][9] = 0.189890; q[4][10] = 0.428414; + q[4][11] = 0.083649; q[4][12] = 0.437393; + q[4][13] = 0.441300; q[4][14] = 0.122303; + q[4][15] = 1.560590; q[4][16] = 0.570186; + q[4][17] = 0.795736; q[4][18] = 0.604634; + q[4][19] = 1.114570; + + q[5][6] = 6.048790; q[5][7] = 0.366510; + q[5][8] = 4.749460; q[5][9] = 0.131046; + q[5][10] = 0.964886; q[5][11] = 4.308310; + q[5][12] = 1.705070; q[5][13] = 0.110744; + q[5][14] = 1.036370; q[5][15] = 1.141210; + q[5][16] = 0.954144; q[5][17] = 0.243615; + q[5][18] = 0.252457; q[5][19] = 0.333890; + + q[6][7] = 0.630832; q[6][8] = 0.635025; + q[6][9] = 0.141320; q[6][10] = 0.172579; + q[6][11] = 2.867580; q[6][12] = 0.353912; + q[6][13] = 0.092310; q[6][14] = 0.755791; + q[6][15] = 0.782467; q[6][16] = 0.914814; + q[6][17] = 0.172682; q[6][18] = 0.217549; + q[6][19] = 0.655045; + + q[7][8] = 0.276379; q[7][9] = 0.034151; + q[7][10] = 0.068651; q[7][11] = 0.415992; + q[7][12] = 0.194220; q[7][13] = 0.055288; + q[7][14] = 0.273149; q[7][15] = 1.486700; + q[7][16] = 0.251477; q[7][17] = 0.374321; + q[7][18] = 0.114187; q[7][19] = 0.209108; + + q[8][9] = 0.152215; q[8][10] = 0.555096; + q[8][11] = 0.992083; q[8][12] = 0.450867; + q[8][13] = 0.756080; q[8][14] = 0.771387; + q[8][15] = 0.822459; q[8][16] = 0.525511; + q[8][17] = 0.289998; q[8][18] = 4.290350; + q[8][19] = 0.131869; + + q[9][10] = 3.517820; q[9][11] = 0.360574; + q[9][12] = 4.714220; q[9][13] = 1.177640; + q[9][14] = 0.111502; q[9][15] = 0.353443; + q[9][16] = 1.615050; q[9][17] = 0.234326; + q[9][18] = 0.468951; q[9][19] = 8.659740; + + q[10][11] = 0.287583; q[10][12] = 5.375250; + q[10][13] = 2.348200; q[10][14] = 0.462018; + q[10][15] = 0.382421; q[10][16] = 0.364222; + q[10][17] = 0.740259; q[10][18] = 0.443205; + q[10][19] = 1.997370; + + q[11][12] = 1.032220; q[11][13] = 0.098843; + q[11][14] = 0.619503; q[11][15] = 1.073780; + q[11][16] = 1.537920; q[11][17] = 0.152232; + q[11][18] = 0.147411; q[11][19] = 0.342012; + + q[12][13] = 1.320870; q[12][14] = 0.194864; + q[12][15] = 0.556353; q[12][16] = 1.681970; + q[12][17] = 0.570369; q[12][18] = 0.473810; + q[12][19] = 2.282020; + + q[13][14] = 0.179896; q[13][15] = 0.606814; + q[13][16] = 0.191467; q[13][17] = 1.699780; + q[13][18] = 7.154480; q[13][19] = 0.725096; + + q[14][15] = 1.786490; q[14][16] = 0.885349; + q[14][17] = 0.156619; q[14][18] = 0.239607; + q[14][19] = 0.351250; + + q[15][16] = 4.847130; q[15][17] = 0.578784; + q[15][18] = 0.872519; q[15][19] = 0.258861; + + q[16][17] = 0.126678; q[16][18] = 0.325490; + q[16][19] = 1.547670; + + q[17][18] = 2.763540; q[17][19] = 0.409817; + + q[18][19] = 0.347826; + + /* original frequencies */ + f[ 0] = 0.0866; + f[ 1] = 0.0440; + f[ 2] = 0.0391; + f[ 3] = 0.0570; + f[ 4] = 0.0193; + f[ 5] = 0.0367; + f[ 6] = 0.0581; + f[ 7] = 0.0833; + f[ 8] = 0.0244; + f[ 9] = 0.0485; + f[10] = 0.0862; + f[11] = 0.0620; + f[12] = 0.0195; + f[13] = 0.0384; + f[14] = 0.0458; + f[15] = 0.0695; + f[16] = 0.0610; + f[17] = 0.0144; + f[18] = 0.0353; + f[19] = 0.0709; +} + +void cprev45data(dmatrix q, double *f) +{ + /* cpREV45 model of amino acid evolution + * Adachi, J., P.J. Waddell, W. Martin, and M. Hasegawa. 2000. + * J. Mol. Evol. 50:348-358 + * (reconstructed from 45 chloroplast genomes) + */ + q[0][1] = 105; q[0][2] = 227; + q[0][3] = 175; q[0][4] = 669; + q[0][5] = 157; q[0][6] = 499; + q[0][7] = 665; q[0][8] = 66; + q[0][9] = 145; q[0][10] = 197; + q[0][11] = 236; q[0][12] = 185; + q[0][13] = 68; q[0][14] = 490; + q[0][15] = 2440; q[0][16] = 1340; + q[0][17] = 14; q[0][18] = 56; + q[0][19] = 968; + + q[1][2] = 357; q[1][3] = 43; + q[1][4] = 823; q[1][5] = 1745; + q[1][6] = 152; q[1][7] = 243; + q[1][8] = 715; q[1][9] = 136; + q[1][10] = 203; q[1][11] = 4482; + q[1][12] = 125; q[1][13] = 53; + q[1][14] = 87; q[1][15] = 385; + q[1][16] = 314; q[1][17] = 230; + q[1][18] = 323; q[1][19] = 92; + + q[2][3] = 4435; q[2][4] = 538; + q[2][5] = 768; q[2][6] = 1055; + q[2][7] = 653; q[2][8] = 1405; + q[2][9] = 168; q[2][10] = 113; + q[2][11] = 2430; q[2][12] = 61; + q[2][13] = 97; q[2][14] = 173; + q[2][15] = 2085; q[2][16] = 1393; + q[2][17] = 40; q[2][18] = 754; + q[2][19] = 83; + + q[3][4] = 10; q[3][5] = 400; + q[3][6] = 3691; q[3][7] = 431; + q[3][8] = 331; q[3][9] = 10; + q[3][10] = 10; q[3][11] = 412; + q[3][12] = 47; q[3][13] = 22; + q[3][14] = 170; q[3][15] = 590; + q[3][16] = 266; q[3][17] = 18; + q[3][18] = 281; q[3][19] = 75; + + q[4][5] = 10; q[4][6] = 10; + q[4][7] = 303; q[4][8] = 441; + q[4][9] = 280; q[4][10] = 396; + q[4][11] = 48; q[4][12] = 159; + q[4][13] = 726; q[4][14] = 285; + q[4][15] = 2331; q[4][16] = 576; + q[4][17] = 435; q[4][18] = 1466; + q[4][19] = 592; + + q[5][6] = 3122; q[5][7] = 133; + q[5][8] = 1269; q[5][9] = 92; + q[5][10] = 286; q[5][11] = 3313; + q[5][12] = 202; q[5][13] = 10; + q[5][14] = 323; q[5][15] = 396; + q[5][16] = 241; q[5][17] = 53; + q[5][18] = 391; q[5][19] = 54; + + q[6][7] = 379; q[6][8] = 162; + q[6][9] = 148; q[6][10] = 82; + q[6][11] = 2629; q[6][12] = 113; + q[6][13] = 145; q[6][14] = 185; + q[6][15] = 568; q[6][16] = 369; + q[6][17] = 63; q[6][18] = 142; + q[6][19] = 200; + + q[7][8] = 19; q[7][9] = 40; + q[7][10] = 20; q[7][11] = 263; + q[7][12] = 21; q[7][13] = 25; + q[7][14] = 28; q[7][15] = 691; + q[7][16] = 92; q[7][17] = 82; + q[7][18] = 10; q[7][19] = 91; + + q[8][9] = 29; q[8][10] = 66; + q[8][11] = 305; q[8][12] = 10; + q[8][13] = 127; q[8][14] = 152; + q[8][15] = 303; q[8][16] = 32; + q[8][17] = 69; q[8][18] = 1971; + q[8][19] = 25; + + q[9][10] = 1745; q[9][11] = 345; + q[9][12] = 1772; q[9][13] = 454; + q[9][14] = 117; q[9][15] = 216; + q[9][16] = 1040; q[9][17] = 42; + q[9][18] = 89; q[9][19] = 4797; + + q[10][11] = 218; q[10][12] = 1351; + q[10][13] = 1268; q[10][14] = 219; + q[10][15] = 516; q[10][16] = 156; + q[10][17] = 159; q[10][18] = 189; + q[10][19] = 865; + + q[11][12] = 193; q[11][13] = 72; + q[11][14] = 302; q[11][15] = 868; + q[11][16] = 918; q[11][17] = 10; + q[11][18] = 247; q[11][19] = 249; + + q[12][13] = 327; q[12][14] = 100; + q[12][15] = 93; q[12][16] = 645; + q[12][17] = 86; q[12][18] = 215; + q[12][19] = 475; + + q[13][14] = 43; q[13][15] = 487; + q[13][16] = 148; q[13][17] = 468; + q[13][18] = 2370; q[13][19] = 317; + + q[14][15] = 1202; q[14][16] = 260; + q[14][17] = 49; q[14][18] = 97; + q[14][19] = 122; + + q[15][16] = 2151; q[15][17] = 73; + q[15][18] = 522; q[15][19] = 167; + + q[16][17] = 29; q[16][18] = 71; + q[16][19] = 760; + + q[17][18] = 346; q[17][19] = 10; + + q[18][19] = 119; + + f[0] = 0.076; + f[1] = 0.062; + f[2] = 0.041; + f[3] = 0.037; + f[4] = 0.009; + f[5] = 0.038; + f[6] = 0.049; + f[7] = 0.084; + f[8] = 0.025; + f[9] = 0.081; + f[10] = 0.101; + f[11] = 0.050; + f[12] = 0.022; + f[13] = 0.051; + f[14] = 0.043; + f[15] = 0.062; + f[16] = 0.054; + f[17] = 0.018; + f[18] = 0.031; + f[19] = 0.066; +} + diff --git a/forester/archive/RIO/others/puzzle_dqo/src/outdist b/forester/archive/RIO/others/puzzle_dqo/src/outdist new file mode 100644 index 0000000..f5728a1 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/outdist @@ -0,0 +1,4 @@ +RECA_NEIMU 0.01095 +O86384/1-2 8.99866 +RECA_NEIPH 0.02202 + diff --git a/forester/archive/RIO/others/puzzle_dqo/src/ppuzzle.h b/forester/archive/RIO/others/puzzle_dqo/src/ppuzzle.h new file mode 100644 index 0000000..2007467 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/ppuzzle.h @@ -0,0 +1,274 @@ +/* + * ppuzzle.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef _PPUZZLE_ +#define _PPUZZLE_ + +#include "puzzle.h" +#include "util.h" +#include "ml.h" +#include "sched.h" + +extern int PP_IamSlave; +extern int PP_IamMaster; + +#ifdef PARALLEL +# ifdef SEQUENTIAL +# undef SEQUENTIAL +# endif +# define SEQUENTIAL 0 +# undef PARALLEL +# define PARALLEL 1 +# include "mpi.h" +#else +# ifdef SEQUENTIAL +# undef SEQUENTIAL +# endif +# define SEQUENTIAL 1 +# define PARALLEL 0 +# undef PVERBOSE +# undef PVERBOSE1 +# undef PVERBOSE2 +# undef PVERBOSE3 +#endif + +/* PVERBOSE3 includes PVERBOSE2 includes PVERBOSE1 */ +/* PVERBOSE1 is default (PVERBOSE) */ + +#ifdef PVERBOSE +# undef PVERBOSE1 +# define PVERBOSE1 +#endif +#ifdef PVERBOSE3 +# undef PVERBOSE2 +# define PVERBOSE2 +#endif +#ifdef PVERBOSE2 +# undef PVERBOSE1 +# define PVERBOSE1 +#endif + +#if PARALLEL +# define PP_DONE 0 /* Finished M->S */ +# define PP_SIZES 1 /* Array sizes needed M->S */ +# define PP_DATA 2 /* Data Arrays M->S */ + +# define PP_ALLQUARTS 3 /* All Quartets M->S */ + +# define PP_DOQUART 4 /* do 4Specs M->S */ +# define PP_DOQUARTX2 5 /* do 4Specs + X^2 M->S */ +# define PP_QUART 6 /* quartet back S->M */ +# define PP_QUARTX2 7 /* quartet + X^2 back S->M */ + +# define PP_DOQUARTBLOCKSPECS 8 /* do block Specs M->S */ +# define PP_DOQUARTBLOCK 9 /* do block of Quarts M->S */ +# define PP_QUARTBLOCKSPECS 10 /* block Specs S->M */ +# define PP_QUARTBLOCK 11 /* block of Quarts S->M */ + +# define PP_DOPUZZLE 12 /* do Puzzling step M->S */ +# define PP_PUZZLE 13 /* Puzzling tree back S->M */ +# define PP_DOPUZZLEBLOCK 14 /* do Puzzling block M->S */ +# define PP_DOPUZZLEBLOCKSPECS 15 /* do Puzzling block M->S */ +# define PP_PUZZLEBLOCK 16 /* Puzzling block S->M */ +# define PP_PUZZLEBLOCKSPECS 17 /* Puzzling block S->M */ + +# define PP_STATS 18 /* Slave Statistics S->M */ + +# define PP_WAIT 18 /* waiting for work S->M */ +# define PP_TEST 100 /* testing */ + +# define PERMUTQUEUESIZE 100 +# define QUARTQUEUESIZE 100 + + extern int PP_IamMaster; + extern int PP_IamSlave; + extern int PP_Myid; + extern int PP_MyMaster; + extern int PP_NumProcs; + extern MPI_Comm PP_Comm; +#endif /* PARALLEL */ + +extern int *permutsent, + *permutrecved, + *quartsent, + *quartrecved, + *doquartsent, + *doquartrecved, + *splitsent, + *splitrecved, + *permutsentn, + *permutrecvedn, + *quartsentn, + *quartrecvedn, + *doquartsentn, + *doquartrecvedn, + *splitsentn, + *splitrecvedn; +extern double *walltimes, + *cputimes; +extern double *fullwalltimes, + *fullcputimes; +extern double *altwalltimes, + *altcputimes; + +extern int PP_permutsent, + PP_permutrecved, + PP_quartsent, + PP_quartrecved, + PP_doquartsent, + PP_doquartrecved, + PP_splitsent, + PP_splitrecved, + PP_permutsentn, + PP_permutrecvedn, + PP_quartsentn, + PP_quartrecvedn, + PP_doquartsentn, + PP_doquartrecvedn, + PP_splitsentn, + PP_splitrecvedn; + +extern double PP_starttime, + PP_stoptime, + PP_inittime, + PP_paramcomptime, + PP_paramsendtime, + PP_quartcomptime, + PP_quartsendtime, + PP_puzzletime, + PP_treetime; + +void num2quart(uli qnum, int *a, int *b, int *c, int *d); +uli numquarts(int maxspc); +uli quart2num (int a, int b, int c, int d); + +int slave_main(int argc, char *argv[]); +void PP_Init(int *argc, char **argv[]); +void PP_Finalize(); +void PP_Printerror(FILE *of, int id, int err); +void PP_do_puzzling(ivector trueID); + +void PP_RecvDoQuart(int *a, + int *b, + int *c, + int *d, + int *approx); +void PP_SendDoQuart(int dest, + int a, + int b, + int c, + int d, + int approx); +void PP_RecvQuart(int *a, + int *b, + int *c, + int *d, + double *d1, + double *d2, + double *d3, + int *approx); +void PP_SendQuart(int a, + int b, + int c, + int d, + double d1, + double d2, + double d3, + int approx); +void PP_SendSizes(int mspc, + int msite, + int ncats, + int nptrn, + int rad, + int outgr, + double frconst, + int rseed); +void PP_RecvSizes(int *mspc, + int *msite, + int *ncats, + int *nptrn, + int *rad, + int *outgr, + double *frconst, + int *rseed); +void PP_RecvData( + cmatrix Seqpat, /* cmatrix (Maxspc x Numptrn) */ + ivector Alias, /* ivector (Maxsite) */ + ivector Weight, /* ivector (Numptrn) */ + ivector constpat, + dvector Rates, /* dvector (numcats) */ + dvector Eval, /* dvector (tpmradix) */ + dvector Freqtpm, + dmatrix Evec, /* dmatrix (tpmradix x tpmradix) */ + dmatrix Ievc, + dmatrix iexp, + dmatrix Distanmat, /* dmatrix (Maxspc x Maxspc) */ + dcube ltprobr); /* dcube (numcats x tpmradix x tpmradix) */ +void PP_SendData( + cmatrix Seqpat, /* cmatrix (Maxspc x Numptrn) */ + ivector Alias, /* ivector (Maxsite) */ + ivector Weight, /* ivector (Numptrn) */ + ivector constpat, + dvector Rates, /* dvector (numcats) */ + dvector Eval, /* dvector (tpmradix) */ + dvector Freqtpm, + dmatrix Evec, /* dmatrix (tpmradix x tpmradix) */ + dmatrix Ievc, + dmatrix iexp, + dmatrix Distanmat, /* dmatrix (Maxspc x Maxspc) */ + dcube ltprobr); /* dcube (numcats x tpmradix x tpmradix) */ +void PP_SendAllQuarts(unsigned long Numquartets, + unsigned char *quartetinfo); +void PP_RecvAllQuarts(int taxa, + unsigned long *Numquartets, + unsigned char *quartetinfo); + +void PP_SendDoQuartBlock(int dest, uli firstq, uli amount, int approx); +void PP_RecvDoQuartBlock(uli *firstq, uli *amount, uli **bq, int *approx); +void PP_SendQuartBlock(uli startq, + uli numofq, + unsigned char *quartetinfo, + uli numofbq, + uli *bq, + int approx); +void PP_RecvQuartBlock(int slave, + uli *startq, + uli *numofq, + unsigned char *quartetinfo, + int *approx); + +void PP_SendPermut(int dest, + int taxa, + ivector permut); +void PP_RecvPermut(int taxa, + ivector permut); +void PP_SendDoPermutBlock(uli puzzlings); +void PP_RecvDoPermutBlock(uli *taxa); + +void PP_SendSplits(int taxa, + cmatrix biparts); +void PP_RecvSplits(int taxa, + cmatrix biparts); +void PP_SendDone(); +void PP_RecvDone(); + +int PP_emptyslave(); +void PP_putslave(int sl); +int PP_getslave(); + +void PP_cmpd(int rank, double a, double b); +void PP_cmpi(int rank, int a, int b); + +#endif /* _PPUZZLE_ */ diff --git a/forester/archive/RIO/others/puzzle_dqo/src/puzzle.h b/forester/archive/RIO/others/puzzle_dqo/src/puzzle.h new file mode 100644 index 0000000..8165b1a --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/puzzle.h @@ -0,0 +1,493 @@ +/* + * puzzle.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef _PUZZLE_ +#define _PUZZLE_ + +#ifndef PACKAGE +# define PACKAGE "tree-puzzle" +#endif +#ifndef VERSION +# define VERSION "5.0" +#endif +#define DATE "October 2000" + +/* prototypes */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" +#include "ml.h" +#ifdef PARALLEL +# include "ppuzzle.h" +#endif + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + +/* filenames */ +# define FILENAMELENTH 2048 + + +# define INFILEDEFAULT "infile" +# define OUTFILEDEFAULT "outfile" +# define TREEFILEDEFAULT "outtree" +# define INTREEDEFAULT "intree" +# define DISTANCESDEFAULT "outdist" +# define TRIANGLEDEFAULT "outlm.eps" +# define UNRESOLVEDDEFAULT "outqlist" +# define ALLQUARTDEFAULT "outallquart" +# define ALLQUARTLHDEFAULT "outallquartlh" +# define OUTPTLISTDEFAULT "outpstep" +# define OUTPTORDERDEFAULT "outptorder" + +# define INFILE infilename +# define OUTFILE outfilename +# define TREEFILE outtreename +# define INTREE intreename +# define DISTANCES outdistname +# define TRIANGLE outlmname +# define UNRESOLVED outqlistname +# define ALLQUART outallquartname +# define ALLQUARTLH outallquartlhname +# define OUTPTLIST outpstepname +# define OUTPTORDER outptordername + +EXTERN char infilename [FILENAMELENTH]; +EXTERN char outfilename [FILENAMELENTH]; +EXTERN char outtreename [FILENAMELENTH]; +EXTERN char intreename [FILENAMELENTH]; +EXTERN char outdistname [FILENAMELENTH]; +EXTERN char outlmname [FILENAMELENTH]; +EXTERN char outqlistname [FILENAMELENTH]; +EXTERN char outallquartname [FILENAMELENTH]; +EXTERN char outallquartlhname [FILENAMELENTH]; +EXTERN char outpstepname [FILENAMELENTH]; +EXTERN char outptordername [FILENAMELENTH]; + +#define OUTFILEEXT "puzzle" +#define TREEFILEEXT "tree" +#define DISTANCESEXT "dist" +#define TRIANGLEEXT "eps" +#define UNRESOLVEDEXT "qlist" +#define ALLQUARTEXT "allquart" +#define ALLQUARTLHEXT "allquartlh" +#define OUTPTLISTEXT "pstep" +#define OUTPTORDEREXT "ptorder" + +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUT stdout +# define STDOUTFILE STDOUT, +#endif + + +/* auto_aamodel/auto_datatype values (xxx) */ +#define AUTO_OFF 0 +#define AUTO_GUESS 1 +#define AUTO_DEFAULT 2 + + +/* qptlist values (xxx) */ +#define PSTOUT_NONE 0 +#define PSTOUT_ORDER 1 +#define PSTOUT_LISTORDER 2 +#define PSTOUT_LIST 3 + +/* dtat_optn values (xxx) */ +#define NUCLEOTIDE 0 +#define AMINOACID 1 +#define BINARY 2 + +/* typ_optn values (xxx) */ +#define LIKMAPING_OPTN 1 +#define TREERECON_OPTN 0 + +/* puzzlemodes (xxx) */ +#define QUARTPUZ 0 +#define USERTREE 1 +#define PAIRDIST 2 + +/* rhetmodes (xxx) Modes of rate heterogeneity */ +#define UNIFORMRATE 0 +#define GAMMARATE 1 +#define TWORATE 2 +#define MIXEDRATE 3 + +/* defines for types of quartet likelihood computation (xxx) */ +#define EXACT 0 +#define APPROX 1 + +/* tree structure */ +typedef struct oneedge { + /* pointer to other three edges */ + struct oneedge *up; + struct oneedge *downleft; + struct oneedge *downright; + int numedge; /* number of edge */ + uli edgeinfo; /* value of this edge */ + int *edgemap; /* pointer to the local edgemap */ +} ONEEDGE; + + +/* variables */ +EXTERN cmatrix biparts; /* bipartitions of tree of current puzzling step */ +EXTERN cmatrix consbiparts; /* bipartitions of majority rule consensus tree */ +EXTERN cmatrix seqchars; /* characters contained in data set */ +EXTERN cmatrix treepict; /* picture of consensus tree */ +EXTERN double minscore; /* value of edgescore on minedge */ +EXTERN double tstvf84; /* F84 transition/transversion ratio */ +EXTERN double tstvratio; /* expected transition/transversion ratio */ +EXTERN double yrtsratio; /* expected pyrimidine/purine transition ratio */ +EXTERN dvector ulkl; /* log L of user trees */ +EXTERN dmatrix allsites; /* log L per sites of user trees */ +EXTERN dvector ulklc; /* log L of user trees (clock) */ +EXTERN dmatrix allsitesc; /* log L per sites of user trees (clock) */ +EXTERN FILE *utfp; /* pointer to user tree file */ +EXTERN FILE *ofp; /* pointer to output file */ +EXTERN FILE *seqfp; /* pointer to sequence input file */ +EXTERN FILE *tfp; /* pointer to tree file */ +EXTERN FILE *dfp; /* pointer to distance file */ +EXTERN FILE *trifp; /* pointer to triangle file */ +EXTERN FILE *unresfp; /* pointer to file with unresolved quartets */ +EXTERN FILE *tmpfp; /* pointer to temporary file */ +EXTERN FILE *qptlist; /* pointer to file with puzzling step trees */ +EXTERN FILE *qptorder; /* pointer to file with unique puzzling step trees */ +EXTERN int SHcodon; /* whether SH should be applied to 1st, 2nd codon positions */ +EXTERN int utree_optn; /* use first user tree for estimation */ +EXTERN int listqptrees; /* list puzzling step trees */ +EXTERN int approxqp; /* approximate QP quartets */ +EXTERN int *edgeofleaf; /* vector with edge number of all leaves */ +EXTERN int codon_optn; /* declares what positions in a codon should be used */ +EXTERN int compclock; /* computation of clocklike branch lengths */ +EXTERN int chooseA; /* leaf variable */ +EXTERN int chooseB; /* leaf variable */ +EXTERN int clustA, clustB, clustC, clustD; /* number of members of LM clusters */ +EXTERN int column; /* used for breaking lines (writing tree to treefile) */ +EXTERN int Frequ_optn; /* use empirical base frequencies */ +EXTERN int Maxbrnch; /* 2*Maxspc - 3 */ +EXTERN int Maxseqc; /* number of sequence characters per taxum */ +EXTERN int mflag; /* flag used for correct printing of runtime messages */ +EXTERN int minedge; /* edge with minimum edgeinfo */ +EXTERN int nextedge; /* number of edges in the current tree */ +EXTERN int nextleaf; /* next leaf to add to tree */ +EXTERN int numclust; /* number of clusters in LM analysis */ +EXTERN int outgroup; /* outgroup */ +EXTERN int puzzlemode; /* computation of QP tree and/or ML distances */ +EXTERN int rootsearch; /* how location of root is found */ +EXTERN int rhetmode; /* model of rate heterogeneity */ +EXTERN int splitlength; /* length of one entry in splitpatterns */ +EXTERN int *splitsizes; /* size of all different splits of all trees */ +EXTERN int usebestq_optn; /* use only best quartet topology, no bayesian weights */ +EXTERN int show_optn; /* show unresolved quartets */ +EXTERN int savequart_optn; /* save memory block which quartets to file */ +EXTERN int savequartlh_optn; /* save quartet likelihoods to file */ +EXTERN int saveqlhbin_optn; /* save quartet likelihoods binary */ +EXTERN int readquart_optn; /* read memory block which quartets from file */ +EXTERN int sym_optn; /* symmetrize doublet frequencies */ +EXTERN int xsize; /* depth of consensus tree picture */ +EXTERN int ytaxcounter; /* counter for establishing y-coordinates of all taxa */ +EXTERN int numutrees; /* number of users trees in input tree file */ +EXTERN ivector clusterA, clusterB, clusterC, clusterD; /* clusters for LM analysis */ +EXTERN ivector consconfid; /* confidence values of majority rule consensus tree */ +EXTERN ivector conssizes; /* partition sizes of majority rule consensus tree */ +EXTERN ivector trueID; /* leaf -> taxon on this leaf */ +EXTERN ivector xcor; /* x-coordinates of consensus tree nodes */ +EXTERN ivector ycor; /* y-coordinates of consensus tree nodes */ +EXTERN ivector ycormax; /* maximal y-coordinates of consensus tree nodes */ +EXTERN ivector ycormin; /* minimal y-coordinates of consensus tree nodes */ +EXTERN ivector ycortax; /* y-coordinates of all taxa */ +EXTERN ONEEDGE *edge; /* vector with all the edges of the tree */ +EXTERN uli *splitcomp; /* bipartition storage */ +EXTERN uli *splitfreqs; /* frequencies of all different splits of all trees */ +EXTERN uli *splitpatterns; /* all different splits of all trees */ +EXTERN uli badqs; /* number of bad quartets */ +EXTERN uli consincluded; /* number of included biparts in the consensus tree */ +EXTERN uli Currtrial; /* counter for puzzling steps */ +EXTERN uli maxbiparts; /* space is reserved for that many bipartitions */ +EXTERN uli mininfo; /* value of edgeinfo on minedge */ +EXTERN uli numbiparts; /* number of different bipartitions */ +EXTERN uli Numquartets; /* number of quartets */ +EXTERN uli Numtrial; /* number of puzzling steps */ +EXTERN uli lmqts; /* quartets investigated in LM analysis (0 = ALL) */ + +EXTERN int auto_datatype; /* guess datatype ? */ +EXTERN int guessdata_optn; /* guessed datatype */ + +EXTERN int auto_aamodel; /* guess amino acid modell ? */ +EXTERN int guessauto_aamodel; /* guessed amino acid modell ? */ +EXTERN int guessDayhf_optn; /* guessed Dayhoff model option */ +EXTERN int guessJtt_optn; /* guessed JTT model option */ +EXTERN int guessblosum62_optn; /* guessed BLOSUM 62 model option */ +EXTERN int guessmtrev_optn; /* guessed mtREV model option */ +EXTERN int guesscprev_optn; /* guessed cpREV model option */ +EXTERN int guessvtmv_optn; /* guessed VT model option */ +EXTERN int guesswag_optn; /* guessed WAG model option */ + +/* counter variables needed in likelihood mapping analysis */ +EXTERN uli ar1, ar2, ar3; +EXTERN uli reg1, reg2, reg3, reg4, reg5, reg6, reg7; +EXTERN uli reg1l, reg1r, reg2u, reg2d, reg3u, reg3d, + reg4u, reg4d, reg5l, reg5r, reg6u, reg6d; +EXTERN unsigned char *quartetinfo; /* place where quartets are stored */ +EXTERN dvector qweight; /* for use in QP and LM analysis */ +EXTERN dvector sqdiff; +EXTERN ivector qworder; +EXTERN ivector sqorder; + +EXTERN int randseed; +EXTERN int psteptreestrlen; + +typedef struct treelistitemtypedummy { + struct treelistitemtypedummy *pred; + struct treelistitemtypedummy *succ; + struct treelistitemtypedummy *sortnext; + struct treelistitemtypedummy *sortlast; + char *tree; + int count; + int id; + int idx; +} treelistitemtype; + +EXTERN treelistitemtype *psteptreelist; +EXTERN treelistitemtype *psteptreesortlist; +EXTERN int psteptreenum; +EXTERN int psteptreesum; + + +/* prototypes */ +void makeF84model(void); +void compnumqts(void); +void setoptions(void); +void openfiletoread(FILE **, char[], char[]); +void openfiletowrite(FILE **, char[], char[]); +void openfiletoappend(FILE **, char[], char[]); +void closefile(FILE *); +void symdoublets(void); +void computeexpectations(void); +void putdistance(FILE *); +void findidenticals(FILE *); +double averagedist(void); +void initps(FILE *); +void plotlmpoint(FILE *, double, double); +void finishps(FILE *); +void makelmpoint(FILE *, double, double, double); +void printtreestats(FILE *); +void timestamp(FILE *); +void writeoutputfile(FILE *, int); + +/* definitions for writing output */ +#define WRITEALL 0 +#define WRITEPARAMS 1 +#define WRITEREST 2 + +void writetimesstat(FILE *ofp); +void writecutree(FILE *, int); +void starttimer(void); +void checktimer(uli); +void estimateparametersnotree(void); +void estimateparameterstree(void); +int main(int, char *[]); +int ulicmp(const void *, const void *); +int intcmp(const void *, const void *); + +void readid(FILE *, int); +char readnextcharacter(FILE *, int, int); +void skiprestofline(FILE *, int, int); +void skipcntrl(FILE *, int, int); +void getseqs(FILE *); +void initid(int); +void fputid10(FILE *, int); +int fputid(FILE *, int); +void getsizesites(FILE *); +void getdataset(FILE *); +int guessdatatype(void); +void translatedataset(void); +void estimatebasefreqs(void); +void guessmodel(void); +void inittree(void); +void addnextleaf(int); +void freetree(void); +void writeOTU(FILE *, int); +void writetree(FILE *); +int *initctree(); +void copytree(int *ctree); +void freectree(int **snodes); +void printctree(int *ctree); +char *sprintfctree(int *ctree, int strlen); +void fprintffullpstree(FILE *outf, char *treestr); +int printfsortctree(int *ctree); +int sortctree(int *ctree); +int ct_1stedge(int node); +int ct_2ndedge(int node); +int ct_3rdedge(int node); + +void printfpstrees(treelistitemtype *list); +void printfsortedpstrees(treelistitemtype *list); +void fprintfsortedpstrees(FILE *output, treelistitemtype *list, int itemnum, int itemsum, int comment, float cutoff); + +void sortbynum(treelistitemtype *list, treelistitemtype **sortlist); +treelistitemtype *addtree2list(char **tree, + int numtrees, + treelistitemtype **list, + int *numitems, + int *numsum); +void freetreelist(treelistitemtype **list, + int *numitems, + int *numsum); +void resetedgeinfo(void); +void incrementedgeinfo(int, int); +void minimumedgeinfo(void); +void initconsensus(void); +void makepart(int, int); +void computebiparts(void); +void printsplit(FILE *, uli); +void makenewsplitentries(void); +void copysplit(uli, int); +void makeconsensus(void); +void writenode(FILE *, int); +void writeconsensustree(FILE *); +void nodecoordinates(int); +void drawnode(int, int); +void plotconsensustree(FILE *); +unsigned char *mallocquartets(int); +void freequartets(void); +unsigned char readquartet(int, int, int, int); +void writequartet(int, int, int, int, unsigned char); +void sort3doubles(dvector, ivector); +void computeallquartets(void); +void checkquartet(int, int, int, int); +void num2quart(uli qnum, int *a, int *b, int *c, int *d); +uli numquarts(int maxspc); +uli quart2num (int a, int b, int c, int d); + +void writetpqfheader(int nspec, FILE *ofp, int flag); + + +/* extracted from main (xxx) */ +void compute_quartlklhds(int a, int b, int c, int d, double *d1, double *d2, double *d3, int approx); + + +/* definitions for timing */ + +#define OVERALL 0 +#define GENERAL 1 +#define OPTIONS 2 +#define PARAMEST 3 +#define QUARTETS 4 +#define PUZZLING 5 +#define TREEEVAL 6 + +typedef struct { + int currentjob; + clock_t tempcpu; + clock_t tempfullcpu; + clock_t tempcpustart; + time_t temptime; + time_t tempfulltime; + time_t temptimestart; + + clock_t maxcpu; + clock_t mincpu; + time_t maxtime; + time_t mintime; + + double maxcpublock; + double mincpublock; + double mincputick; + double mincputicktime; + double maxtimeblock; + double mintimeblock; + + double generalcpu; + double optionscpu; + double paramestcpu; + double quartcpu; + double quartblockcpu; + double quartmaxcpu; + double quartmincpu; + double puzzcpu; + double puzzblockcpu; + double puzzmaxcpu; + double puzzmincpu; + double treecpu; + double treeblockcpu; + double treemaxcpu; + double treemincpu; + double cpu; + double fullcpu; + + double generaltime; + double optionstime; + double paramesttime; + double quarttime; + double quartblocktime; + double quartmaxtime; + double quartmintime; + double puzztime; + double puzzblocktime; + double puzzmaxtime; + double puzzmintime; + double treetime; + double treeblocktime; + double treemaxtime; + double treemintime; + double time; + double fulltime; +} timearray_t; + +EXTERN double cputime, walltime; +EXTERN double fullcpu, fulltime; +EXTERN double fullcputime, fullwalltime; +EXTERN double altcputime, altwalltime; +EXTERN clock_t cputimestart, cputimestop, cputimedummy; +EXTERN time_t walltimestart, walltimestop, walltimedummy; +EXTERN clock_t Startcpu; /* start cpu time */ +EXTERN clock_t Stopcpu; /* stop cpu time */ +EXTERN time_t Starttime; /* start time */ +EXTERN time_t Stoptime; /* stop time */ +EXTERN time_t time0; /* timer variable */ +EXTERN time_t time1; /* yet another timer */ +EXTERN time_t time2; /* yet another timer */ +EXTERN timearray_t tarr; + +void resetqblocktime(timearray_t *ta); +void resetpblocktime(timearray_t *ta); +void inittimearr(timearray_t *ta); +void addtimes(int jobtype, timearray_t *ta); +#ifdef TIMEDEBUG + void printtimearr(timearray_t *ta); +#endif /* TIMEDEBUG */ + +#endif /* _PUZZLE_ */ + diff --git a/forester/archive/RIO/others/puzzle_dqo/src/puzzle1.c b/forester/archive/RIO/others/puzzle_dqo/src/puzzle1.c new file mode 100644 index 0000000..a012cb4 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/puzzle1.c @@ -0,0 +1,2864 @@ +/* + * puzzle1.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#define EXTERN + +#include "puzzle.h" +#include "gamma.h" + +void num2quart(uli qnum, int *a, int *b, int *c, int *d) +{ + double temp; + uli aa, bb, cc, dd; + uli lowval=0, highval=0; + + aa=0; bb=1; cc=2; dd=3; + + temp = (double)(24 * qnum); + temp = sqrt(temp); + temp = sqrt(temp); + /* temp = pow(temp, (double)(1/4)); */ + dd = (uli) floor(temp) + 1; + if (dd < 3) dd = 3; + lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + highval = (uli) (dd+1)*dd*(dd-1)*(dd-2)/24; + if (lowval >= qnum) + while ((lowval > qnum)) { + dd -= 1; lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + } + else { + while (highval <= qnum) { + dd += 1; highval = (uli) (dd+1)*dd*(dd-1)*(dd-2)/24; + } + lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + } + qnum -= lowval; + if (qnum > 0) { + temp = (double)(6 * qnum); + temp = pow(temp, (double)(1/3)); + cc = (uli) floor(temp); + if (cc < 2) cc= 2; + lowval = (uli) cc*(cc-1)*(cc-2)/6; + highval = (uli) (cc+1)*cc*(cc-1)/6; + if (lowval >= qnum) + while ((lowval > qnum)) { + cc -= 1; lowval = (uli) cc*(cc-1)*(cc-2)/6; + } + else { + while (highval <= qnum) { + cc += 1; highval = (uli) (cc+1)*cc*(cc-1)/6; + } + lowval = (uli) cc*(cc-1)*(cc-2)/6; + } + qnum -= lowval; + if (qnum > 0) { + temp = (double)(2 * qnum); + temp = sqrt(temp); + bb = (uli) floor(temp); + if (bb < 1) bb= 1; + lowval = (uli) bb*(bb-1)/2; + highval = (uli) (bb+1)*bb/2; + if (lowval >= qnum) + while ((lowval > qnum)) { + bb -= 1; lowval = (uli) bb*(bb-1)/2; + } + else { + while (highval <= qnum) { + bb += 1; highval = (uli) (bb+1)*bb/2; + } + lowval = (uli) bb*(bb-1)/2; + } + qnum -= lowval; + if (qnum > 0) { + aa = (uli) qnum; + if (aa < 0) aa= 0; + } + } + } + *d = (int)dd; + *c = (int)cc; + *b = (int)bb; + *a = (int)aa; +} /* num2quart */ + +/******************/ + +uli numquarts(int maxspc) +{ + uli tmp; + int a, b, c, d; + + if (maxspc < 4) + return (uli)0; + else { + maxspc--; + a = maxspc-3; + b = maxspc-2; + c = maxspc-1; + d = maxspc; + + tmp = (uli) 1 + a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); + } +} /* numquarts */ + +/******************/ + +uli quart2num (int a, int b, int c, int d) +{ + uli tmp; + if ((a>b) || (b>c) || (c>d)) { + fprintf(stderr, "Error PP5 not (%d <= %d <= %d <= %d) !!!\n", a, b, c, +d); + exit (1); + } + tmp = (uli) a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); +} /* quart2num */ + +/******************/ + + + +/* flag=0 old allquart binary */ +/* flag=1 allquart binary */ +/* flag=2 allquart ACSII */ +/* flag=3 quartlh binary */ +/* flag=4 quartlh ASCII */ + +void writetpqfheader(int nspec, + FILE *ofp, + int flag) +{ int currspec; + + if (flag == 0) { + unsigned long nquart; + unsigned long blocklen; + + nquart = numquarts(nspec); + /* compute number of bytes */ + if (nquart % 2 == 0) { /* even number */ + blocklen = (nquart)/2; + } else { /* odd number */ + blocklen = (nquart + 1)/2; + } + /* FPRINTF(STDOUTFILE "Writing quartet file: %s\n", filename); */ + fprintf(ofp, "TREE-PUZZLE\n%s\n\n", VERSION); + fprintf(ofp, "species: %d\n", nspec); + fprintf(ofp, "quartets: %lu\n", nquart); + fprintf(ofp, "bytes: %lu\n\n", blocklen); + + + /* fwrite(&(quartetinfo[0]), sizeof(char), blocklen, ofp); */ + } + + if (flag == 1) fprintf(ofp, "##TPQF-BB (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + if (flag == 2) fprintf(ofp, "##TPQF-BA (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + if (flag == 3) fprintf(ofp, "##TPQF-LB (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + if (flag == 4) fprintf(ofp, "##TPQF-LA (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + + for (currspec=0; currspec MAXTS) { + FPRINTF(STDOUTFILE "\n\n\nF84 model not possible "); + FPRINTF(STDOUTFILE "(bad Ts/Tv parameter)\n"); + tstvf84 = 0.0; + return; + } + if (yr < MINYR || yr > MAXYR) { + FPRINTF(STDOUTFILE "\n\n\nF84 model not possible "); + FPRINTF(STDOUTFILE "(bad Y/R transition parameter)\n"); + tstvf84 = 0.0; + return; + } + TSparam = ts; + YRparam = yr; + optim_optn = FALSE; +} + +/* compute number of quartets used in LM analysis */ +void compnumqts() +{ + if (lmqts == 0) { + if (numclust == 4) + Numquartets = (uli) clustA*clustB*clustC*clustD; + if (numclust == 3) + Numquartets = (uli) clustA*clustB*clustC*(clustC-1)/2; + if (numclust == 2) + Numquartets = (uli) clustA*(clustA-1)/2 * clustB*(clustB-1)/2; + if (numclust == 1) + Numquartets = (uli) Maxspc*(Maxspc-1)*(Maxspc-2)*(Maxspc-3)/24; + } else { + Numquartets = lmqts; + } +} + +/* set options interactively */ +void setoptions() +{ + int i, valid; + double sumfreq; + char ch; + + puzzlemode = PAIRDIST; /*Only do pairwise dist. CZ, 05/16/01*/ + + /* defaults */ + rhetmode = UNIFORMRATE; /* assume rate homogeneity */ + numcats = 1; + Geta = 0.05; + grate_optim = FALSE; + fracinv = 0.0; + fracinv_optim = FALSE; + + compclock = FALSE; /* compute clocklike branch lengths */ + locroot = -1; /* search for optimal place of root */ + qcalg_optn = FALSE; /* don't use sampling of quartets */ + approxp_optn = TRUE; /* approximate parameter estimates */ + listqptrees = PSTOUT_NONE; /* list puzzling step trees */ + + /* approximate QP quartets? */ + if (Maxspc <= 6) approxqp = FALSE; + else approxqp = TRUE; + + codon_optn = 0; /* use all positions in a codon */ + + /* number of puzzling steps */ + if (Maxspc <= 25) Numtrial = 1000; + else if (Maxspc <= 50) Numtrial = 10000; + else if (Maxspc <= 75) Numtrial = 25000; + else Numtrial = 50000; + + utree_optn = TRUE; /* use first user tree for estimation */ + outgroup = 0; /* use first taxon as outgroup */ + sym_optn = FALSE; /* symmetrize doublet frequencies */ + tstvf84 = 0.0; /* disable F84 model */ + show_optn = FALSE; /* show unresolved quartets */ + typ_optn = TREERECON_OPTN; /* tree reconstruction */ + numclust = 1; /* one clusters in LM analysis */ + lmqts = 0; /* all quartets in LM analysis */ + compnumqts(); + if (Numquartets > 10000) { + lmqts = 10000; /* 10000 quartets in LM analysis */ + compnumqts(); + } + + do { + FPRINTF(STDOUTFILE "\n\n\nGENERAL OPTIONS\n"); + FPRINTF(STDOUTFILE " b Type of analysis? "); + if (typ_optn == TREERECON_OPTN) FPRINTF(STDOUTFILE "Tree reconstruction\n"); + if (typ_optn == LIKMAPING_OPTN) FPRINTF(STDOUTFILE "Likelihood mapping\n"); + if (typ_optn == TREERECON_OPTN) { + FPRINTF(STDOUTFILE " k Tree search procedure? "); + if (puzzlemode == QUARTPUZ) FPRINTF(STDOUTFILE "Quartet puzzling\n"); + if (puzzlemode == USERTREE) FPRINTF(STDOUTFILE "User defined trees\n"); + if (puzzlemode == PAIRDIST) FPRINTF(STDOUTFILE "Pairwise distances only (no tree)\n"); + if (puzzlemode == QUARTPUZ) { + FPRINTF(STDOUTFILE " v Approximate quartet likelihood? %s\n", + (approxqp ? "Yes" : "No")); + FPRINTF(STDOUTFILE " u List unresolved quartets? %s\n", + (show_optn ? "Yes" : "No")); + FPRINTF(STDOUTFILE " n Number of puzzling steps? %lu\n", + Numtrial); + FPRINTF(STDOUTFILE " j List puzzling step trees? "); + switch (listqptrees) { + case PSTOUT_NONE: FPRINTF(STDOUTFILE "No\n"); break; + case PSTOUT_ORDER: FPRINTF(STDOUTFILE "Unique topologies\n"); break; + case PSTOUT_LISTORDER: FPRINTF(STDOUTFILE "Unique topologies & Chronological list\n"); break; + case PSTOUT_LIST: FPRINTF(STDOUTFILE "Chronological list only\n"); break; + } + + FPRINTF(STDOUTFILE " o Display as outgroup? "); + fputid(STDOUT, outgroup); + FPRINTF(STDOUTFILE "\n"); + } + if (puzzlemode == QUARTPUZ || puzzlemode == USERTREE) { + FPRINTF(STDOUTFILE " z Compute clocklike branch lengths? "); + if (compclock) FPRINTF(STDOUTFILE "Yes\n"); + else FPRINTF(STDOUTFILE "No\n"); + } + if (compclock) + if (puzzlemode == QUARTPUZ || puzzlemode == USERTREE) { + FPRINTF(STDOUTFILE " l Location of root? "); + if (locroot < 0) FPRINTF(STDOUTFILE "Best place (automatic search)\n"); + else if (locroot < Maxspc) { + FPRINTF(STDOUTFILE "Branch %d (", locroot + 1); + fputid(STDOUT, locroot); + FPRINTF(STDOUTFILE ")\n"); + } else FPRINTF(STDOUTFILE "Branch %d (internal branch)\n", locroot + 1); + } + } + if (typ_optn == LIKMAPING_OPTN) { + FPRINTF(STDOUTFILE " g Group sequences in clusters? "); + if (numclust == 1) FPRINTF(STDOUTFILE "No\n"); + else FPRINTF(STDOUTFILE "Yes (%d clusters as specified)\n", numclust); + FPRINTF(STDOUTFILE " n Number of quartets? "); + if (lmqts == 0) FPRINTF(STDOUTFILE "%lu (all possible)\n", Numquartets); + else FPRINTF(STDOUTFILE "%lu (random choice)\n", lmqts); + } + FPRINTF(STDOUTFILE " e Parameter estimates? "); + if (approxp_optn) FPRINTF(STDOUTFILE "Approximate (faster)\n"); + else FPRINTF(STDOUTFILE "Exact (slow)\n"); + if (!(puzzlemode == USERTREE && typ_optn == TREERECON_OPTN)) { + FPRINTF(STDOUTFILE " x Parameter estimation uses? "); + if (qcalg_optn) FPRINTF(STDOUTFILE "Quartet sampling + NJ tree\n"); + else FPRINTF(STDOUTFILE "Neighbor-joining tree\n"); + + } else { + FPRINTF(STDOUTFILE " x Parameter estimation uses? "); + if (utree_optn) + FPRINTF(STDOUTFILE "1st input tree\n"); + else if (qcalg_optn) FPRINTF(STDOUTFILE "Quartet sampling + NJ tree\n"); + else FPRINTF(STDOUTFILE "Neighbor-joining tree\n"); + } + FPRINTF(STDOUTFILE "SUBSTITUTION PROCESS\n"); + FPRINTF(STDOUTFILE " d Type of sequence input data? "); + if (auto_datatype == AUTO_GUESS) FPRINTF(STDOUTFILE "Auto: "); + if (data_optn == NUCLEOTIDE) FPRINTF(STDOUTFILE "Nucleotides\n"); + if (data_optn == AMINOACID) FPRINTF(STDOUTFILE "Amino acids\n"); + if (data_optn == BINARY) FPRINTF(STDOUTFILE "Binary states\n"); + if (data_optn == NUCLEOTIDE && (Maxseqc % 3) == 0 && !SH_optn) { + FPRINTF(STDOUTFILE " h Codon positions selected? "); + if (codon_optn == 0) FPRINTF(STDOUTFILE "Use all positions\n"); + if (codon_optn == 1) FPRINTF(STDOUTFILE "Use only 1st positions\n"); + if (codon_optn == 2) FPRINTF(STDOUTFILE "Use only 2nd positions\n"); + if (codon_optn == 3) FPRINTF(STDOUTFILE "Use only 3rd positions\n"); + if (codon_optn == 4) FPRINTF(STDOUTFILE "Use 1st and 2nd positions\n"); + } + FPRINTF(STDOUTFILE " m Model of substitution? "); + if (data_optn == NUCLEOTIDE) { /* nucleotides */ + if (nuc_optn) { + if(HKY_optn) + FPRINTF(STDOUTFILE "HKY (Hasegawa et al. 1985)\n"); + else { + FPRINTF(STDOUTFILE "TN (Tamura-Nei 1993)\n"); + FPRINTF(STDOUTFILE " p Constrain TN model to F84 model? "); + if (tstvf84 == 0.0) + FPRINTF(STDOUTFILE "No\n"); + else FPRINTF(STDOUTFILE "Yes (Ts/Tv ratio = %.2f)\n", tstvf84); + } + FPRINTF(STDOUTFILE " t Transition/transversion parameter? "); + if (optim_optn) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else + FPRINTF(STDOUTFILE "%.2f\n", TSparam); + if (TN_optn) { + FPRINTF(STDOUTFILE " r Y/R transition parameter? "); + if (optim_optn) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else + FPRINTF(STDOUTFILE "%.2f\n", YRparam); + } + } + if (SH_optn) { + FPRINTF(STDOUTFILE "SH (Schoeniger-von Haeseler 1994)\n"); + FPRINTF(STDOUTFILE " t Transition/transversion parameter? "); + if (optim_optn) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else + FPRINTF(STDOUTFILE "%.2f\n", TSparam); + } + } + if (data_optn == NUCLEOTIDE && SH_optn) { + FPRINTF(STDOUTFILE " h Doublets defined by? "); + if (SHcodon) + FPRINTF(STDOUTFILE "1st and 2nd codon positions\n"); + else + FPRINTF(STDOUTFILE "1st+2nd, 3rd+4th, etc. site\n"); + } + if (data_optn == AMINOACID) { /* amino acids */ + switch (auto_aamodel) { + case AUTO_GUESS: + FPRINTF(STDOUTFILE "Auto: "); + break; + case AUTO_DEFAULT: + FPRINTF(STDOUTFILE "Def.: "); + break; + } + if (Dayhf_optn) FPRINTF(STDOUTFILE "Dayhoff (Dayhoff et al. 1978)\n"); + if (Jtt_optn) FPRINTF(STDOUTFILE "JTT (Jones et al. 1992)\n"); + if (mtrev_optn) FPRINTF(STDOUTFILE "mtREV24 (Adachi-Hasegawa 1996)\n"); + if (cprev_optn) FPRINTF(STDOUTFILE "cpREV45 (Adachi et al. 2000)\n"); + if (blosum62_optn) FPRINTF(STDOUTFILE "BLOSUM62 (Henikoff-Henikoff 92)\n"); + if (vtmv_optn) FPRINTF(STDOUTFILE "VT (Mueller-Vingron 2000)\n"); + if (wag_optn) FPRINTF(STDOUTFILE "WAG (Whelan-Goldman 2000)\n"); + } + if (data_optn == BINARY) { /* binary states */ + FPRINTF(STDOUTFILE "Two-state model (Felsenstein 1981)\n"); + } + if (data_optn == AMINOACID) + FPRINTF(STDOUTFILE " f Amino acid frequencies? "); + else if (data_optn == NUCLEOTIDE && SH_optn) + FPRINTF(STDOUTFILE " f Doublet frequencies? "); + else if (data_optn == NUCLEOTIDE && nuc_optn) + FPRINTF(STDOUTFILE " f Nucleotide frequencies? "); + else if (data_optn == BINARY) + FPRINTF(STDOUTFILE " f Binary state frequencies? "); + FPRINTF(STDOUTFILE "%s\n", (Frequ_optn ? "Estimate from data set" : + "Use specified values")); + if (data_optn == NUCLEOTIDE && SH_optn) + FPRINTF(STDOUTFILE " s Symmetrize doublet frequencies? %s\n", + (sym_optn ? "Yes" : "No")); + + FPRINTF(STDOUTFILE "RATE HETEROGENEITY\n"); + FPRINTF(STDOUTFILE " w Model of rate heterogeneity? "); + if (rhetmode == UNIFORMRATE) FPRINTF(STDOUTFILE "Uniform rate\n"); + if (rhetmode == GAMMARATE ) FPRINTF(STDOUTFILE "Gamma distributed rates\n"); + if (rhetmode == TWORATE ) FPRINTF(STDOUTFILE "Two rates (1 invariable + 1 variable)\n"); + if (rhetmode == MIXEDRATE ) FPRINTF(STDOUTFILE "Mixed (1 invariable + %d Gamma rates)\n", numcats); + + if (rhetmode == TWORATE || rhetmode == MIXEDRATE) { + FPRINTF(STDOUTFILE " i Fraction of invariable sites? "); + if (fracinv_optim) FPRINTF(STDOUTFILE "Estimate from data set"); + else FPRINTF(STDOUTFILE "%.2f", fracinv); + if (fracinv == 0.0 && !fracinv_optim) FPRINTF(STDOUTFILE " (all sites variable)"); + FPRINTF(STDOUTFILE "\n"); + } + if (rhetmode == GAMMARATE || rhetmode == MIXEDRATE) { + FPRINTF(STDOUTFILE " a Gamma distribution parameter alpha? "); + if (grate_optim) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else if (Geta > 0.5) + FPRINTF(STDOUTFILE "%.2f (strong rate heterogeneity)\n", (1.0-Geta)/Geta); + else FPRINTF(STDOUTFILE "%.2f (weak rate heterogeneity)\n", (1.0-Geta)/Geta); + FPRINTF(STDOUTFILE " c Number of Gamma rate categories? %d\n", numcats); + } + + FPRINTF(STDOUTFILE "\nQuit [q], confirm [y], or change [menu] settings: "); + + /* read one char */ + ch = getchar(); + if (ch != '\n') { + do ; + while (getchar() != '\n'); + } + ch = (char) tolower((int) ch); + + /* letters in use: d m */ + /* letters not in use: */ + + switch (ch) { + + case '\n': break; + + + + case 'd': if (auto_datatype == AUTO_GUESS) { + auto_datatype = AUTO_OFF; + guessdata_optn = data_optn; + data_optn = 0; + } else { + data_optn = data_optn + 1; + if (data_optn == 3) { + auto_datatype = AUTO_GUESS; + data_optn = guessdata_optn; + } + } + /* translate characters into format used by ML engine */ + translatedataset(); + estimatebasefreqs(); + break; + + + + case 'm': if (data_optn == NUCLEOTIDE) { /* nucleotide data */ + if(HKY_optn && nuc_optn) { + /* HKY -> TN */ + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 0.9; + HKY_optn = FALSE; + TN_optn = TRUE; + optim_optn = TRUE; + nuc_optn = TRUE; + SH_optn = FALSE; + break; + } + if(TN_optn && nuc_optn) { + if (Maxseqc % 2 == 0 || Maxseqc % 3 == 0) { + /* number of chars needs to be a multiple 2 or 3 */ + /* TN -> SH */ + if (Maxseqc % 2 != 0 && Maxseqc % 3 == 0) + SHcodon = TRUE; + else + SHcodon = FALSE; + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 1.0; + HKY_optn = TRUE; + TN_optn = FALSE; + optim_optn = TRUE; + nuc_optn = FALSE; + SH_optn = TRUE; + /* translate characters into format */ + /* used by ML engine */ + translatedataset(); + estimatebasefreqs(); + } else { + FPRINTF(STDOUTFILE "\n\n\nSH model not "); + FPRINTF(STDOUTFILE "available for the data set!\n"); + /* TN -> HKY */ + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 1.0; + HKY_optn = TRUE; + TN_optn = FALSE; + optim_optn = TRUE; + nuc_optn = TRUE; + SH_optn = FALSE; + } + break; + } + if(SH_optn) { + /* SH -> HKY */ + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 1.0; + HKY_optn = TRUE; + TN_optn = FALSE; + optim_optn = TRUE; + nuc_optn = TRUE; + SH_optn = FALSE; + /* translate characters into format */ + /* used by ML engine */ + translatedataset(); + estimatebasefreqs(); + break; + } + break; + } + if (data_optn == AMINOACID) { /* amino acid data */ + if (auto_aamodel) { + /* AUTO -> Dayhoff */ + Dayhf_optn = TRUE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } + if (Dayhf_optn) { + /* Dayhoff -> JTT */ + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } + if (Jtt_optn) { + /* JTT -> mtREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = TRUE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#ifdef CPREV + if (mtrev_optn) { + /* mtREV -> cpREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = TRUE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#else /* ! CPREV */ + if (mtrev_optn) { + /* mtREV -> BLOSUM 62 */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = TRUE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#endif /* ! CPREV */ + +#ifdef CPREV + if (cprev_optn) { + /* cpREV -> BLOSUM 62 */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = TRUE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#endif + if (blosum62_optn) { + /* BLOSUM 62 -> VT model */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = TRUE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } + if (vtmv_optn) { + /* VT model -> WAG model */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = TRUE; + auto_aamodel = AUTO_OFF; + break; + } + if (wag_optn) { + /* WAG model -> AUTO */ + Dayhf_optn = guessDayhf_optn; + Jtt_optn = guessJtt_optn; + mtrev_optn = guessmtrev_optn; + cprev_optn = guesscprev_optn; + blosum62_optn = guessblosum62_optn; + vtmv_optn = guessvtmv_optn; + wag_optn = guesswag_optn; + auto_aamodel = guessauto_aamodel; + break; + } + break; + } + if (data_optn == BINARY) { + FPRINTF(STDOUTFILE "\n\n\nNo other model available!\n"); + } + break; + + + + case 'y': break; + + default: FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + break; + } + } while (ch != 'y'); + + FPRINTF(STDOUTFILE "\n\n\n"); +} + +/* open file for reading */ +void openfiletoread(FILE **fp, char name[], char descr[]) +{ + int count = 0; + cvector str; + + if ((*fp = fopen(name, "r")) == NULL) { + FPRINTF(STDOUTFILE "\n\n\nPlease enter a file name for the %s: ", descr); + str = mygets(); + while ((*fp = fopen(str, "r")) == NULL) + { + count++; + if (count > 10) + { + FPRINTF(STDOUTFILE "\n\n\nToo many trials - quitting ...\n"); + exit(1); + } + FPRINTF(STDOUTFILE "File '%s' not found, ", str); + FPRINTF(STDOUTFILE "please enter alternative name: "); + free_cvector(str); + str = mygets(); + } + free_cvector(str); + FPRINTF(STDOUTFILE "\n"); + } +} /* openfiletoread */ + + +/* open file for writing */ +void openfiletowrite(FILE **fp, char name[], char descr[]) +{ + int count = 0; + cvector str; + + if ((*fp = fopen(name, "w")) == NULL) { + FPRINTF(STDOUTFILE "\n\n\nPlease enter a file name for the %s: ", descr); + str = mygets(); + while ((*fp = fopen(str, "w")) == NULL) + { + count++; + if (count > 10) + { + FPRINTF(STDOUTFILE "\n\n\nToo many trials - quitting ...\n"); + exit(1); + } + FPRINTF(STDOUTFILE "File '%s' not created, ", str); + FPRINTF(STDOUTFILE "please enter other name: "); + free_cvector(str); + str = mygets(); + } + free_cvector(str); + FPRINTF(STDOUTFILE "\n"); + } +} /* openfiletowrite */ + + +/* open file for appending */ +void openfiletoappend(FILE **fp, char name[], char descr[]) +{ + int count = 0; + cvector str; + + if ((*fp = fopen(name, "a")) == NULL) { + FPRINTF(STDOUTFILE "\n\n\nPlease enter a file name for the %s: ", descr); + str = mygets(); + while ((*fp = fopen(str, "a")) == NULL) + { + count++; + if (count > 10) + { + FPRINTF(STDOUTFILE "\n\n\nToo many trials - quitting ...\n"); + exit(1); + } + FPRINTF(STDOUTFILE "File '%s' not created, ", str); + FPRINTF(STDOUTFILE "please enter other name: "); + free_cvector(str); + str = mygets(); + } + free_cvector(str); + FPRINTF(STDOUTFILE "\n"); + } +} /* openfiletowrite */ + + +/* close file */ +void closefile(FILE *fp) +{ + fclose(fp); +} /* closefile */ + +/* symmetrize doublet frequencies */ +void symdoublets() +{ + int i, imean; + double mean; + + if (data_optn == NUCLEOTIDE && SH_optn && sym_optn) { + /* ML frequencies */ + mean = (Freqtpm[1] + Freqtpm[4])/2.0; /* AC CA */ + Freqtpm[1] = mean; + Freqtpm[4] = mean; + mean = (Freqtpm[2] + Freqtpm[8])/2.0; /* AG GA */ + Freqtpm[2] = mean; + Freqtpm[8] = mean; + mean = (Freqtpm[3] + Freqtpm[12])/2.0; /* AT TA */ + Freqtpm[3] = mean; + Freqtpm[12] = mean; + mean = (Freqtpm[6] + Freqtpm[9])/2.0; /* CG GC */ + Freqtpm[6] = mean; + Freqtpm[9] = mean; + mean = (Freqtpm[7] + Freqtpm[13])/2.0; /* CT TC */ + Freqtpm[7] = mean; + Freqtpm[13] = mean; + mean = (Freqtpm[11] + Freqtpm[14])/2.0; /* GT TG */ + Freqtpm[11] = mean; + Freqtpm[14] = mean; + + /* base composition of each taxon */ + for (i = 0; i < Maxspc; i++) { + imean = (Basecomp[i][1] + Basecomp[i][4])/2; /* AC CA */ + Basecomp[i][1] = imean; + Basecomp[i][4] = imean; + imean = (Basecomp[i][2] + Basecomp[i][8])/2; /* AG GA */ + Basecomp[i][2] = imean; + Basecomp[i][8] = imean; + imean = (Basecomp[i][3] + Basecomp[i][12])/2; /* AT TA */ + Basecomp[i][3] = imean; + Basecomp[i][12] = imean; + imean = (Basecomp[i][6] + Basecomp[i][9])/2; /* CG GC */ + Basecomp[i][6] = imean; + Basecomp[i][9] = imean; + imean = (Basecomp[i][7] + Basecomp[i][13])/2; /* CT TC */ + Basecomp[i][7] = imean; + Basecomp[i][13] = imean; + imean = (Basecomp[i][11] + Basecomp[i][14])/2; /* GT TG */ + Basecomp[i][11] = imean; + Basecomp[i][14] = imean; + } + } +} + +/* show Ts/Tv ratio and Ts Y/R ratio */ +void computeexpectations() +{ + /* CZ */ +} + +/* write ML distance matrix to file. Modified CZ 05/29/01 */ +void putdistance(FILE *fp) +{ + /*int i;*/ + int i, j; + + for (i = 0; i < Maxspc - 1; i++) { + /*fprintf(fp, "%.5f ", Distanmat[i]/100.0);*/ + for ( j = 0; j < 26; j++ ) { + fputc( Identif[i][j], fp ); /*CZ*/ + } + fprintf(fp, "%.5f\n", Distanmat[i]/100.0); + } + fprintf(fp, "\n"); + +} + + + + +/* first lines of EPSF likelihood mapping file */ +void initps(FILE *ofp) +{ + /* CZ */ +} + +/* plot one point of likelihood mapping analysis */ +void plotlmpoint(FILE *ofp, double w1, double w2) +{ + /* CZ */ +} + +/* last lines of EPSF likelihood mapping file */ +void finishps(FILE *ofp) +{ + /* CZ */ +} + +/* computes LM point from the three log-likelihood values, + plots the point, and does some statistics */ +void makelmpoint(FILE *fp, double b1, double b2, double b3) +{ + double w1, w2, w3, temp; + unsigned char qpbranching; + double temp1, temp2, temp3, onethird; + unsigned char discreteweight[3], treebits[3]; + + onethird = 1.0/3.0; + treebits[0] = (unsigned char) 1; + treebits[1] = (unsigned char) 2; + treebits[2] = (unsigned char) 4; + + /* sort in descending order */ + qweight[0] = b1; + qweight[1] = b2; + qweight[2] = b3; + sort3doubles(qweight, qworder); + + /* compute Bayesian weights */ + qweight[qworder[1]] = exp(qweight[qworder[1]]-qweight[qworder[0]]); + qweight[qworder[2]] = exp(qweight[qworder[2]]-qweight[qworder[0]]); + qweight[qworder[0]] = 1.0; + temp = qweight[0] + qweight[1] + qweight[2]; + qweight[0] = qweight[0]/temp; + qweight[1] = qweight[1]/temp; + qweight[2] = qweight[2]/temp; + + /* plot one point in likelihood mapping triangle */ + w1 = qweight[0]; + w2 = qweight[1]; + w3 = qweight[2]; + plotlmpoint(fp, w1, w2); + + /* check areas 1,2,3 */ + if (treebits[qworder[0]] == 1) ar1++; + else if (treebits[qworder[0]] == 2) ar2++; + else ar3++; + + /* check out regions 1,2,3,4,5,6,7 */ + + /* 100 distribution */ + temp1 = 1.0 - qweight[qworder[0]]; + sqdiff[0] = temp1*temp1 + + qweight[qworder[1]]*qweight[qworder[1]] + + qweight[qworder[2]]*qweight[qworder[2]]; + discreteweight[0] = treebits[qworder[0]]; + + /* 110 distribution */ + temp1 = 0.5 - qweight[qworder[0]]; + temp2 = 0.5 - qweight[qworder[1]]; + sqdiff[1] = temp1*temp1 + temp2*temp2 + + qweight[qworder[2]]*qweight[qworder[2]]; + discreteweight[1] = treebits[qworder[0]] + treebits[qworder[1]]; + + /* 111 distribution */ + temp1 = onethird - qweight[qworder[0]]; + temp2 = onethird - qweight[qworder[1]]; + temp3 = onethird - qweight[qworder[2]]; + sqdiff[2] = temp1 * temp1 + temp2 * temp2 + temp3 * temp3; + discreteweight[2] = (unsigned char) 7; + + /* sort in descending order */ + sort3doubles(sqdiff, sqorder); + + qpbranching = (unsigned char) discreteweight[sqorder[2]]; + + if (qpbranching == 1) { + reg1++; + if (w2 < w3) reg1l++; + else reg1r++; + } + if (qpbranching == 2) { + reg2++; + if (w1 < w3) reg2d++; + else reg2u++; + } + if (qpbranching == 4) { + reg3++; + if (w1 < w2) reg3d++; + else reg3u++; + } + if (qpbranching == 3) { + reg4++; + if (w1 < w2) reg4d++; + else reg4u++; + } + if (qpbranching == 6) { + reg5++; + if (w2 < w3) reg5l++; + else reg5r++; + } + if (qpbranching == 5) { + reg6++; + if (w1 < w3) reg6d++; + else reg6u++; + } + if (qpbranching == 7) reg7++; +} + +/* print tree statistics */ +void printtreestats(FILE *ofp) +{ + int i, j, besttree; + double bestlkl, difflkl, difflklps, temp, sum; + + /* find best tree */ + besttree = 0; + bestlkl = ulkl[0]; + for (i = 1; i < numutrees; i++) + if (ulkl[i] > bestlkl) { + besttree = i; + bestlkl = ulkl[i]; + } + + fprintf(ofp, "\n\nCOMPARISON OF USER TREES (NO CLOCK)\n\n"); + fprintf(ofp, "Tree log L difference S.E. Significantly worse\n"); + fprintf(ofp, "--------------------------------------------------------\n"); + for (i = 0; i < numutrees; i++) { + difflkl = ulkl[besttree]-ulkl[i]; + fprintf(ofp, "%2d %10.2f %8.2f ", i+1, ulkl[i], difflkl); + if (i == besttree) { + fprintf(ofp, " <----------------- best tree"); + } else { + /* compute variance of Log L differences over sites */ + difflklps = difflkl/(double)Maxsite; + sum = 0.0; + for (j = 0; j < Numptrn; j++) { + temp = allsites[besttree][j] - allsites[i][j] - difflklps; + sum += temp*temp*Weight[j]; + } + sum = sqrt(fabs(sum/(Maxsite-1.0)*Maxsite)); + fprintf(ofp, "%11.2f ", sum); + if (difflkl > 1.96*sum) + fprintf(ofp, "yes"); + else + fprintf(ofp, "no"); + } + fprintf(ofp, "\n"); + } + fprintf(ofp, "\nThis test (5%% significance) follows Kishino and Hasegawa (1989).\n"); + + if (compclock) { + + /* find best tree */ + besttree = 0; + bestlkl = ulklc[0]; + for (i = 1; i < numutrees; i++) + if (ulklc[i] > bestlkl) { + besttree = i; + bestlkl = ulklc[i]; + } + + fprintf(ofp, "\n\nCOMPARISON OF USER TREES (WITH CLOCK)\n\n"); + fprintf(ofp, "Tree log L difference S.E. Significantly worse\n"); + fprintf(ofp, "--------------------------------------------------------\n"); + for (i = 0; i < numutrees; i++) { + difflkl = ulklc[besttree]-ulklc[i]; + fprintf(ofp, "%2d %10.2f %8.2f ", i+1, ulklc[i], difflkl); + if (i == besttree) { + fprintf(ofp, " <----------------- best tree"); + } else { + /* compute variance of Log L differences over sites */ + difflklps = difflkl/(double)Maxsite; + sum = 0.0; + for (j = 0; j < Numptrn; j++) { + temp = allsitesc[besttree][j] - allsitesc[i][j] - difflklps; + sum += temp*temp*Weight[j]; + } + sum = sqrt(fabs(sum/(Maxsite-1.0)*Maxsite)); + fprintf(ofp, "%11.2f ", sum); + if (difflkl > 1.96*sum) + fprintf(ofp, "yes"); + else + fprintf(ofp, "no"); + } + fprintf(ofp, "\n"); + } + fprintf(ofp, "\nThis test (5%% significance) follows Kishino and Hasegawa (1989).\n"); + } +} + +/* time stamp */ +void timestamp(FILE* ofp) +{ + double timespan; + double cpuspan; + timespan = difftime(Stoptime, Starttime); + cpuspan = ((double) (Stopcpu - Startcpu) / CLOCKS_PER_SEC); + fprintf(ofp, "\n\nTIME STAMP\n\n"); + fprintf(ofp, "Date and time: %s", asctime(localtime(&Starttime)) ); + fprintf(ofp, "Runtime (excl. input) : %.0f seconds (= %.1f minutes = %.1f hours)\n", + timespan, timespan/60., timespan/3600.); + fprintf(ofp, "Runtime (incl. input) : %.0f seconds (= %.1f minutes = %.1f hours)\n", + fulltime, fulltime/60., fulltime/3600.); +#ifdef TIMEDEBUG + fprintf(ofp, "CPU time (incl. input): %.0f seconds (= %.1f minutes = %.1f hours)\n\n", + fullcpu, fullcpu/60., fullcpu/3600.); +#endif /* TIMEDEBUG */ + +} + +/* extern int bestrfound; */ + +/* write output file */ +void writeoutputfile(FILE *ofp, int part) +{ + /* CZ */ +} + + +/******************************************************************************/ +/* timer routines */ +/******************************************************************************/ + +/* start timer */ +void starttimer() +{ + time(&time0); + time1 = time0; +} + +/* check remaining time and print message if necessary */ +void checktimer(uli numqts) +{ + double tc2, mintogo, minutes, hours; + + time(&time2); + if ( (time2 - time1) > 900) { /* generate message every 15 minutes */ + /* every 900 seconds */ + /* percentage of completed quartets */ + if (mflag == 0) { + mflag = 1; + FPRINTF(STDOUTFILE "\n"); + } + tc2 = 100.*numqts/Numquartets; + mintogo = (100.0-tc2) * + (double) (time2-time0)/60.0/tc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%.2f%%", tc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + time1 = time2; + } + +} + +/* check remaining time and print message if necessary */ +void checktimer2(uli numqts, uli all, int flag) +{ + double tc2, mintogo, minutes, hours; + + static time_t tt1; + static time_t tt2; + + if (flag == 1) { + time(&tt1); + time(&tt2); + } else { + time(&tt2); + if ( (tt2 - tt1) > 900) { /* generate message every 15 minutes */ + /* every 900 seconds */ + /* percentage of completed quartets */ + if (mflag == 0) { + mflag = 1; + FPRINTF(STDOUTFILE "\n"); + } + tc2 = 100.*numqts/Numquartets; + mintogo = (100.0-tc2) * + (double) (tt2-time0)/60.0/tc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%.2f%%", tc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + tt1 = tt2; + } + } +} + +void resetqblocktime(timearray_t *ta) +{ + ta->quartcpu += ta->quartblockcpu; + ta->quartblockcpu = 0.0; + ta->quarttime += ta->quartblocktime; + ta->quartblocktime = 0.0; +} /* resetqblocktime */ + + +void resetpblocktime(timearray_t *ta) +{ + ta->puzzcpu += ta->puzzblockcpu; + ta->puzzblockcpu = 0.0; + ta->puzztime += ta->puzzblocktime; + ta->puzzblocktime = 0.0; +} /* resetpblocktime */ + + +#ifdef TIMEDEBUG +void printtimearr(timearray_t *ta) +{ +# if ! PARALLEL + int PP_Myid; + PP_Myid = -1; +# endif + printf("(%2d) MMCPU: %11ld / %11ld \n", PP_Myid, ta->maxcpu, ta->mincpu); + printf("(%2d) CTick: %11.6f [tks] / %11.6f [s] \n", PP_Myid, ta->mincputick, ta->mincputicktime); + + printf("(%2d) MMTIM: %11ld / %11ld \n", PP_Myid, ta->maxtime, ta->mintime); + + printf("(%2d) Mxblk: %11.6e / %11.6e \n", PP_Myid, ta->maxcpublock, ta->maxtimeblock); + printf("(%2d) Mnblk: %11.6e / %11.6e \n", PP_Myid, ta->mincpublock, ta->mintimeblock); + + printf("(%2d) Gnrl: %11.6e / %11.6e \n", PP_Myid, ta->generalcpu, ta->generaltime); + printf("(%2d) Optn: %11.6e / %11.6e \n", PP_Myid, ta->optionscpu, ta->optionstime); + printf("(%2d) Estm: %11.6e / %11.6e \n", PP_Myid, ta->paramestcpu, ta->paramesttime); + printf("(%2d) Qurt: %11.6e / %11.6e \n", PP_Myid, ta->quartcpu, ta->quarttime); + printf("(%2d) QBlk: %11.6e / %11.6e \n", PP_Myid, ta->quartblockcpu, ta->quartblocktime); + printf("(%2d) QMax: %11.6e / %11.6e \n", PP_Myid, ta->quartmaxcpu, ta->quartmaxtime); + printf("(%2d) QMin: %11.6e / %11.6e \n", PP_Myid, ta->quartmincpu, ta->quartmintime); + + printf("(%2d) Puzz: %11.6e / %11.6e \n", PP_Myid, ta->puzzcpu, ta->puzztime); + printf("(%2d) PBlk: %11.6e / %11.6e \n", PP_Myid, ta->puzzblockcpu, ta->puzzblocktime); + printf("(%2d) PMax: %11.6e / %11.6e \n", PP_Myid, ta->puzzmaxcpu, ta->puzzmaxtime); + printf("(%2d) PMin: %11.6e / %11.6e \n", PP_Myid, ta->puzzmincpu, ta->puzzmintime); + + printf("(%2d) Tree: %11.6e / %11.6e \n", PP_Myid, ta->treecpu, ta->treetime); + printf("(%2d) TBlk: %11.6e / %11.6e \n", PP_Myid, ta->treeblockcpu, ta->treeblocktime); + printf("(%2d) TMax: %11.6e / %11.6e \n", PP_Myid, ta->treemaxcpu, ta->treemaxtime); + printf("(%2d) TMin: %11.6e / %11.6e \n", PP_Myid, ta->treemincpu, ta->treemintime); + + printf("(%2d) C/T : %11.6e / %11.6e \n", PP_Myid, + (ta->generalcpu + ta->optionscpu + ta->paramestcpu + ta->quartblockcpu + ta->puzzblockcpu + ta->treeblockcpu), + (ta->generaltime + ta->optionstime + ta->paramesttime + ta->quartblocktime + ta->puzzblocktime + ta->treeblocktime)); + printf("(%2d) CPU: %11.6e / Time: %11.6e \n", PP_Myid, ta->cpu, ta->time); + printf("(%2d) aCPU: %11.6e / aTime: %11.6e \n", PP_Myid, ta->fullcpu, ta->fulltime); + +} /* printtimearr */ +#endif /* TIMEDEBUG */ + +char *jtype [7]; + +void inittimearr(timearray_t *ta) +{ + clock_t c0, c1, c2; + + jtype[OVERALL] = "OVERALL"; + jtype[GENERAL] = "GENERAL"; + jtype[OPTIONS] = "OPTIONS"; + jtype[PARAMEST] = "PARAMeter ESTimation"; + jtype[QUARTETS] = "QUARTETS"; + jtype[PUZZLING] = "PUZZLING steps"; + jtype[TREEEVAL] = "TREE EVALuation"; + ta->currentjob = GENERAL; + + c1 = clock(); + c2 = clock(); + while (c1 == c2) + c2 = clock(); + ta->mincputick = (double)(c2 - c1); + ta->mincputicktime = ((double)(c2 - c1))/CLOCKS_PER_SEC; + + ta->tempcpu = clock(); + ta->tempcpustart = ta->tempcpu; + ta->tempfullcpu = ta->tempcpu; + time(&(ta->temptime)); + ta->temptimestart = ta->temptime; + ta->tempfulltime = ta->temptime; + + c0=0; c1=0; c2=(clock_t)((2 * c1) + 1);; + while (c1 < c2) { + c0 = c1; + c1 = c2; + c2 = (clock_t)((2 * c1) + 1); + } + if (c1 == c2) ta->maxcpu=c0; + if (c1 > c2) ta->maxcpu=c1; + + c0=0; c1=0; c2=(clock_t)((2 * c1) - 1); + while (c1 > c2) { + c0 = c1; + c1 = c2; + c2 = (clock_t)((2 * c1) - 1); + } + if (c1 == c2) ta->mincpu=c0; + if (c1 < c2) ta->mincpu=c1; + + + + ta->maxtime = 0; + ta->mintime = 0; + + ta->maxcpublock = 0; + ta->mincpublock = DBL_MAX; + ta->maxtimeblock = 0; + ta->mintimeblock = DBL_MAX; + + ta->cpu = 0.0; + ta->time = 0.0; + + ta->fullcpu = 0.0; + ta->fulltime = 0.0; + + ta->generalcpu = 0.0; + ta->optionscpu = 0.0; + ta->paramestcpu = 0.0; + ta->quartcpu = 0.0; + ta->quartblockcpu = 0.0; + ta->quartmaxcpu = 0.0; + ta->quartmincpu = ((double) ta->maxcpu)/CLOCKS_PER_SEC; + ta->puzzcpu = 0.0; + ta->puzzblockcpu = 0.0; + ta->puzzmaxcpu = 0.0; + ta->puzzmincpu = ((double) ta->maxcpu)/CLOCKS_PER_SEC; + ta->treecpu = 0.0; + ta->treeblockcpu = 0.0; + ta->treemaxcpu = 0.0; + ta->treemincpu = ((double) ta->maxcpu)/CLOCKS_PER_SEC; + + ta->generaltime = 0.0; + ta->optionstime = 0.0; + ta->paramesttime = 0.0; + ta->quarttime = 0.0; + ta->quartblocktime = 0.0; + ta->quartmaxtime = 0.0; + ta->quartmintime = DBL_MAX; + ta->puzztime = 0.0; + ta->puzzblocktime = 0.0; + ta->puzzmaxtime = 0.0; + ta->puzzmintime = DBL_MAX; + ta->treetime = 0.0; + ta->treeblocktime = 0.0; + ta->treemaxtime = 0.0; + ta->treemintime = DBL_MAX; +} /* inittimearr */ + + +/***************/ + +void addup(int jobtype, clock_t c1, clock_t c2, time_t t1, time_t t2, timearray_t *ta) +{ + double c, + t; + + if (t2 != t1) t = difftime(t2, t1); + else t = 0.0; + + if (c2 < c1) + c = ((double)(c2 - ta->mincpu))/CLOCKS_PER_SEC + + ((double)(ta->maxcpu - c1))/CLOCKS_PER_SEC; + else + c = ((double)(c2 - c1))/CLOCKS_PER_SEC; + + if (jobtype != OVERALL) { + + if (ta->mincpublock > c) ta->mincpublock = c; + if (ta->maxcpublock < c) ta->maxcpublock = c; + if (ta->mintimeblock > t) ta->mintimeblock = t; + if (ta->maxtimeblock < t) ta->maxtimeblock = t; + + switch (jobtype) { + case GENERAL: ta->generalcpu += c; + ta->generaltime += t; + break; + case OPTIONS: ta->optionscpu += c; + ta->optionstime += t; + break; + case PARAMEST: ta->paramestcpu += c; + ta->paramesttime += t; + break; + case QUARTETS: ta->quartblockcpu += c; + ta->quartblocktime += t; + if (ta->quartmincpu > c) ta->quartmincpu = c; + if (ta->quartmaxcpu < c) ta->quartmaxcpu = c; + if (ta->quartmintime > t) ta->quartmintime = t; + if (ta->quartmaxtime < t) ta->quartmaxtime = t; + break; + case PUZZLING: ta->puzzblockcpu += c; + ta->puzzblocktime += t; + if (ta->puzzmincpu > c) ta->puzzmincpu = c; + if (ta->puzzmaxcpu < c) ta->puzzmaxcpu = c; + if (ta->puzzmintime > t) ta->puzzmintime = t; + if (ta->puzzmaxtime < t) ta->puzzmaxtime = t; + break; + case TREEEVAL: ta->treeblockcpu += c; + ta->treeblocktime += t; + if (ta->treemincpu > c) ta->treemincpu = c; + if (ta->treemaxcpu < c) ta->treemaxcpu = c; + if (ta->treemintime > t) ta->treemintime = t; + if (ta->treemaxtime < t) ta->treemaxtime = t; + break; + } + ta->cpu += c; + ta->time += t; + + } else { + ta->fullcpu += c; + ta->fulltime += t; + } + +# ifdef TIMEDEBUG + { +# if ! PARALLEL + int PP_Myid = -1; +# endif /* !PARALLEL */ + printf("(%2d) CPU: +%10.6f / Time: +%10.6f (%s)\n", PP_Myid, c, t, jtype[jobtype]); + printf("(%2d) CPU: %11.6f / Time: %11.6f (%s)\n", PP_Myid, ta->cpu, ta->time, jtype[jobtype]); + printf("(%2d) CPU: %11.6f / Time: %11.6f (%s)\n", PP_Myid, ta->fullcpu, ta->fulltime, jtype[jobtype]); + } +# endif /* TIMEDEBUG */ +} /* addup */ + + +/***************/ + + +void addtimes(int jobtype, timearray_t *ta) +{ + clock_t tempc; + time_t tempt; + + time(&tempt); + tempc = clock(); + + if ((tempc < ta->tempfullcpu) || (jobtype == OVERALL)) { /* CPU counter overflow for overall time */ + addup(OVERALL, ta->tempfullcpu, tempc, ta->tempfulltime, tempt, ta); + ta->tempfullcpu = tempc; + ta->tempfulltime = tempt; + if (jobtype == OVERALL) { + addup(ta->currentjob, ta->tempcpustart, tempc, ta->temptimestart, tempt, ta); + ta->tempcpustart = ta->tempcpu; + ta->tempcpu = tempc; + ta->temptimestart = ta->temptime; + ta->temptime = tempt; + } + } + + if((jobtype != ta->currentjob) && (jobtype != OVERALL)) { /* change of job type */ + addup(ta->currentjob, ta->tempcpustart, ta->tempcpu, ta->temptimestart, ta->temptime, ta); + ta->tempcpustart = ta->tempcpu; + ta->tempcpu = tempc; + ta->temptimestart = ta->temptime; + ta->temptime = tempt; + ta->currentjob = jobtype; + } + + if (tempc < ta->tempcpustart) { /* CPU counter overflow */ + addup(jobtype, ta->tempcpustart, tempc, ta->temptimestart, tempt, ta); + ta->tempcpustart = ta->tempcpu; + ta->tempcpu = tempc; + ta->temptimestart = ta->temptime; + ta->temptime = tempt; + } + +} /* addtimes */ + + + +/******************************************************************************/ + +/* estimate parameters of substitution process and rate heterogeneity - no tree + n-taxon tree is not needed because of quartet method or NJ tree topology */ +void estimateparametersnotree() +{ + int it, nump, change; + double TSold, YRold, FIold, GEold; + + it = 0; + nump = 0; + + /* count number of parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) nump++; + if (fracinv_optim || grate_optim) nump++; + + do { /* repeat until nothing changes any more */ + it++; + change = FALSE; + + /* optimize substitution parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) { + + TSold = TSparam; + YRold = YRparam; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing substitution process parameters\n"); + fflush(STDOUT); + + if (qcalg_optn) { /* quartet sampling */ + optimseqevolparamsq(); + } else { /* NJ tree */ + tmpfp = tmpfile(); + njtree(tmpfp); + rewind(tmpfp); + readusertree(tmpfp); + closefile(tmpfp); + optimseqevolparamst(); + } + + computedistan(); /* update ML distances */ + + /* same tolerance as 1D minimization */ + if ((fabs(TSparam - TSold) > 3.3*PEPS1) || + (fabs(YRparam - YRold) > 3.3*PEPS1) + ) change = TRUE; + + } + + /* optimize rate heterogeneity variables */ + if (fracinv_optim || grate_optim) { + + FIold = fracinv; + GEold = Geta; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing rate heterogeneity parameters\n"); + fflush(STDOUT); + /* compute NJ tree */ + tmpfp = tmpfile(); + njtree(tmpfp); + /* use NJ tree topology to estimate parameters */ + rewind(tmpfp); + readusertree(tmpfp); + closefile(tmpfp); + + optimrateparams(); + computedistan(); /* update ML distances */ + + + /* same tolerance as 1D minimization */ + if ((fabs(fracinv - FIold) > 3.3*PEPS2) || + (fabs(Geta - GEold) > 3.3*PEPS2) + ) change = TRUE; + + } + + if (nump == 1) return; + + } while (it != MAXITS && change); + + return; +} + + +/* estimate parameters of substitution process and rate heterogeneity - tree + same as above but here the n-taxon tree is already in memory */ +void estimateparameterstree() +{ + int it, nump, change; + double TSold, YRold, FIold, GEold; + + it = 0; + nump = 0; + + /* count number of parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) nump++; + if (fracinv_optim || grate_optim) nump++; + + do { /* repeat until nothing changes any more */ + it++; + change = FALSE; + + /* optimize substitution process parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) { + + TSold = TSparam; + YRold = YRparam; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing substitution process parameters\n"); + fflush(STDOUT); + optimseqevolparamst(); + computedistan(); /* update ML distances */ + + + /* same tolerance as 1D minimization */ + if ((fabs(TSparam - TSold) > 3.3*PEPS1) || + (fabs(YRparam - YRold) > 3.3*PEPS1) + ) change = TRUE; + + } + + /* optimize rate heterogeneity variables */ + if (fracinv_optim || grate_optim) { + + FIold = fracinv; + GEold = Geta; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing rate heterogeneity parameters\n"); + fflush(STDOUT); + optimrateparams(); + computedistan(); /* update ML distances */ + + + /* same tolerance as 1D minimization */ + if ((fabs(fracinv - FIold) > 3.3*PEPS2) || + (fabs(Geta - GEold) > 3.3*PEPS2) + ) change = TRUE; + + } + + if (nump == 1) return; + + } while (it != MAXITS && change); + + return; +} + + +/******************************************************************************/ +/* exported from main */ +/******************************************************************************/ + +void compute_quartlklhds(int a, int b, int c, int d, double *d1, double *d2, double *d3, int approx) +{ + if (approx == APPROX) { + + *d1 = quartet_alklhd(a,b, c,d); /* (a,b)-(c,d) */ + *d2 = quartet_alklhd(a,c, b,d); /* (a,c)-(b,d) */ + *d3 = quartet_alklhd(a,d, b,c); /* (a,d)-(b,c) */ + + } else /* approx == EXACT */ { + + *d1 = quartet_lklhd(a,b, c,d); /* (a,b)-(c,d) */ + *d2 = quartet_lklhd(a,c, b,d); /* (a,c)-(b,d) */ + *d3 = quartet_lklhd(a,d, b,c); /* (a,d)-(b,c) */ + + } +} + +/***************************************************************/ + +void recon_tree() +{ + int i; +# if ! PARALLEL + int a, b, c; + uli nq; + double tc2, mintogo, minutes, hours; +# endif + + /* allocate memory for taxon list of bad quartets */ + badtaxon = new_ulivector(Maxspc); + for (i = 0; i < Maxspc; i++) badtaxon[i] = 0; + + /* allocate variable used for randomizing input order */ + trueID = new_ivector(Maxspc); + + /* allocate memory for quartets */ + quartetinfo = mallocquartets(Maxspc); + + /* prepare for consensus tree analysis */ + initconsensus(); + + if (!(readquart_optn) || (readquart_optn && savequart_optn)) { + /* compute quartets */ + FPRINTF(STDOUTFILE "Computing quartet maximum likelihood trees\n"); + fflush(STDOUT); + computeallquartets(); + } + + if (savequart_optn) + writeallquarts(Maxspc, ALLQUART, quartetinfo); + if (readquart_optn) { + int xx1, xx2, xx3, xx4, count; + readallquarts (Maxspc, ALLQUART, quartetinfo); + if (show_optn) { /* list all unresolved quartets */ + openfiletowrite(&unresfp, UNRESOLVED, "unresolved quartet trees"); + fprintf(unresfp, "List of all completely unresolved quartets:\n\n"); + } + + /* initialize bad quartet memory */ + for (count = 0; count < Maxspc; count++) badtaxon[count] = 0; + badqs = 0; + + for (xx4 = 3; xx4 < Maxspc; xx4++) + for (xx3 = 2; xx3 < xx4; xx3++) + for (xx2 = 1; xx2 < xx3; xx2++) + for (xx1 = 0; xx1 < xx2; xx1++) { + if (readquartet(xx1, xx2, xx3, xx4) == 7) { + badqs++; + badtaxon[xx1]++; + badtaxon[xx2]++; + badtaxon[xx3]++; + badtaxon[xx4]++; + if (show_optn) { + fputid10(unresfp, xx1); + fprintf(unresfp, " "); + fputid10(unresfp, xx2); + fprintf(unresfp, " "); + fputid10(unresfp, xx3); + fprintf(unresfp, " "); + fputid (unresfp, xx4); + fprintf(unresfp, "\n"); + } + } + } /* end for xx4; for xx3; for xx2; for xx1 */ + if (show_optn) /* list all unresolved quartets */ + fclose(unresfp); + } /* readquart_optn */ + +# if PARALLEL + PP_SendAllQuarts(numquarts(Maxspc), quartetinfo); +# endif /* PARALLEL */ + + FPRINTF(STDOUTFILE "Computing quartet puzzling tree\n"); + fflush(STDOUT); + + /* start timer - percentage of completed trees */ + time(&time0); + time1 = time0; + mflag = 0; + + /* open file for chronological list of puzzling step trees */ + if((listqptrees == PSTOUT_LIST) || (listqptrees == PSTOUT_LISTORDER)) + openfiletowrite(&qptlist, OUTPTLIST, "puzzling step trees (chonological)"); + +# if PARALLEL + { + PP_SendDoPermutBlock(Numtrial); + } +# else + addtimes(GENERAL, &tarr); + for (Currtrial = 0; Currtrial < Numtrial; Currtrial++) { + + /* randomize input order */ + chooser(Maxspc, Maxspc, trueID); + + /* initialize tree */ + inittree(); + + /* adding all other leafs */ + for (i = 3; i < Maxspc; i++) { + + /* clear all edgeinfos */ + resetedgeinfo(); + + /* clear counter of quartets */ + nq = 0; + + /* + * core of quartet puzzling algorithm + */ + + for (a = 0; a < nextleaf - 2; a++) + for (b = a + 1; b < nextleaf - 1; b++) + for (c = b + 1; c < nextleaf; c++) { + + /* check which two _leaves_ out of a, b, c + are closer related to each other than + to leaf i according to a least squares + fit of the continous Baysian weights to the + seven trivial "attractive regions". We assign + a score of 1 to all edges between these two leaves + chooseA and chooseB */ + + checkquartet(a, b, c, i); + incrementedgeinfo(chooseA, chooseB); + + nq++; + + /* generate message every 15 minutes */ + + /* check timer */ + time(&time2); + if ( (time2 - time1) > 900) { + /* every 900 seconds */ + /* percentage of completed trees */ + if (mflag == 0) { + FPRINTF(STDOUTFILE "\n"); + mflag = 1; + } + tc2 = 100.0*Currtrial/Numtrial + + 100.0*nq/Numquartets/Numtrial; + mintogo = (100.0-tc2) * + (double) (time2-time0)/60.0/tc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%2.2f%%", tc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + time1 = time2; + } + } + + /* find out which edge has the lowest edgeinfo */ + minimumedgeinfo(); + + /* add the next leaf on minedge */ + addnextleaf(minedge); + } + + /* compute bipartitions of current tree */ + computebiparts(); + makenewsplitentries(); + + { + int *ctree, startnode; + char *trstr; + treelistitemtype *treeitem; + ctree = initctree(); + copytree(ctree); + startnode = sortctree(ctree); + trstr=sprintfctree(ctree, psteptreestrlen); + + + treeitem = addtree2list(&trstr, 1, &psteptreelist, &psteptreenum, &psteptreesum); + + if((listqptrees == PSTOUT_LIST) + || (listqptrees == PSTOUT_LISTORDER)) { + /* print: order no/# topol per this id/tree id/sum of unique topologies/sum of trees so far */ + fprintf(qptlist, "%ld.\t1\t%d\t%d\t%d\t%d\n", + Currtrial + 1, (*treeitem).count, (*treeitem).id, psteptreenum, psteptreesum); + } + +# ifdef VERBOSE1 + printf("%s\n", trstr); + printfsortedpstrees(psteptreelist); +# endif + freectree(&ctree); + } + + + + /* free tree before building the next tree */ + freetree(); + + addtimes(PUZZLING, &tarr); + } +# endif /* PARALLEL */ + + /* close file for list of puzzling step trees */ + if((listqptrees == PSTOUT_LIST) || (listqptrees == PSTOUT_LISTORDER)) + closefile(qptlist); + + if (mflag == 1) FPRINTF(STDOUTFILE "\n"); + + /* garbage collection */ + free(splitcomp); + free_ivector(trueID); + +# if ! PARALLEL + free_cmatrix(biparts); +# endif /* PARALLEL */ + + freequartets(); + + /* compute majority rule consensus tree */ + makeconsensus(); + + /* write consensus tree to tmp file */ + tmpfp = tmpfile(); + writeconsensustree(tmpfp); +} /* recon_tree */ + +/***************************************************************/ + +void map_lklhd() +{ + int i, a, a1, a2, b, b1, b2, c, c1, c2, d; + uli nq; + double logs[3], d1, d2, d3, temp; + ivector qts, mlorder, gettwo; + /* reset variables */ + ar1 = ar2 = ar3 = 0; + reg1 = reg2 = reg3 = reg4 = reg5 = reg6 = reg7 = 0; + reg1l = reg1r = reg2u = reg2d = reg3u = reg3d = reg4u = + reg4d = reg5l = reg5r = reg6u = reg6d = 0; + + /* place for random quartet */ + qts = new_ivector(4); + + /* initialize output file */ + openfiletowrite(&trifp, TRIANGLE, "Postscript output"); + initps(trifp); + FPRINTF(STDOUTFILE "Performing likelihood mapping analysis\n"); + fflush(STDOUT); + + /* start timer */ + starttimer(); + nq = 0; + mflag = 0; + + addtimes(GENERAL, &tarr); + if (lmqts == 0) { /* all possible quartets */ + + if (numclust == 4) { /* four-cluster analysis */ + + for (a = 0; a < clustA; a++) + for (b = 0; b < clustB; b++) + for (c = 0; c < clustC; c++) + for (d = 0; d < clustD; d++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(clusterA[a],clusterB[b],clusterC[c],clusterD[d],&d1,&d2,&d3, APPROX); + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + } + + if (numclust == 3) { /* three-cluster analysis */ + + gettwo = new_ivector(2); + + for (a = 0; a < clustA; a++) + for (b = 0; b < clustB; b++) + for (c1 = 0; c1 < clustC-1; c1++) + for (c2 = c1+1; c2 < clustC; c2++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(clusterA[a],clusterB[b],clusterC[c1],clusterC[c2],&d1,&d2,&d3, APPROX); + + /* randomize order of d2 and d3 */ + if (randominteger(2) == 1) { + temp = d3; + d3 = d2; + d2 = temp; + } + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + free_ivector(gettwo); + } + + if (numclust == 2) { /* two-cluster analysis */ + + gettwo = new_ivector(2); + + for (a1 = 0; a1 < clustA-1; a1++) + for (a2 = a1+1; a2 < clustA; a2++) + for (b1 = 0; b1 < clustB-1; b1++) + for (b2 = b1+1; b2 < clustB; b2++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(clusterA[a1],clusterA[a2],clusterB[b1],clusterB[b2],&d1,&d2,&d3, APPROX); + + /* randomize order of d2 and d3 */ + if (randominteger(2) == 1) { + temp = d3; + d3 = d2; + d2 = temp; + } + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + + free_ivector(gettwo); + } + + if (numclust == 1) { /* normal likelihood mapping (one cluster) */ + + mlorder = new_ivector(3); + +#if 0 + for (i = 3; i < Maxspc; i++) + for (a = 0; a < i - 2; a++) + for (b = a + 1; b < i - 1; b++) + for (c = b + 1; c < i; c++) + for (d = 3; d < Maxspc; d++) + for (c = 2; c < d; c++) + for (b = 1; b < c; b++) + for (a = 0; a < b; a++) +#endif + + for (i = 3; i < Maxspc; i++) + for (c = 2; c < i; c++) + for (b = 1; b < c; b++) + for (a = 0; a < b; a++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(a,b,c,i,&logs[0],&logs[1],&logs[2], APPROX); + + /* randomize order */ + chooser(3,3,mlorder); + d1 = logs[mlorder[0]]; + d2 = logs[mlorder[1]]; + d3 = logs[mlorder[2]]; + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + free_ivector(mlorder); + } + + } else { /* randomly selected quartets */ + + if (numclust == 4) { /* four-cluster analysis */ + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + qts[0] = clusterA[ randominteger(clustA) ]; + qts[1] = clusterB[ randominteger(clustB) ]; + qts[2] = clusterC[ randominteger(clustC) ]; + qts[3] = clusterD[ randominteger(clustD) ]; + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + } + + if (numclust == 3) { /* three-cluster analysis */ + + gettwo = new_ivector(2); + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + qts[0] = clusterA[ randominteger(clustA) ]; + qts[1] = clusterB[ randominteger(clustB) ]; + chooser(clustC, 2, gettwo); + qts[2] = clusterC[gettwo[0]]; + qts[3] = clusterC[gettwo[1]]; + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* order of d2 and d3 is already randomized! */ + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + + free_ivector(gettwo); + } + + if (numclust == 2) { /* two-cluster analysis */ + + gettwo = new_ivector(2); + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + chooser(clustA, 2, gettwo); + qts[0] = clusterA[gettwo[0]]; + qts[1] = clusterA[gettwo[1]]; + chooser(clustB, 2, gettwo); + qts[2] = clusterB[gettwo[0]]; + qts[3] = clusterB[gettwo[1]]; + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* order of d2 and d3 is already randomized! */ + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + free_ivector(gettwo); + } + + if (numclust == 1) { /* normal likelihood mapping (one cluster) */ + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + chooser(Maxspc, 4, qts); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* order of d1, d2, and d3 is already randomized! */ + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + } + } + + finishps(trifp); + closefile(trifp); + free_ivector(qts); + +} /* map_lklhd */ + +/***************************************************************/ + +void setdefaults() { + + strcpy(INFILE, INFILEDEFAULT); + strcpy(OUTFILE, OUTFILEDEFAULT); + strcpy(TREEFILE, TREEFILEDEFAULT); + strcpy(INTREE, INTREEDEFAULT); + strcpy(DISTANCES, DISTANCESDEFAULT); + strcpy(TRIANGLE, TRIANGLEDEFAULT); + strcpy(UNRESOLVED, UNRESOLVEDDEFAULT); + strcpy(ALLQUART, ALLQUARTDEFAULT); + strcpy(ALLQUARTLH, ALLQUARTLHDEFAULT); + strcpy(OUTPTLIST, OUTPTLISTDEFAULT); + strcpy(OUTPTORDER, OUTPTORDERDEFAULT); + + usebestq_optn = FALSE; + savequartlh_optn = FALSE; + savequart_optn = FALSE; + readquart_optn = FALSE; + + randseed = -1; /* to set random random seed */ + +} /* setdefaults */ + +/***************************************************************/ + +void printversion() +{ +# if ! PARALLEL + fprintf(stderr, "puzzle (%s) %s\n", PACKAGE, VERSION); +#else + fprintf(stderr, "ppuzzle (%s) %s\n", PACKAGE, VERSION); +# endif + exit (0); +} +/***************************************************************/ + +void printusage(char *fname) +{ + fprintf(stderr, "\n\nUsage: %s [-h] [ Infilename [ UserTreeFilename ] ]\n\n", fname); +# if PARALLEL + PP_SendDone(); + MPI_Finalize(); +# endif + exit (1); +} + +/***************************************************************/ + +#ifdef HHH +void printusagehhh(char *fname) +{ + fprintf(stderr, "\n\nUsage: %s [options] [ Infilename [ UserTreeFilename ] ]\n\n", fname); + fprintf(stderr, " -h - print usage\n"); + fprintf(stderr, " -wqf - write quartet file to Infilename.allquart\n"); + fprintf(stderr, " -rqf - read quartet file from Infilename.allquart\n"); + fprintf(stderr, " -wqlb - write quart lhs to Infilename.allquartlh (binary)\n"); + fprintf(stderr, " -wqla - write quart lhs to Infilename.allquartlh (ASCII)\n"); + fprintf(stderr, " -bestq - use best quart, no basian weights\n"); + fprintf(stderr, " -randseed<#> - use <#> as random number seed, for debug purposes only\n"); +# if PARALLEL + PP_SendDone(); + MPI_Finalize(); +# endif + exit (2); +} +#endif /* HHH */ + +/***************************************************************/ + + +void scancmdline(int *argc, char **argv[]) +{ + static short infileset = 0; + static short intreefileset = 0; + short flagused; + int n; + int count, dummyint; + + for (n = 1; n < *argc; n++) { +# ifdef VERBOSE1 + printf("argv[%d] = %s\n", n, (*argv)[n]); +# endif + + flagused = FALSE; + +# ifdef HHH + dummyint = 0; + count = sscanf((*argv)[n], "-wqlb%n", &dummyint); + if (dummyint == 5) { + savequartlh_optn = TRUE; + saveqlhbin_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n], "-wqla%n", &dummyint); + if (dummyint == 5) { + savequartlh_optn = TRUE; + saveqlhbin_optn = FALSE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n], "-wqf%n", &dummyint); + if (dummyint == 4) { + savequart_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-rqf%n", &dummyint); + if (dummyint == 4) { + readquart_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-bestq%n", &dummyint); + if (dummyint == 6) { + usebestq_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-hhh%n", &dummyint); + if (dummyint==4) { + printusagehhh((*argv)[0]); + flagused = TRUE; + } +# endif /* HHH */ + + dummyint = 0; + count = sscanf((*argv)[n],"-V%n", &dummyint); + if (dummyint==2) { + printversion((*argv)[0]); + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-version%n", &dummyint); + if (dummyint==8) { + printversion((*argv)[0]); + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"--version%n", &dummyint); + if (dummyint>=4) { + printversion((*argv)[0]); + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-h%n", &dummyint); + if (dummyint==2) { + printusage((*argv)[0]); + flagused = TRUE; + } + + count = sscanf((*argv)[n],"-randseed%d", &dummyint); + if (count == 1) { + randseed = dummyint; + flagused = TRUE; + } + +#if 0 + count = sscanf((*argv)[n],"-h%n", &dummyint); + if ((count == 1) && (dummyint>=2)) printusage((*argv)[0]); + + count = sscanf((*argv)[n],"-writequarts%n", &dummyint); + if (count == 1) writequartstofile = 1;; + + count = sscanf((*argv)[n],"-ws%d", &dummyint); + if (count == 1) windowsize = dummyint; +#endif + + if ((*argv)[n][0] != '-') { + if (infileset == 0) { + strcpy(INFILE, (*argv)[n]); + infileset++; + sprintf(OUTFILE ,"%s.%s", INFILE, OUTFILEEXT); + sprintf(TREEFILE ,"%s.%s", INFILE, TREEFILEEXT); + sprintf(DISTANCES ,"%s.%s", INFILE, DISTANCESEXT); + sprintf(TRIANGLE ,"%s.%s", INFILE, TRIANGLEEXT); + sprintf(UNRESOLVED ,"%s.%s", INFILE, UNRESOLVEDEXT); + sprintf(ALLQUART ,"%s.%s", INFILE, ALLQUARTEXT); + sprintf(ALLQUARTLH ,"%s.%s", INFILE, ALLQUARTLHEXT); + sprintf(OUTPTLIST ,"%s.%s", INFILE, OUTPTLISTEXT); + sprintf(OUTPTORDER ,"%s.%s", INFILE, OUTPTORDEREXT); + FPRINTF(STDOUTFILE "Input file: %s\n", INFILE); + flagused = TRUE; + } else { + if (intreefileset == 0) { + strcpy(INTREE, (*argv)[n]); + intreefileset++; + sprintf(OUTFILE ,"%s.%s", INTREE, OUTFILEEXT); + sprintf(TREEFILE ,"%s.%s", INTREE, TREEFILEEXT); + sprintf(DISTANCES ,"%s.%s", INTREE, DISTANCESEXT); + FPRINTF(STDOUTFILE "Usertree file: %s\n", INTREE); + flagused = TRUE; + } + } + } + if (flagused == FALSE) { + fprintf(stderr, "WARNING: commandline parameter %d not recognized (\"%s\")\n", n, (*argv)[n]); + } + flagused = FALSE; + } + +} /* scancmdline */ + + +/***************************************************************/ + +void inputandinit(int *argc, char **argv[]) { + + int ci; + + /* vectors used in QP and LM analysis */ + qweight = new_dvector(3); + sqdiff = new_dvector(3); + qworder = new_ivector(3); + sqorder = new_ivector(3); + + /* Initialization and parsing of Commandline */ + setdefaults(); + scancmdline(argc, argv); + + /* initialize random numbers generator */ + if (randseed >= 0) + fprintf(stderr, "WARNING: random seed set to %d for debugging!\n", randseed); + randseed = initrandom(randseed); + + psteptreelist = NULL; + psteptreesum = 0; + bestratefound = 0; + +# ifndef ALPHA + FPRINTF(STDOUTFILE "\n\n\nWELCOME TO TREE-PUZZLE %s!\n\n\n", VERSION); +# else + FPRINTF(STDOUTFILE "\n\n\nWELCOME TO TREE-PUZZLE %s%s!\n\n\n", VERSION, ALPHA); +# endif + + + /* get sequences */ + openfiletoread(&seqfp, INFILE, "sequence data"); + getsizesites(seqfp); + FPRINTF(STDOUTFILE "\nInput data set contains %d sequences of length %d\n", Maxspc, Maxseqc); + getdataset(seqfp); + closefile(seqfp); + data_optn = guessdatatype(); + + /* translate characters into format used by ML engine */ + nuc_optn = TRUE; + SH_optn = FALSE; + Seqchar = NULL; + translatedataset(); + + /* estimate base frequencies from data set */ + Freqtpm = NULL; + Basecomp = NULL; + estimatebasefreqs(); + + /* guess model of substitution */ + guessmodel(); + + /* initialize guess variables */ + auto_datatype = AUTO_GUESS; + if (data_optn == AMINOACID) auto_aamodel = AUTO_GUESS; + else auto_aamodel = AUTO_DEFAULT; + /* save guessed amino acid options */ + guessDayhf_optn = Dayhf_optn; + guessJtt_optn = Jtt_optn; + guessmtrev_optn = mtrev_optn; + guesscprev_optn = cprev_optn; + guessblosum62_optn = blosum62_optn; + guessvtmv_optn = vtmv_optn; + guesswag_optn = wag_optn; + guessauto_aamodel = auto_aamodel; + + + /* check for user specified tree */ + if ((utfp = fopen(INTREE, "r")) != NULL) { + fclose(utfp); + puzzlemode = USERTREE; + } else { + puzzlemode = QUARTPUZ; + } + + /* reserve memory for cluster LM analysis */ + clusterA = new_ivector(Maxspc); + clusterB = new_ivector(Maxspc); + clusterC = new_ivector(Maxspc); + clusterD = new_ivector(Maxspc); + + /* set options interactively */ + setoptions(); + + /* open usertree file right after start */ + if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE) { + openfiletoread(&utfp, INTREE, "user trees"); + } + + /* start main timer */ + time(&Starttime); + Startcpu=clock(); + addtimes(OPTIONS, &tarr); + + /* symmetrize doublet frequencies if specified */ + symdoublets(); + + /* initialise ML */ + mlstart(); + + /* determine how many usertrees */ + if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE) { + numutrees = 0; + do { + ci = fgetc(utfp); + if ((char) ci == ';') numutrees++; + } while (ci != EOF); + rewind(utfp); + if (numutrees < 1) { + FPRINTF(STDOUTFILE "Unable to proceed (no tree in input tree file)\n\n\n"); + exit(1); + } + } + + /* check fraction of invariable sites */ + if ((rhetmode == TWORATE || rhetmode == MIXEDRATE) && !fracinv_optim) + /* fraction of invariable site was specified manually */ + if (fracinv > MAXFI) + fracinv = MAXFI; + + addtimes(GENERAL, &tarr); + /* estimate parameters */ + if (!(typ_optn == TREERECON_OPTN && puzzlemode == USERTREE)) { + /* no tree present */ + estimateparametersnotree(); + } else { + if (utree_optn) { + /* use 1st user tree */ + readusertree(utfp); + rewind(utfp); + estimateparameterstree(); + } else { + /* don't use first user tree */ + estimateparametersnotree(); + } + } + addtimes(PARAMEST, &tarr); + + /* compute expected Ts/Tv ratio */ + if (data_optn == NUCLEOTIDE) computeexpectations(); + +} /* inputandinit */ + + + +/***************************************************************/ + +void evaluatetree(FILE *intreefp, FILE *outtreefp, int pmode, int utreenum, int maxutree, int *oldlocroot) +{ + + switch (pmode) { + case QUARTPUZ: /* read QP tree */ + readusertree(intreefp); + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (without clock)\n"); + fflush(STDOUT); + usertree_lklhd(); + findbestratecombination(); + break; + case USERTREE: /* read user tree */ + readusertree(intreefp); + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (without clock) for tree # %d\n", utreenum+1); + fflush(STDOUT); + usertree_lklhd(); + if (maxutree > 1) { + ulkl[utreenum] = Ctree->lklhd; + allsitelkl(Ctree->condlkl, allsites[utreenum]); + } + if (utreenum==0) findbestratecombination(); + break; + } + + + if (compclock) { /* clocklike branch length */ + switch (pmode) { + case QUARTPUZ: + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (with clock)\n"); + fflush(STDOUT); + break; + case USERTREE: + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (with clock) for tree # %d\n", utreenum+1); + fflush(STDOUT); + break; + } + + /* find best place for root */ + rootsearch = 0; + + if (utreenum==0) locroot = *oldlocroot; + else *oldlocroot = locroot; + + if (locroot < 0) { + locroot = findrootedge(); + rootsearch = 1; + } + /* if user-specified edge for root does not exist use displayed outgroup */ + if (!checkedge(locroot)) { + locroot = outgroup; + rootsearch = 2; + } + /* compute likelihood */ + clock_lklhd(locroot); + if (maxutree > 1) { + ulklc[utreenum] = Ctree->lklhdc; + allsitelkl(Ctree->condlkl, allsitesc[utreenum]); + } + + } + + if (clockmode == 0) + fprintf(outtreefp, "[ lh=%.6f ]", Ctree->lklhd); + else + fprintf(outtreefp, "[ lh=%.6f ]", Ctree->lklhdc); + + /* write ML branch length tree to outree file */ + clockmode = 0; /* nonclocklike branch lengths */ + fputphylogeny(outtreefp); + + /* clocklike branch lengths */ + if (compclock) { + clockmode = 1; + fputrooted(outtreefp, locroot); + } +} /* evaluatetree */ + +/***************************************************************/ + +void memcleanup() { + if (puzzlemode == QUARTPUZ && typ_optn == TREERECON_OPTN) { + free(splitfreqs); + free(splitpatterns); + free(splitsizes); + free_ivector(consconfid); + free_ivector(conssizes); + free_cmatrix(consbiparts); + free_ulivector(badtaxon); + } + free_cmatrix(Identif); + free_dvector(Freqtpm); + free_imatrix(Basecomp); + free_ivector(clusterA); + free_ivector(clusterB); + free_ivector(clusterC); + free_ivector(clusterD); + free_dvector(qweight); + free_dvector(sqdiff); + free_ivector(qworder); + free_ivector(sqorder); + freetreelist(&psteptreelist, &psteptreenum, &psteptreesum); +} /* memcleanup */ + +/***************************************************************/ + + +/******************************************************************************/ +/* main part */ +/******************************************************************************/ + +int main(int argc, char *argv[]) +{ + int i, oldlocroot=0; + + /* start main timer */ + time(&walltimestart); + cputimestart = clock(); + inittimearr(&tarr); + + + + inputandinit(&argc, &argv); + + + + /* write distance matrix */ + FPRINTF(STDOUTFILE "Writing pairwise distances to file %s\n", DISTANCES); + openfiletowrite(&dfp, DISTANCES, "pairwise distances"); + putdistance(dfp); + closefile(dfp); + + + + free_cmatrix(Seqchar); + free_cmatrix(seqchars); + + + + + /* write CPU/Wallclock times and parallel statistics */ + time(&walltimestop); + cputimestop = clock(); + addtimes(OVERALL, &tarr); + + fullcpu = tarr.fullcpu; + fulltime = tarr.fulltime; + + + + /* stop timer */ + + time(&Stoptime); + Stopcpu=clock(); + /* + timestamp(ofp); + closefile(ofp); + CZ 05/16/01*/ + + + /* printbestratecombination(stderr); */ + mlfinish(); + + FPRINTF(STDOUTFILE "\nAll results written to disk:\n"); + /*FPRINTF(STDOUTFILE " Puzzle report file: %s\n", OUTFILE);*/ + FPRINTF(STDOUTFILE " Likelihood distances: %s\n", DISTANCES); + + if (typ_optn == TREERECON_OPTN && puzzlemode != PAIRDIST) + FPRINTF(STDOUTFILE " Phylip tree file: %s\n", TREEFILE); + if (typ_optn == TREERECON_OPTN && puzzlemode == QUARTPUZ) { + if ((listqptrees == PSTOUT_ORDER) ||(listqptrees == PSTOUT_LISTORDER)) + FPRINTF(STDOUTFILE " Unique puzzling step trees: %s\n", OUTPTORDER); + if ((listqptrees == PSTOUT_LIST) ||(listqptrees == PSTOUT_LISTORDER)) + FPRINTF(STDOUTFILE " Puzzling step tree list: %s\n", OUTPTLIST); + } + if (show_optn && typ_optn == TREERECON_OPTN && puzzlemode == QUARTPUZ) + FPRINTF(STDOUTFILE " Unresolved quartets: %s\n", UNRESOLVED); + if (typ_optn == LIKMAPING_OPTN) + FPRINTF(STDOUTFILE " Likelihood mapping diagram: %s\n", TRIANGLE); + FPRINTF(STDOUTFILE "\n"); + + /* runtime message */ + FPRINTF(STDOUTFILE + "The computation took %.0f seconds (= %.1f minutes = %.1f hours)\n", + difftime(Stoptime, Starttime), difftime(Stoptime, Starttime)/60., + difftime(Stoptime, Starttime)/3600.); + FPRINTF(STDOUTFILE + " including input %.0f seconds (= %.1f minutes = %.1f hours)\n", + fulltime, fulltime/60., fulltime/3600.); + + + /* free memory */ + memcleanup(); + + + + return 0; +} + + +/* compare function for uli - sort largest numbers first */ +int ulicmp(const void *ap, const void *bp) +{ + uli a, b; + + a = *((uli *) ap); + b = *((uli *) bp); + + if (a > b) return -1; + else if (a < b) return 1; + else return 0; +} + +/* compare function for int - sort smallest numbers first */ +int intcmp(const void *ap, const void *bp) +{ + int a, b; + + a = *((int *) ap); + b = *((int *) bp); + + if (a < b) return -1; + else if (a > b) return 1; + else return 0; +} diff --git a/forester/archive/RIO/others/puzzle_dqo/src/puzzle2.c b/forester/archive/RIO/others/puzzle_dqo/src/puzzle2.c new file mode 100644 index 0000000..ea53889 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/puzzle2.c @@ -0,0 +1,2651 @@ +/* + * puzzle2.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#define EXTERN extern + +#include "puzzle.h" +#include + +#if PARALLEL +# include "sched.h" +#endif /* PARALLEL */ + + +/******************************************************************************/ +/* sequences */ +/******************************************************************************/ + +/* read ten characters of current line as identifier */ +void readid(FILE *infp, int t) +{ + int i, j, flag, ci; + + for (i = 0; i < 26; i++) { /*CZ*/ + ci = fgetc(infp); + if (ci == EOF || !isprint(ci)) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no name for sequence %d)\n\n\n", t+1); + exit(1); + } + Identif[t][i] = (char) ci; + } + /* convert leading blanks in taxon name to underscores */ + flag = FALSE; + for (i = 25; i > -1; i--) { /*CZ*/ + if (flag == FALSE) { + if (Identif[t][i] != ' ') flag = TRUE; + } else { + if (Identif[t][i] == ' ') Identif[t][i] = '_'; + } + } + /* check whether this name is already used */ + for (i = 0; i < t; i++) { /* compare with all other taxa */ + flag = TRUE; /* assume identity */ + for (j = 0; (j < 26) && (flag == TRUE); j++) /*CZ*/ + if (Identif[t][j] != Identif[i][j]) + flag = FALSE; + if (flag) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (multiple occurence of sequence name '"); + fputid(STDOUT, t); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } +} + +/* read next allowed character */ +char readnextcharacter(FILE *ifp, int notu, int nsite) +{ + char c; + + /* ignore blanks and control characters except newline */ + do { + if (fscanf(ifp, "%c", &c) != 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing character at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } while (c == ' ' || (iscntrl((int) c) && c != '\n')); + return c; +} + +/* skip rest of the line */ +void skiprestofline(FILE* ifp, int notu, int nsite) +{ + int ci; + + /* read chars until the first newline */ + do{ + ci = fgetc(ifp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "Unable to proceed (missing newline at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } while ((char) ci != '\n'); +} + +/* skip control characters and blanks */ +void skipcntrl(FILE *ifp, int notu, int nsite) +{ + int ci; + + /* read over all control characters and blanks */ + do { + ci = fgetc(ifp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing character at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } while (iscntrl(ci) || (char) ci == ' '); + /* go one character back */ + if (ungetc(ci, ifp) == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (positioning error at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } +} + +/* read sequences of one data set */ +void getseqs(FILE *ifp) +{ + int notu, nsite, endofline, linelength, i; + char c; + + seqchars = new_cmatrix(Maxspc, Maxseqc); + /* read all characters */ + nsite = 0; /* next site to be read */ + while (nsite < Maxseqc) { + /* read first taxon */ + notu = 0; + /* go to next true line */ + skiprestofline(ifp, notu, nsite); + skipcntrl(ifp, notu, nsite); + if (nsite == 0) readid(ifp, notu); + endofline = FALSE; + linelength = 0; + do { + c = readnextcharacter(ifp, notu, nsite + linelength); + if (c == '\n') endofline = TRUE; + else if (c == '.') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (invalid character '.' at position "); + FPRINTF(STDOUTFILE "%d in first sequence)\n\n\n", nsite + linelength + 1); + exit(1); + } else if (nsite + linelength < Maxseqc) { + /* change to upper case */ + seqchars[notu][nsite + linelength] = (char) toupper((int) c); + linelength++; + } else { + endofline = TRUE; + skiprestofline(ifp, notu, nsite + linelength); + } + } while (!endofline); + if (linelength == 0) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (line with length 0 at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + /* read other taxa */ + for (notu = 1; notu < Maxspc; notu++) { + /* go to next true line */ + if (notu != 1) skiprestofline(ifp, notu, nsite); + skipcntrl(ifp, notu, nsite); + if (nsite == 0) readid(ifp, notu); + for (i = nsite; i < nsite + linelength; i++) { + c = readnextcharacter(ifp, notu, i); + if (c == '\n') { /* too short */ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (line to short at position %d in sequence '", i + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } else if (c == '.') { + seqchars[notu][i] = seqchars[0][i]; + } else { + /* change to upper case */ + seqchars[notu][i] = (char) toupper((int) c); + } + } + } + nsite = nsite + linelength; + } +} + +/* initialize identifer array */ +void initid(int t) +{ + int i, j; + + Identif = new_cmatrix(t, 26); /*CZ*/ + for (i = 0; i < t; i++) + for (j = 0; j < 26; j++) /*CZ*/ + Identif[i][j] = ' '; +} + +/* print identifier of specified taxon in full 10 char length */ +void fputid10(FILE *ofp, int t) +{ + int i; + + for (i = 0; i < 26; i++) fputc(Identif[t][i], ofp); /*CZ*/ +} + +/* print identifier of specified taxon up to first space */ +int fputid(FILE *ofp, int t) +{ + int i; + + i = 0; + while (Identif[t][i] != ' ' && i < 26) { /*CZ*/ + fputc(Identif[t][i], ofp); + i++; + } + return i; +} + +/* read first line of sequence data set */ +void getsizesites(FILE *ifp) +{ + if (fscanf(ifp, "%d", &Maxspc) != 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing number of sequences)\n\n\n"); + exit(1); + } + if (fscanf(ifp, "%d", &Maxseqc) != 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing number of sites)\n\n\n"); + exit(1); + } + + if (Maxspc < 4) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (less than 4 sequences)\n\n\n"); + exit(1); + } + if (Maxspc > 8000) { /*CZ*/ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (more than 8000 sequences)\n\n\n"); + exit(1); + } + if (Maxseqc < 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no sequence sites)\n\n\n"); + exit(1); + } + Maxbrnch = 2*Maxspc - 3; +} + +/* read one data set - PHYLIP interleaved */ +void getdataset(FILE *ifp) +{ + initid(Maxspc); + getseqs(ifp); +} + +/* guess data type */ +int guessdatatype() +{ + uli numnucs, numchars, numbins; + int notu, nsite; + char c; + + /* count A, C, G, T, U, N */ + numnucs = 0; + numchars = 0; + numbins = 0; + for (notu = 0; notu < Maxspc; notu++) + for (nsite = 0; nsite < Maxseqc; nsite++) { + c = seqchars[notu][nsite]; + if (c == 'A' || c == 'C' || c == 'G' || + c == 'T' || c == 'U' || c == 'N') numnucs++; + if (c != '-' && c != '?') numchars++; + if (c == '0' || c == '1') numbins++; + } + if (numchars == 0) numchars = 1; + /* more than 85 % frequency means nucleotide data */ + if ((double) numnucs / (double) numchars > 0.85) return 0; + else if ((double) numbins / (double) numchars > 0.2) return 2; + else return 1; +} + +/* translate characters into format used by ML engine */ +void translatedataset() +{ + int notu, sn, co; + char c; + cvector code; + + + /* determine Maxsite - number of ML sites per taxon */ + if (data_optn == 0 && SH_optn) { + if (SHcodon) + Maxsite = Maxseqc / 3; + else + Maxsite = Maxseqc / 2; /* assume doublets */ + + } else + Maxsite = Maxseqc; + if (data_optn == 0 && (Maxsite % 3) == 0 && !SH_optn) { + if (codon_optn == 1 || codon_optn == 2 || codon_optn == 3) + Maxsite = Maxsite / 3; /* only one of the three codon positions */ + if (codon_optn == 4) + Maxsite = 2*(Maxsite / 3); /* 1st + 2nd codon positions */ + } + + /* reserve memory */ + if (Seqchar != NULL) free_cmatrix(Seqchar); + Seqchar = new_cmatrix(Maxspc, Maxsite); + + /* code length */ + if (data_optn == 0 && SH_optn) + code = new_cvector(2); + else + code = new_cvector(1); + + /* decode characters */ + if (data_optn == 0 && SH_optn) { /* SH doublets */ + + for (notu = 0; notu < Maxspc; notu++) { + for (sn = 0; sn < Maxsite; sn++) { + for (co = 0; co < 2; co++) { + if (SHcodon) + c = seqchars[notu][sn*3 + co]; + else + c = seqchars[notu][sn*2 + co]; + code[co] = c; + } + Seqchar[notu][sn] = code2int(code); + } + } + + } else if (!(data_optn == 0 && (Maxseqc % 3) == 0)) { /* use all */ + + for (notu = 0; notu < Maxspc; notu++) { + for (sn = 0; sn < Maxsite; sn++) { + code[0] = seqchars[notu][sn]; + Seqchar[notu][sn] = code2int(code); + } + } + + } else { /* codons */ + + for (notu = 0; notu < Maxspc; notu++) { + for (sn = 0; sn < Maxsite; sn++) { + if (codon_optn == 1 || codon_optn == 2 || codon_optn == 3) + code[0] = seqchars[notu][sn*3+codon_optn-1]; + else if (codon_optn == 4) { + if ((sn % 2) == 0) + code[0] = seqchars[notu][(sn/2)*3]; + else + code[0] = seqchars[notu][((sn-1)/2)*3+1]; + } else + code[0] = seqchars[notu][sn]; + Seqchar[notu][sn] = code2int(code); + } + } + + } + free_cvector(code); +} + +/* estimate mean base frequencies from translated data set */ +void estimatebasefreqs() +{ + int tpmradix, i, j; + uli all, *gene; + + tpmradix = gettpmradix(); + + if (Freqtpm != NULL) free_dvector(Freqtpm); + Freqtpm = new_dvector(tpmradix); + + if (Basecomp != NULL) free_imatrix(Basecomp); + Basecomp = new_imatrix(Maxspc, tpmradix); + + gene = (uli *) malloc((unsigned) ((tpmradix + 1) * sizeof(uli))); + if (gene == NULL) maerror("gene in estimatebasefreqs"); + + for (i = 0; i < tpmradix + 1; i++) gene[i] = 0; + for (i = 0; i < Maxspc; i++) + for (j = 0; j < tpmradix; j++) Basecomp[i][j] = 0; + for (i = 0; i < Maxspc; i++) + for (j = 0; j < Maxsite; j++) { + gene[(int) Seqchar[i][j]]++; + if (Seqchar[i][j] != tpmradix) Basecomp[i][(int) Seqchar[i][j]]++; + } + + all = Maxspc * Maxsite - gene[tpmradix]; + if (all != 0) { /* normal case */ + for (i = 0; i < tpmradix; i++) + Freqtpm[i] = (double) gene[i] / (double) all; + } else { /* pathological case with no unique character in data set */ + for (i = 0; i < tpmradix; i++) + Freqtpm[i] = 1.0 / (double) tpmradix; + } + + free(gene); + + Frequ_optn = TRUE; +} + +/* guess model of substitution */ +void guessmodel() +{ + double c1, c2, c3, c4, c5, c6; + dvector f; + dmatrix a; + int i; + + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + TSparam = 2.0; + YRparam = 1.0; + optim_optn = TRUE; + HKY_optn = TRUE; + TN_optn = FALSE; + + if (data_optn == 1) { /* amino acids */ + + /* chi2 fit to amino acid frequencies */ + + f = new_dvector(20); + a = new_dmatrix(20,20); + /* chi2 distance Dayhoff */ + dyhfdata(a, f); + c1 = 0; + for (i = 0; i < 20; i++) + c1 = c1 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance JTT */ + jttdata(a, f); + c2 = 0; + for (i = 0; i < 20; i++) + c2 = c2 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance mtREV */ + mtrevdata(a, f); + c3 = 0; + for (i = 0; i < 20; i++) + c3 = c3 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance VT */ + vtmvdata(a, f); + c4 = 0; + for (i = 0; i < 20; i++) + c4 = c4 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance WAG */ + wagdata(a, f); + c5 = 0; + for (i = 0; i < 20; i++) + c5 = c5 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance cpREV */ + cprev45data(a, f); + c6 = 0; + for (i = 0; i < 20; i++) + c6 = c6 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + + free_dvector(f); + free_dmatrix(a); + +#ifndef CPREV + if ((c1 < c2) && (c1 < c3) && (c1 < c4) && (c1 < c5)) { + /* c1 -> Dayhoff */ + Dayhf_optn = TRUE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c2 < c3) && (c2 < c4) && (c2 < c5)) { + /* c2 -> JTT */ + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c3 < c4) && (c3 < c5)) { + /* c3 -> mtREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = TRUE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on mtDNA)\n"); + } else { + if ((c4 < c5)) { + /* c4 -> VT */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = TRUE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + /* c5 -> WAG */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = TRUE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } /* if c4 else c5 */ + } /* if c3 else c4 */ + } /* if c2 */ + } /* if c1 */ + +#else /* CPREV */ + + if ((c1 < c2) && (c1 < c3) && (c1 < c4) && (c1 < c5) && (c1 < c6)) { + /* c1 -> Dayhoff */ + Dayhf_optn = TRUE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c2 < c3) && (c2 < c4) && (c2 < c5) && (c2 < c6)) { + /* c2 -> JTT */ + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c3 < c4) && (c3 < c5) && (c3 < c6)) { + /* c3 -> mtREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = TRUE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on mtDNA)\n"); + } else { + if ((c4 < c5) && (c4 < c6)) { + /* c4 -> VT */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = TRUE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if (c5 < c6) { + /* c5 -> WAG */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = TRUE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + /* if (c6) */ + /* c6 -> cpREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = TRUE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on cpDNA)\n"); + } /* if c5 else c6 */ + } /* if c4 else c5 */ + } /* if c3 else c4 */ + } /* if c2 */ + } /* if c1 */ +#endif /* CPREV */ + + } else if (data_optn == 0) { + FPRINTF(STDOUTFILE "(consists very likely of nucleotides)\n"); + } else { + FPRINTF(STDOUTFILE "(consists very likely of binary state data)\n"); + } +} /* guessmodel */ + + +/******************************************************************************/ +/* functions for representing and building puzzling step trees */ +/******************************************************************************/ + +/* initialize tree with the following starting configuration + + 2 + 0 +------- C(=2) + A(=0) -----+ + +------- B(=1) + 1 + */ +void inittree() +{ + int i; + + /* allocate the memory for the whole tree */ + + /* allocate memory for vector with all the edges of the tree */ + edge = (ONEEDGE *) calloc(Maxbrnch, sizeof(ONEEDGE) ); + if (edge == NULL) maerror("edge in inittree"); + + /* allocate memory for vector with edge numbers of leaves */ + edgeofleaf = (int *) calloc(Maxspc, sizeof(int) ); + if (edgeofleaf == NULL) maerror("edgeofleaf in inittree"); + + /* allocate memory for all the edges the edge map */ + for (i = 0; i < Maxbrnch; i++) { + edge[i].edgemap = (int *) calloc(Maxbrnch, sizeof(int) ); + if (edge[i].edgemap == NULL) maerror("edgemap in inittree"); + } + + /* number all edges */ + for (i = 0; i < Maxbrnch; i++) edge[i].numedge = i; + + /* initialize tree */ + + nextedge = 3; + nextleaf = 3; + + /* edge maps */ + (edge[0].edgemap)[0] = 0; /* you are on the right edge */ + (edge[0].edgemap)[1] = 4; /* go down left for leaf 1 */ + (edge[0].edgemap)[2] = 5; /* go down right for leaf 2 */ + (edge[1].edgemap)[0] = 1; /* go up for leaf 0 */ + (edge[1].edgemap)[1] = 0; /* you are on the right edge */ + (edge[1].edgemap)[2] = 3; /* go up/down right for leaf 2 */ + (edge[2].edgemap)[0] = 1; /* go up for leaf 0 */ + (edge[2].edgemap)[1] = 2; /* go up/down left for leaf 1 */ + (edge[2].edgemap)[2] = 0; /* you are on the right edge */ + + /* interconnection */ + edge[0].up = NULL; + edge[0].downleft = &edge[1]; + edge[0].downright = &edge[2]; + edge[1].up = &edge[0]; + edge[1].downleft = NULL; + edge[1].downright = NULL; + edge[2].up = &edge[0]; + edge[2].downleft = NULL; + edge[2].downright = NULL; + + /* edges of leaves */ + edgeofleaf[0] = 0; + edgeofleaf[1] = 1; + edgeofleaf[2] = 2; +} /* inittree */ + +/* add next leaf on the specified edge */ +void addnextleaf(int dockedge) +{ + int i; + + if (dockedge >= nextedge) { + /* Trying to add leaf nextleaf to nonexisting edge dockedge */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR F TO DEVELOPERS\n\n\n"); + exit(1); + } + + if (nextleaf >= Maxspc) { + /* Trying to add leaf nextleaf to a tree with Maxspc leaves */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR G TO DEVELOPERS\n\n\n"); + exit(1); + } + + /* necessary change in edgeofleaf if dockedge == edgeofleaf[0] */ + if (edgeofleaf[0] == dockedge) edgeofleaf[0] = nextedge; + + /* adding nextedge to the tree */ + edge[nextedge].up = edge[dockedge].up; + edge[nextedge].downleft = &edge[dockedge]; + edge[nextedge].downright = &edge[nextedge+1]; + edge[dockedge].up = &edge[nextedge]; + + if (edge[nextedge].up != NULL) { + if ( ((edge[nextedge].up)->downleft) == &edge[dockedge] ) + (edge[nextedge].up)->downleft = &edge[nextedge]; + else + (edge[nextedge].up)->downright = &edge[nextedge]; + } + + /* adding nextedge + 1 to the tree */ + edge[nextedge+1].up = &edge[nextedge]; + edge[nextedge+1].downleft = NULL; + edge[nextedge+1].downright = NULL; + edgeofleaf[nextleaf] = nextedge+1; + + /* the two new edges get info about the old edges */ + /* nextedge */ + for (i = 0; i < nextedge; i++) { + switch ( (edge[dockedge].edgemap)[i] ) { + + /* down right changes to down left */ + case 5: (edge[nextedge].edgemap)[i] = 4; + break; + + /* null changes to down left */ + case 0: (edge[nextedge].edgemap)[i] = 4; + break; + + default: (edge[nextedge].edgemap)[i] = + (edge[dockedge].edgemap)[i]; + break; + } + } + + /* nextedge + 1 */ + for (i = 0; i < nextedge; i++) { + switch ( (edge[dockedge].edgemap)[i] ) { + + /* up/down left changes to up */ + case 2: (edge[nextedge+1].edgemap)[i] = 1; + break; + + /* up/down right changes to up */ + case 3: (edge[nextedge+1].edgemap)[i] = 1; + break; + + /* down left changes to up/down left */ + case 4: (edge[nextedge+1].edgemap)[i] = 2; + break; + + /* down right changes to up/down left */ + case 5: (edge[nextedge+1].edgemap)[i] = 2; + break; + + /* null changes to up/down left */ + case 0: (edge[nextedge+1].edgemap)[i] = 2; + break; + + /* up stays up */ + default: (edge[nextedge+1].edgemap)[i] = + (edge[dockedge].edgemap)[i]; + break; + } + } + + /* dockedge */ + for (i = 0; i < nextedge; i++) { + switch ( (edge[dockedge].edgemap)[i] ) { + + /* up/down right changes to up */ + case 3: (edge[dockedge].edgemap)[i] = 1; + break; + + /* up/down left changes to up */ + case 2: (edge[dockedge].edgemap)[i] = 1; + break; + + default: break; + } + } + + /* all edgemaps are updated for the two new edges */ + /* nextedge */ + (edge[nextedge].edgemap)[nextedge] = 0; + (edge[nextedge].edgemap)[nextedge+1] = 5; /* down right */ + + /* nextedge + 1 */ + (edge[nextedge+1].edgemap)[nextedge] = 1; /* up */ + (edge[nextedge+1].edgemap)[nextedge+1] = 0; + + /* all other edges */ + for (i = 0; i < nextedge; i++) { + (edge[i].edgemap)[nextedge] = (edge[i].edgemap)[dockedge]; + (edge[i].edgemap)[nextedge+1] = (edge[i].edgemap)[dockedge]; + } + + /* an extra for dockedge */ + (edge[dockedge].edgemap)[nextedge] = 1; /* up */ + (edge[dockedge].edgemap)[nextedge+1] = 3; /* up/down right */ + + nextleaf++; + nextedge = nextedge + 2; +} /* addnextleaf */ + + +/* free memory (to be called after inittree) */ +void freetree() +{ + int i; + + for (i = 0; i < 2 * Maxspc - 3; i++) free(edge[i].edgemap); + free(edge); + free(edgeofleaf); +} /* freetree */ + +/* writes OTU sitting on edge ed */ +void writeOTU(FILE *outfp, int ed) +{ + int i; + + /* test whether we are on a leaf */ + if (edge[ed].downright == NULL && edge[ed].downleft == NULL) { + for (i = 1; i < nextleaf; i++) { + if (edgeofleaf[i] == ed) { /* i is the leaf of ed */ + column += fputid(outfp, trueID[i]); + return; + } + } + } + + /* we are NOT on a leaf */ + fprintf(outfp, "("); + column++; + writeOTU(outfp, edge[ed].downleft->numedge); + fprintf(outfp, ","); + column++; + column++; + if (column > 55) { + column = 2; + fprintf(outfp, "\n "); + } + writeOTU(outfp, edge[ed].downright->numedge); + fprintf(outfp, ")"); + column++; +} /* writeOTU */ + +/* write tree */ +void writetree(FILE *outfp) +{ + column = 1; + fprintf(outfp, "("); + column += fputid(outfp, trueID[0]) + 3; + fprintf(outfp, ","); + writeOTU(outfp, edge[edgeofleaf[0]].downleft->numedge); + column++; + column++; + fprintf(outfp, ","); + writeOTU(outfp, edge[edgeofleaf[0]].downright->numedge); + fprintf(outfp, ");\n"); +} /* writetree */ + + +/* clear all edgeinfos */ +void resetedgeinfo() +{ + int i; + + for (i = 0; i < nextedge; i++) + edge[i].edgeinfo = 0; +} /* resetedgeinfo */ + +/* increment all edgeinfo between leaf A and B */ +void incrementedgeinfo(int A, int B) +{ + int curredge, finaledge, nextstep; + + if (A == B) return; + + finaledge = edgeofleaf[B]; + + curredge = edgeofleaf[A]; + edge[curredge].edgeinfo = edge[curredge].edgeinfo + 1; + + while (curredge != finaledge) { + nextstep = (edge[curredge].edgemap)[finaledge]; + switch (nextstep) { + + /* up */ + case 1: curredge = (edge[curredge].up)->numedge; + break; + + /* up/down left */ + case 2: curredge = ((edge[curredge].up)->downleft)->numedge; + break; + + /* up/down right */ + case 3: curredge = ((edge[curredge].up)->downright)->numedge; + break; + + /* down left */ + case 4: curredge = (edge[curredge].downleft)->numedge; + break; + + /* down right */ + case 5: curredge = (edge[curredge].downright)->numedge; + break; + + } + edge[curredge].edgeinfo = edge[curredge].edgeinfo + 1; + } +} /* incrementedgeinfo */ + +/* checks which edge has the lowest edgeinfo + if there are several edges with the same lowest edgeinfo, + one of them will be selected randomly */ +void minimumedgeinfo() +{ + int i, k, howmany, randomnum; + + howmany = 1; + minedge = 0; + mininfo = edge[0].edgeinfo; + for (i = 1; i < nextedge; i++) + if (edge[i].edgeinfo <= mininfo) { + if (edge[i].edgeinfo == mininfo) { + howmany++; + } else { + minedge = i; + mininfo = edge[i].edgeinfo; + howmany = 1; + } + } + + if (howmany > 1) { /* draw random edge */ + randomnum = randominteger(howmany) + 1; /* 1 to howmany */ + i = -1; + for (k = 0; k < randomnum; k++) { + do { + i++; + } while (edge[i].edgeinfo != mininfo); + minedge = i; + } + } +} /* minimumedgeinfo */ + + + + +/*******************************************/ +/* tree sorting */ +/*******************************************/ + +/* compute address of the 4 int (sort key) in the 4 int node */ +int ct_sortkeyaddr(int addr) +{ + int a, res; + a = addr % 4; + res = addr - a + 3; + return res; +} + + +/**********/ + +/* compute address of the next edge pointer in a 4 int node (0->1->2->0) */ +int ct_nextedgeaddr(int addr) +{ + int a, res; + a = addr % 4; + if ( a == 2 ) { res = addr - 2; } + else { res = addr + 1; } + return res; +} + + +/**********/ + +/* compute address of 1st edge of a 4 int node from node number */ +int ct_1stedge(int node) +{ + int res; + res = 4 * node; + return res; +} + + +/**********/ + +/* compute address of 2nd edge of a 4 int node from node number */ +int ct_2ndedge(int node) +{ + int res; + res = 4 * node +1; + return res; +} + + +/**********/ + +/* compute address of 3rd edge of a 4 int node from node number */ +int ct_3rdedge(int node) +{ + int res; + res = 4 * node +2; + return res; +} + + +/**********/ + +/* check whether node 'node' is a leaf (2nd/3rd edge pointer = -1) */ +int ct_isleaf(int node, int *ctree) +{ + return (ctree[ct_3rdedge(node)] < 0); +} + + +/**********/ + +/* compute node number of 4 int node from an edge addr. */ +int ct_addr2node(int addr) +{ + int a, res; + a = addr % 4; + res = (int) ((addr - a) / 4); + return res; +} + + +/**********/ + +/* print graph pointers for checking */ +void printctree(int *ctree) +{ + int n; + for (n=0; n < 2*Maxspc; n++) { + printf("n[%3d] = (%3d.%2d, %3d.%2d, %3d.%2d | %3d)\n", n, + (int) ctree[ct_1stedge(n)]/4, + (int) ctree[ct_1stedge(n)]%4, + (int) ctree[ct_2ndedge(n)]/4, + (int) ctree[ct_2ndedge(n)]%4, + (int) ctree[ct_3rdedge(n)]/4, + (int) ctree[ct_3rdedge(n)]%4, + ctree[ct_3rdedge(n)+1]); + } + printf("\n"); +} /* printctree */ + + +/**********/ + +/* allocate memory for ctree 3 ints pointer plus 1 check byte */ +int *initctree() +{ + int *snodes; + int n; + + snodes = (int *) malloc(4 * 2 * Maxspc * sizeof(int)); + if (snodes == NULL) maerror("snodes in copytree"); + + for (n=0; n<(4 * 2 * Maxspc); n++) { + snodes[n]=-1; + } + return snodes; +} + + +/**********/ + +/* free memory of a tree for sorting */ +void freectree(int **snodes) +{ + free(*snodes); + *snodes = NULL; +} + + +/**********/ + +/* copy subtree recursively */ +void copyOTU(int *ctree, /* tree array struct */ + int *ct_nextnode, /* next free node */ + int ct_curredge, /* currende edge to add subtree */ + int *ct_nextleaf, /* next free leaf (0-maxspc) */ + int ed) /* edge in puzzling step tree */ +{ + int i, nextcurredge; + + /* test whether we are on a leaf */ + if (edge[ed].downright == NULL && edge[ed].downleft == NULL) { + for (i = 1; i < nextleaf; i++) { + if (edgeofleaf[i] == ed) { /* i is the leaf of ed */ + nextcurredge = ct_1stedge(*ct_nextleaf); + ctree[ct_curredge] = nextcurredge; + ctree[nextcurredge] = ct_curredge; + ctree[ct_sortkeyaddr(nextcurredge)] = trueID[i]; + (*ct_nextleaf)++; + return; + } + } + } + + /* we are NOT on a leaf */ + nextcurredge = ct_1stedge(*ct_nextnode); + ctree[ct_curredge] = nextcurredge; + ctree[nextcurredge] = ct_curredge; + (*ct_nextnode)++; + nextcurredge = ct_nextedgeaddr(nextcurredge); + copyOTU(ctree, ct_nextnode, nextcurredge, + ct_nextleaf, edge[ed].downleft->numedge); + + nextcurredge = ct_nextedgeaddr(nextcurredge); + copyOTU(ctree, ct_nextnode, nextcurredge, + ct_nextleaf, edge[ed].downright->numedge); +} + + +/**********/ + +/* copy treestructure to sorting structure */ +void copytree(int *ctree) +{ + int ct_curredge; + int ct_nextleaf; + int ct_nextnode; + + ct_nextnode = Maxspc; + ct_curredge = ct_1stedge(ct_nextnode); + ct_nextleaf = 1; + + ctree[ct_1stedge(0)] = ct_curredge; + ctree[ct_curredge] = ct_1stedge(0); + ctree[ct_sortkeyaddr(0)] = trueID[0]; + + ct_nextnode++; + + ct_curredge = ct_nextedgeaddr(ct_curredge); + copyOTU(ctree, &ct_nextnode, ct_curredge, + &ct_nextleaf, edge[edgeofleaf[0]].downleft->numedge); + + ct_curredge = ct_nextedgeaddr(ct_curredge); + copyOTU(ctree, &ct_nextnode, ct_curredge, + &ct_nextleaf, edge[edgeofleaf[0]].downright->numedge); +} + + +/**********/ + +/* sort subtree from edge recursively by indices */ +int sortOTU(int edge, int *ctree) +{ + int key1, key2; + int edge1, edge2; + int tempedge; + + if (ctree[ct_2ndedge((int) (edge / 4))] < 0) + return ctree[ct_sortkeyaddr(edge)]; + + edge1 = ctree[ct_nextedgeaddr(edge)]; + edge2 = ctree[ct_nextedgeaddr(ct_nextedgeaddr(edge))]; + + /* printf ("visiting [%5d] -> [%5d], [%5d]\n", edge, edge1, edge2); */ + /* printf ("visiting [%2d.%2d] -> [%2d.%2d], [%2d.%2d]\n", + (int)(edge/4), edge%4, (int)(edge1/4), edge1%4, + (int)(edge2/4), edge2%4); */ + + key1 = sortOTU(edge1, ctree); + key2 = sortOTU(edge2, ctree); + + if (key2 < key1) { + tempedge = ctree[ctree[edge1]]; + ctree[ctree[edge1]] = ctree[ctree[edge2]]; + ctree[ctree[edge2]] = tempedge; + tempedge = ctree[edge1]; + ctree[edge1] = ctree[edge2]; + ctree[edge2] = tempedge; + ctree[ct_sortkeyaddr(edge)] = key2; + + } else { + ctree[ct_sortkeyaddr(edge)] = key1; + } + return ctree[ct_sortkeyaddr(edge)]; +} + + +/**********/ + +/* sort ctree recursively by indices */ +int sortctree(int *ctree) +{ + int n, startnode=-1; + for(n=0; n>>>\n"); + tmpptr = list; + *sortlist = list; + while (tmpptr != NULL) { + (*tmpptr).sortnext = (*tmpptr).succ; + (*tmpptr).sortlast = (*tmpptr).pred; + tmpptr = (*tmpptr).succ; + } + + while (xchange > 0) { + curr = *sortlist; + xchange = 0; + if (curr == NULL) fprintf(stderr, "Grrrrrrrrr>>>>\n"); + while((*curr).sortnext != NULL) { + next = (*curr).sortnext; + if ((*curr).count >= (*next).count) + curr = (*curr).sortnext; + else { + if ((*curr).sortlast != NULL) + (*((*curr).sortlast)).sortnext = next; + if (*sortlist == curr) + *sortlist = next; + (*next).sortlast = (*curr).sortlast; + + if ((*next).sortnext != NULL) + (*((*next).sortnext)).sortlast = curr; + (*curr).sortnext = (*next).sortnext; + + (*curr).sortlast = next; + (*next).sortnext = curr; + + xchange++; + } + } + } +} /* sortbynum */ + + +/**********/ + +/* print puzzling step tree stuctures for checking */ +void printfpstrees(treelistitemtype *list) +{ + char ch; + treelistitemtype *tmpptr = NULL; + tmpptr = list; + ch = '-'; + while (tmpptr != NULL) { + printf ("%c[%2d] %5d %s\n", ch, (*tmpptr).idx, (*tmpptr).count, (*tmpptr).tree); + tmpptr = (*tmpptr).succ; + ch = ' '; + } +} + +/**********/ + +/* print sorted puzzling step tree stucture with names */ +void fprintffullpstree(FILE *outf, char *treestr) +{ + int count = 0; + int idnum = 0; + int n; + for(n=0; treestr[n] != '\0'; n++){ + while(isdigit((int)treestr[n])){ + idnum = (10 * idnum) + ((int)treestr[n]-48); + n++; + count++; + } + if (count > 0){ +# ifdef USEQUOTES + fprintf(outf, "'"); +# endif + (void)fputid(outf, idnum); +# ifdef USEQUOTES + fprintf(outf, "'"); +# endif + count = 0; + idnum = 0; + } + fprintf(outf, "%c", treestr[n]); + } +} + + +/**********/ + +/* print sorted puzzling step tree stuctures with names */ +void fprintfsortedpstrees(FILE *output, + treelistitemtype *list, /* tree list */ + int itemnum, /* order number */ + int itemsum, /* number of trees */ + int comment, /* with statistics, or puzzle report ? */ + float cutoff) /* cutoff percentage */ +{ + treelistitemtype *tmpptr = NULL; + treelistitemtype *slist = NULL; + int num = 1; + float percent; + + if (list == NULL) fprintf(stderr, "Grrrrrrrrr>>>>\n"); + sortbynum(list, &slist); + + tmpptr = slist; + while (tmpptr != NULL) { + percent = (float)(100.0 * (*tmpptr).count / itemsum); + if ((cutoff == 0.0) || (cutoff <= percent)) { + if (comment) + fprintf (output, "[ %d. %d %.2f %d %d %d ]", num++, (*tmpptr).count, percent, (*tmpptr).id, itemnum, itemsum); + else { + if (num == 1){ + fprintf (output, "\n"); + fprintf (output, "The following tree(s) occured in more than %.2f%% of the %d puzzling steps.\n", cutoff, itemsum); + fprintf (output, "The trees are orderd descending by the number of occurences.\n"); + fprintf (output, "\n"); + fprintf (output, "\n occurences ID Phylip tree\n"); + } + fprintf (output, "%2d. %5d %6.2f%% %5d ", num++, (*tmpptr).count, percent, (*tmpptr).id); + } + fprintffullpstree(output, (*tmpptr).tree); + fprintf (output, "\n"); + } + tmpptr = (*tmpptr).sortnext; + } + + if (!comment) { + fprintf (output, "\n"); + switch(num) { + case 1: fprintf (output, "There were no tree topologies (out of %d) occuring with a percentage >= %.2f%% of the %d puzzling steps.\n", itemnum, cutoff, itemsum); break; + case 2: fprintf (output, "There was one tree topology (out of %d) occuring with a percentage >= %.2f%%.\n", itemnum, cutoff); break; + default: fprintf (output, "There were %d tree topologies (out of %d) occuring with a percentage >= %.2f%%.\n", num-1, itemnum, cutoff); break; + } + fprintf (output, "\n"); + fprintf (output, "\n"); + } + +} /* fprintfsortedpstrees */ + +/**********/ + +/* print sorted tree topologies for checking */ +void printfsortedpstrees(treelistitemtype *list) +{ + treelistitemtype *tmpptr = NULL; + treelistitemtype *slist = NULL; + + sortbynum(list, &slist); + + tmpptr = slist; + while (tmpptr != NULL) { + printf ("[%2d] %5d %s\n", (*tmpptr).idx, (*tmpptr).count, (*tmpptr).tree); + tmpptr = (*tmpptr).sortnext; + } +} /* printfsortedpstrees */ + + +/*******************************************/ +/* end of tree sorting */ +/*******************************************/ + + + +/******************************************************************************/ +/* functions for computing the consensus tree */ +/******************************************************************************/ + +/* prepare for consensus tree analysis */ +void initconsensus() +{ +# if ! PARALLEL + biparts = new_cmatrix(Maxspc-3, Maxspc); +# endif /* PARALLEL */ + + if (Maxspc % 32 == 0) + splitlength = Maxspc/32; + else splitlength = (Maxspc + 32 - (Maxspc % 32))/32; + numbiparts = 0; /* no pattern stored so far */ + maxbiparts = 0; /* no memory reserved so far */ + splitfreqs = NULL; + splitpatterns = NULL; + splitsizes = NULL; + splitcomp = (uli *) malloc(splitlength * sizeof(uli) ); + if (splitcomp == NULL) maerror("splitcomp in initconsensus"); +} + +/* prototype needed for recursive function */ +void makepart(int i, int curribrnch); + +/* recursive function to get bipartitions */ +void makepart(int i, int curribrnch) +{ + int j; + + if ( edge[i].downright == NULL || + edge[i].downleft == NULL) { /* if i is leaf */ + + /* check out what leaf j sits on this edge i */ + for (j = 1; j < Maxspc; j++) { + if (edgeofleaf[j] == i) { + biparts[curribrnch][trueID[j]] = '*'; + return; + } + } + } else { /* still on inner branch */ + makepart(edge[i].downleft->numedge, curribrnch); + makepart(edge[i].downright->numedge, curribrnch); + } +} + +/* compute bipartitions of tree of current puzzling step */ +void computebiparts() +{ + int i, j, curribrnch; + + curribrnch = -1; + + for (i = 0; i < Maxspc - 3; i++) + for (j = 0; j < Maxspc; j++) + biparts[i][j] = '.'; + + for (i = 0; i < Maxbrnch; i++) { + if (!( edgeofleaf[0] == i || + edge[i].downright == NULL || + edge[i].downleft == NULL) ) { /* check all inner branches */ + curribrnch++; + makepart(i, curribrnch); + + /* make sure that the root is always a '*' */ + if (biparts[curribrnch][outgroup] == '.') { + for (j = 0; j < Maxspc; j++) { + if (biparts[curribrnch][j] == '.') + biparts[curribrnch][j] = '*'; + else + biparts[curribrnch][j] = '.'; + } + } + } + } +} + +/* print out the bipartition n of all different splitpatterns */ +void printsplit(FILE *fp, uli n) +{ + int i, j, col; + uli z; + + col = 0; + for (i = 0; i < splitlength; i++) { + z = splitpatterns[n*splitlength + i]; + for (j = 0; j < 32 && col < Maxspc; j++) { + if (col % 10 == 0 && col != 0) fprintf(fp, " "); + if (z & 1) fprintf(fp, "."); + else fprintf(fp, "*"); + z = (z >> 1); + col++; + } + } +} + +/* make new entries for new different bipartitions and count frequencies */ +void makenewsplitentries() +{ + int i, j, bpc, identical, idflag, bpsize; + uli nextentry, obpc; + + /* where the next entry would be in splitpatterns */ + nextentry = numbiparts; + + for (bpc = 0; bpc < Maxspc - 3; bpc++) { /* for every new bipartition */ + /* convert bipartition into a more compact format */ + bpsize = 0; + for (i = 0; i < splitlength; i++) { + splitcomp[i] = 0; + for (j = 0; j < 32; j++) { + splitcomp[i] = splitcomp[i] >> 1; + if (i*32 + j < Maxspc) + if (biparts[bpc][i*32 + j] == '.') { + /* set highest bit */ + splitcomp[i] = (splitcomp[i] | 2147483648UL); + bpsize++; /* count the '.' */ + } + } + } + /* compare to the *old* patterns */ + identical = FALSE; + for (obpc = 0; (obpc < numbiparts) && (!identical); obpc++) { + /* compare first partition size */ + if (splitsizes[obpc] == bpsize) idflag = TRUE; + else idflag = FALSE; + /* if size is identical compare whole partition */ + for (i = 0; (i < splitlength) && idflag; i++) + if (splitcomp[i] != splitpatterns[obpc*splitlength + i]) + idflag = FALSE; + if (idflag) identical = TRUE; + } + if (identical) { /* if identical increase frequency */ + splitfreqs[2*(obpc-1)]++; + } else { /* create new entry */ + if (nextentry == maxbiparts) { /* reserve more memory */ + maxbiparts = maxbiparts + 2*Maxspc; + splitfreqs = (uli *) myrealloc(splitfreqs, + 2*maxbiparts * sizeof(uli) ); + /* 2x: splitfreqs contains also an index (sorting!) */ + if (splitfreqs == NULL) maerror("splitfreqs in makenewsplitentries"); + splitpatterns = (uli *) myrealloc(splitpatterns, + splitlength*maxbiparts * sizeof(uli) ); + if (splitpatterns == NULL) maerror("splitpatterns in makenewsplitentries"); + splitsizes = (int *) myrealloc(splitsizes, + maxbiparts * sizeof(int) ); + if (splitsizes == NULL) maerror("splitsizes in makenewsplitentries"); + } + splitfreqs[2*nextentry] = 1; /* frequency */ + splitfreqs[2*nextentry+1] = nextentry; /* index for sorting */ + for (i = 0; i < splitlength; i++) + splitpatterns[nextentry*splitlength + i] = splitcomp[i]; + splitsizes[nextentry] = bpsize; + nextentry++; + } + } + numbiparts = nextentry; +} + +/* general remarks: + + - every entry in consbiparts is one node of the consensus tree + - for each node one has to know which taxa and which other nodes + are *directly* descending from it + - for every taxon/node number there is a flag that shows + whether it descends from the node or not + - '0' means that neither a taxon nor another node with the + corresponding number decends from the node + '1' means that the corresponding taxon descends from the node + '2' means that the corresponding node descends from the node + '3' means that the corresponding taxon and node descends from the node +*/ + +/* copy bipartition n of all different splitpatterns to consbiparts[k] */ +void copysplit(uli n, int k) +{ + int i, j, col; + uli z; + + col = 0; + for (i = 0; i < splitlength; i++) { + z = splitpatterns[n*splitlength + i]; + for (j = 0; j < 32 && col < Maxspc; j++) { + if (z & 1) consbiparts[k][col] = '1'; + else consbiparts[k][col] = '0'; + z = (z >> 1); + col++; + } + } +} + +/* compute majority rule consensus tree */ +void makeconsensus() +{ + int i, j, k, size, subnode; + char chari, charj; + + /* sort bipartition frequencies */ + qsort(splitfreqs, numbiparts, 2*sizeof(uli), ulicmp); + /* how many bipartitions are included in the consensus tree */ + consincluded = 0; + for (i = 0; i < numbiparts && i == consincluded; i++) { + if (2*splitfreqs[2*i] > Numtrial) consincluded = i + 1; + } + + /* collect all info about majority rule consensus tree */ + /* the +1 is due to the edge with the root */ + consconfid = new_ivector(consincluded + 1); + conssizes = new_ivector(2*consincluded + 2); + consbiparts = new_cmatrix(consincluded + 1, Maxspc); + + for (i = 0; i < consincluded; i++) { + /* copy partition to consbiparts */ + copysplit(splitfreqs[2*i+1], i); + /* frequency in percent (rounded to integer) */ + consconfid[i] = (int) floor(100.0*splitfreqs[2*i]/Numtrial + 0.5); + /* size of partition */ + conssizes[2*i] = splitsizes[splitfreqs[2*i+1]]; + conssizes[2*i+1] = i; + } + for (i = 0; i < Maxspc; i++) consbiparts[consincluded][i] = '1'; + consbiparts[consincluded][outgroup] = '0'; + consconfid[consincluded] = 100; + conssizes[2*consincluded] = Maxspc - 1; + conssizes[2*consincluded + 1] = consincluded; + + /* sort bipartitions according to cluster size */ + qsort(conssizes, consincluded + 1, 2*sizeof(int), intcmp); + + /* reconstruct consensus tree */ + for (i = 0; i < consincluded; i++) { /* try every node */ + size = conssizes[2*i]; /* size of current node */ + for (j = i + 1; j < consincluded + 1; j++) { + + /* compare only with nodes with more descendants */ + if (size == conssizes[2*j]) continue; + + /* check whether node i is a subnode of j */ + subnode = FALSE; + for (k = 0; k < Maxspc && !subnode; k++) { + chari = consbiparts[ conssizes[2*i+1] ][k]; + if (chari != '0') { + charj = consbiparts[ conssizes[2*j+1] ][k]; + if (chari == charj || charj == '3') subnode = TRUE; + } + } + + /* if i is a subnode of j change j accordingly */ + if (subnode) { + /* remove subnode i from j */ + for (k = 0; k < Maxspc; k++) { + chari = consbiparts[ conssizes[2*i+1] ][k]; + if (chari != '0') { + charj = consbiparts[ conssizes[2*j+1] ][k]; + if (chari == charj) + consbiparts[ conssizes[2*j+1] ][k] = '0'; + else if (charj == '3') { + if (chari == '1') + consbiparts[ conssizes[2*j+1] ][k] = '2'; + else if (chari == '2') + consbiparts[ conssizes[2*j+1] ][k] = '1'; + else { + /* Consensus tree [1] */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR H TO DEVELOPERS\n\n\n"); + exit(1); + } + } else { + /* Consensus tree [2] */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR I TO DEVELOPERS\n\n\n"); + exit(1); + } + } + } + /* add link to subnode i in node j */ + charj = consbiparts[ conssizes[2*j+1] ][ conssizes[2*i+1] ]; + if (charj == '0') + consbiparts[ conssizes[2*j+1] ][ conssizes[2*i+1] ] = '2'; + else if (charj == '1') + consbiparts[ conssizes[2*j+1] ][ conssizes[2*i+1] ] = '3'; + else { + /* Consensus tree [3] */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR J TO DEVELOPERS\n\n\n"); + exit(1); + } + } + } + } +} + +/* prototype for recursion */ +void writenode(FILE *treefile, int node); + +/* write node (writeconsensustree) */ +void writenode(FILE *treefile, int node) +{ + int i, first; + + fprintf(treefile, "("); + column++; + /* write descending nodes */ + first = TRUE; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + /* write node i */ + writenode(treefile, i); + + /* reliability value as internal label */ + fprintf(treefile, "%d", consconfid[i]); + + column = column + 3; + } + } + /* write descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + column += fputid(treefile, i); + } + } + fprintf(treefile, ")"); + column++; +} + +/* write consensus tree */ +void writeconsensustree(FILE *treefile) +{ + int i, first; + + column = 1; + fprintf(treefile, "("); + column += fputid(treefile, outgroup) + 2; + fprintf(treefile, ","); + /* write descending nodes */ + first = TRUE; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + /* write node i */ + writenode(treefile, i); + + /* reliability value as internal label */ + fprintf(treefile, "%d", consconfid[i]); + + column = column + 3; + } + } + /* write descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + column += fputid(treefile, i); + } + } + fprintf(treefile, ");\n"); +} + +/* prototype for recursion */ +void nodecoordinates(int node); + +/* establish node coordinates (plotconsensustree) */ +void nodecoordinates(int node) +{ + int i, ymin, ymax, xcoordinate; + + /* first establish coordinates of descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') + nodecoordinates(i); + } + + /* then establish coordinates of descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + /* y-coordinate of taxon i */ + ycortax[i] = ytaxcounter; + ytaxcounter = ytaxcounter - 2; + } + } + + /* then establish coordinates of this node */ + ymin = 2*Maxspc - 2; + ymax = 0; + xcoordinate = 0; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') { + if (ycor[i] > ymax) ymax = ycor[i]; + if (ycor[i] < ymin) ymin = ycor[i]; + if (xcor[i] > xcoordinate) xcoordinate = xcor[i]; + } + } + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + if (ycortax[i] > ymax) ymax = ycortax[i]; + if (ycortax[i] < ymin) ymin = ycortax[i]; + } + } + ycormax[node] = ymax; + ycormin[node] = ymin; + ycor[node] = (int) floor(0.5*(ymax + ymin) + 0.5); + if (xcoordinate == 0) xcoordinate = 9; + xcor[node] = xcoordinate + 4; +} + +/* prototype for recursion */ +void drawnode(int node, int xold); + +/* drawnode (plotconsensustree) */ +void drawnode(int node, int xold) +{ + int i, j; + char buf[4]; + + /* first draw vertical line */ + for (i = ycormin[node] + 1; i < ycormax[node]; i++) + treepict[xcor[node]][i] = ':'; + + /* then draw descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') + drawnode(i, xcor[node]); + } + + /* then draw descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + treepict[xcor[node]][ycortax[i]] = ':'; + for (j = xcor[node] + 1; j < xsize-10; j++) + treepict[j][ycortax[i]] = '-'; + for (j = 0; j < 10; j++) + treepict[xsize-10+j][ycortax[i]] = Identif[i][j]; + } + } + + /* then draw internal edge with consensus value */ + treepict[xold][ycor[node]] = ':'; + treepict[xcor[node]][ycor[node]] = ':'; + for (i = xold + 1; i < xcor[node]-3; i++) + treepict[i][ycor[node]] = '-'; + sprintf(buf, "%d", consconfid[node]); + if (consconfid[node] == 100) { + treepict[xcor[node]-3][ycor[node]] = buf[0]; + treepict[xcor[node]-2][ycor[node]] = buf[1]; + treepict[xcor[node]-1][ycor[node]] = buf[2]; + } else { + treepict[xcor[node]-3][ycor[node]] = '-'; + treepict[xcor[node]-2][ycor[node]] = buf[0]; + treepict[xcor[node]-1][ycor[node]] = buf[1]; + } +} + +/* plot consensus tree */ +void plotconsensustree(FILE *plotfp) +{ + int i, j, yroot, startree; + + /* star tree or no star tree */ + if (consincluded == 0) { + startree = TRUE; + consincluded = 1; /* avoids problems with malloc */ + } else + startree = FALSE; + + /* memory for x-y-coordinates of each bipartition */ + xcor = new_ivector(consincluded); + ycor = new_ivector(consincluded); + ycormax = new_ivector(consincluded); + ycormin = new_ivector(consincluded); + if (startree) consincluded = 0; /* avoids problems with malloc */ + + /* y-coordinates of each taxon */ + ycortax = new_ivector(Maxspc); + ycortax[outgroup] = 0; + + /* establish coordinates */ + ytaxcounter = 2*Maxspc - 2; + + /* first establish coordinates of descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') + nodecoordinates(i); + } + + /* then establish coordinates of descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + /* y-coordinate of taxon i */ + ycortax[i] = ytaxcounter; + ytaxcounter = ytaxcounter - 2; + } + } + + /* then establish length of root edge and size of whole tree */ + yroot = 0; + xsize = 0; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') { + if (ycor[i] > yroot) yroot = ycor[i]; + if (xcor[i] > xsize) xsize = xcor[i]; + } + } + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + if (ycortax[i] > yroot) yroot = ycortax[i]; + } + } + if (xsize == 0) xsize = 9; + /* size in x direction inclusive one blank on the left */ + xsize = xsize + 6; + + /* change all x-labels so that (0,0) is down-left */ + for (i = 0; i < consincluded; i++) + xcor[i] = xsize-1-xcor[i]; + + /* draw tree */ + treepict = new_cmatrix(xsize, 2*Maxspc-1); + for (i = 0; i < xsize; i++) + for (j = 0; j < 2*Maxspc-1; j++) + treepict[i][j] = ' '; + + /* draw root */ + for (i = 1; i < yroot; i++) + treepict[1][i] = ':'; + treepict[1][0] = ':'; + for (i = 2; i < xsize - 10; i++) + treepict[i][0] = '-'; + for (i = 0; i < 10; i++) + treepict[xsize-10+i][0] = Identif[outgroup][i]; + + /* then draw descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') + drawnode(i, 1); + } + + /* then draw descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + treepict[1][ycortax[i]] = ':'; + for (j = 2; j < xsize-10; j++) + treepict[j][ycortax[i]] = '-'; + for (j = 0; j < 10; j++) + treepict[xsize-10+j][ycortax[i]] = Identif[i][j]; + } + } + + /* plot tree */ + for (i = 2*Maxspc-2; i > -1; i--) { + for (j = 0; j < xsize; j++) + fputc(treepict[j][i], plotfp); + fputc('\n', plotfp); + } + + free_ivector(xcor); + free_ivector(ycor); + free_ivector(ycormax); + free_ivector(ycormin); + free_ivector(ycortax); + free_cmatrix(treepict); +} + + + +/******************************************************************************/ +/* storing and evaluating quartet branching information */ +/******************************************************************************/ + +/* general remarks: + + for a quartet with the taxa a, b, c, d there are + three possible binary trees: + + 1) (a,b)-(c,d) + 2) (a,c)-(b,d) + 3) (a,d)-(b,c) + + For every quartet information about its branching structure is + stored. With the functions readquartet and writequartet + this information can be accessed. For every quartet (a,b,c,d) + with a < b < c < d (taxa) the branching information is encoded + using 4 bits: + + value 8 4 2 1 + +-------------+-------------+-------------+-------------+ + | not used | tree 3 | tree 2 | tree 1 | + +-------------+-------------+-------------+-------------+ + + If the branching structure of the taxa corresponds to one of the + three trees the corresponding bit is set. If the branching structure + is unclear because two of the three trees have the same maximum + likelihood value the corresponding two bits are set. If the branching + structure is completely unknown all the bits are set (the highest + bit is always cleared because it is not used). + +*/ + +/* allocate memory for quartets */ +unsigned char *mallocquartets(int taxa) +{ + uli nc, numch; + unsigned char *qinfo; + + /* compute number of quartets */ + Numquartets = (uli) taxa*(taxa-1)*(taxa-2)*(taxa-3)/24; + if (Numquartets % 2 == 0) { /* even number */ + numch = Numquartets/2; + } else { /* odd number */ + numch = (Numquartets + 1)/2; + } + /* allocate memory */ + qinfo = (unsigned char *) malloc(numch * sizeof(unsigned char) ); + if (qinfo == NULL) maerror("quartetinfo in mallocquartets"); + for (nc = 0; nc < numch; nc++) qinfo[nc] = 0; + return(qinfo); +} + +/* free quartet memory */ +void freequartets() +{ + free(quartetinfo); +} + +/* read quartet info - a < b < c < d */ +unsigned char readquartet(int a, int b, int c, int d) +{ + uli qnum; + + qnum = (uli) a + + (uli) b*(b-1)/2 + + (uli) c*(c-1)*(c-2)/6 + + (uli) d*(d-1)*(d-2)*(d-3)/24; + if (qnum % 2 == 0) { /* even number */ + /* bits 0 to 3 */ + return (quartetinfo[qnum/2] & (unsigned char) 15); + } else { /* odd number */ + /* bits 4 to 7 */ + return ((quartetinfo[(qnum-1)/2] & (unsigned char) 240)>>4); + } +} + +/* write quartet info - a < b < c < d, 0 <= info <= 15 */ +void writequartet(int a, int b, int c, int d, unsigned char info) +{ + uli qnum; + + qnum = (uli) a + + (uli) b*(b-1)/2 + + (uli) c*(c-1)*(c-2)/6 + + (uli) d*(d-1)*(d-2)*(d-3)/24; + if (qnum % 2 == 0) { /* even number */ + /* bits 0 to 3 */ + quartetinfo[qnum/2] = + ((quartetinfo[qnum/2] & (unsigned char) 240) | + (info & (unsigned char) 15)); + } else { /* odd number */ + /* bits 4 to 7 */ + quartetinfo[(qnum-1)/2] = + ((quartetinfo[(qnum-1)/2] & (unsigned char) 15) | + ((info & (unsigned char) 15)<<4)); + } +} + +/* prototypes */ +void openfiletowrite(FILE **, char[], char[]); +void closefile(FILE *); + +/* sorts three doubles in descending order */ +void sort3doubles(dvector num, ivector order) +{ + if (num[0] > num[1]) { + if(num[2] > num[0]) { + order[0] = 2; + order[1] = 0; + order[2] = 1; + } else if (num[2] < num[1]) { + order[0] = 0; + order[1] = 1; + order[2] = 2; + } else { + order[0] = 0; + order[1] = 2; + order[2] = 1; + } + } else { + if(num[2] > num[1]) { + order[0] = 2; + order[1] = 1; + order[2] = 0; + } else if (num[2] < num[0]) { + order[0] = 1; + order[1] = 0; + order[2] = 2; + } else { + order[0] = 1; + order[1] = 2; + order[2] = 0; + } + } +} + +/* checks out all possible quartets */ +void computeallquartets() +{ + double onethird; + uli nq; + unsigned char treebits[3]; + FILE *lhfp; +# if ! PARALLEL + int a, b, c, i; + double qc2, mintogo, minutes, hours, temp; + double temp1, temp2, temp3; + unsigned char discreteweight[3]; +# endif + + onethird = 1.0/3.0; + treebits[0] = (unsigned char) 1; + treebits[1] = (unsigned char) 2; + treebits[2] = (unsigned char) 4; + + if (show_optn) { /* list all unresolved quartets */ + openfiletowrite(&unresfp, UNRESOLVED, "unresolved quartet trees"); + fprintf(unresfp, "List of all completely unresolved quartets:\n\n"); + } + + nq = 0; + badqs = 0; + + /* start timer - percentage of completed quartets */ + time(&time0); + time1 = time0; + mflag = 0; + +# if PARALLEL + { + schedtype sched; + int flag; + MPI_Status stat; + int dest = 1; + uli qaddr =0; + uli qamount=0; + int qblocksent = 0; + int apr; + uli sq, noq; + initsched(&sched, numquarts(Maxspc), PP_NumProcs-1, 4); + qamount=sgss(&sched); + while (qamount > 0) { + if (PP_emptyslave()) { + PP_RecvQuartBlock(0, &sq, &noq, quartetinfo, &apr); + qblocksent -= noq; + } + dest = PP_getslave(); + PP_SendDoQuartBlock(dest, qaddr, qamount, (approxqp ? APPROX : EXACT)); + qblocksent += qamount; + qaddr += qamount; + qamount=sgss(&sched); + + MPI_Iprobe(MPI_ANY_SOURCE, PP_QUARTBLOCKSPECS, PP_Comm, &flag, &stat); + while (flag) { + PP_RecvQuartBlock(0, &sq, &noq, quartetinfo, &apr); + qblocksent -= noq; + MPI_Iprobe(MPI_ANY_SOURCE, PP_QUARTBLOCKSPECS, PP_Comm, &flag, &stat); + } + } + while (qblocksent > 0) { + PP_RecvQuartBlock(0, &sq, &noq, quartetinfo, &apr); + qblocksent -= noq; + } + } +# else /* PARALLEL */ + + addtimes(GENERAL, &tarr); + if (savequartlh_optn) { + openfiletowrite(&lhfp, ALLQUARTLH, "all quartet likelihoods"); + if (saveqlhbin_optn) writetpqfheader(Maxspc, lhfp, 3); + else writetpqfheader(Maxspc, lhfp, 4); + } + + for (i = 3; i < Maxspc; i++) + for (c = 2; c < i; c++) + for (b = 1; b < c; b++) + for (a = 0; a < b; a++) { + nq++; + + /* generate message every 15 minutes */ + /* check timer */ + time(&time2); + if ( (time2 - time1) > 900) { + /* every 900 seconds */ + /* percentage of completed quartets */ + if (mflag == 0) { + FPRINTF(STDOUTFILE "\n"); + mflag = 1; + } + qc2 = 100.*nq/Numquartets; + mintogo = (100.0-qc2) * + (double) (time2-time0)/60.0/qc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%.2f%%", qc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + time1 = time2; + } + + /* maximum likelihood values */ + + /* exact or approximate maximum likelihood values */ + compute_quartlklhds(a,b,c,i,&qweight[0],&qweight[1],&qweight[2], (approxqp ? APPROX : EXACT)); + + if (savequartlh_optn) { + if (saveqlhbin_optn) + fwrite(qweight, sizeof(double), 3, lhfp); + else + fprintf(lhfp, "(%d,%d,%d,%d)\t%f\t%f\t%f\n", a, b, c, i, + qweight[0], qweight[1], qweight[2]); + } + + /* sort in descending order */ + sort3doubles(qweight, qworder); + + if (usebestq_optn) { + sqorder[2] = 2; + discreteweight[sqorder[2]] = treebits[qworder[0]]; + if (qweight[qworder[0]] == qweight[qworder[1]]) { + discreteweight[sqorder[2]] = discreteweight[sqorder[2]] || treebits[qworder[1]]; + if (qweight[qworder[1]] == qweight[qworder[2]]) { + discreteweight[sqorder[2]] = discreteweight[sqorder[2]] || treebits[qworder[2]]; + discreteweight[sqorder[2]] = 7; + } + } + } else { + + /* compute Bayesian weights */ + qweight[qworder[1]] = exp(qweight[qworder[1]]-qweight[qworder[0]]); + qweight[qworder[2]] = exp(qweight[qworder[2]]-qweight[qworder[0]]); + qweight[qworder[0]] = 1.0; + temp = qweight[0] + qweight[1] + qweight[2]; + qweight[0] = qweight[0]/temp; + qweight[1] = qweight[1]/temp; + qweight[2] = qweight[2]/temp; + + /* square deviations */ + temp1 = 1.0 - qweight[qworder[0]]; + sqdiff[0] = temp1 * temp1 + + qweight[qworder[1]] * qweight[qworder[1]] + + qweight[qworder[2]] * qweight[qworder[2]]; + discreteweight[0] = treebits[qworder[0]]; + + temp1 = 0.5 - qweight[qworder[0]]; + temp2 = 0.5 - qweight[qworder[1]]; + sqdiff[1] = temp1 * temp1 + temp2 * temp2 + + qweight[qworder[2]] * qweight[qworder[2]]; + discreteweight[1] = treebits[qworder[0]] + treebits[qworder[1]]; + + temp1 = onethird - qweight[qworder[0]]; + temp2 = onethird - qweight[qworder[1]]; + temp3 = onethird - qweight[qworder[2]]; + sqdiff[2] = temp1 * temp1 + temp2 * temp2 + temp3 * temp3; + discreteweight[2] = (unsigned char) 7; + + /* sort in descending order */ + sort3doubles(sqdiff, sqorder); + } + + /* determine best discrete weight */ + writequartet(a, b, c, i, discreteweight[sqorder[2]]); + + /* counting completely unresolved quartets */ + if (discreteweight[sqorder[2]] == 7) { + badqs++; + badtaxon[a]++; + badtaxon[b]++; + badtaxon[c]++; + badtaxon[i]++; + if (show_optn) { + fputid10(unresfp, a); + fprintf(unresfp, " "); + fputid10(unresfp, b); + fprintf(unresfp, " "); + fputid10(unresfp, c); + fprintf(unresfp, " "); + fputid(unresfp, i); + fprintf(unresfp, "\n"); + } + } + addtimes(QUARTETS, &tarr); + } + if (savequartlh_optn) { + closefile(lhfp); + } + if (show_optn) + closefile(unresfp); + if (mflag == 1) + FPRINTF(STDOUTFILE "\n"); +# endif /* PARALLEL */ + +} + +/* check the branching structure between the leaves (not the taxa!) + A, B, C, and I (A, B, C, I don't need to be ordered). As a result, + the two leaves that are closer related to each other than to leaf I + are found in chooseA and chooseB. If the branching structure is + not uniquely defined, ChooseA and ChooseB are chosen randomly + from the possible taxa */ +void checkquartet(int A, int B, int C, int I) +{ + int i, j, a, b, taxon[5], leaf[5], ipos; + unsigned char qresult; + int notunique = FALSE; + + /* The relationship between leaves and taxa is defined by trueID */ + taxon[1] = trueID[A]; /* taxon number */ + leaf[1] = A; /* leaf number */ + taxon[2] = trueID[B]; + leaf[2] = B; + taxon[3] = trueID[C]; + leaf[3] = C; + taxon[4] = trueID[I]; + leaf[4] = I; + + /* sort for taxa */ + /* Source: Numerical Recipes (PIKSR2.C) */ + for (j = 2; j <= 4; j++) { + a = taxon[j]; + b = leaf[j]; + i = j-1; + while (i > 0 && taxon[i] > a) { + taxon[i+1] = taxon[i]; + leaf[i+1] = leaf[i]; + i--; + } + taxon[i+1] = a; + leaf[i+1] = b; + } + + /* where is leaf I ? */ + ipos = 1; + while (leaf[ipos] != I) ipos++; + + /* look at sequence quartet */ + qresult = readquartet(taxon[1], taxon[2], taxon[3], taxon[4]); + + /* chooseA and chooseB */ + do { + switch (qresult) { + + /* one single branching structure */ + + /* 001 */ + case 1: if (ipos == 1 || ipos == 2) { + chooseA = leaf[3]; + chooseB = leaf[4]; + } else { + chooseA = leaf[1]; + chooseB = leaf[2]; + } + notunique = FALSE; + break; + + /* 010 */ + case 2: if (ipos == 1 || ipos == 3) { + chooseA = leaf[2]; + chooseB = leaf[4]; + } else { + chooseA = leaf[1]; + chooseB = leaf[3]; + } + notunique = FALSE; + break; + + /* 100 */ + case 4: if (ipos == 1 || ipos == 4) { + chooseA = leaf[2]; + chooseB = leaf[3]; + } else { + chooseA = leaf[1]; + chooseB = leaf[4]; + } + notunique = FALSE; + break; + + /* two possible branching structures */ + + /* 011 */ + case 3: if (randominteger(2)) qresult = 1; + else qresult = 2; + notunique = TRUE; + break; + + /* 101 */ + case 5: if (randominteger(2)) qresult = 1; + else qresult = 4; + notunique = TRUE; + break; + + /* 110 */ + case 6: if (randominteger(2)) qresult = 2; + else qresult = 4; + notunique = TRUE; + break; + + /* three possible branching structures */ + + /* 111 */ + case 7: qresult = (1 << randominteger(3)); /* 1, 2, or 4 */ + notunique = TRUE; + break; + + default: /* Program error [checkquartet] */ +#if PARALLEL + FPRINTF(STDOUTFILE "\n\n\n(%2d)HALT: PLEASE REPORT ERROR K-PARALLEL TO DEVELOPERS (%d,%d,%d,%d) = %ld\n\n\n", + PP_Myid, taxon[1], taxon[2], taxon[3], taxon[4], + quart2num(taxon[1], taxon[2], taxon[3], taxon[4])); +#else + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR K TO DEVELOPERS\n\n\n"); +#endif + + } + } while (notunique); + + return; +} + diff --git a/forester/archive/RIO/others/puzzle_dqo/src/sched.c b/forester/archive/RIO/others/puzzle_dqo/src/sched.c new file mode 100644 index 0000000..3f1c0f6 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/sched.c @@ -0,0 +1,423 @@ +/* + * sched.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#include +#include +#include +#include "sched.h" +/* #include "ppuzzle.h" */ + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + +int scinit; +int ssinit; +int fscinit; +int gssinit; +int tssinit; + +int n, chunksize; +int p; + +#ifdef SCHEDTEST + schedtype testsched; +#endif + +void printsched(schedtype sch) +{ + FPRINTF(STDOUTFILE "Current scheduling status:\n"); + FPRINTF(STDOUTFILE " truetasks=%5ld - alltasks=%5ld - numtasks=%5ld - numprocs=%5d\n", + sch.truetasks, sch.alltasks, sch.numtasks, sch.numprocs); + FPRINTF(STDOUTFILE " delta =%5d - overhead=%5d - rest =%5d - inited =%5d\n", + sch.delta, sch.overhead, sch.rest, sch.inited); + FPRINTF(STDOUTFILE " nconst =%5d - fconst =%5f - lconst =%5f - kconst =%5f\n", + sch.nconst, sch.fconst, sch.lconst, sch.kconst); +} + +void initsched(schedtype *sch, uli tasks, int procs, uli minchunk) +{ + if (minchunk < 1) minchunk = 1; + (*sch).minchunk = minchunk; + (*sch).truetasks = tasks; + (*sch).rest = (int)((*sch).truetasks % (*sch).minchunk); + (*sch).alltasks = (tasks - (*sch).rest); + (*sch).numtasks = (*sch).alltasks; + (*sch).numprocs = procs; + (*sch).delta = 0; + (*sch).overhead = 0; + (*sch).nconst = 0; + (*sch).fconst = 0; + (*sch).lconst = 0; + (*sch).kconst = 0; + (*sch).inited = 0; + +# ifdef PVERBOSE1 + printsched(*sch); +# endif /* PVERBOSE1 */ +} + +/************************************** +* Static Chunking +**************************************/ +uli sc(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).overhead = (*sch).alltasks % (*sch).numprocs; + (*sch).delta = ((*sch).alltasks - (*sch).overhead) / (*sch).numprocs; + (*sch).inited ++; + } + + if (!(*sch).overhead) { + if ((*sch).numtasks >= (*sch).delta) + tmp = (uli)(*sch).delta; + else + tmp = 0; + } else { + if ((*sch).numtasks >= ((*sch).delta + 1)) { + tmp = (uli)(*sch).delta + 1; + (*sch).overhead--; + } else + tmp = 0; + } + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; +} /* SC */ + + +/************************************** +* Self Scheduling +**************************************/ +uli ss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).inited ++; + } + + if ((*sch).numtasks >= 1) + tmp = 1; + else + tmp = (*sch).numtasks; + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + + return tmp; +} /* SS */ + + +/************************************** +* fixed-size chunking +**************************************/ +int fsc() +{ + static int R ; + static int delta ; + static int overhead; + + int tmp; + + if (fscinit == 0) { + R = n; + overhead = n % p; + delta = (n - overhead) / p; + fscinit ++; + } + + if (!overhead) { + if (R >= delta) + tmp = delta; + else + tmp = 0; + } else { + if (R >= (delta + 1)) { + tmp = delta + 1; + overhead--; + } else + tmp = 0; + } + + R -= tmp; + return tmp; +} /* FSC */ + + +/************************************** +* Guided Self Scheduling +**************************************/ +uli gss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).inited ++; + } + + if ((*sch).numtasks >= 1) { + tmp = (uli)ceil((*sch).numtasks / (*sch).numprocs); + if (tmp == 0) tmp = 1; + } else + tmp = 0; + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; +} /* GSS */ + +/************************************** +* Smooth Guided Self Scheduling +**************************************/ +uli sgss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).inited ++; + } + + if ((*sch).numtasks >= 1) { + tmp = (uli)ceil(((*sch).numtasks / (*sch).numprocs) / 2); + if (tmp == 0) tmp = 1; + } else + tmp = 0; + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; +} /* SGSS */ + + +/************************************** +* Trapezoid Self Scheduling +**************************************/ +uli tss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).fconst = ceil((*sch).numtasks / (2*(*sch).numprocs)); + if ((*sch).fconst == 0) (*sch).fconst = 1; + (*sch).lconst = 1; + (*sch).nconst = ceil( (2*n) / ((*sch).fconst + (*sch).lconst) ); + (*sch).ddelta = (((*sch).fconst - (*sch).lconst) / ((*sch).nconst - 1)); + (*sch).kconst = (*sch).fconst; + FPRINTF(STDOUTFILE "f = n/2p = %.2f ; l = %.2f\n", (*sch).fconst, (*sch).lconst); + FPRINTF(STDOUTFILE "N = 2n/(f+l) = %d ; delta = (f-l)/(N-1) = %.2f\n", (*sch).nconst, (*sch).ddelta); + (*sch).inited ++; + } + + if ((*sch).kconst <= (double) (*sch).numtasks) { + tmp = (uli)ceil((*sch).kconst); + (*sch).kconst -= (*sch).ddelta; + } else { + tmp = (uli)(*sch).numtasks; + (*sch).kconst = 0.0; + } + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; + +} /* TSS */ + + +/******************/ + + +#ifdef SCHEDTEST + uli numquarts(int maxspc) + { + uli tmp; + int a, b, c, d; + + if (maxspc < 4) + return (uli)0; + else { + maxspc--; + a = maxspc-3; + b = maxspc-2; + c = maxspc-1; + d = maxspc; + + tmp = (uli) 1 + a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); + } + } /* numquarts */ +#endif + + + + +/************************************** +* main +**************************************/ +#ifdef SCHEDTEST +int main(int argc, char *argv[]) +{ + int tcount, + count, + lastsize, + size; + if ((argc > 4) || (argc < 3)) { + FPRINTF(STDOUTFILE "\n\n Usage: %s <# species> <# processors> []\n\n", argv[0]); + exit(1); + } + + chunksize = 1; + + switch(argc) { + case 4: + chunksize = atoi(argv[3]); + case 3: + n = numquarts(atoi(argv[1])); + p = atoi(argv[2]); + } + + FPRINTF(STDOUTFILE "proc=%6d\n", p); + FPRINTF(STDOUTFILE "task=%6d\n", n); + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n---------------------------\n"); + FPRINTF(STDOUTFILE "SC(sched) - Static Chunking\n"); + FPRINTF(STDOUTFILE "---------------------------\n\n"); + do { size = sc(&testsched); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); + + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n---------------------------\n"); + FPRINTF(STDOUTFILE "SS(sched) - Self Scheduling\n"); + FPRINTF(STDOUTFILE "---------------------------\n\n"); + do { size = ss(&testsched); + if (size > 0) {if (count==1) FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + count++; + tcount+=size; + lastsize = size;} + else {FPRINTF(STDOUTFILE " ...\n"); + FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, lastsize , (lastsize%chunksize) ? '!' : ' '); + FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1));} + } while (size > 0); + + +/**/ + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n---------------------------\n"); + FPRINTF(STDOUTFILE "FSC() - Fixed-Size Chunking\n"); + FPRINTF(STDOUTFILE "---------------------------\n\n"); + do { size = fsc(); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); +/**/ + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n-----------------------------------\n"); + FPRINTF(STDOUTFILE "GSS(sched) - Guided Self Scheduling\n"); + FPRINTF(STDOUTFILE "-----------------------------------\n\n"); + do { size = gss(&testsched); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n--------------------------------------\n"); + FPRINTF(STDOUTFILE "TSS(sched) - Trapezoid Self Scheduling\n"); + FPRINTF(STDOUTFILE "--------------------------------------\n\n"); + do { size = tss(&testsched); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); + return (0); +} +#endif diff --git a/forester/archive/RIO/others/puzzle_dqo/src/sched.h b/forester/archive/RIO/others/puzzle_dqo/src/sched.h new file mode 100644 index 0000000..e75bdd2 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/sched.h @@ -0,0 +1,53 @@ +/* + * sched.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef SCHED_H +#define SCHED_H +#ifndef SCHEDTEST +# include "util.h" +#else + typedef unsigned long int uli; +#endif + + +typedef struct sched_t{ + uli truetasks; + uli alltasks; + uli numtasks; + uli minchunk; + int numprocs; + int delta; + double ddelta; + int overhead; + int rest; + int nconst; + double fconst; + double lconst; + double kconst; + int inited; +} schedtype; + +void num2quart(uli qnum, int *a, int *b, int *c, int *d); +uli numquarts(int maxspc); +uli quart2num (int a, int b, int c, int d); + +void printsched(schedtype sch); +void initsched(schedtype *sch, uli tasks, int procs, uli minchunk); +uli sc(schedtype *sch); +uli gss(schedtype *sch); +uli sgss(schedtype *sch); +uli tss(schedtype *sch); + +#endif /* SCHED_H */ diff --git a/forester/archive/RIO/others/puzzle_dqo/src/test b/forester/archive/RIO/others/puzzle_dqo/src/test new file mode 100644 index 0000000..a680df2 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/test @@ -0,0 +1,19 @@ +CC gcc +LIBS -lm +CFLAGS -g -O2 +DEFS -DPACKAGE=\"tree-puzzle\" -DVERSION=\"5.0\" -DHAVE_LIBM=1 -DSTDC_HEADERS=1 -DHAVE_LIMITS_H=1 +SET_MAKE + +HCC @HCC@ +MPICC +MPCC @MPCC@ + +MPICC +MPILIBS +MPIDEFS +MPICFLAGS + +PCC @PCC@ +PLIBS @PLIBS@ +PDEFS @PDEFS@ +PCFLAGS @PCFLAGS@ diff --git a/forester/archive/RIO/others/puzzle_dqo/src/test.in b/forester/archive/RIO/others/puzzle_dqo/src/test.in new file mode 100644 index 0000000..0dc7ddc --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/test.in @@ -0,0 +1,19 @@ +CC @CC@ +LIBS @LIBS@ +CFLAGS @CFLAGS@ +DEFS @DEFS@ +SET_MAKE @SET_MAKE@ + +HCC @HCC@ +MPICC @MPICC@ +MPCC @MPCC@ + +MPICC @MPICC@ +MPILIBS @MPILIBS@ +MPIDEFS @MPIDEFS@ +MPICFLAGS @MPICFLAGS@ + +PCC @PCC@ +PLIBS @PLIBS@ +PDEFS @PDEFS@ +PCFLAGS @PCFLAGS@ diff --git a/forester/archive/RIO/others/puzzle_dqo/src/util.c b/forester/archive/RIO/others/puzzle_dqo/src/util.c new file mode 100644 index 0000000..6a998dc --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/util.c @@ -0,0 +1,751 @@ +/* + * util.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#include "util.h" + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, + extern int PP_NumProcs; + extern int PP_Myid; + long int PP_randn; + long int PP_rand; +#endif + + +/* + * memory allocation error handler + */ + +void maerror(char *message) +{ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (lack of memory: %s)\n\n", message); + FPRINTF(STDOUTFILE "Hint for Macintosh users:\n"); + FPRINTF(STDOUTFILE "Use the command of the Finder to increase the memory partition!\n\n"); + exit(1); +} + + +/* + * memory allocate double vectors, matrices, and cubes + */ + +dvector new_dvector(int n) +{ + dvector v; + + v = (dvector) malloc((unsigned) (n * sizeof(double))); + if (v == NULL) maerror("step 1 in new_dvector"); + + return v; +} + +dmatrix new_dmatrix(int nrow, int ncol) +{ + int i; + dmatrix m; + + m = (dmatrix) malloc((unsigned) (nrow * sizeof(dvector))); + if (m == NULL) maerror("step 1 in in new_dmatrix"); + + *m = (dvector) malloc((unsigned) (nrow * ncol * sizeof(double))); + if (*m == NULL) maerror("step 2 in in new_dmatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + + + + +dcube new_dcube(int ntri, int nrow, int ncol) +{ + int i, j; + dcube c; + + c = (dcube) malloc((unsigned) (ntri * sizeof(dmatrix))); + if (c == NULL) maerror("step 1 in in new_dcube"); + + *c = (dmatrix) malloc((unsigned) (ntri * nrow * sizeof(dvector))); + if (*c == NULL) maerror("step 2 in in new_dcube"); + + **c = (dvector) malloc((unsigned) (ntri * nrow * ncol * sizeof(double))); + if (**c == NULL) maerror("step 3 in in new_dcube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_dvector(dvector v) +{ + free((double *) v); +} + +void free_dmatrix(dmatrix m) +{ + free((double *) *m); + free((double *) m); +} + +void free_dcube(dcube c) +{ + free((double *) **c); + free((double *) *c); + free((double *) c); +} + + +/* + * memory allocate char vectors, matrices, and cubes + */ + +cvector new_cvector(int n) +{ + cvector v; + + v = (cvector) malloc((unsigned)n * sizeof(char)); + if (v == NULL) maerror("step1 in new_cvector"); + + return v; +} + +cmatrix new_cmatrix(int nrow, int ncol) +{ + int i; + cmatrix m; + + m = (cmatrix) malloc((unsigned) (nrow * sizeof(cvector))); + if (m == NULL) maerror("step 1 in new_cmatrix"); + + *m = (cvector) malloc((unsigned) (nrow * ncol * sizeof(char))); + if (*m == NULL) maerror("step 2 in new_cmatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + +ccube new_ccube(int ntri, int nrow, int ncol) +{ + int i, j; + ccube c; + + c = (ccube) malloc((unsigned) (ntri * sizeof(cmatrix))); + if (c == NULL) maerror("step 1 in new_ccube"); + + *c = (cmatrix) malloc((unsigned) (ntri * nrow * sizeof(cvector))); + if (*c == NULL) maerror("step 2 in new_ccube"); + + **c = (cvector) malloc((unsigned) (ntri * nrow * ncol * sizeof(char))); + if (**c == NULL) maerror("step 3 in new_ccube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_cvector(cvector v) +{ + free((char *) v); +} + +void free_cmatrix(cmatrix m) +{ + free((char *) *m); + free((char *) m); +} + +void free_ccube(ccube c) +{ + free((char *) **c); + free((char *) *c); + free((char *) c); +} + + +/* + * memory allocate int vectors, matrices, and cubes + */ + +ivector new_ivector(int n) +{ + ivector v; + + v = (ivector) malloc((unsigned) (n * sizeof(int))); + if (v == NULL) maerror("step 1 in new_ivector"); + + return v; +} + +imatrix new_imatrix(int nrow, int ncol) +{ + int i; + imatrix m; + + m = (imatrix) malloc((unsigned) (nrow * sizeof(ivector))); + if (m == NULL) maerror("step 1 in new_imatrix"); + + *m = (ivector) malloc((unsigned) (nrow * ncol * sizeof(int))); + if (*m == NULL) maerror("step 2 in new_imatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + +icube new_icube(int ntri, int nrow, int ncol) +{ + int i, j; + icube c; + + c = (icube) malloc((unsigned) (ntri * sizeof(imatrix))); + if (c == NULL) maerror("step 1 in new_icube"); + + *c = (imatrix) malloc((unsigned) (ntri * nrow * sizeof(ivector))); + if (*c == NULL) maerror("step 2 in new_icube"); + + **c = (ivector) malloc((unsigned) (ntri * nrow * ncol * sizeof(int))); + if (**c == NULL) maerror("step 3 in new_icube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_ivector(ivector v) +{ + free((int *) v); +} + +void free_imatrix(imatrix m) +{ + free((int *) *m); + free((int *) m); +} + +void free_icube(icube c) +{ + free((int *) **c); + free((int *) *c); + free((int *) c); +} + + +/* + * memory allocate uli vectors, matrices, and cubes + */ + +ulivector new_ulivector(int n) +{ + ulivector v; + + v = (ulivector) malloc((unsigned) (n * sizeof(uli))); + if (v == NULL) maerror("step 1 in new_ulivector"); + + return v; +} + +ulimatrix new_ulimatrix(int nrow, int ncol) +{ + int i; + ulimatrix m; + + m = (ulimatrix) malloc((unsigned) (nrow * sizeof(ulivector))); + if (m == NULL) maerror("step 1 in new_ulimatrix"); + + *m = (ulivector) malloc((unsigned) (nrow * ncol * sizeof(uli))); + if (*m == NULL) maerror("step 2 in new_ulimatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + +ulicube new_ulicube(int ntri, int nrow, int ncol) +{ + int i, j; + ulicube c; + + c = (ulicube) malloc((unsigned) (ntri * sizeof(ulimatrix))); + if (c == NULL) maerror("step 1 in new_ulicube"); + + *c = (ulimatrix) malloc((unsigned) (ntri * nrow * sizeof(ulivector))); + if (*c == NULL) maerror("step 2 in new_ulicube"); + + **c = (ulivector) malloc((unsigned) (ntri * nrow * ncol * sizeof(uli))); + if (**c == NULL) maerror("step 3 in new_ulicube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_ulivector(ulivector v) +{ + free((uli *) v); +} + +void free_ulimatrix(ulimatrix m) +{ + free((uli *) *m); + free((uli *) m); +} + +void free_ulicube(ulicube c) +{ + free((uli *) **c); + free((uli *) *c); + free((uli *) c); +} + + +/******************************************************************************/ +/* random numbers generator (Numerical recipes) */ +/******************************************************************************/ + +/* definitions */ +#define IM1 2147483563 +#define IM2 2147483399 +#define AM (1.0/IM1) +#define IMM1 (IM1-1) +#define IA1 40014 +#define IA2 40692 +#define IQ1 53668 +#define IQ2 52774 +#define IR1 12211 +#define IR2 3791 +#define NTAB 32 +#define NDIV (1+IMM1/NTAB) +#define EPS 1.2e-7 +#define RNMX (1.0-EPS) + +/* variable */ +long idum; + +double randomunitintervall() +/* Long period (> 2e18) random number generator. Returns a uniform random + deviate between 0.0 and 1.0 (exclusive of endpoint values). + + Source: + Press et al., "Numerical recipes in C", Cambridge University Press, 1992 + (chapter 7 "Random numbers", ran2 random number generator) */ +{ + int j; + long k; + static long idum2=123456789; + static long iy=0; + static long iv[NTAB]; + double temp; + + if (idum <= 0) { + if (-(idum) < 1) + idum=1; + else + idum=-(idum); + idum2=(idum); + for (j=NTAB+7;j>=0;j--) { + k=(idum)/IQ1; + idum=IA1*(idum-k*IQ1)-k*IR1; + if (idum < 0) + idum += IM1; + if (j < NTAB) + iv[j] = idum; + } + iy=iv[0]; + } + k=(idum)/IQ1; + idum=IA1*(idum-k*IQ1)-k*IR1; + if (idum < 0) + idum += IM1; + k=idum2/IQ2; + idum2=IA2*(idum2-k*IQ2)-k*IR2; + if (idum2 < 0) + idum2 += IM2; + j=iy/NDIV; + iy=iv[j]-idum2; + iv[j] = idum; + if (iy < 1) + iy += IMM1; + if ((temp=AM*iy) > RNMX) + return RNMX; + else + return temp; +} + +#undef IM1 +#undef IM2 +#undef AM +#undef IMM1 +#undef IA1 +#undef IA2 +#undef IQ1 +#undef IQ2 +#undef IR1 +#undef IR2 +#undef NTAB +#undef NDIV +#undef EPS +#undef RNMX + +int initrandom(int seed) +{ + srand((unsigned) time(NULL)); + if (seed < 0) + seed = rand(); + idum=-(long) seed; +# ifdef PARALLEL + { + int n; + for (n=0; n= 0.0 ? fabs(a) : -fabs(a)) + +/* Brents method in one dimension */ +double brent(double ax, double bx, double cx, double (*f)(double), double tol, + double *foptx, double *f2optx, double fax, double fbx, double fcx) +{ + int iter; + double a,b,d=0,etemp,fu,fv,fw,fx,p,q,r,tol1,tol2,u,v,w,x,xm; + double xw,wv,vx; + double e=0.0; + + a=(ax < cx ? ax : cx); + b=(ax > cx ? ax : cx); + x=bx; + fx=fbx; + if (fax < fcx) { + w=ax; + fw=fax; + v=cx; + fv=fcx; + } else { + w=cx; + fw=fcx; + v=ax; + fv=fax; + } + for (iter=1;iter<=ITMAX;iter++) { + xm=0.5*(a+b); + tol2=2.0*(tol1=tol*fabs(x)+ZEPS); + if (fabs(x-xm) <= (tol2-0.5*(b-a))) { + *foptx = fx; + xw = x-w; + wv = w-v; + vx = v-x; + *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ + (v*v*xw + x*x*wv + w*w*vx); + return x; + } + if (fabs(e) > tol1) { + r=(x-w)*(fx-fv); + q=(x-v)*(fx-fw); + p=(x-v)*q-(x-w)*r; + q=2.0*(q-r); + if (q > 0.0) p = -p; + q=fabs(q); + etemp=e; + e=d; + if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x)) + d=CGOLD*(e=(x >= xm ? a-x : b-x)); + else { + d=p/q; + u=x+d; + if (u-a < tol2 || b-u < tol2) + d=SIGN(tol1,xm-x); + } + } else { + d=CGOLD*(e=(x >= xm ? a-x : b-x)); + } + u=(fabs(d) >= tol1 ? x+d : x+SIGN(tol1,d)); + fu=(*f)(u); + if (fu <= fx) { + if (u >= x) a=x; else b=x; + SHFT(v,w,x,u) + SHFT(fv,fw,fx,fu) + } else { + if (u < x) a=u; else b=u; + if (fu <= fw || w == x) { + v=w; + w=u; + fv=fw; + fw=fu; + } else if (fu <= fv || v == x || v == w) { + v=u; + fv=fu; + } + } + } + *foptx = fx; + xw = x-w; + wv = w-v; + vx = v-x; + *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ + (v*v*xw + x*x*wv + w*w*vx); + return x; +} +#undef ITMAX +#undef CGOLD +#undef ZEPS +#undef SHFT +#undef SIGN +#undef GOLD +#undef GLIMIT +#undef TINY + +/* one-dimensional minimization - as input a lower and an upper limit and a trial + value for the minimum is needed: xmin < xguess < xmax + the function and a fractional tolerance has to be specified + onedimenmin returns the optimal x value and the value of the function + and its second derivative at this point + */ +double onedimenmin(double xmin, double xguess, double xmax, double (*f)(double), + double tol, double *fx, double *f2x) +{ + double eps, optx, ax, bx, cx, fa, fb, fc; + + /* first attempt to bracketize minimum */ + eps = xguess*tol*50.0; + ax = xguess - eps; + if (ax < xmin) ax = xmin; + bx = xguess; + cx = xguess + eps; + if (cx > xmax) cx = xmax; + + /* check if this works */ + fa = (*f)(ax); + fb = (*f)(bx); + fc = (*f)(cx); + + /* if it works use these borders else be conservative */ + if ((fa < fb) || (fc < fb)) { + if (ax != xmin) fa = (*f)(xmin); + if (cx != xmax) fc = (*f)(xmax); + optx = brent(xmin, xguess, xmax, f, tol, fx, f2x, fa, fb, fc); + } else + optx = brent(ax, bx, cx, f, tol, fx, f2x, fa, fb, fc); + + return optx; /* return optimal x */ +} + +/* two-dimensional minimization with borders and calculations of standard errors */ +/* we optimize along basis vectors - not very optimal but it seems to work well */ +void twodimenmin(double tol, + int active1, double min1, double *x1, double max1, double (*func1)(double), double *err1, + int active2, double min2, double *x2, double max2, double (*func2)(double), double *err2) +{ + int it, nump, change; + double x1old, x2old; + double fx, f2x; + + it = 0; + nump = 0; + + /* count number of parameters */ + if (active1) nump++; + if (active2) nump++; + + do { /* repeat until nothing changes any more */ + it++; + change = FALSE; + + /* optimize first variable */ + if (active1) { + + if ((*x1) <= min1) (*x1) = min1 + 0.2*(max1-min1); + if ((*x1) >= max1) (*x1) = max1 - 0.2*(max1-min1); + x1old = (*x1); + (*x1) = onedimenmin(min1, (*x1), max1, func1, tol, &fx, &f2x); + if ((*x1) < min1) (*x1) = min1; + if ((*x1) > max1) (*x1) = max1; + /* same tolerance as 1D minimization */ + if (fabs((*x1) - x1old) > 3.3*tol) change = TRUE; + + /* standard error */ + f2x = fabs(f2x); + if (1.0/(max1*max1) < f2x) (*err1) = sqrt(1.0/f2x); + else (*err1) = max1; + + } + + /* optimize second variable */ + if (active2) { + + if ((*x2) <= min2) (*x2) = min2 + 0.2*(max2-min2); + if ((*x2) >= max2) (*x2) = max2 - 0.2*(max2-min2); + x2old = (*x2); + (*x2) = onedimenmin(min2, (*x2), max2, func2, tol, &fx, &f2x); + if ((*x2) < min2) (*x2) = min2; + if ((*x2) > max2) (*x2) = max2; + /* same tolerance as 1D minimization */ + if (fabs((*x2) - x2old) > 3.3*tol) change = TRUE; + + /* standard error */ + f2x = fabs(f2x); + if (1.0/(max2*max2) < f2x) (*err2) = sqrt(1.0/f2x); + else (*err2) = max2; + + } + + if (nump == 1) return; + + } while (it != MAXITS && change); + + return; +} + diff --git a/forester/archive/RIO/others/puzzle_dqo/src/util.h b/forester/archive/RIO/others/puzzle_dqo/src/util.h new file mode 100644 index 0000000..20f37e5 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_dqo/src/util.h @@ -0,0 +1,96 @@ +/* + * util.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef _UTIL_ +#define _UTIL_ + +#include +#include +#include +#include + + +/* + * general definitions + */ + +#define TRUE 1 +#define FALSE 0 + +#ifdef PARALLEL + extern long int PP_randn; + extern long int PP_rand; +#endif + +/* + * type definitions + */ + +typedef unsigned long int uli; + +typedef double *dvector, **dmatrix, ***dcube; +typedef char *cvector, **cmatrix, ***ccube; +typedef int *ivector, **imatrix, ***icube; +typedef uli *ulivector, **ulimatrix, ***ulicube; + + +/* + * prototypes of functions defined in util.c + */ + +void maerror(char *message); + +dvector new_dvector(int n); +dmatrix new_dmatrix(int nrow, int ncol); +dcube new_dcube(int ntri, int nrow, int ncol); +void free_dvector(dvector v); +void free_dmatrix(dmatrix m); +void free_dcube(dcube c); + +cvector new_cvector(int n); +cmatrix new_cmatrix(int nrow, int ncol); +ccube new_ccube(int ntri, int nrow, int ncol); +void free_cvector(cvector v); +void free_cmatrix(cmatrix m); +void free_ccube(ccube c); + +ivector new_ivector(int n); +imatrix new_imatrix(int nrow, int ncol); +icube new_icube(int ntri, int nrow, int ncol); +void free_ivector(ivector v); +void free_imatrix(imatrix m); +void free_icube(icube c); + +ulivector new_ulivector(int n); +ulimatrix new_ulimatrix(int nrow, int ncol); +ulicube new_ulicube(int ntri, int nrow, int ncol); +void free_ulivector(ulivector v); +void free_ulimatrix(ulimatrix m); +void free_ulicube(ulicube c); + +double randomunitintervall(void); +int initrandom(int seed); +int randominteger(int n); +void chooser(int t, int s, ivector slist); +void *myrealloc(void *, size_t); +cvector mygets(void); + +#define MAXITS 10 /* maximum number of iterations in twoedimenmin */ +double onedimenmin(double, double, double, double (*f )(double ), double, double *, double *); +void twodimenmin(double, int, double, double *, double, double (*func1 )(double ), double *, int, double, double *, double, double (*func2 )(double ), double *); + + + +#endif diff --git a/forester/archive/RIO/others/puzzle_mod/AUTHORS b/forester/archive/RIO/others/puzzle_mod/AUTHORS new file mode 100644 index 0000000..cbef439 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/AUTHORS @@ -0,0 +1,45 @@ +since 1999 by Heiko A. Schmidt, Korbinian Strimmer, + Martin Vingron, Arndt von Haeseler + +1995-1999 by Korbinian Strimmer and Arndt von Haeseler + + + +Heiko A. Schmidt + Theoretical Bioinformatics + Deutsches Krebsforschungszentrum (DKFZ) + Im Neuenheimer Feld 280 + D-69124 Heidelberg + Germany + + email: h.schmidt@dkfz-heidelberg.de, + http://www.dkfz-heidelberg.de/tbi/ + +Korbinian Strimmer + Department of Zoology + University of Oxford + South Parks Road + Oxford OX1 3PS, UK + + email: korbinian.strimmer@zoo.ox.ac.uk + http://www.zoo.ox.ac.uk/ + +Martin Vingron + Theoretical Bioinformatics + Deutsches Krebsforschungszentrum (DKFZ) + Im Neuenheimer Feld 280 + D-69124 Heidelberg + Germany + + email: vingron@dkfz-heidelberg.de + http://www.dkfz-heidelberg.de/tbi/ + +Arndt von Haeseler + Max-Planck-Institute for Evolutionary Anthropology + Inselstr. 22 + D-04103 Leipzig + Germany + + email: haeseler@eva.mpg.de, + http://www.eva.mpg.de/ + diff --git a/forester/archive/RIO/others/puzzle_mod/COPYING b/forester/archive/RIO/others/puzzle_mod/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/forester/archive/RIO/others/puzzle_mod/ChangeLog b/forester/archive/RIO/others/puzzle_mod/ChangeLog new file mode 100644 index 0000000..824b296 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/ChangeLog @@ -0,0 +1,347 @@ + +Version date what has been changed + +5.0 26.08.2000 - changes to manual, Makefile.in + - cpREV hidden by -DCPREV flag + - chi2test, quartio included into source code files + - generic scr/Makefile.generic + - src/makefile.com for VAX + - AUTHORS, README, ChangeLog updated + - INSTALL checked + 27.08.2000 - test code excluded + - '-randseed#' added for debugging purposes + - ./data added to autoconf/automake + - warning output if cmdline parameter unknown + 11.10.2000 - fixed output of rate categories of sites before + computing them + - check whether rate categories were computed by + 1st user tree or NJ tree fixed in the output + 12.10.2000 - invariant site model normalization fixed + + +CODE FREEZE +=========== + +5.0.a33 15.08.2000 - changes for autoconf/automake + +5.0.a32 01.08.2000 - a FPE error fixed (badq == 0) + - small error in -bestq fixed + - fflush's added at several places + +5.0.a31 01.08.2000 - comments added to tree structure sorting puzzle2.c + - changes in configure.in, Makefile.in + +5.0.a30 23.07.2000 - some debugging in checkquart + - changed to autoconf + +5.0.a29 13.07.2000 - some debugging in checkquart + +5.0.a28 13.07.2000 - use best quartet topology option (-bestq) implemented + +5.0.a27 13.07.2000 - further developement to checkquart + - ascii/binary quartet values (-wqla/-wqlb) + - typo correction + +5.0.a26 11.07.2000 - fflush at all checktimer + - further developement at checkquart + - possibility to write quartet values to file (-wqlh) + +5.0.a25 06.07.2000 - fflush at checktimer + +5.0.a24 02.07.2000 - further debugging of checkquart + +5.0.a23 02.07.2000 - further developement to checkquart + +5.0.a22 29.06.2000 - checkquart added to makefile + - bad quartet stats added after reading in *.allquarts + +5.0.a21 27.06.2000 - site pattern statistics implemented and added to + SEQUENCE ALIGNMENT section in puzzle report + +5.0.a20 26.06.2000 - cpREV45 implemented + +5.0.a19 26.06.2000 - for debugging purposes: typo "MPE" changed to "FPE" + - fflush(stdout) added in chi2test + +5.0.a18 20.06.2000 - checkquart implemented + +5.0.a17 19.06.2000 - FPRINTF(STDOUTFILE and STDOUT definition changed + and moved; fputid/fputid10 writes to STDOUT instead + of stdout + - ppuzzle checks slaves enough slave-processes + - numquarts, num2quart, quart2num moved from ppuzzle.c + to puzzle1.c + - read/writeallquart implemented (undocumented feature) + to be used by -wqf/-rqf at comandline + -wqf = write quartet file (infilename.allquart) after + quartet evaluation + -rqf = read quartet file (infilename.allquart), no + quartet evaluation, unless -wqf is used as + well, then quartets are written and read in + - '-h' option at comandline -> printusage + +5.0.a16 31.05.2000 - chi2test bug fixed + - WAG matrix added, model choice adopted + 13.06.2000 - date set to June 2000 + - author order changed to Schmidt, Strimmer, Vingron, + v.Haeseler + - CPU time output stopped, due to overflow errors + 16.06.2000 - sequence composition chi2test moved before + parameter output. + - output of chi2test and bad quartet statistics split, + to do the chi2test output earlier. + +5.0.a15 02.05.2000 - Names changed back from TREE-PUZZLE to PUZZLE + 09.05.2000 - and to TREE-PUZZLE again ;-) + +5.0.a14 13.03.2000 - Changes to the manual. + - Executable names changed to (p)treepuzzle. + (changes in the makefiles) + 15.03.2000 - Output of parameters after estimation added. + +5.0.a13 18.02.2000 - ALPHA version number removed from the code + +5.0.a12 18.02.2000 - CPU time measurement problems fixed for case where + clock_t is an unsigned type. + +5.0.a11 17.02.2000 - time measure problems (CPU/wallclock) fixed + not all features in addtimes are used at the moment. + - unnecessary and unused routines removed fron source + code. + +5.0.a10 20.01.2000 - Name changes from PUZZLE to TREE-PUZZLE + - Chi2-fit model guessing for VT model added + - little model printing bug fixed + +5.0.a9 22.12.1999 - VT Model incorporated (Mueller, Vingron (2000) + JCB, to appear). + - TODO: Chi2-fit model guessing for VT model + +5.0.a8 21.12.1999 - 'sys/times.h' and 'sys/types.h' removed from + puzzle.h. They were neither ANSI conform nor + necessary, but occured in the SUN man pages. + - Definition and call of writetimesstat eliminated + from the sequention version by compiler switched, + and not just the function body as before. + - '-O4' canged to '-O' to be more generic. + +5.0.a7 21.12.1999 - Macro constants introduced for data_optn + (NUCLEOTIDE, AMINOACID, BINARY) + - round robbing of datatype and AA model option changed + in menu to make adjustment of the model possible by a + determined sequence of letters: + 'd': Auto -> Nucleotides + -> Amino acids + -> Binary states + -> Auto + ('m' && data_optn == AMINOACID): + Auto -> Dayhoff + -> JTT + -> mtREV24 + -> BLOSUM62 + -> Auto + - manual.html adjusted + +5.0.a6 20.12.1999 - new manual.html added + +5.0.a5 07.12.1999 - output bug fixed (bestrates were written before they + were computed) + +5.0.a4 02.12.1999 - header file inclusion ajusted: + added: #include + changed from: #include "ppuzzle.h" + to: #ifdef PARALLEL + # include "ppuzzle.h" + #endif + +5.0.a3 27.11.1999 - '-h' comandline option removed, because of problems + with MPICH under LINUX + - new memory leaks of 5.0.a2 closed in PP_Finalize + +5.0.a2 27.11.1999 - Cleanup of the source code + - Measurement of CPU time added + - Parallel load statistics added (quartets, trees, time) + to puzzle report. + - Cleanup debug messages + - Comments "[...]" are removed from usertrees now. + - single quotes will only be printed arount species + names if -DUSEQUOTES is set at compiletime. + - tree likelihood is printed infront of a tree as a + comment, [ lh=-xx.xxxxx ](...); + +5.0.a1 26.11.1999 - Cleanup of the directories + - Copyright changes + - Version changes + + +VERSION CHANGE +============== + +4.1.a26 25.11.1999 - Makefile made universal for pauzzle and ppuzzle + - lines not needed removed from puzzle.h + +4.1.a25 19.11.1999 - Output file prefixes for distances, trees, and + puzzlereport changed in user trees analysis case + to user tree file name + - Temporary output of likelihood to treefile added + +4.1.a24 11.11.1999 - Output of puzzling step trees changed + ptorder: [ orderno # % ID #UniqTopos #Steps ]PHYLIP + pstep: chunk #InChunk sum ID #UniqTopos #Steps + - preliminary leap frog RNG implemented, i.e. uses + the rand4 in the usual way in the sequential case. + If run in parallel all rand4 are initialized with + the same seed and started with PP_Myid-th random + number. after that each process uses the every + PP_NumProcs-th random number to make sure that these + unique. + +4.1.a23 08.11.1999 - output of sequential and parallel version to *.pstep + made identical + +4.1.a22 05.11.1999 - two different puzzle step tree outputs intruduced + and added to the menu ("[ 1. 35 ](...);": + - ordered unique tree list -> *.ptorder + Format: "[ 1. 35 ]" (Ordernumber, Amount) + - chronological tree list -> *.pstep + Format: "[ 1. 35 ]" (Chunknumber, Amount in chunk) + (the last is a problem in parallel, because the come + in chunks, as scheduled) + - debugged the output +4.1.a21 04.11.1999 - Makefile adjustments for other Plattforms + - pstep tree output changed. unique treestructures + printed to *.pstep file with a leading comment + containing an order number and the ammount padded + with blanks (e.g. "[ 1. 356 ]('mouse'..."). + output is done right before writing the puzzle file. + - controlled MPI finish to the Quit menu option added + +4.1.a20 03.11.1999 - some garbage collection (free) added + - makefile adjusted, OFLAGS for optimization added + (ppuzzle/MPICH has problems with -O, so the + ppuzzle is created without optimization) + Some minor changes in the makefiles + - still to do: garbage collection from 'internalnode' + in master process + +4.1.a19 13.10.1999 - adding the output of standardized (i.e. sorted) + puzzling step trees. Those are printed to the + standard output at the moment. (Routines to sort + and print the trees implemented) + 14.10.1999 - routines for printing the sorted trees to a string. + needed to send them between Master and Worker, and + to have a unique key to sort and count the trees. + 21.10.1999 - counting of sorted trees implemented by doubly linked + list, sort routine, print to stdout + 25.10.1999 - change place of writing distances to file right after + distances have been computed. + - output of puzzling step trees now with true name, + not numbers + 02.11.1999 - parallel counting and sending of puzzling step trees + - some parallel sending bugs fixed + +4.1.a18 14.09.1999 - adding possibility to specify input file at + command line, this specifies also the output + filenames (puzzle output: *.puzzle; treefile: + *.tree; distances: *.dist; Triangel EPS: *.eps; + unresolved: *.qlist; puzzling step trees: *.pstep) + If an unexisting name is given, one has to reenter + the right name, but the wrong one is used as prefix. + 15.09.1999 - sending back of bad quartets from slaves added + - bug in quart2num fixed (not used before; was shifted + by 1) + - first version of a README added ;-) + +4.1.a17 03.08.1999 - Recv-Error in receiving DoPuzzleBlock fixed + - double freeing of same MPI_Datatype fixed + - changing of scheduling algorithm to smaller chunks + in gss -> sgss + 13.09.1999 - bug fixed in optimization routine in ml2.c: + boundary check added + +4.1.a16 12.07.1999 - slight changes in verbosity levels + - changed all printf to FPRINTF(STDOUTFILE to + change easily from stdout to a file. + +4.1.a15 08.07.1999 - scheduler for both parallel parts + - several small changes + +4.1.a14 25.06.1999 - computation of tree parallel, scheduler dependent, + sending all biparts in one message instead of one + by one + - several small changes since a13 in sched.c, et al. + +4.1.a13 10.06.1999 - computation of tree parallel (chunk = #trees/#slaves) + - scheduling schemes implemented for minimum chunk sizes + +4.1.a12 07.06.1999 - computation of quartets properly parallel + - scheduling implemented + - counting of quartets by slave ajusted + - TODO: sending of bad quartets (array + list) + - distinction between '1st user tree' and 'NJ tree' + in result output removed again + +4.1.a11 28.05.1999 - PP_SendDoQuartBlock, PP_RecvDoQuartBlock, + PP_SendQuartBlock, PP_RecvQuartBlock + - mallocquartets() changed from global to local + variables to be more flexible + - Quartet computation moved to slave (badquartet + handling missing: output, badquartet vector); + - distinction between '1st user tree' and 'NJ tree' + added in result output (puzzle1.c around l.1756) + +4.1.a10 20.05.1999 - num2quart, numquarts, quart2num introduced + - parallel init/finalize, quartets computed on + master and slave, compared -> equal -> all necessary + parameter exported + +4.1.a9 19.05.1999 - 'dvector forg' removed from onepamratematrix + cmdline, because it's not used in the function. + +4.1.a8 18.05.1999 - add _GAMMA_ (not necessary) to gamma.h and _PUZZLE_ + to puzzle.h to avoid dublicate includes, possible + due to ppuzzle.h + - ppuzzle added to makefile and to check + - 1st parallel version but no slave computations + only sending parameters and done signals. + +4.1.a7 18.05.1999 - export reevaluation of tree and evaluation of + usertrees to evaluatetree. + +4.1.a6 17.05.1999 - -DNEWFORLOOP added to fixed.src, because the changed + for loop structure changes the sequence of randomized + quartets during likelihood mapping + - change 'int main()' to 'int main(argc, argv)' + - export more functionalities from main: + memcleanup(), inputandinit(&argc, &argv) + - grouping if's (excluding eachother) together in + switch() + - split treereavaluation and 1st usertree, + evaluate all usertrees together (TODO: both, + treereavaluation and usertrees in one loop) + - MAKE CHECK added to ./makefile + +4.1.a5 16.05.1999 - adding ´dvector Brnlength´ to lslength cmdline to + reduce globality of Brnlength. (Later better to *Tree) + +4.1.a4 11.05.1999 - structure of for loops changed in computeallquartets + and recon_tree, so that the quarted addresses are in + one contigous sequence (for a /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done + for subdir in $(SUBDIRS); do \ + if test "$$subdir" = .; then :; else \ + test -d $(distdir)/$$subdir \ + || mkdir $(distdir)/$$subdir \ + || exit 1; \ + chmod 777 $(distdir)/$$subdir; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(distdir) distdir=../$(distdir)/$$subdir distdir) \ + || exit 1; \ + fi; \ + done +info-am: +info: info-recursive +dvi-am: +dvi: dvi-recursive +check-am: all-am +check: check-recursive +installcheck-am: +installcheck: installcheck-recursive +install-exec-am: +install-exec: install-exec-recursive + +install-data-am: +install-data: install-data-recursive + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-recursive +uninstall-am: +uninstall: uninstall-recursive +all-am: Makefile +all-redirect: all-recursive +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: installdirs-recursive +installdirs-am: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-recursive + +clean-am: clean-tags clean-generic mostlyclean-am + +clean: clean-recursive + +distclean-am: distclean-tags distclean-generic clean-am + +distclean: distclean-recursive + -rm -f config.status + +maintainer-clean-am: maintainer-clean-tags maintainer-clean-generic \ + distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-recursive + -rm -f config.status + +.PHONY: install-data-recursive uninstall-data-recursive \ +install-exec-recursive uninstall-exec-recursive installdirs-recursive \ +uninstalldirs-recursive all-recursive check-recursive \ +installcheck-recursive info-recursive dvi-recursive \ +mostlyclean-recursive distclean-recursive clean-recursive \ +maintainer-clean-recursive tags tags-recursive mostlyclean-tags \ +distclean-tags clean-tags maintainer-clean-tags distdir info-am info \ +dvi-am dvi check check-am installcheck-am installcheck install-exec-am \ +install-exec install-data-am install-data install-am install \ +uninstall-am uninstall all-redirect all-am all installdirs-am \ +installdirs mostlyclean-generic distclean-generic clean-generic \ +maintainer-clean-generic clean mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/Makefile.am b/forester/archive/RIO/others/puzzle_mod/Makefile.am new file mode 100644 index 0000000..2a0bac6 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/Makefile.am @@ -0,0 +1,2 @@ +EXTRA_DIST = +SUBDIRS = src doc data diff --git a/forester/archive/RIO/others/puzzle_mod/Makefile.in b/forester/archive/RIO/others/puzzle_mod/Makefile.in new file mode 100644 index 0000000..06043c6 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/Makefile.in @@ -0,0 +1,327 @@ +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = . + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +EXTRA_DIST = +SUBDIRS = src doc data +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = README AUTHORS COPYING ChangeLog INSTALL Makefile.am \ +Makefile.in NEWS aclocal.m4 configure configure.in install-sh missing \ +mkinstalldirs + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(ACLOCAL_M4): configure.in + cd $(srcdir) && $(ACLOCAL) + +config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + $(SHELL) ./config.status --recheck +$(srcdir)/configure: $(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES) + cd $(srcdir) && $(AUTOCONF) + +# This directory's subdirectories are mostly independent; you can cd +# into them and run `make' without going through this Makefile. +# To change the values of `make' variables: instead of editing Makefiles, +# (1) if the variable is set in `config.status', edit `config.status' +# (which will cause the Makefiles to be regenerated when you run `make'); +# (2) otherwise, pass the desired values on the `make' command line. + +@SET_MAKE@ + +all-recursive install-data-recursive install-exec-recursive \ +installdirs-recursive install-recursive uninstall-recursive \ +check-recursive installcheck-recursive info-recursive dvi-recursive: + @set fnord $(MAKEFLAGS); amf=$$2; \ + dot_seen=no; \ + target=`echo $@ | sed s/-recursive//`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + dot_seen=yes; \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + done; \ + if test "$$dot_seen" = "no"; then \ + $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ + fi; test -z "$$fail" + +mostlyclean-recursive clean-recursive distclean-recursive \ +maintainer-clean-recursive: + @set fnord $(MAKEFLAGS); amf=$$2; \ + dot_seen=no; \ + rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \ + rev="$$subdir $$rev"; \ + test "$$subdir" = "." && dot_seen=yes; \ + done; \ + test "$$dot_seen" = "no" && rev=". $$rev"; \ + target=`echo $@ | sed s/-recursive//`; \ + for subdir in $$rev; do \ + echo "Making $$target in $$subdir"; \ + if test "$$subdir" = "."; then \ + local_target="$$target-am"; \ + else \ + local_target="$$target"; \ + fi; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ + || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + done && test -z "$$fail" +tags-recursive: + list='$(SUBDIRS)'; for subdir in $$list; do \ + test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ + done + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + here=`pwd` && cd $(srcdir) \ + && mkid -f$$here/ID $$unique $(LISP) + +TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + if test "$$subdir" = .; then :; else \ + test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \ + fi; \ + done; \ + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \ + || (cd $(srcdir) && etags $(ETAGS_ARGS) $$tags $$unique $(LISP) -o $$here/TAGS) + +mostlyclean-tags: + +clean-tags: + +distclean-tags: + -rm -f TAGS ID + +maintainer-clean-tags: + +distdir = $(PACKAGE)-$(VERSION) +top_distdir = $(distdir) + +# This target untars the dist file and tries a VPATH configuration. Then +# it guarantees that the distribution is self-contained by making another +# tarfile. +distcheck: dist + -rm -rf $(distdir) + GZIP=$(GZIP_ENV) $(TAR) zxf $(distdir).tar.gz + mkdir $(distdir)/=build + mkdir $(distdir)/=inst + dc_install_base=`cd $(distdir)/=inst && pwd`; \ + cd $(distdir)/=build \ + && ../configure --srcdir=.. --prefix=$$dc_install_base \ + && $(MAKE) $(AM_MAKEFLAGS) \ + && $(MAKE) $(AM_MAKEFLAGS) dvi \ + && $(MAKE) $(AM_MAKEFLAGS) check \ + && $(MAKE) $(AM_MAKEFLAGS) install \ + && $(MAKE) $(AM_MAKEFLAGS) installcheck \ + && $(MAKE) $(AM_MAKEFLAGS) dist + -rm -rf $(distdir) + @banner="$(distdir).tar.gz is ready for distribution"; \ + dashes=`echo "$$banner" | sed s/./=/g`; \ + echo "$$dashes"; \ + echo "$$banner"; \ + echo "$$dashes" +dist: distdir + -chmod -R a+r $(distdir) + GZIP=$(GZIP_ENV) $(TAR) chozf $(distdir).tar.gz $(distdir) + -rm -rf $(distdir) +dist-all: distdir + -chmod -R a+r $(distdir) + GZIP=$(GZIP_ENV) $(TAR) chozf $(distdir).tar.gz $(distdir) + -rm -rf $(distdir) +distdir: $(DISTFILES) + -rm -rf $(distdir) + mkdir $(distdir) + -chmod 777 $(distdir) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done + for subdir in $(SUBDIRS); do \ + if test "$$subdir" = .; then :; else \ + test -d $(distdir)/$$subdir \ + || mkdir $(distdir)/$$subdir \ + || exit 1; \ + chmod 777 $(distdir)/$$subdir; \ + (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(distdir) distdir=../$(distdir)/$$subdir distdir) \ + || exit 1; \ + fi; \ + done +info-am: +info: info-recursive +dvi-am: +dvi: dvi-recursive +check-am: all-am +check: check-recursive +installcheck-am: +installcheck: installcheck-recursive +install-exec-am: +install-exec: install-exec-recursive + +install-data-am: +install-data: install-data-recursive + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-recursive +uninstall-am: +uninstall: uninstall-recursive +all-am: Makefile +all-redirect: all-recursive +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: installdirs-recursive +installdirs-am: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-recursive + +clean-am: clean-tags clean-generic mostlyclean-am + +clean: clean-recursive + +distclean-am: distclean-tags distclean-generic clean-am + +distclean: distclean-recursive + -rm -f config.status + +maintainer-clean-am: maintainer-clean-tags maintainer-clean-generic \ + distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-recursive + -rm -f config.status + +.PHONY: install-data-recursive uninstall-data-recursive \ +install-exec-recursive uninstall-exec-recursive installdirs-recursive \ +uninstalldirs-recursive all-recursive check-recursive \ +installcheck-recursive info-recursive dvi-recursive \ +mostlyclean-recursive distclean-recursive clean-recursive \ +maintainer-clean-recursive tags tags-recursive mostlyclean-tags \ +distclean-tags clean-tags maintainer-clean-tags distdir info-am info \ +dvi-am dvi check check-am installcheck-am installcheck install-exec-am \ +install-exec install-data-am install-data install-am install \ +uninstall-am uninstall all-redirect all-am all installdirs-am \ +installdirs mostlyclean-generic distclean-generic clean-generic \ +maintainer-clean-generic clean mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/aclocal.m4 b/forester/archive/RIO/others/puzzle_mod/aclocal.m4 new file mode 100644 index 0000000..9f8add8 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/aclocal.m4 @@ -0,0 +1,104 @@ +dnl aclocal.m4 generated automatically by aclocal 1.4 + +dnl Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl This program is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY, to the extent permitted by law; without +dnl even the implied warranty of MERCHANTABILITY or FITNESS FOR A +dnl PARTICULAR PURPOSE. + +# Do all the work for Automake. This macro actually does too much -- +# some checks are only needed if your package does certain things. +# But this isn't really a big deal. + +# serial 1 + +dnl Usage: +dnl AM_INIT_AUTOMAKE(package,version, [no-define]) + +AC_DEFUN(AM_INIT_AUTOMAKE, +[AC_REQUIRE([AC_PROG_INSTALL]) +PACKAGE=[$1] +AC_SUBST(PACKAGE) +VERSION=[$2] +AC_SUBST(VERSION) +dnl test to see if srcdir already configured +if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then + AC_MSG_ERROR([source directory already configured; run "make distclean" there first]) +fi +ifelse([$3],, +AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package]) +AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])) +AC_REQUIRE([AM_SANITY_CHECK]) +AC_REQUIRE([AC_ARG_PROGRAM]) +dnl FIXME This is truly gross. +missing_dir=`cd $ac_aux_dir && pwd` +AM_MISSING_PROG(ACLOCAL, aclocal, $missing_dir) +AM_MISSING_PROG(AUTOCONF, autoconf, $missing_dir) +AM_MISSING_PROG(AUTOMAKE, automake, $missing_dir) +AM_MISSING_PROG(AUTOHEADER, autoheader, $missing_dir) +AM_MISSING_PROG(MAKEINFO, makeinfo, $missing_dir) +AC_REQUIRE([AC_PROG_MAKE_SET])]) + +# +# Check to make sure that the build environment is sane. +# + +AC_DEFUN(AM_SANITY_CHECK, +[AC_MSG_CHECKING([whether build environment is sane]) +# Just in case +sleep 1 +echo timestamp > conftestfile +# Do `set' in a subshell so we don't clobber the current shell's +# arguments. Must try -L first in case configure is actually a +# symlink; some systems play weird games with the mod time of symlinks +# (eg FreeBSD returns the mod time of the symlink's containing +# directory). +if ( + set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null` + if test "[$]*" = "X"; then + # -L didn't work. + set X `ls -t $srcdir/configure conftestfile` + fi + if test "[$]*" != "X $srcdir/configure conftestfile" \ + && test "[$]*" != "X conftestfile $srcdir/configure"; then + + # If neither matched, then we have a broken ls. This can happen + # if, for instance, CONFIG_SHELL is bash and it inherits a + # broken ls alias from the environment. This has actually + # happened. Such a system could not be considered "sane". + AC_MSG_ERROR([ls -t appears to fail. Make sure there is not a broken +alias in your environment]) + fi + + test "[$]2" = conftestfile + ) +then + # Ok. + : +else + AC_MSG_ERROR([newly created file is older than distributed files! +Check your system clock]) +fi +rm -f conftest* +AC_MSG_RESULT(yes)]) + +dnl AM_MISSING_PROG(NAME, PROGRAM, DIRECTORY) +dnl The program must properly implement --version. +AC_DEFUN(AM_MISSING_PROG, +[AC_MSG_CHECKING(for working $2) +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if ($2 --version) < /dev/null > /dev/null 2>&1; then + $1=$2 + AC_MSG_RESULT(found) +else + $1="$3/missing $2" + AC_MSG_RESULT(missing) +fi +AC_SUBST($1)]) + diff --git a/forester/archive/RIO/others/puzzle_mod/config.status b/forester/archive/RIO/others/puzzle_mod/config.status new file mode 100755 index 0000000..da58b56 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/config.status @@ -0,0 +1,179 @@ +#! /bin/sh +# Generated automatically by configure. +# Run this file to recreate the current configuration. +# This directory was configured as follows, +# on host forester.wustl.edu: +# +# ./configure +# +# Compiler output produced by configure, useful for debugging +# configure, is in ./config.log if it exists. + +ac_cs_usage="Usage: ./config.status [--recheck] [--version] [--help]" +for ac_option +do + case "$ac_option" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + echo "running ${CONFIG_SHELL-/bin/sh} ./configure --no-create --no-recursion" + exec ${CONFIG_SHELL-/bin/sh} ./configure --no-create --no-recursion ;; + -version | --version | --versio | --versi | --vers | --ver | --ve | --v) + echo "./config.status generated by autoconf version 2.13" + exit 0 ;; + -help | --help | --hel | --he | --h) + echo "$ac_cs_usage"; exit 0 ;; + *) echo "$ac_cs_usage"; exit 1 ;; + esac +done + +ac_given_srcdir=. +ac_given_INSTALL="/usr/bin/install -c" + +trap 'rm -fr Makefile src/Makefile src/test doc/Makefile data/Makefile conftest*; exit 1' 1 2 15 + +# Protect against being on the right side of a sed subst in config.status. +sed 's/%@/@@/; s/@%/@@/; s/%g$/@g/; /@g$/s/[\\&%]/\\&/g; + s/@@/%@/; s/@@/@%/; s/@g$/%g/' > conftest.subs <<\CEOF +/^[ ]*VPATH[ ]*=[^:]*$/d + +s%@SHELL@%/bin/sh%g +s%@CFLAGS@%-g -O2%g +s%@CPPFLAGS@%%g +s%@CXXFLAGS@%%g +s%@FFLAGS@%%g +s%@DEFS@% -DPACKAGE=\"tree-puzzle\" -DVERSION=\"5.0\" -DHAVE_LIBM=1 -DSTDC_HEADERS=1 -DHAVE_LIMITS_H=1 %g +s%@LDFLAGS@%%g +s%@LIBS@%-lm %g +s%@exec_prefix@%${prefix}%g +s%@prefix@%/usr/local%g +s%@program_transform_name@%s,x,x,%g +s%@bindir@%${exec_prefix}/bin%g +s%@sbindir@%${exec_prefix}/sbin%g +s%@libexecdir@%${exec_prefix}/libexec%g +s%@datadir@%${prefix}/share%g +s%@sysconfdir@%${prefix}/etc%g +s%@sharedstatedir@%${prefix}/com%g +s%@localstatedir@%${prefix}/var%g +s%@libdir@%${exec_prefix}/lib%g +s%@includedir@%${prefix}/include%g +s%@oldincludedir@%/usr/include%g +s%@infodir@%${prefix}/info%g +s%@mandir@%${prefix}/man%g +s%@INSTALL_PROGRAM@%${INSTALL}%g +s%@INSTALL_SCRIPT@%${INSTALL_PROGRAM}%g +s%@INSTALL_DATA@%${INSTALL} -m 644%g +s%@PACKAGE@%tree-puzzle%g +s%@VERSION@%5.0%g +s%@ACLOCAL@%aclocal%g +s%@AUTOCONF@%autoconf%g +s%@AUTOMAKE@%automake%g +s%@AUTOHEADER@%autoheader%g +s%@MAKEINFO@%makeinfo%g +s%@SET_MAKE@%%g +s%@CC@%gcc%g +s%@MPICC0@%%g +s%@MPICC1@%%g +s%@MPICC2@%%g +s%@MPICC3@%%g +s%@MPICC4@%%g +s%@MPICC5@%%g +s%@MPICC@%%g +s%@MPILIBS@%%g +s%@MPIDEFS@%%g +s%@MPICFLAGS@%%g +s%@PPUZZLE@%%g +s%@CPP@%gcc -E%g + +CEOF + +# Split the substitutions into bite-sized pieces for seds with +# small command number limits, like on Digital OSF/1 and HP-UX. +ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script. +ac_file=1 # Number of current file. +ac_beg=1 # First line for current file. +ac_end=$ac_max_sed_cmds # Line after last line for current file. +ac_more_lines=: +ac_sed_cmds="" +while $ac_more_lines; do + if test $ac_beg -gt 1; then + sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file + else + sed "${ac_end}q" conftest.subs > conftest.s$ac_file + fi + if test ! -s conftest.s$ac_file; then + ac_more_lines=false + rm -f conftest.s$ac_file + else + if test -z "$ac_sed_cmds"; then + ac_sed_cmds="sed -f conftest.s$ac_file" + else + ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file" + fi + ac_file=`expr $ac_file + 1` + ac_beg=$ac_end + ac_end=`expr $ac_end + $ac_max_sed_cmds` + fi +done +if test -z "$ac_sed_cmds"; then + ac_sed_cmds=cat +fi + +CONFIG_FILES=${CONFIG_FILES-"Makefile src/Makefile src/test doc/Makefile data/Makefile"} +for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories. + + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`" + # A "../" for each directory in $ac_dir_suffix. + ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'` + else + ac_dir_suffix= ac_dots= + fi + + case "$ac_given_srcdir" in + .) srcdir=. + if test -z "$ac_dots"; then top_srcdir=. + else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;; + /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;; + *) # Relative path. + srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix" + top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + case "$ac_given_INSTALL" in + [/$]*) INSTALL="$ac_given_INSTALL" ;; + *) INSTALL="$ac_dots$ac_given_INSTALL" ;; + esac + + echo creating "$ac_file" + rm -f "$ac_file" + configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure." + case "$ac_file" in + *Makefile*) ac_comsub="1i\\ +# $configure_input" ;; + *) ac_comsub= ;; + esac + + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + sed -e "$ac_comsub +s%@configure_input@%$configure_input%g +s%@srcdir@%$srcdir%g +s%@top_srcdir@%$top_srcdir%g +s%@INSTALL@%$INSTALL%g +" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file +fi; done +rm -f conftest.s* + + + +exit 0 diff --git a/forester/archive/RIO/others/puzzle_mod/configure b/forester/archive/RIO/others/puzzle_mod/configure new file mode 100755 index 0000000..5d4db41 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/configure @@ -0,0 +1,2265 @@ +#! /bin/sh + +# Guess values for system-dependent variables and create Makefiles. +# Generated automatically using autoconf version 2.13 +# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc. +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. + +# Defaults: +ac_help= +ac_default_prefix=/usr/local +# Any additions from configure.in: + +# Initialize some variables set by options. +# The variables have the same names as the options, with +# dashes changed to underlines. +build=NONE +cache_file=./config.cache +exec_prefix=NONE +host=NONE +no_create= +nonopt=NONE +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +target=NONE +verbose= +x_includes=NONE +x_libraries=NONE +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datadir='${prefix}/share' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +libdir='${exec_prefix}/lib' +includedir='${prefix}/include' +oldincludedir='/usr/include' +infodir='${prefix}/info' +mandir='${prefix}/man' + +# Initialize some other variables. +subdirs= +MFLAGS= MAKEFLAGS= +SHELL=${CONFIG_SHELL-/bin/sh} +# Maximum number of lines to put in a shell here document. +ac_max_here_lines=12 + +ac_prev= +for ac_option +do + + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval "$ac_prev=\$ac_option" + ac_prev= + continue + fi + + case "$ac_option" in + -*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;; + *) ac_optarg= ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case "$ac_option" in + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir="$ac_optarg" ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build="$ac_optarg" ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file="$ac_optarg" ;; + + -datadir | --datadir | --datadi | --datad | --data | --dat | --da) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \ + | --da=*) + datadir="$ac_optarg" ;; + + -disable-* | --disable-*) + ac_feature=`echo $ac_option|sed -e 's/-*disable-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + eval "enable_${ac_feature}=no" ;; + + -enable-* | --enable-*) + ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; } + fi + ac_feature=`echo $ac_feature| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "enable_${ac_feature}='$ac_optarg'" ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix="$ac_optarg" ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he) + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat << EOF +Usage: configure [options] [host] +Options: [defaults in brackets after descriptions] +Configuration: + --cache-file=FILE cache test results in FILE + --help print this message + --no-create do not create output files + --quiet, --silent do not print \`checking...' messages + --version print the version of autoconf that created configure +Directory and file names: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [same as prefix] + --bindir=DIR user executables in DIR [EPREFIX/bin] + --sbindir=DIR system admin executables in DIR [EPREFIX/sbin] + --libexecdir=DIR program executables in DIR [EPREFIX/libexec] + --datadir=DIR read-only architecture-independent data in DIR + [PREFIX/share] + --sysconfdir=DIR read-only single-machine data in DIR [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data in DIR + [PREFIX/com] + --localstatedir=DIR modifiable single-machine data in DIR [PREFIX/var] + --libdir=DIR object code libraries in DIR [EPREFIX/lib] + --includedir=DIR C header files in DIR [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc in DIR [/usr/include] + --infodir=DIR info documentation in DIR [PREFIX/info] + --mandir=DIR man documentation in DIR [PREFIX/man] + --srcdir=DIR find the sources in DIR [configure dir or ..] + --program-prefix=PREFIX prepend PREFIX to installed program names + --program-suffix=SUFFIX append SUFFIX to installed program names + --program-transform-name=PROGRAM + run sed PROGRAM on installed program names +EOF + cat << EOF +Host type: + --build=BUILD configure for building on BUILD [BUILD=HOST] + --host=HOST configure for HOST [guessed] + --target=TARGET configure for TARGET [TARGET=HOST] +Features and packages: + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) + --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --x-includes=DIR X include files are in DIR + --x-libraries=DIR X library files are in DIR +EOF + if test -n "$ac_help"; then + echo "--enable and --with options recognized:$ac_help" + fi + exit 0 ;; + + -host | --host | --hos | --ho) + ac_prev=host ;; + -host=* | --host=* | --hos=* | --ho=*) + host="$ac_optarg" ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir="$ac_optarg" ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir="$ac_optarg" ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir="$ac_optarg" ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir="$ac_optarg" ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst \ + | --locals | --local | --loca | --loc | --lo) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* \ + | --locals=* | --local=* | --loca=* | --loc=* | --lo=*) + localstatedir="$ac_optarg" ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir="$ac_optarg" ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir="$ac_optarg" ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix="$ac_optarg" ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix="$ac_optarg" ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix="$ac_optarg" ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name="$ac_optarg" ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir="$ac_optarg" ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir="$ac_optarg" ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site="$ac_optarg" ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir="$ac_optarg" ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir="$ac_optarg" ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target="$ac_optarg" ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers) + echo "configure generated by autoconf version 2.13" + exit 0 ;; + + -with-* | --with-*) + ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + case "$ac_option" in + *=*) ;; + *) ac_optarg=yes ;; + esac + eval "with_${ac_package}='$ac_optarg'" ;; + + -without-* | --without-*) + ac_package=`echo $ac_option|sed -e 's/-*without-//'` + # Reject names that are not valid shell variable names. + if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then + { echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; } + fi + ac_package=`echo $ac_package| sed 's/-/_/g'` + eval "with_${ac_package}=no" ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes="$ac_optarg" ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries="$ac_optarg" ;; + + -*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; } + ;; + + *) + if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then + echo "configure: warning: $ac_option: invalid host type" 1>&2 + fi + if test "x$nonopt" != xNONE; then + { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; } + fi + nonopt="$ac_option" + ;; + + esac +done + +if test -n "$ac_prev"; then + { echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; } +fi + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +# File descriptor usage: +# 0 standard input +# 1 file creation +# 2 errors and warnings +# 3 some systems may open it to /dev/tty +# 4 used on the Kubota Titan +# 6 checking for... messages and results +# 5 compiler messages saved in config.log +if test "$silent" = yes; then + exec 6>/dev/null +else + exec 6>&1 +fi +exec 5>./config.log + +echo "\ +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. +" 1>&5 + +# Strip out --no-create and --no-recursion so they do not pile up. +# Also quote any args containing shell metacharacters. +ac_configure_args= +for ac_arg +do + case "$ac_arg" in + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c) ;; + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;; + *" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*) + ac_configure_args="$ac_configure_args '$ac_arg'" ;; + *) ac_configure_args="$ac_configure_args $ac_arg" ;; + esac +done + +# NLS nuisances. +# Only set these to C if already set. These must not be set unconditionally +# because not all systems understand e.g. LANG=C (notably SCO). +# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'! +# Non-C LC_CTYPE values break the ctype check. +if test "${LANG+set}" = set; then LANG=C; export LANG; fi +if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi +if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi +if test "${LC_CTYPE+set}" = set; then LC_CTYPE=C; export LC_CTYPE; fi + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -rf conftest* confdefs.h +# AIX cpp loses on an empty file, so make sure it contains at least a newline. +echo > confdefs.h + +# A filename unique to this package, relative to the directory that +# configure is in, which we can look for to find out if srcdir is correct. +ac_unique_file=src/ml.h + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then its parent. + ac_prog=$0 + ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'` + test "x$ac_confdir" = "x$ac_prog" && ac_confdir=. + srcdir=$ac_confdir + if test ! -r $srcdir/$ac_unique_file; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r $srcdir/$ac_unique_file; then + if test "$ac_srcdir_defaulted" = yes; then + { echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; } + else + { echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; } + fi +fi +srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'` + +# Prefer explicitly selected file to automatically selected ones. +if test -z "$CONFIG_SITE"; then + if test "x$prefix" != xNONE; then + CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site" + else + CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site" + fi +fi +for ac_site_file in $CONFIG_SITE; do + if test -r "$ac_site_file"; then + echo "loading site script $ac_site_file" + . "$ac_site_file" + fi +done + + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +ac_exeext= +ac_objext=o +if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then + # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu. + if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then + ac_n= ac_c=' +' ac_t=' ' + else + ac_n=-n ac_c= ac_t= + fi +else + ac_n= ac_c='\c' ac_t= +fi + + + +ac_aux_dir= +for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do + if test -f $ac_dir/install-sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install-sh -c" + break + elif test -f $ac_dir/install.sh; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install.sh -c" + break + fi +done +if test -z "$ac_aux_dir"; then + { echo "configure: error: can not find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; } +fi +ac_config_guess=$ac_aux_dir/config.guess +ac_config_sub=$ac_aux_dir/config.sub +ac_configure=$ac_aux_dir/configure # This should be Cygnus configure. + +# Find a good install program. We prefer a C program (faster), +# so one script is as good as another. But avoid the broken or +# incompatible versions: +# SysV /etc/install, /usr/sbin/install +# SunOS /usr/etc/install +# IRIX /sbin/install +# AIX /bin/install +# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag +# AFS /usr/afsws/bin/install, which mishandles nonexistent args +# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff" +# ./install, which can be erroneously created by make from ./install.sh. +echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6 +echo "configure:550: checking for a BSD compatible install" >&5 +if test -z "$INSTALL"; then +if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + IFS="${IFS= }"; ac_save_IFS="$IFS"; IFS=":" + for ac_dir in $PATH; do + # Account for people who put trailing slashes in PATH elements. + case "$ac_dir/" in + /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;; + *) + # OSF1 and SCO ODT 3.0 have their own names for install. + # Don't use installbsd from OSF since it installs stuff as root + # by default. + for ac_prog in ginstall scoinst install; do + if test -f $ac_dir/$ac_prog; then + if test $ac_prog = install && + grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then + # AIX install. It has an incompatible calling convention. + : + else + ac_cv_path_install="$ac_dir/$ac_prog -c" + break 2 + fi + fi + done + ;; + esac + done + IFS="$ac_save_IFS" + +fi + if test "${ac_cv_path_install+set}" = set; then + INSTALL="$ac_cv_path_install" + else + # As a last resort, use the slow shell script. We don't cache a + # path for INSTALL within a source directory, because that will + # break other packages using the cache if that directory is + # removed, or if the path is relative. + INSTALL="$ac_install_sh" + fi +fi +echo "$ac_t""$INSTALL" 1>&6 + +# Use test -z because SunOS4 sh mishandles braces in ${var-val}. +# It thinks the first close brace ends the variable substitution. +test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}' + +test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}' + +test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' + +echo $ac_n "checking whether build environment is sane""... $ac_c" 1>&6 +echo "configure:603: checking whether build environment is sane" >&5 +# Just in case +sleep 1 +echo timestamp > conftestfile +# Do `set' in a subshell so we don't clobber the current shell's +# arguments. Must try -L first in case configure is actually a +# symlink; some systems play weird games with the mod time of symlinks +# (eg FreeBSD returns the mod time of the symlink's containing +# directory). +if ( + set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null` + if test "$*" = "X"; then + # -L didn't work. + set X `ls -t $srcdir/configure conftestfile` + fi + if test "$*" != "X $srcdir/configure conftestfile" \ + && test "$*" != "X conftestfile $srcdir/configure"; then + + # If neither matched, then we have a broken ls. This can happen + # if, for instance, CONFIG_SHELL is bash and it inherits a + # broken ls alias from the environment. This has actually + # happened. Such a system could not be considered "sane". + { echo "configure: error: ls -t appears to fail. Make sure there is not a broken +alias in your environment" 1>&2; exit 1; } + fi + + test "$2" = conftestfile + ) +then + # Ok. + : +else + { echo "configure: error: newly created file is older than distributed files! +Check your system clock" 1>&2; exit 1; } +fi +rm -f conftest* +echo "$ac_t""yes" 1>&6 +if test "$program_transform_name" = s,x,x,; then + program_transform_name= +else + # Double any \ or $. echo might interpret backslashes. + cat <<\EOF_SED > conftestsed +s,\\,\\\\,g; s,\$,$$,g +EOF_SED + program_transform_name="`echo $program_transform_name|sed -f conftestsed`" + rm -f conftestsed +fi +test "$program_prefix" != NONE && + program_transform_name="s,^,${program_prefix},; $program_transform_name" +# Use a double $ so make ignores it. +test "$program_suffix" != NONE && + program_transform_name="s,\$\$,${program_suffix},; $program_transform_name" + +# sed with no file args requires a program. +test "$program_transform_name" = "" && program_transform_name="s,x,x," + +echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6 +echo "configure:660: checking whether ${MAKE-make} sets \${MAKE}" >&5 +set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftestmake <<\EOF +all: + @echo 'ac_maketemp="${MAKE}"' +EOF +# GNU make sometimes prints "make[1]: Entering...", which would confuse us. +eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=` +if test -n "$ac_maketemp"; then + eval ac_cv_prog_make_${ac_make}_set=yes +else + eval ac_cv_prog_make_${ac_make}_set=no +fi +rm -f conftestmake +fi +if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then + echo "$ac_t""yes" 1>&6 + SET_MAKE= +else + echo "$ac_t""no" 1>&6 + SET_MAKE="MAKE=${MAKE-make}" +fi + + +PACKAGE=tree-puzzle + +VERSION=5.0 + +if test "`cd $srcdir && pwd`" != "`pwd`" && test -f $srcdir/config.status; then + { echo "configure: error: source directory already configured; run "make distclean" there first" 1>&2; exit 1; } +fi +cat >> confdefs.h <> confdefs.h <&6 +echo "configure:706: checking for working aclocal" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (aclocal --version) < /dev/null > /dev/null 2>&1; then + ACLOCAL=aclocal + echo "$ac_t""found" 1>&6 +else + ACLOCAL="$missing_dir/missing aclocal" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working autoconf""... $ac_c" 1>&6 +echo "configure:719: checking for working autoconf" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (autoconf --version) < /dev/null > /dev/null 2>&1; then + AUTOCONF=autoconf + echo "$ac_t""found" 1>&6 +else + AUTOCONF="$missing_dir/missing autoconf" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working automake""... $ac_c" 1>&6 +echo "configure:732: checking for working automake" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (automake --version) < /dev/null > /dev/null 2>&1; then + AUTOMAKE=automake + echo "$ac_t""found" 1>&6 +else + AUTOMAKE="$missing_dir/missing automake" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working autoheader""... $ac_c" 1>&6 +echo "configure:745: checking for working autoheader" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (autoheader --version) < /dev/null > /dev/null 2>&1; then + AUTOHEADER=autoheader + echo "$ac_t""found" 1>&6 +else + AUTOHEADER="$missing_dir/missing autoheader" + echo "$ac_t""missing" 1>&6 +fi + +echo $ac_n "checking for working makeinfo""... $ac_c" 1>&6 +echo "configure:758: checking for working makeinfo" >&5 +# Run test in a subshell; some versions of sh will print an error if +# an executable is not found, even if stderr is redirected. +# Redirect stdin to placate older versions of autoconf. Sigh. +if (makeinfo --version) < /dev/null > /dev/null 2>&1; then + MAKEINFO=makeinfo + echo "$ac_t""found" 1>&6 +else + MAKEINFO="$missing_dir/missing makeinfo" + echo "$ac_t""missing" 1>&6 +fi + + + +# Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:775: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="gcc" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:805: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_prog_rejected=no + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + break + fi + done + IFS="$ac_save_ifs" +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# -gt 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + set dummy "$ac_dir/$ac_word" "$@" + shift + ac_cv_prog_CC="$@" + fi +fi +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + if test -z "$CC"; then + case "`uname -s`" in + *win32* | *WIN32*) + # Extract the first word of "cl", so it can be a program name with args. +set dummy cl; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:856: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_prog_CC="cl" + break + fi + done + IFS="$ac_save_ifs" +fi +fi +CC="$ac_cv_prog_CC" +if test -n "$CC"; then + echo "$ac_t""$CC" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + ;; + esac + fi + test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; } +fi + +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6 +echo "configure:888: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5 + +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +cat > conftest.$ac_ext << EOF + +#line 899 "configure" +#include "confdefs.h" + +main(){return(0);} +EOF +if { (eval echo configure:904: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + ac_cv_prog_cc_works=yes + # If we can't run a trivial program, we are probably using a cross compiler. + if (./conftest; exit) 2>/dev/null; then + ac_cv_prog_cc_cross=no + else + ac_cv_prog_cc_cross=yes + fi +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + ac_cv_prog_cc_works=no +fi +rm -fr conftest* +ac_ext=c +# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options. +ac_cpp='$CPP $CPPFLAGS' +ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5' +ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5' +cross_compiling=$ac_cv_prog_cc_cross + +echo "$ac_t""$ac_cv_prog_cc_works" 1>&6 +if test $ac_cv_prog_cc_works = no; then + { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; } +fi +echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6 +echo "configure:930: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5 +echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6 +cross_compiling=$ac_cv_prog_cc_cross + +echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6 +echo "configure:935: checking whether we are using GNU C" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.c <&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then + ac_cv_prog_gcc=yes +else + ac_cv_prog_gcc=no +fi +fi + +echo "$ac_t""$ac_cv_prog_gcc" 1>&6 + +if test $ac_cv_prog_gcc = yes; then + GCC=yes +else + GCC= +fi + +ac_test_CFLAGS="${CFLAGS+set}" +ac_save_CFLAGS="$CFLAGS" +CFLAGS= +echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6 +echo "configure:963: checking whether ${CC-cc} accepts -g" >&5 +if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'void f(){}' > conftest.c +if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then + ac_cv_prog_cc_g=yes +else + ac_cv_prog_cc_g=no +fi +rm -f conftest* + +fi + +echo "$ac_t""$ac_cv_prog_cc_g" 1>&6 +if test "$ac_test_CFLAGS" = set; then + CFLAGS="$ac_save_CFLAGS" +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi + +if test "x$CC" != xcc; then + echo $ac_n "checking whether $CC and cc understand -c and -o together""... $ac_c" 1>&6 +echo "configure:996: checking whether $CC and cc understand -c and -o together" >&5 +else + echo $ac_n "checking whether cc understands -c and -o together""... $ac_c" 1>&6 +echo "configure:999: checking whether cc understands -c and -o together" >&5 +fi +set dummy $CC; ac_cc="`echo $2 | + sed -e 's/[^a-zA-Z0-9_]/_/g' -e 's/^[0-9]/_/'`" +if eval "test \"`echo '$''{'ac_cv_prog_cc_${ac_cc}_c_o'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + echo 'foo(){}' > conftest.c +# Make sure it works both with $CC and with simple cc. +# We do the test twice because some compilers refuse to overwrite an +# existing .o file with -o, though they will create one. +ac_try='${CC-cc} -c conftest.c -o conftest.o 1>&5' +if { (eval echo configure:1011: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } && + test -f conftest.o && { (eval echo configure:1012: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; +then + eval ac_cv_prog_cc_${ac_cc}_c_o=yes + if test "x$CC" != xcc; then + # Test first that cc exists at all. + if { ac_try='cc -c conftest.c 1>&5'; { (eval echo configure:1017: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; }; then + ac_try='cc -c conftest.c -o conftest.o 1>&5' + if { (eval echo configure:1019: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } && + test -f conftest.o && { (eval echo configure:1020: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; + then + # cc works too. + : + else + # cc exists but doesn't like -o. + eval ac_cv_prog_cc_${ac_cc}_c_o=no + fi + fi + fi +else + eval ac_cv_prog_cc_${ac_cc}_c_o=no +fi +rm -f conftest* + +fi +if eval "test \"`echo '$ac_cv_prog_cc_'${ac_cc}_c_o`\" = yes"; then + echo "$ac_t""yes" 1>&6 +else + echo "$ac_t""no" 1>&6 + cat >> confdefs.h <<\EOF +#define NO_MINUS_C_MINUS_O 1 +EOF + +fi + +# Find a good install program. We prefer a C program (faster), +# so one script is as good as another. But avoid the broken or +# incompatible versions: +# SysV /etc/install, /usr/sbin/install +# SunOS /usr/etc/install +# IRIX /sbin/install +# AIX /bin/install +# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag +# AFS /usr/afsws/bin/install, which mishandles nonexistent args +# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff" +# ./install, which can be erroneously created by make from ./install.sh. +echo $ac_n "checking for a BSD compatible install""... $ac_c" 1>&6 +echo "configure:1058: checking for a BSD compatible install" >&5 +if test -z "$INSTALL"; then +if eval "test \"`echo '$''{'ac_cv_path_install'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + IFS="${IFS= }"; ac_save_IFS="$IFS"; IFS=":" + for ac_dir in $PATH; do + # Account for people who put trailing slashes in PATH elements. + case "$ac_dir/" in + /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;; + *) + # OSF1 and SCO ODT 3.0 have their own names for install. + # Don't use installbsd from OSF since it installs stuff as root + # by default. + for ac_prog in ginstall scoinst install; do + if test -f $ac_dir/$ac_prog; then + if test $ac_prog = install && + grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then + # AIX install. It has an incompatible calling convention. + : + else + ac_cv_path_install="$ac_dir/$ac_prog -c" + break 2 + fi + fi + done + ;; + esac + done + IFS="$ac_save_IFS" + +fi + if test "${ac_cv_path_install+set}" = set; then + INSTALL="$ac_cv_path_install" + else + # As a last resort, use the slow shell script. We don't cache a + # path for INSTALL within a source directory, because that will + # break other packages using the cache if that directory is + # removed, or if the path is relative. + INSTALL="$ac_install_sh" + fi +fi +echo "$ac_t""$INSTALL" 1>&6 + +# Use test -z because SunOS4 sh mishandles braces in ${var-val}. +# It thinks the first close brace ends the variable substitution. +test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}' + +test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL_PROGRAM}' + +test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644' + +echo $ac_n "checking whether ${MAKE-make} sets \${MAKE}""... $ac_c" 1>&6 +echo "configure:1111: checking whether ${MAKE-make} sets \${MAKE}" >&5 +set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_prog_make_${ac_make}_set'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftestmake <<\EOF +all: + @echo 'ac_maketemp="${MAKE}"' +EOF +# GNU make sometimes prints "make[1]: Entering...", which would confuse us. +eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=` +if test -n "$ac_maketemp"; then + eval ac_cv_prog_make_${ac_make}_set=yes +else + eval ac_cv_prog_make_${ac_make}_set=no +fi +rm -f conftestmake +fi +if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then + echo "$ac_t""yes" 1>&6 + SET_MAKE= +else + echo "$ac_t""no" 1>&6 + SET_MAKE="MAKE=${MAKE-make}" +fi + + + + + +if test "$MPICC" != "" ; then + # Extract the first word of "$MPICC", so it can be a program name with args. +set dummy $MPICC; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1145: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC0'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC0" in + /*) + ac_cv_path_MPICC0="$MPICC0" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC0="$MPICC0" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC0="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC0="$ac_cv_path_MPICC0" +if test -n "$MPICC0"; then + echo "$ac_t""$MPICC0" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +fi +# Extract the first word of "mpcc", so it can be a program name with args. +set dummy mpcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1181: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC1'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC1" in + /*) + ac_cv_path_MPICC1="$MPICC1" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC1="$MPICC1" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC1="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC1="$ac_cv_path_MPICC1" +if test -n "$MPICC1"; then + echo "$ac_t""$MPICC1" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "hcc", so it can be a program name with args. +set dummy hcc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1216: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC2'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC2" in + /*) + ac_cv_path_MPICC2="$MPICC2" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC2="$MPICC2" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC2="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC2="$ac_cv_path_MPICC2" +if test -n "$MPICC2"; then + echo "$ac_t""$MPICC2" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "mpicc", so it can be a program name with args. +set dummy mpicc; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1251: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC3'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC3" in + /*) + ac_cv_path_MPICC3="$MPICC3" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC3="$MPICC3" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC3="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC3="$ac_cv_path_MPICC3" +if test -n "$MPICC3"; then + echo "$ac_t""$MPICC3" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "mpicc_lam", so it can be a program name with args. +set dummy mpicc_lam; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1286: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC4'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC4" in + /*) + ac_cv_path_MPICC4="$MPICC4" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC4="$MPICC4" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC4="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC4="$ac_cv_path_MPICC4" +if test -n "$MPICC4"; then + echo "$ac_t""$MPICC4" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + +# Extract the first word of "mpicc_mpich", so it can be a program name with args. +set dummy mpicc_mpich; ac_word=$2 +echo $ac_n "checking for $ac_word""... $ac_c" 1>&6 +echo "configure:1321: checking for $ac_word" >&5 +if eval "test \"`echo '$''{'ac_cv_path_MPICC5'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + case "$MPICC5" in + /*) + ac_cv_path_MPICC5="$MPICC5" # Let the user override the test with a path. + ;; + ?:/*) + ac_cv_path_MPICC5="$MPICC5" # Let the user override the test with a dos path. + ;; + *) + IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":" + ac_dummy="$PATH" + for ac_dir in $ac_dummy; do + test -z "$ac_dir" && ac_dir=. + if test -f $ac_dir/$ac_word; then + ac_cv_path_MPICC5="$ac_dir/$ac_word" + break + fi + done + IFS="$ac_save_ifs" + ;; +esac +fi +MPICC5="$ac_cv_path_MPICC5" +if test -n "$MPICC5"; then + echo "$ac_t""$MPICC5" 1>&6 +else + echo "$ac_t""no" 1>&6 +fi + + + if test "$MPICC0" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC0 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1371: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1382: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC1" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC1 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1419: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1430: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC2" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC2 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1467: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1478: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC3" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC3 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1515: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1526: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC4" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC4 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1563: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1574: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + if test "$MPICC5" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$MPICC5 + + if test "$MPICC" != "" ; then + echo $ac_n "checking whether $MPICC works as MPI compiler""... $ac_c" 1>&6 +echo "configure:1611: checking whether $MPICC works as MPI compiler" >&5 + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + echo $ac_n "checking whether $MPICC needs -lmpi""... $ac_c" 1>&6 +echo "configure:1622: checking whether $MPICC needs -lmpi" >&5 + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + echo "$ac_t""yes" 1>&6 + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + echo "$ac_t""no" 1>&6 + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi + +ac_cv_prog_MPICC=$MPICC + + + + + + + +echo $ac_n "checking for main in -lm""... $ac_c" 1>&6 +echo "configure:1652: checking for main in -lm" >&5 +ac_lib_var=`echo m'_'main | sed 'y%./+-%__p_%'` +if eval "test \"`echo '$''{'ac_cv_lib_$ac_lib_var'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + ac_save_LIBS="$LIBS" +LIBS="-lm $LIBS" +cat > conftest.$ac_ext <&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=yes" +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_lib_$ac_lib_var=no" +fi +rm -f conftest* +LIBS="$ac_save_LIBS" + +fi +if eval "test \"`echo '$ac_cv_lib_'$ac_lib_var`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_lib=HAVE_LIB`echo m | sed -e 's/[^a-zA-Z0-9_]/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + cat >> confdefs.h <&6 +fi + + +echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6 +echo "configure:1696: checking how to run the C preprocessor" >&5 +# On Suns, sometimes $CPP names a directory. +if test -n "$CPP" && test -d "$CPP"; then + CPP= +fi +if test -z "$CPP"; then +if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + # This must be in double quotes, not single quotes, because CPP may get + # substituted into the Makefile and "${CC-cc}" will confuse make. + CPP="${CC-cc} -E" + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1717: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -E -traditional-cpp" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1734: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP="${CC-cc} -nologo -E" + cat > conftest.$ac_ext < +Syntax Error +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1751: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + : +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + CPP=/lib/cpp +fi +rm -f conftest* +fi +rm -f conftest* +fi +rm -f conftest* + ac_cv_prog_CPP="$CPP" +fi + CPP="$ac_cv_prog_CPP" +else + ac_cv_prog_CPP="$CPP" +fi +echo "$ac_t""$CPP" 1>&6 + +echo $ac_n "checking for ANSI C header files""... $ac_c" 1>&6 +echo "configure:1776: checking for ANSI C header files" >&5 +if eval "test \"`echo '$''{'ac_cv_header_stdc'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +#include +#include +#include +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1789: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + ac_cv_header_stdc=yes +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. +cat > conftest.$ac_ext < +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "memchr" >/dev/null 2>&1; then + : +else + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. +cat > conftest.$ac_ext < +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "free" >/dev/null 2>&1; then + : +else + rm -rf conftest* + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. +if test "$cross_compiling" = yes; then + : +else + cat > conftest.$ac_ext < +#define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +#define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int main () { int i; for (i = 0; i < 256; i++) +if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2); +exit (0); } + +EOF +if { (eval echo configure:1856: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null +then + : +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -fr conftest* + ac_cv_header_stdc=no +fi +rm -fr conftest* +fi + +fi +fi + +echo "$ac_t""$ac_cv_header_stdc" 1>&6 +if test $ac_cv_header_stdc = yes; then + cat >> confdefs.h <<\EOF +#define STDC_HEADERS 1 +EOF + +fi + +for ac_hdr in limits.h +do +ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'` +echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6 +echo "configure:1883: checking for $ac_hdr" >&5 +if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +EOF +ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out" +{ (eval echo configure:1893: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; } +ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"` +if test -z "$ac_err"; then + rm -rf conftest* + eval "ac_cv_header_$ac_safe=yes" +else + echo "$ac_err" >&5 + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + eval "ac_cv_header_$ac_safe=no" +fi +rm -f conftest* +fi +if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then + echo "$ac_t""yes" 1>&6 + ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'` + cat >> confdefs.h <&6 +fi +done + + + + +echo $ac_n "checking for working const""... $ac_c" 1>&6 +echo "configure:1923: checking for working const" >&5 +if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext <j = 5; +} +{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ + const int foo = 10; +} + +; return 0; } +EOF +if { (eval echo configure:1977: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then + rm -rf conftest* + ac_cv_c_const=yes +else + echo "configure: failed program was:" >&5 + cat conftest.$ac_ext >&5 + rm -rf conftest* + ac_cv_c_const=no +fi +rm -f conftest* +fi + +echo "$ac_t""$ac_cv_c_const" 1>&6 +if test $ac_cv_c_const = no; then + cat >> confdefs.h <<\EOF +#define const +EOF + +fi + +echo $ac_n "checking for size_t""... $ac_c" 1>&6 +echo "configure:1998: checking for size_t" >&5 +if eval "test \"`echo '$''{'ac_cv_type_size_t'+set}'`\" = set"; then + echo $ac_n "(cached) $ac_c" 1>&6 +else + cat > conftest.$ac_ext < +#if STDC_HEADERS +#include +#include +#endif +EOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + egrep "(^|[^a-zA-Z_0-9])size_t[^a-zA-Z_0-9]" >/dev/null 2>&1; then + rm -rf conftest* + ac_cv_type_size_t=yes +else + rm -rf conftest* + ac_cv_type_size_t=no +fi +rm -f conftest* + +fi +echo "$ac_t""$ac_cv_type_size_t" 1>&6 +if test $ac_cv_type_size_t = no; then + cat >> confdefs.h <<\EOF +#define size_t unsigned +EOF + +fi + + + +trap '' 1 2 15 + +trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15 + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Any assignment to VPATH causes Sun make to only execute +# the first set of double-colon rules, so remove it if not needed. +# If there is a colon in the path, we need to keep it. +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[^:]*$/d' +fi + +trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15 + +# Transform confdefs.h into DEFS. +# Protect against shell expansion while executing Makefile rules. +# Protect against Makefile macro expansion. +cat > conftest.defs <<\EOF +s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%-D\1=\2%g +s%[ `~#$^&*(){}\\|;'"<>?]%\\&%g +s%\[%\\&%g +s%\]%\\&%g +s%\$%$$%g +EOF +DEFS=`sed -f conftest.defs confdefs.h | tr '\012' ' '` +rm -f conftest.defs + + +# Without the "./", some shells look in PATH for config.status. +: ${CONFIG_STATUS=./config.status} + +echo creating $CONFIG_STATUS +rm -f $CONFIG_STATUS +cat > $CONFIG_STATUS </dev/null | sed 1q`: +# +# $0 $ac_configure_args +# +# Compiler output produced by configure, useful for debugging +# configure, is in ./config.log if it exists. + +ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]" +for ac_option +do + case "\$ac_option" in + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion" + exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;; + -version | --version | --versio | --versi | --vers | --ver | --ve | --v) + echo "$CONFIG_STATUS generated by autoconf version 2.13" + exit 0 ;; + -help | --help | --hel | --he | --h) + echo "\$ac_cs_usage"; exit 0 ;; + *) echo "\$ac_cs_usage"; exit 1 ;; + esac +done + +ac_given_srcdir=$srcdir +ac_given_INSTALL="$INSTALL" + +trap 'rm -fr `echo "Makefile src/Makefile src/test doc/Makefile data/Makefile" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15 +EOF +cat >> $CONFIG_STATUS < conftest.subs <<\\CEOF +$ac_vpsub +$extrasub +s%@SHELL@%$SHELL%g +s%@CFLAGS@%$CFLAGS%g +s%@CPPFLAGS@%$CPPFLAGS%g +s%@CXXFLAGS@%$CXXFLAGS%g +s%@FFLAGS@%$FFLAGS%g +s%@DEFS@%$DEFS%g +s%@LDFLAGS@%$LDFLAGS%g +s%@LIBS@%$LIBS%g +s%@exec_prefix@%$exec_prefix%g +s%@prefix@%$prefix%g +s%@program_transform_name@%$program_transform_name%g +s%@bindir@%$bindir%g +s%@sbindir@%$sbindir%g +s%@libexecdir@%$libexecdir%g +s%@datadir@%$datadir%g +s%@sysconfdir@%$sysconfdir%g +s%@sharedstatedir@%$sharedstatedir%g +s%@localstatedir@%$localstatedir%g +s%@libdir@%$libdir%g +s%@includedir@%$includedir%g +s%@oldincludedir@%$oldincludedir%g +s%@infodir@%$infodir%g +s%@mandir@%$mandir%g +s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%g +s%@INSTALL_SCRIPT@%$INSTALL_SCRIPT%g +s%@INSTALL_DATA@%$INSTALL_DATA%g +s%@PACKAGE@%$PACKAGE%g +s%@VERSION@%$VERSION%g +s%@ACLOCAL@%$ACLOCAL%g +s%@AUTOCONF@%$AUTOCONF%g +s%@AUTOMAKE@%$AUTOMAKE%g +s%@AUTOHEADER@%$AUTOHEADER%g +s%@MAKEINFO@%$MAKEINFO%g +s%@SET_MAKE@%$SET_MAKE%g +s%@CC@%$CC%g +s%@MPICC0@%$MPICC0%g +s%@MPICC1@%$MPICC1%g +s%@MPICC2@%$MPICC2%g +s%@MPICC3@%$MPICC3%g +s%@MPICC4@%$MPICC4%g +s%@MPICC5@%$MPICC5%g +s%@MPICC@%$MPICC%g +s%@MPILIBS@%$MPILIBS%g +s%@MPIDEFS@%$MPIDEFS%g +s%@MPICFLAGS@%$MPICFLAGS%g +s%@PPUZZLE@%$PPUZZLE%g +s%@CPP@%$CPP%g + +CEOF +EOF + +cat >> $CONFIG_STATUS <<\EOF + +# Split the substitutions into bite-sized pieces for seds with +# small command number limits, like on Digital OSF/1 and HP-UX. +ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script. +ac_file=1 # Number of current file. +ac_beg=1 # First line for current file. +ac_end=$ac_max_sed_cmds # Line after last line for current file. +ac_more_lines=: +ac_sed_cmds="" +while $ac_more_lines; do + if test $ac_beg -gt 1; then + sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file + else + sed "${ac_end}q" conftest.subs > conftest.s$ac_file + fi + if test ! -s conftest.s$ac_file; then + ac_more_lines=false + rm -f conftest.s$ac_file + else + if test -z "$ac_sed_cmds"; then + ac_sed_cmds="sed -f conftest.s$ac_file" + else + ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file" + fi + ac_file=`expr $ac_file + 1` + ac_beg=$ac_end + ac_end=`expr $ac_end + $ac_max_sed_cmds` + fi +done +if test -z "$ac_sed_cmds"; then + ac_sed_cmds=cat +fi +EOF + +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF +for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then + # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in". + case "$ac_file" in + *:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'` + ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + *) ac_file_in="${ac_file}.in" ;; + esac + + # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories. + + # Remove last slash and all that follows it. Not all systems have dirname. + ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'` + if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then + # The file is in a subdirectory. + test ! -d "$ac_dir" && mkdir "$ac_dir" + ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`" + # A "../" for each directory in $ac_dir_suffix. + ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'` + else + ac_dir_suffix= ac_dots= + fi + + case "$ac_given_srcdir" in + .) srcdir=. + if test -z "$ac_dots"; then top_srcdir=. + else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;; + /*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;; + *) # Relative path. + srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix" + top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + case "$ac_given_INSTALL" in + [/$]*) INSTALL="$ac_given_INSTALL" ;; + *) INSTALL="$ac_dots$ac_given_INSTALL" ;; + esac + + echo creating "$ac_file" + rm -f "$ac_file" + configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure." + case "$ac_file" in + *Makefile*) ac_comsub="1i\\ +# $configure_input" ;; + *) ac_comsub= ;; + esac + + ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"` + sed -e "$ac_comsub +s%@configure_input@%$configure_input%g +s%@srcdir@%$srcdir%g +s%@top_srcdir@%$top_srcdir%g +s%@INSTALL@%$INSTALL%g +" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file +fi; done +rm -f conftest.s* + +EOF +cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF + +exit 0 +EOF +chmod +x $CONFIG_STATUS +rm -fr confdefs* $ac_clean_files +test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1 + diff --git a/forester/archive/RIO/others/puzzle_mod/configure.in b/forester/archive/RIO/others/puzzle_mod/configure.in new file mode 100644 index 0000000..57f0e27 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/configure.in @@ -0,0 +1,117 @@ + +dnl Disable caching. +define([AC_CACHE_LOAD], )dnl +define([AC_CACHE_SAVE], )dnl + +dnl Process this file with autoconf to produce a configure script. +AC_INIT(src/ml.h) + +AM_INIT_AUTOMAKE(tree-puzzle, 5.0) + +dnl Checks for programs. +AC_PROG_CC +AC_PROG_CC_C_O +AC_PROG_INSTALL +AC_PROG_MAKE_SET + + +AC_DEFUN(AC_TEST_MPICC,[dnl + if test "$1" != "" ; then + if test "$MPICCSET" = "" ; then +cat > conftest.c < +int main (int argc, char **argv) +{ +MPI_Init(&argc,&argv); +MPI_Finalize(); +exit(0); +} +EOF + + +MPICC=$1 +dnl if test "$MPICC" != "$CC" ; then +dnl +dnl fi + + if test "$MPICC" != "" ; then + AC_MSG_CHECKING(whether $MPICC works as MPI compiler) + $MPICC conftest.c -o conftest > /dev/null 2>&1 + if test $? = 0 ; then + AC_MSG_RESULT(yes) + #MPICC=$MPICC + MPILIBS= + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + AC_MSG_RESULT(no) + AC_MSG_CHECKING(whether $MPICC needs -lmpi) + $MPICC conftest.c -o conftest -lmpi > /dev/null 2>&1 + if test $? = 0 ; then + AC_MSG_RESULT(yes) + #MPICC=$PCC + MPILIBS=-lmpi + MPICCSET=$MPICC + PPUZZLE=ppuzzle + else + AC_MSG_RESULT(no) + MPICC= + MPILIBS= + MPICCSET= + PPUZZLE= + fi + fi + fi + rm -f conftest* + fi + fi ]) + +if test "$MPICC" != "" ; then + AC_PATH_PROG(MPICC0, $MPICC) +fi +AC_PATH_PROG(MPICC1, mpcc) +AC_PATH_PROG(MPICC2, hcc) +AC_PATH_PROG(MPICC3, mpicc) +AC_PATH_PROG(MPICC4, mpicc_lam) +AC_PATH_PROG(MPICC5, mpicc_mpich) + +AC_TEST_MPICC($MPICC0) +AC_TEST_MPICC($MPICC1) +AC_TEST_MPICC($MPICC2) +AC_TEST_MPICC($MPICC3) +AC_TEST_MPICC($MPICC4) +AC_TEST_MPICC($MPICC5) + +ac_cv_prog_MPICC=$MPICC + +AC_SUBST(MPICC) +AC_SUBST(MPILIBS) +AC_SUBST(MPIDEFS) +AC_SUBST(MPICFLAGS) +AC_SUBST(PPUZZLE) + +dnl Checks for libraries. +dnl Replace `main' with a function in -lm: +AC_CHECK_LIB(m, main) +dnl AC_CHECK_LIB(mpi, main) + +dnl Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS(limits.h) +dnl AC_HAVE_HEADERS(mpi.h) + +dnl AC_HAVE_HEADERS(rpc/xdr.h) + + +dnl Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST +AC_TYPE_SIZE_T + +dnl Checks for library functions. +dnl AC_CHECK_FUNCS(xdr_u_char) +dnl AC_CHECK_FUNCS(xdr_double) +dnl AC_CHECK_FUNCS(xdrstdio_create) +dnl AC_CHECK_FUNCS(xdr_destroy) +dnl AC_CHECK_FUNCS(xdr_inline) + +AC_OUTPUT(Makefile src/Makefile src/test doc/Makefile data/Makefile) diff --git a/forester/archive/RIO/others/puzzle_mod/data/Makefile b/forester/archive/RIO/others/puzzle_mod/data/Makefile new file mode 100644 index 0000000..13d6fc1 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/data/Makefile @@ -0,0 +1,177 @@ +# Generated automatically from Makefile.in by configure. +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = /bin/sh + +srcdir = . +top_srcdir = .. +prefix = /usr/local +exec_prefix = ${prefix} + +bindir = ${exec_prefix}/bin +sbindir = ${exec_prefix}/sbin +libexecdir = ${exec_prefix}/libexec +datadir = ${prefix}/share +sysconfdir = ${prefix}/etc +sharedstatedir = ${prefix}/com +localstatedir = ${prefix}/var +libdir = ${exec_prefix}/lib +infodir = ${prefix}/info +mandir = ${prefix}/man +includedir = ${prefix}/include +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/tree-puzzle +pkglibdir = $(libdir)/tree-puzzle +pkgincludedir = $(includedir)/tree-puzzle + +top_builddir = .. + +ACLOCAL = aclocal +AUTOCONF = autoconf +AUTOMAKE = automake +AUTOHEADER = autoheader + +INSTALL = /usr/bin/install -c +INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_SCRIPT = ${INSTALL_PROGRAM} +transform = s,x,x, + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = gcc +MAKEINFO = makeinfo +MPICC = +MPICC0 = +MPICC1 = +MPICC2 = +MPICC3 = +MPICC4 = +MPICC5 = +MPICFLAGS = +MPIDEFS = +MPILIBS = +PACKAGE = tree-puzzle +PPUZZLE = +VERSION = 5.0 + +EXTRA_DIST = atp6.a globin.a marswolf.n primates.b +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps data/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = data + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/data/Makefile.am b/forester/archive/RIO/others/puzzle_mod/data/Makefile.am new file mode 100644 index 0000000..9589f1e --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/data/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = atp6.a globin.a marswolf.n primates.b diff --git a/forester/archive/RIO/others/puzzle_mod/data/Makefile.in b/forester/archive/RIO/others/puzzle_mod/data/Makefile.in new file mode 100644 index 0000000..47fa224 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/data/Makefile.in @@ -0,0 +1,177 @@ +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +EXTRA_DIST = atp6.a globin.a marswolf.n primates.b +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps data/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = data + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/doc/Makefile b/forester/archive/RIO/others/puzzle_mod/doc/Makefile new file mode 100644 index 0000000..008b529 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/doc/Makefile @@ -0,0 +1,177 @@ +# Generated automatically from Makefile.in by configure. +# Makefile.in generated automatically by automake 1.4-p5 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = /bin/sh + +srcdir = . +top_srcdir = .. +prefix = /usr/local +exec_prefix = ${prefix} + +bindir = ${exec_prefix}/bin +sbindir = ${exec_prefix}/sbin +libexecdir = ${exec_prefix}/libexec +datadir = ${prefix}/share +sysconfdir = ${prefix}/etc +sharedstatedir = ${prefix}/com +localstatedir = ${prefix}/var +libdir = ${exec_prefix}/lib +infodir = ${prefix}/info +mandir = ${prefix}/man +includedir = ${prefix}/include +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/tree-puzzle +pkglibdir = $(libdir)/tree-puzzle +pkgincludedir = $(includedir)/tree-puzzle + +top_builddir = .. + +ACLOCAL = aclocal +AUTOCONF = autoconf +AUTOMAKE = automake +AUTOHEADER = autoheader + +INSTALL = /usr/bin/install -c +INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_SCRIPT = ${INSTALL_PROGRAM} +transform = s,x,x, + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = gcc +MAKEINFO = makeinfo +MPICC = +MPICC0 = +MPICC1 = +MPICC2 = +MPICC3 = +MPICC4 = +MPICC5 = +MPICFLAGS = +MPIDEFS = +MPILIBS = +PACKAGE = tree-puzzle +PPUZZLE = +VERSION = 5.0 + +EXTRA_DIST = manual.html ppuzzle.gif puzzle.gif +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps doc/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = doc + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/doc/Makefile.am b/forester/archive/RIO/others/puzzle_mod/doc/Makefile.am new file mode 100644 index 0000000..3cb95e6 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/doc/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = manual.html ppuzzle.gif puzzle.gif diff --git a/forester/archive/RIO/others/puzzle_mod/doc/Makefile.in b/forester/archive/RIO/others/puzzle_mod/doc/Makefile.in new file mode 100644 index 0000000..b5588c3 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/doc/Makefile.in @@ -0,0 +1,177 @@ +# Makefile.in generated automatically by automake 1.4-p5 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +EXTRA_DIST = manual.html ppuzzle.gif puzzle.gif +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = +DIST_COMMON = Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps doc/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = doc + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/install-sh b/forester/archive/RIO/others/puzzle_mod/install-sh new file mode 100755 index 0000000..e9de238 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/install-sh @@ -0,0 +1,251 @@ +#!/bin/sh +# +# install - install a program, script, or datafile +# This comes from X11R5 (mit/util/scripts/install.sh). +# +# Copyright 1991 by the Massachusetts Institute of Technology +# +# Permission to use, copy, modify, distribute, and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice appear in all copies and that both that +# copyright notice and this permission notice appear in supporting +# documentation, and that the name of M.I.T. not be used in advertising or +# publicity pertaining to distribution of the software without specific, +# written prior permission. M.I.T. makes no representations about the +# suitability of this software for any purpose. It is provided "as is" +# without express or implied warranty. +# +# Calling this script install-sh is preferred over install.sh, to prevent +# `make' implicit rules from creating a file called install from it +# when there is no Makefile. +# +# This script is compatible with the BSD install script, but was written +# from scratch. It can only install one file at a time, a restriction +# shared with many OS's install programs. + + +# set DOITPROG to echo to test this script + +# Don't use :- since 4.3BSD and earlier shells don't like it. +doit="${DOITPROG-}" + + +# put in absolute paths if you don't have them in your path; or use env. vars. + +mvprog="${MVPROG-mv}" +cpprog="${CPPROG-cp}" +chmodprog="${CHMODPROG-chmod}" +chownprog="${CHOWNPROG-chown}" +chgrpprog="${CHGRPPROG-chgrp}" +stripprog="${STRIPPROG-strip}" +rmprog="${RMPROG-rm}" +mkdirprog="${MKDIRPROG-mkdir}" + +transformbasename="" +transform_arg="" +instcmd="$mvprog" +chmodcmd="$chmodprog 0755" +chowncmd="" +chgrpcmd="" +stripcmd="" +rmcmd="$rmprog -f" +mvcmd="$mvprog" +src="" +dst="" +dir_arg="" + +while [ x"$1" != x ]; do + case $1 in + -c) instcmd="$cpprog" + shift + continue;; + + -d) dir_arg=true + shift + continue;; + + -m) chmodcmd="$chmodprog $2" + shift + shift + continue;; + + -o) chowncmd="$chownprog $2" + shift + shift + continue;; + + -g) chgrpcmd="$chgrpprog $2" + shift + shift + continue;; + + -s) stripcmd="$stripprog" + shift + continue;; + + -t=*) transformarg=`echo $1 | sed 's/-t=//'` + shift + continue;; + + -b=*) transformbasename=`echo $1 | sed 's/-b=//'` + shift + continue;; + + *) if [ x"$src" = x ] + then + src=$1 + else + # this colon is to work around a 386BSD /bin/sh bug + : + dst=$1 + fi + shift + continue;; + esac +done + +if [ x"$src" = x ] +then + echo "install: no input file specified" + exit 1 +else + true +fi + +if [ x"$dir_arg" != x ]; then + dst=$src + src="" + + if [ -d $dst ]; then + instcmd=: + chmodcmd="" + else + instcmd=mkdir + fi +else + +# Waiting for this to be detected by the "$instcmd $src $dsttmp" command +# might cause directories to be created, which would be especially bad +# if $src (and thus $dsttmp) contains '*'. + + if [ -f $src -o -d $src ] + then + true + else + echo "install: $src does not exist" + exit 1 + fi + + if [ x"$dst" = x ] + then + echo "install: no destination specified" + exit 1 + else + true + fi + +# If destination is a directory, append the input filename; if your system +# does not like double slashes in filenames, you may need to add some logic + + if [ -d $dst ] + then + dst="$dst"/`basename $src` + else + true + fi +fi + +## this sed command emulates the dirname command +dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'` + +# Make sure that the destination directory exists. +# this part is taken from Noah Friedman's mkinstalldirs script + +# Skip lots of stat calls in the usual case. +if [ ! -d "$dstdir" ]; then +defaultIFS=' +' +IFS="${IFS-${defaultIFS}}" + +oIFS="${IFS}" +# Some sh's can't handle IFS=/ for some reason. +IFS='%' +set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'` +IFS="${oIFS}" + +pathcomp='' + +while [ $# -ne 0 ] ; do + pathcomp="${pathcomp}${1}" + shift + + if [ ! -d "${pathcomp}" ] ; + then + $mkdirprog "${pathcomp}" + else + true + fi + + pathcomp="${pathcomp}/" +done +fi + +if [ x"$dir_arg" != x ] +then + $doit $instcmd $dst && + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi +else + +# If we're going to rename the final executable, determine the name now. + + if [ x"$transformarg" = x ] + then + dstfile=`basename $dst` + else + dstfile=`basename $dst $transformbasename | + sed $transformarg`$transformbasename + fi + +# don't allow the sed command to completely eliminate the filename + + if [ x"$dstfile" = x ] + then + dstfile=`basename $dst` + else + true + fi + +# Make a temp file name in the proper directory. + + dsttmp=$dstdir/#inst.$$# + +# Move or copy the file name to the temp name + + $doit $instcmd $src $dsttmp && + + trap "rm -f ${dsttmp}" 0 && + +# and set any options; do chmod last to preserve setuid bits + +# If any of these fail, we abort the whole thing. If we want to +# ignore errors from any of these, just make sure not to ignore +# errors from the above "$doit $instcmd $src $dsttmp" command. + + if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi && + if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi && + if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi && + if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi && + +# Now rename the file to the real destination. + + $doit $rmcmd -f $dstdir/$dstfile && + $doit $mvcmd $dsttmp $dstdir/$dstfile + +fi && + + +exit 0 diff --git a/forester/archive/RIO/others/puzzle_mod/missing b/forester/archive/RIO/others/puzzle_mod/missing new file mode 100755 index 0000000..7789652 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/missing @@ -0,0 +1,190 @@ +#! /bin/sh +# Common stub for a few missing GNU programs while installing. +# Copyright (C) 1996, 1997 Free Software Foundation, Inc. +# Franc,ois Pinard , 1996. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + +if test $# -eq 0; then + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 +fi + +case "$1" in + + -h|--h|--he|--hel|--help) + echo "\ +$0 [OPTION]... PROGRAM [ARGUMENT]... + +Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an +error status if there is no known handling for PROGRAM. + +Options: + -h, --help display this help and exit + -v, --version output version information and exit + +Supported PROGRAM values: + aclocal touch file \`aclocal.m4' + autoconf touch file \`configure' + autoheader touch file \`config.h.in' + automake touch all \`Makefile.in' files + bison create \`y.tab.[ch]', if possible, from existing .[ch] + flex create \`lex.yy.c', if possible, from existing .c + lex create \`lex.yy.c', if possible, from existing .c + makeinfo touch the output file + yacc create \`y.tab.[ch]', if possible, from existing .[ch]" + ;; + + -v|--v|--ve|--ver|--vers|--versi|--versio|--version) + echo "missing - GNU libit 0.0" + ;; + + -*) + echo 1>&2 "$0: Unknown \`$1' option" + echo 1>&2 "Try \`$0 --help' for more information" + exit 1 + ;; + + aclocal) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acinclude.m4' or \`configure.in'. You might want + to install the \`Automake' and \`Perl' packages. Grab them from + any GNU archive site." + touch aclocal.m4 + ;; + + autoconf) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`configure.in'. You might want to install the + \`Autoconf' and \`GNU m4' packages. Grab them from any GNU + archive site." + touch configure + ;; + + autoheader) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`acconfig.h' or \`configure.in'. You might want + to install the \`Autoconf' and \`GNU m4' packages. Grab them + from any GNU archive site." + files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' configure.in` + test -z "$files" && files="config.h" + touch_files= + for f in $files; do + case "$f" in + *:*) touch_files="$touch_files "`echo "$f" | + sed -e 's/^[^:]*://' -e 's/:.*//'`;; + *) touch_files="$touch_files $f.in";; + esac + done + touch $touch_files + ;; + + automake) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified \`Makefile.am', \`acinclude.m4' or \`configure.in'. + You might want to install the \`Automake' and \`Perl' packages. + Grab them from any GNU archive site." + find . -type f -name Makefile.am -print | + sed 's/\.am$/.in/' | + while read f; do touch "$f"; done + ;; + + bison|yacc) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.y' file. You may need the \`Bison' package + in order for those modifications to take effect. You can get + \`Bison' from any GNU archive site." + rm -f y.tab.c y.tab.h + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.y) + SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.c + fi + SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" y.tab.h + fi + ;; + esac + fi + if [ ! -f y.tab.h ]; then + echo >y.tab.h + fi + if [ ! -f y.tab.c ]; then + echo 'main() { return 0; }' >y.tab.c + fi + ;; + + lex|flex) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.l' file. You may need the \`Flex' package + in order for those modifications to take effect. You can get + \`Flex' from any GNU archive site." + rm -f lex.yy.c + if [ $# -ne 1 ]; then + eval LASTARG="\${$#}" + case "$LASTARG" in + *.l) + SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` + if [ -f "$SRCFILE" ]; then + cp "$SRCFILE" lex.yy.c + fi + ;; + esac + fi + if [ ! -f lex.yy.c ]; then + echo 'main() { return 0; }' >lex.yy.c + fi + ;; + + makeinfo) + echo 1>&2 "\ +WARNING: \`$1' is missing on your system. You should only need it if + you modified a \`.texi' or \`.texinfo' file, or any other file + indirectly affecting the aspect of the manual. The spurious + call might also be the consequence of using a buggy \`make' (AIX, + DU, IRIX). You might want to install the \`Texinfo' package or + the \`GNU make' package. Grab either from any GNU archive site." + file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'` + if test -z "$file"; then + file=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` + file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $file` + fi + touch $file + ;; + + *) + echo 1>&2 "\ +WARNING: \`$1' is needed, and you do not seem to have it handy on your + system. You might have modified some files without having the + proper tools for further handling them. Check the \`README' file, + it often tells you about the needed prerequirements for installing + this package. You may also peek at any GNU archive site, in case + some other package would contain this missing \`$1' program." + exit 1 + ;; +esac + +exit 0 diff --git a/forester/archive/RIO/others/puzzle_mod/mkinstalldirs b/forester/archive/RIO/others/puzzle_mod/mkinstalldirs new file mode 100755 index 0000000..1d8b882 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/mkinstalldirs @@ -0,0 +1,40 @@ +#! /bin/sh +# mkinstalldirs --- make directory hierarchy +# Author: Noah Friedman +# Created: 1993-05-16 +# Public domain + +# $Id: mkinstalldirs,v 1.1.1.1 2005/03/22 08:35:12 cmzmasek Exp $ + +errstatus=0 + +for file +do + set fnord `echo ":$file" | sed -ne 's/^:\//#/;s/^://;s/\// /g;s/^#/\//;p'` + shift + + pathcomp= + for d + do + pathcomp="$pathcomp$d" + case "$pathcomp" in + -* ) pathcomp=./$pathcomp ;; + esac + + if test ! -d "$pathcomp"; then + echo "mkdir $pathcomp" + + mkdir "$pathcomp" || lasterr=$? + + if test ! -d "$pathcomp"; then + errstatus=$lasterr + fi + fi + + pathcomp="$pathcomp/" + done +done + +exit $errstatus + +# mkinstalldirs ends here diff --git a/forester/archive/RIO/others/puzzle_mod/src/00README b/forester/archive/RIO/others/puzzle_mod/src/00README new file mode 100644 index 0000000..a50e005 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/00README @@ -0,0 +1,97 @@ +Modifications by Christian Zmasek +--------------------------------- + + +!WARNING: Use this modified version of TREE-PUZZLE 5.0 ONLY + together with FORESTER/RIO! + +!For all other puposes download the excellent original! + + +Changes: +-------- + + +puzzle1.c: void putdistance(FILE *fp): + +remove: "/* seven in one row */ + if ((j + 1) % 7 == 0 && j+1 != Maxspc) + fprintf(fp, "\n ");" + + + + + +puzzle1.c: int main(int argc, char *argv[]): + +remove: +"FPRINTF(STDOUTFILE "Writing parameters to file %s\n", OUTFILE); + openfiletowrite(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEPARAMS); + fclose(ofp);" + +"openfiletoappend(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEREST);" + +"openfiletoappend(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEREST);" + +"openfiletoappend(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEREST);" + +"timestamp(ofp); + closefile(ofp);" + + + + +puzzle2.c: void getsizesites(FILE *ifp): + +257 -> 8000 + + + +puzzle2.c: void readid(FILE *infp, int t): + +for (i = 0; i < 10; i++) { -> for (i = 0; i < 26; i++) { + +for (i = 9; i > -1; i--) { -> for (i = 25; i > -1; i--) { + +for (j = 0; (j < 10) && (flag == TRUE); j++) -> for (j = 0; (j < 26) && (flag == TRUE); j++) + + + +puzzle2.c: void initid(int t): + +Identif = new_cmatrix(t, 10); -> Identif = new_cmatrix(t, 26); + +for (j = 0; j < 10; j++) -> for (j = 0; j < 26; j++) + + + +puzzle2.c: fputid10(FILE *ofp, int t): + +for (i = 0; i < 10; i++) -> for (i = 0; i < 26; i++) + + + +puzzle2.c: int fputid(FILE *ofp, int t): + +while (Identif[t][i] != ' ' && i < 10) { -> while (Identif[t][i] != ' ' && i < 26) { + + + + +ml2.c: Node *internalnode(Tree *tr, char **chpp, int *ninode): + +char ident[100], idcomp[11]; -> char ident[100], idcomp[27]; + +idcomp[10] = '\0'; -> idcomp[26] = '\0'; + +} while (!stop && (ff != 10)); -> } while (!stop && (ff != 26)); + + + + + + diff --git a/forester/archive/RIO/others/puzzle_mod/src/Makefile b/forester/archive/RIO/others/puzzle_mod/src/Makefile new file mode 100644 index 0000000..9c6d4c0 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/Makefile @@ -0,0 +1,356 @@ +# Generated automatically from Makefile.in by configure. +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = /bin/sh + +srcdir = . +top_srcdir = .. +prefix = /usr/local +exec_prefix = ${prefix} + +bindir = ${exec_prefix}/bin +sbindir = ${exec_prefix}/sbin +libexecdir = ${exec_prefix}/libexec +datadir = ${prefix}/share +sysconfdir = ${prefix}/etc +sharedstatedir = ${prefix}/com +localstatedir = ${prefix}/var +libdir = ${exec_prefix}/lib +infodir = ${prefix}/info +mandir = ${prefix}/man +includedir = ${prefix}/include +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/tree-puzzle +pkglibdir = $(libdir)/tree-puzzle +pkgincludedir = $(includedir)/tree-puzzle + +top_builddir = .. + +ACLOCAL = aclocal +AUTOCONF = autoconf +AUTOMAKE = automake +AUTOHEADER = autoheader + +INSTALL = /usr/bin/install -c +INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_SCRIPT = ${INSTALL_PROGRAM} +transform = s,x,x, + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = gcc +MAKEINFO = makeinfo +MPICC = +MPICC0 = +MPICC1 = +MPICC2 = +MPICC3 = +MPICC4 = +MPICC5 = +MPICFLAGS = +MPIDEFS = +MPILIBS = +PACKAGE = tree-puzzle +PPUZZLE = +VERSION = 5.0 + +bin_PROGRAMS = puzzle +EXTRA_PROGRAMS = ppuzzle + +puzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c util.c ml.h util.h puzzle.h gamma.h +puzzle_LDADD = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o spuzzle1.o spuzzle2.o sutil.o + +SDEFS = +SCFLAGS = +SLDFLAGS = -lm + +SCOMPILE = $(CC) $(SDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(SCFLAGS) $(CFLAGS) +SCCLD = $(CC) +SLINK = $(SCCLD) $(AM_CFLAGS) $(CFLAGS) $(SLDFLAGS) $(LDFLAGS) + +ppuzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c sched.c util.c ppuzzle.c ml.h util.h puzzle.h gamma.h ppuzzle.h sched.h +ppuzzle_LDADD = pgamma.o pml1.o pml2.o pml3.o pmodel1.o pmodel2.o ppuzzle1.o ppuzzle2.o psched.o putil.o ppuzzle.o + +PCC = +PDEFS = -DPARALLEL +PCFLAGS = +PLDFLAGS = -lm + +PCOMPILE = $(PCC) $(PDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(PCFLAGS) $(CFLAGS) +PCCLD = $(PCC) +PLINK = $(PCCLD) $(AM_CFLAGS) $(PCFLAGS) $(CFLAGS) $(PLDFLAGS) $(LDFLAGS) +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = test +PROGRAMS = $(bin_PROGRAMS) + + +DEFS = -DPACKAGE=\"tree-puzzle\" -DVERSION=\"5.0\" -DHAVE_LIBM=1 -DSTDC_HEADERS=1 -DHAVE_LIMITS_H=1 -I. -I$(srcdir) +CPPFLAGS = +LDFLAGS = +LIBS = -lm +ppuzzle_OBJECTS = gamma.o ml1.o ml2.o ml3.o model1.o model2.o puzzle1.o \ +puzzle2.o sched.o util.o ppuzzle.o +ppuzzle_DEPENDENCIES = pgamma.o pml1.o pml2.o pml3.o pmodel1.o \ +pmodel2.o ppuzzle1.o ppuzzle2.o psched.o putil.o ppuzzle.o +ppuzzle_LDFLAGS = +puzzle_OBJECTS = gamma.o ml1.o ml2.o ml3.o model1.o model2.o puzzle1.o \ +puzzle2.o util.o +puzzle_DEPENDENCIES = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o \ +spuzzle1.o spuzzle2.o sutil.o +puzzle_LDFLAGS = +CFLAGS = -g -O2 +COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ +DIST_COMMON = README Makefile.am Makefile.in test.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +SOURCES = $(ppuzzle_SOURCES) $(puzzle_SOURCES) +OBJECTS = $(ppuzzle_OBJECTS) $(puzzle_OBJECTS) + +all: all-redirect +.SUFFIXES: +.SUFFIXES: .S .c .o .s +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps src/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +test: $(top_builddir)/config.status test.in + cd $(top_builddir) && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +mostlyclean-binPROGRAMS: + +clean-binPROGRAMS: + -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) + +distclean-binPROGRAMS: + +maintainer-clean-binPROGRAMS: + +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(bindir) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + if test -f $$p; then \ + echo " $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`"; \ + $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + list='$(bin_PROGRAMS)'; for p in $$list; do \ + rm -f $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + done + +.c.o: + $(COMPILE) -c $< + +.s.o: + $(COMPILE) -c $< + +.S.o: + $(COMPILE) -c $< + +mostlyclean-compile: + -rm -f *.o core *.core + +clean-compile: + +distclean-compile: + -rm -f *.tab.c + +maintainer-clean-compile: + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + here=`pwd` && cd $(srcdir) \ + && mkid -f$$here/ID $$unique $(LISP) + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \ + || (cd $(srcdir) && etags $(ETAGS_ARGS) $$tags $$unique $(LISP) -o $$here/TAGS) + +mostlyclean-tags: + +clean-tags: + +distclean-tags: + -rm -f TAGS ID + +maintainer-clean-tags: + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = src + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: install-binPROGRAMS +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: uninstall-binPROGRAMS +uninstall: uninstall-am +all-am: Makefile $(PROGRAMS) +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-binPROGRAMS mostlyclean-compile \ + mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-binPROGRAMS clean-compile clean-tags clean-generic \ + mostlyclean-am + +clean: clean-am + +distclean-am: distclean-binPROGRAMS distclean-compile distclean-tags \ + distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-binPROGRAMS \ + maintainer-clean-compile maintainer-clean-tags \ + maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: mostlyclean-binPROGRAMS distclean-binPROGRAMS clean-binPROGRAMS \ +maintainer-clean-binPROGRAMS uninstall-binPROGRAMS install-binPROGRAMS \ +mostlyclean-compile distclean-compile clean-compile \ +maintainer-clean-compile tags mostlyclean-tags distclean-tags \ +clean-tags maintainer-clean-tags distdir info-am info dvi-am dvi check \ +check-am installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +puzzle: $(puzzle_LDADD) $(puzzle_SOURCES) + $(SLINK) $(puzzle_LDADD) -o $@ + +sml1.o: ml1.c ml.h util.h + $(SCOMPILE) -c ml1.c && mv ml1.o $@ +sml2.o: ml2.c ml.h util.h + $(SCOMPILE) -c ml2.c && mv ml2.o $@ +sml3.o: ml3.c ml.h util.h gamma.h + $(SCOMPILE) -c ml3.c && mv ml3.o $@ +smodel1.o: model1.c ml.h util.h + $(SCOMPILE) -c model1.c && mv model1.o $@ +smodel2.o: model2.c ml.h util.h + $(SCOMPILE) -c model2.c && mv model2.o $@ +spuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(SCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +spuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(SCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +sutil.o: util.c util.h + $(SCOMPILE) -c util.c && mv util.o $@ +sgamma.o: gamma.c gamma.h util.h + $(SCOMPILE) -c gamma.c && mv gamma.o $@ + +ppuzzle: $(ppuzzle_LDADD) $(ppuzzle_SOURCES) + $(PLINK) $(ppuzzle_LDADD) -o $@ + +pml1.o: ml1.c ml.h util.h + $(PCOMPILE) -c ml1.c && mv ml1.o $@ +pml2.o: ml2.c ml.h util.h + $(PCOMPILE) -c ml2.c && mv ml2.o $@ +pml3.o: ml3.c ml.h util.h gamma.h + $(PCOMPILE) -c ml3.c && mv ml3.o $@ +pmodel1.o: model1.c ml.h util.h + $(PCOMPILE) -c model1.c && mv model1.o $@ +pmodel2.o: model2.c ml.h util.h + $(PCOMPILE) -c model2.c && mv model2.o $@ +ppuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(PCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +ppuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(PCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +putil.o: util.c util.h + $(PCOMPILE) -c util.c && mv util.o $@ +pgamma.o: gamma.c gamma.h util.h + $(PCOMPILE) -c gamma.c && mv gamma.o $@ +psched.o: sched.c sched.h ppuzzle.h + $(PCOMPILE) -c sched.c && mv sched.o $@ +ppuzzle.o: ppuzzle.c ppuzzle.h ml.h util.h puzzle.h gamma.h sched.h + $(PCOMPILE) -c ppuzzle.c + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/src/Makefile.am b/forester/archive/RIO/others/puzzle_mod/src/Makefile.am new file mode 100644 index 0000000..3b88a39 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/Makefile.am @@ -0,0 +1,77 @@ +bin_PROGRAMS = puzzle @PPUZZLE@ +EXTRA_PROGRAMS = ppuzzle + +puzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c util.c ml.h util.h puzzle.h gamma.h +puzzle_LDADD = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o spuzzle1.o spuzzle2.o sutil.o + +SDEFS = +SCFLAGS = +SLDFLAGS = @LIBS@ + +SCOMPILE = $(CC) $(SDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(SCFLAGS) $(CFLAGS) +SCCLD = $(CC) +SLINK = $(SCCLD) $(AM_CFLAGS) $(CFLAGS) $(SLDFLAGS) $(LDFLAGS) + +ppuzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c sched.c util.c ppuzzle.c ml.h util.h puzzle.h gamma.h ppuzzle.h sched.h +ppuzzle_LDADD = pgamma.o pml1.o pml2.o pml3.o pmodel1.o pmodel2.o ppuzzle1.o ppuzzle2.o psched.o putil.o ppuzzle.o + +PCC = @MPICC@ +PDEFS = -DPARALLEL +PCFLAGS = +PLDFLAGS = @LIBS@ @MPILIBS@ + +PCOMPILE = $(PCC) $(PDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(PCFLAGS) $(CFLAGS) +PCCLD = $(PCC) +PLINK = $(PCCLD) $(AM_CFLAGS) $(PCFLAGS) $(CFLAGS) $(PLDFLAGS) $(LDFLAGS) + + +puzzle: $(puzzle_LDADD) $(puzzle_SOURCES) + $(SLINK) $(puzzle_LDADD) -o $@ + +sml1.o: ml1.c ml.h util.h + $(SCOMPILE) -c ml1.c && mv ml1.o $@ +sml2.o: ml2.c ml.h util.h + $(SCOMPILE) -c ml2.c && mv ml2.o $@ +sml3.o: ml3.c ml.h util.h gamma.h + $(SCOMPILE) -c ml3.c && mv ml3.o $@ +smodel1.o: model1.c ml.h util.h + $(SCOMPILE) -c model1.c && mv model1.o $@ +smodel2.o: model2.c ml.h util.h + $(SCOMPILE) -c model2.c && mv model2.o $@ +spuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(SCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +spuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(SCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +sutil.o: util.c util.h + $(SCOMPILE) -c util.c && mv util.o $@ +sgamma.o: gamma.c gamma.h util.h + $(SCOMPILE) -c gamma.c && mv gamma.o $@ + + + +ppuzzle: $(ppuzzle_LDADD) $(ppuzzle_SOURCES) + $(PLINK) $(ppuzzle_LDADD) -o $@ + +pml1.o: ml1.c ml.h util.h + $(PCOMPILE) -c ml1.c && mv ml1.o $@ +pml2.o: ml2.c ml.h util.h + $(PCOMPILE) -c ml2.c && mv ml2.o $@ +pml3.o: ml3.c ml.h util.h gamma.h + $(PCOMPILE) -c ml3.c && mv ml3.o $@ +pmodel1.o: model1.c ml.h util.h + $(PCOMPILE) -c model1.c && mv model1.o $@ +pmodel2.o: model2.c ml.h util.h + $(PCOMPILE) -c model2.c && mv model2.o $@ +ppuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(PCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +ppuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(PCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +putil.o: util.c util.h + $(PCOMPILE) -c util.c && mv util.o $@ +pgamma.o: gamma.c gamma.h util.h + $(PCOMPILE) -c gamma.c && mv gamma.o $@ +psched.o: sched.c sched.h ppuzzle.h + $(PCOMPILE) -c sched.c && mv sched.o $@ +ppuzzle.o: ppuzzle.c ppuzzle.h ml.h util.h puzzle.h gamma.h sched.h + $(PCOMPILE) -c ppuzzle.c + diff --git a/forester/archive/RIO/others/puzzle_mod/src/Makefile.in b/forester/archive/RIO/others/puzzle_mod/src/Makefile.in new file mode 100644 index 0000000..ab15dd4 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/Makefile.in @@ -0,0 +1,356 @@ +# Makefile.in generated automatically by automake 1.4 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +CC = @CC@ +MAKEINFO = @MAKEINFO@ +MPICC = @MPICC@ +MPICC0 = @MPICC0@ +MPICC1 = @MPICC1@ +MPICC2 = @MPICC2@ +MPICC3 = @MPICC3@ +MPICC4 = @MPICC4@ +MPICC5 = @MPICC5@ +MPICFLAGS = @MPICFLAGS@ +MPIDEFS = @MPIDEFS@ +MPILIBS = @MPILIBS@ +PACKAGE = @PACKAGE@ +PPUZZLE = @PPUZZLE@ +VERSION = @VERSION@ + +bin_PROGRAMS = puzzle @PPUZZLE@ +EXTRA_PROGRAMS = ppuzzle + +puzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c util.c ml.h util.h puzzle.h gamma.h +puzzle_LDADD = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o spuzzle1.o spuzzle2.o sutil.o + +SDEFS = +SCFLAGS = +SLDFLAGS = @LIBS@ + +SCOMPILE = $(CC) $(SDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(SCFLAGS) $(CFLAGS) +SCCLD = $(CC) +SLINK = $(SCCLD) $(AM_CFLAGS) $(CFLAGS) $(SLDFLAGS) $(LDFLAGS) + +ppuzzle_SOURCES = gamma.c ml1.c ml2.c ml3.c model1.c model2.c puzzle1.c puzzle2.c sched.c util.c ppuzzle.c ml.h util.h puzzle.h gamma.h ppuzzle.h sched.h +ppuzzle_LDADD = pgamma.o pml1.o pml2.o pml3.o pmodel1.o pmodel2.o ppuzzle1.o ppuzzle2.o psched.o putil.o ppuzzle.o + +PCC = @MPICC@ +PDEFS = -DPARALLEL +PCFLAGS = +PLDFLAGS = @LIBS@ @MPILIBS@ + +PCOMPILE = $(PCC) $(PDEFS) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(PCFLAGS) $(CFLAGS) +PCCLD = $(PCC) +PLINK = $(PCCLD) $(AM_CFLAGS) $(PCFLAGS) $(CFLAGS) $(PLDFLAGS) $(LDFLAGS) +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_CLEAN_FILES = test +PROGRAMS = $(bin_PROGRAMS) + + +DEFS = @DEFS@ -I. -I$(srcdir) +CPPFLAGS = @CPPFLAGS@ +LDFLAGS = @LDFLAGS@ +LIBS = @LIBS@ +ppuzzle_OBJECTS = gamma.o ml1.o ml2.o ml3.o model1.o model2.o puzzle1.o \ +puzzle2.o sched.o util.o ppuzzle.o +ppuzzle_DEPENDENCIES = pgamma.o pml1.o pml2.o pml3.o pmodel1.o \ +pmodel2.o ppuzzle1.o ppuzzle2.o psched.o putil.o ppuzzle.o +ppuzzle_LDFLAGS = +puzzle_OBJECTS = gamma.o ml1.o ml2.o ml3.o model1.o model2.o puzzle1.o \ +puzzle2.o util.o +puzzle_DEPENDENCIES = sgamma.o sml1.o sml2.o sml3.o smodel1.o smodel2.o \ +spuzzle1.o spuzzle2.o sutil.o +puzzle_LDFLAGS = +CFLAGS = @CFLAGS@ +COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +CCLD = $(CC) +LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ +DIST_COMMON = README Makefile.am Makefile.in test.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = gtar +GZIP_ENV = --best +SOURCES = $(ppuzzle_SOURCES) $(puzzle_SOURCES) +OBJECTS = $(ppuzzle_OBJECTS) $(puzzle_OBJECTS) + +all: all-redirect +.SUFFIXES: +.SUFFIXES: .S .c .o .s +$(srcdir)/Makefile.in: Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu --include-deps src/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +test: $(top_builddir)/config.status test.in + cd $(top_builddir) && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +mostlyclean-binPROGRAMS: + +clean-binPROGRAMS: + -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) + +distclean-binPROGRAMS: + +maintainer-clean-binPROGRAMS: + +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(bindir) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + if test -f $$p; then \ + echo " $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`"; \ + $(INSTALL_PROGRAM) $$p $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + list='$(bin_PROGRAMS)'; for p in $$list; do \ + rm -f $(DESTDIR)$(bindir)/`echo $$p|sed 's/$(EXEEXT)$$//'|sed '$(transform)'|sed 's/$$/$(EXEEXT)/'`; \ + done + +.c.o: + $(COMPILE) -c $< + +.s.o: + $(COMPILE) -c $< + +.S.o: + $(COMPILE) -c $< + +mostlyclean-compile: + -rm -f *.o core *.core + +clean-compile: + +distclean-compile: + -rm -f *.tab.c + +maintainer-clean-compile: + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + here=`pwd` && cd $(srcdir) \ + && mkid -f$$here/ID $$unique $(LISP) + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS)'; \ + unique=`for i in $$list; do echo $$i; done | \ + awk ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \ + || (cd $(srcdir) && etags $(ETAGS_ARGS) $$tags $$unique $(LISP) -o $$here/TAGS) + +mostlyclean-tags: + +clean-tags: + +distclean-tags: + -rm -f TAGS ID + +maintainer-clean-tags: + +distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir) + +subdir = src + +distdir: $(DISTFILES) + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: install-binPROGRAMS +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: uninstall-binPROGRAMS +uninstall: uninstall-am +all-am: Makefile $(PROGRAMS) +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-binPROGRAMS mostlyclean-compile \ + mostlyclean-tags mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-binPROGRAMS clean-compile clean-tags clean-generic \ + mostlyclean-am + +clean: clean-am + +distclean-am: distclean-binPROGRAMS distclean-compile distclean-tags \ + distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-binPROGRAMS \ + maintainer-clean-compile maintainer-clean-tags \ + maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: mostlyclean-binPROGRAMS distclean-binPROGRAMS clean-binPROGRAMS \ +maintainer-clean-binPROGRAMS uninstall-binPROGRAMS install-binPROGRAMS \ +mostlyclean-compile distclean-compile clean-compile \ +maintainer-clean-compile tags mostlyclean-tags distclean-tags \ +clean-tags maintainer-clean-tags distdir info-am info dvi-am dvi check \ +check-am installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +puzzle: $(puzzle_LDADD) $(puzzle_SOURCES) + $(SLINK) $(puzzle_LDADD) -o $@ + +sml1.o: ml1.c ml.h util.h + $(SCOMPILE) -c ml1.c && mv ml1.o $@ +sml2.o: ml2.c ml.h util.h + $(SCOMPILE) -c ml2.c && mv ml2.o $@ +sml3.o: ml3.c ml.h util.h gamma.h + $(SCOMPILE) -c ml3.c && mv ml3.o $@ +smodel1.o: model1.c ml.h util.h + $(SCOMPILE) -c model1.c && mv model1.o $@ +smodel2.o: model2.c ml.h util.h + $(SCOMPILE) -c model2.c && mv model2.o $@ +spuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(SCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +spuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(SCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +sutil.o: util.c util.h + $(SCOMPILE) -c util.c && mv util.o $@ +sgamma.o: gamma.c gamma.h util.h + $(SCOMPILE) -c gamma.c && mv gamma.o $@ + +ppuzzle: $(ppuzzle_LDADD) $(ppuzzle_SOURCES) + $(PLINK) $(ppuzzle_LDADD) -o $@ + +pml1.o: ml1.c ml.h util.h + $(PCOMPILE) -c ml1.c && mv ml1.o $@ +pml2.o: ml2.c ml.h util.h + $(PCOMPILE) -c ml2.c && mv ml2.o $@ +pml3.o: ml3.c ml.h util.h gamma.h + $(PCOMPILE) -c ml3.c && mv ml3.o $@ +pmodel1.o: model1.c ml.h util.h + $(PCOMPILE) -c model1.c && mv model1.o $@ +pmodel2.o: model2.c ml.h util.h + $(PCOMPILE) -c model2.c && mv model2.o $@ +ppuzzle1.o: puzzle1.c ml.h util.h puzzle.h gamma.h ppuzzle.h + $(PCOMPILE) -c puzzle1.c && mv puzzle1.o $@ +ppuzzle2.o: puzzle2.c ml.h util.h puzzle.h ppuzzle.h + $(PCOMPILE) -c puzzle2.c && mv puzzle2.o $@ +putil.o: util.c util.h + $(PCOMPILE) -c util.c && mv util.o $@ +pgamma.o: gamma.c gamma.h util.h + $(PCOMPILE) -c gamma.c && mv gamma.o $@ +psched.o: sched.c sched.h ppuzzle.h + $(PCOMPILE) -c sched.c && mv sched.o $@ +ppuzzle.o: ppuzzle.c ppuzzle.h ml.h util.h puzzle.h gamma.h sched.h + $(PCOMPILE) -c ppuzzle.c + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/forester/archive/RIO/others/puzzle_mod/src/README b/forester/archive/RIO/others/puzzle_mod/src/README new file mode 100644 index 0000000..9c89883 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/README @@ -0,0 +1 @@ +Sources of the TREE-PUZZLE package diff --git a/forester/archive/RIO/others/puzzle_mod/src/gamma.c b/forester/archive/RIO/others/puzzle_mod/src/gamma.c new file mode 100644 index 0000000..ee1f6df --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/gamma.c @@ -0,0 +1,346 @@ +/* + * gamma.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + +#include +#include "util.h" +#include "gamma.h" + +/* private prototypes */ +static double IncompleteGamma (double x, double alpha, double ln_gamma_alpha); +static double PointNormal (double prob); +static double PointChi2 (double prob, double v); + +/* Gamma density function */ +double densityGamma (double x, double shape) +{ + return pow (shape, shape) * pow (x, shape-1) / + exp (shape*x + LnGamma(shape)); +} + +/* Gamma cdf */ +double cdfGamma (double x, double shape) +{ + double result; + + result = IncompleteGamma (shape*x, shape, LnGamma(shape)); + + return result; +} + +/* Gamma inverse cdf */ +double icdfGamma (double y, double shape) +{ + double result; + + result = PointChi2 (y, 2.0*shape)/(2.0*shape); + + /* to avoid -1.0 */ + if (result < 0.0) + { + result = 0.0; + } + + return result; +} + +/* Gamma n-th moment */ +double momentGamma (int n, double shape) +{ + int i; + double tmp = 1.0; + + for (i = 1; i < n; i++) + { + tmp *= (shape + i)/shape; + } + + return tmp; +} + +/* The following code comes from tools.c in Yang's PAML package */ + +double LnGamma (double alpha) +{ +/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places. + Stirling's formula is used for the central polynomial part of the procedure. + Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function. + Communications of the Association for Computing Machinery, 9:684 +*/ + double x=alpha, f=0, z; + + if (x<7) { + f=1; z=x-1; + while (++z<7) f*=z; + x=z; f=-log(f); + } + z = 1/(x*x); + return f + (x-0.5)*log(x) - x + .918938533204673 + + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z + +.083333333333333)/x; +} + +static double IncompleteGamma (double x, double alpha, double ln_gamma_alpha) +{ +/* returns the incomplete gamma ratio I(x,alpha) where x is the upper + limit of the integration and alpha is the shape parameter. + returns (-1) if in error + (1) series expansion if (alpha>x || x<=1) + (2) continued fraction otherwise + RATNEST FORTRAN by + Bhattacharjee GP (1970) The incomplete gamma integral. Applied Statistics, + 19: 285-287 (AS32) +*/ + int i; + double p=alpha, g=ln_gamma_alpha; + double accurate=1e-8, overflow=1e30; + double factor, gin=0, rn=0, a=0,b=0,an=0,dif=0, term=0, pn[6]; + + if (x==0) return (0); + if (x<0 || p<=0) return (-1); + + factor=exp(p*log(x)-x-g); + if (x>1 && x>=p) goto l30; + /* (1) series expansion */ + gin=1; term=1; rn=p; + l20: + rn++; + term*=x/rn; gin+=term; + + if (term > accurate) goto l20; + gin*=factor/p; + goto l50; + l30: + /* (2) continued fraction */ + a=1-p; b=a+x+1; term=0; + pn[0]=1; pn[1]=x; pn[2]=x+1; pn[3]=x*b; + gin=pn[2]/pn[3]; + l32: + a++; b+=2; term++; an=a*term; + for (i=0; i<2; i++) pn[i+4]=b*pn[i+2]-an*pn[i]; + if (pn[5] == 0) goto l35; + rn=pn[4]/pn[5]; dif=fabs(gin-rn); + if (dif>accurate) goto l34; + if (dif<=accurate*rn) goto l42; + l34: + gin=rn; + l35: + for (i=0; i<4; i++) pn[i]=pn[i+2]; + if (fabs(pn[4]) < overflow) goto l32; + for (i=0; i<4; i++) pn[i]/=overflow; + goto l32; + l42: + gin=1-factor*gin; + + l50: + return (gin); +} + + +/* functions concerning the CDF and percentage points of the gamma and + Chi2 distribution +*/ +static double PointNormal (double prob) +{ +/* returns z so that Prob{x.999998 || v<=0) return (-1); + + g = LnGamma (v/2); + xx=v/2; c=xx-1; + if (v >= -1.24*log(p)) goto l1; + + ch=pow((p*xx*exp(g+xx*aa)), 1/xx); + if (ch-e<0) return (ch); + goto l4; +l1: + if (v>.32) goto l3; + ch=0.4; a=log(1-p); +l2: + q=ch; p1=1+ch*(4.67+ch); p2=ch*(6.73+ch*(6.66+ch)); + t=-0.5+(4.67+2*ch)/p1 - (6.73+ch*(13.32+3*ch))/p2; + ch-=(1-exp(a+g+.5*ch+c*aa)*p2/p1)/t; + if (fabs(q/ch-1)-.01 <= 0) goto l4; + else goto l2; + +l3: + x=PointNormal (p); + p1=0.222222/v; ch=v*pow((x*sqrt(p1)+1-p1), 3.0); + if (ch>2.2*v+6) ch=-2*(log(1-p)-c*log(.5*ch)+g); +l4: + + do + { + q=ch; p1=.5*ch; + if ((t=IncompleteGamma (p1, xx, g))<0) { + return (-1); + } + p2=p-t; + t=p2*exp(xx*aa+g+p1-c*log(ch)); + b=t/ch; a=0.5*t-b*c; + + s1=(210+a*(140+a*(105+a*(84+a*(70+60*a))))) / 420; + s2=(420+a*(735+a*(966+a*(1141+1278*a))))/2520; + s3=(210+a*(462+a*(707+932*a)))/2520; + s4=(252+a*(672+1182*a)+c*(294+a*(889+1740*a)))/5040; + s5=(84+264*a+c*(175+606*a))/2520; + s6=(120+c*(346+127*c))/5040; + ch+=t*(1+0.5*t*s1-b*c*(s1-b*(s2-b*(s3-b*(s4-b*(s5-b*s6)))))); + } + while (fabs(q/ch-1) > e); + + return (ch); +} + + +/* Incomplete Gamma function Q(a,x) + - this is a cleanroom implementation of NRs gammq(a,x) +*/ +double IncompleteGammaQ (double a, double x) +{ + return 1.0-IncompleteGamma (x, a, LnGamma(a)); +} + + +/* probability that the observed chi-square + exceeds chi2 even if model is correct */ +double chi2prob (int deg, double chi2) +{ + return IncompleteGammaQ (0.5*deg, 0.5*chi2); +} + + + +/* chi square test + ef expected frequencies (sum up to 1 !!) + of observed frequencies (sum up to the number of samples) + numcat number of categories + returns critical significance level */ +double chi2test(double *ef, int *of, int numcat, int *chi2fail) +{ + double chi2, criticals, efn; + int i, below1, below5, reducedcat; + int samples; + + *chi2fail = FALSE; + reducedcat = numcat; + below1 = 0; + below5 = 0; + + /* compute number of samples */ + samples = 0; + for (i = 0; i < numcat; i++) + samples = samples + of[i]; + + /* compute chi square */ + chi2 = 0; + for (i = 0; i < numcat; i++) { + efn = ef[i]*((double) samples); + if (efn < 1.0) below1++; + if (efn < 5.0) below5++; + if (efn == 0.0) { + reducedcat--; + fprintf(stdout, "FPE error: samples=%d, ef[%d]=%f, of[%d]=%d, efn=%f, nc=%d, rc=%d\n", + samples, i, ef[i], i, of[i], efn, numcat, reducedcat); + fprintf(stdout, "PLEASE REPORT THIS ERROR TO DEVELOPERS !!!\n"); + fflush(stdout); + } else chi2 = chi2 + ((double) of[i]-efn)*((double) of[i]-efn)/efn; + } + + /* compute significance */ + criticals = chi2prob (numcat-1, chi2); + + /* no expected frequency category (sum up to # samples) below 1.0 */ + if (below1 > 0) *chi2fail = TRUE; + /* no more than 1/5 of the frequency categories below 5.0 */ + if (below5 > (int) floor(samples/5.0)) *chi2fail = TRUE; + + return criticals; +} + + +/* chi square test + ef expected frequencies (sum up to 1 !!) + of observed frequencies (sum up to the number of samples) + numcat number of categories + returns critical significance level */ +double altchi2test(double *ef, int *of, int numcat, int *chi2fail) +{ + double chi2, criticals, efn; + int i, below1, below5; + int samples; + + *chi2fail = FALSE; + below1 = 0; + below5 = 0; + + /* compute number of samples */ + samples = 0; + for (i = 0; i < numcat; i++) + samples = samples + of[i]; + + /* compute chi square */ + chi2 = 0; + for (i = 0; i < numcat; i++) { + efn = ef[i]*((double) samples); + if (efn < 1.0) below1++; + if (efn < 5.0) below5++; + chi2 = chi2 + ((double) of[i]-efn)*((double) of[i]-efn)/efn; + } + + /* compute significance */ + criticals = chi2prob (numcat-1, chi2); + + /* no expected frequency category (sum up to # samples) below 1.0 */ + if (below1 > 0) *chi2fail = TRUE; + /* no more than 1/5 of the frequency categories below 5.0 */ + if (below5 > (int) floor(samples/5.0)) *chi2fail = TRUE; + + return criticals; +} diff --git a/forester/archive/RIO/others/puzzle_mod/src/gamma.h b/forester/archive/RIO/others/puzzle_mod/src/gamma.h new file mode 100644 index 0000000..975f4ee --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/gamma.h @@ -0,0 +1,30 @@ +/* + * gamma.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + +#ifndef _GAMMA_ +#define _GAMMA_ + +double densityGamma (double, double); +double cdfGamma (double, double); +double icdfGamma (double, double); +double momentGamma (int, double); + +double LnGamma (double); +double IncompleteGammaQ (double, double); + +double chi2prob (int, double); +double chi2test (double *, int *, int , int *); + + +#endif /* _GAMMA_ */ diff --git a/forester/archive/RIO/others/puzzle_mod/src/ml.h b/forester/archive/RIO/others/puzzle_mod/src/ml.h new file mode 100644 index 0000000..a0aa981 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/ml.h @@ -0,0 +1,279 @@ +/* + * ml.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef _ML_ +#define _ML_ + +/* definitions */ + +#define MINTS 0.20 /* Ts/Tv parameter */ +#define MAXTS 30.0 +#define MINYR 0.10 /* Y/R Ts parameter */ +#define MAXYR 6.00 +#define MINFI 0.00 /* fraction invariable sites */ +#define MAXFI 0.99 /* only for input */ +#define MINGE 0.01 /* rate heterogeneity parameter */ +#define MAXGE 0.99 +#define MINCAT 4 /* discrete Gamma categories */ +#define MAXCAT 16 + +#define RMHROOT 5.0 /* upper relative bound for height of root */ +#define MAXARC 900.0 /* upper limit on branch length (PAM) = 6.0 */ +#define MINARC 0.001 /* lower limit on branch length (PAM) = 0.00001 */ +#define EPSILON 0.0001 /* error in branch length (PAM) = 0.000001 */ +#define HEPSILON 0.0001 /* error in node and root heights */ +#define MAXIT 100 /* maximum number of iterates of smoothing */ +#define MINFDIFF 0.00002 /* lower limit on base frequency differences */ +#define MINFREQ 0.0001 /* lower limit on base frequencies = 0.01% */ +#define NUMQBRNCH 5 /* number of branches in a quartet */ +#define NUMQIBRNCH 1 /* number of internal branches in a quartet */ +#define NUMQSPC 4 /* number of sequences in a quartet */ + +/* 2D minimisation */ +#define PEPS1 0.01 /* epsilon substitution process estimation */ +#define PEPS2 0.01 /* epsilon rate heterogeneity estimation */ + +/* quartet series */ +#define MINPERTAXUM 2 +#define MAXPERTAXUM 6 +#define TSDIFF 0.20 +#define YRDIFF 0.10 + +/* type definitions */ + +typedef struct node +{ + struct node *isop; + struct node *kinp; + int descen; + int number; + double length; + double lengthc; + double varlen; + double height; + double varheight; + ivector paths; + cvector eprob; + dcube partials; /* partial likelihoods */ + char *label; /* internal labels */ +} Node; + +typedef struct tree +{ + Node *rootp; + Node **ebrnchp; /* list of pointers to external branches */ + Node **ibrnchp; /* list of pointers to internal branches */ + double lklhd; /* total log-likelihood */ + double lklhdc; /* total log-likelihood clock */ + dmatrix condlkl; /* likelihoods for each pattern and non-zero rate */ + double rssleast; +} Tree; + + +/* global variables */ + +EXTERN Node *chep; /* pointer to current height node */ +EXTERN Node *rootbr; /* pointer to root branch */ +EXTERN Node **heights; /* pointer to height nodes in unrooted tree */ +EXTERN int Numhts; /* number of height nodes in unrooted tree */ +EXTERN double hroot; /* height of root */ +EXTERN double varhroot; /* variance of height of root */ +EXTERN double maxhroot; /* maximal height of root */ +EXTERN int locroot; /* location of root */ +EXTERN int numbestroot; /* number of best locations for root */ +EXTERN int clockmode; /* clocklike vs. nonclocklike computation */ +EXTERN cmatrix Identif; /* sequence names */ +EXTERN cmatrix Seqchar; /* ML sequence data */ +EXTERN cmatrix Seqpat; /* ordered site patterns */ +EXTERN ivector constpat; /* indicates constant site patterns */ +EXTERN cvector seqchi; +EXTERN cvector seqchj; +EXTERN dcube partiali; +EXTERN dcube partialj; +EXTERN dcube ltprobr; /* transition probabilites (for all non-zero rates */ +EXTERN dmatrix Distanmat; /* matrix with maximum likelihood distances */ +EXTERN dmatrix Evec; /* Eigenvectors */ +EXTERN dmatrix Ievc; /* Inverse eigenvectors */ +EXTERN double TSparam; /* Ts/Tv parameter */ +EXTERN double tsmean, yrmean; +EXTERN double YRparam; /* Y/R Ts parameter */ +EXTERN double geerr; /* estimated error of rate heterogeneity */ +EXTERN double Geta; /* rate heterogeneity parameter */ +EXTERN double fracconst; /* fraction of constant sites */ +EXTERN double fracconstpat;/* fraction of constant patterns */ +EXTERN double Proportion; /* for tree drawing */ +EXTERN double tserr; /* estimated error of TSparam */ +EXTERN double yrerr; /* estimated error of YRparam */ +EXTERN double fracinv; /* fraction of invariable sites */ +EXTERN double fierr; /* estimated error of fracinv */ +EXTERN dvector Brnlength; +EXTERN dvector Distanvec; +EXTERN dvector Eval; /* Eigenvalues of 1 PAM rate matrix */ +EXTERN dvector Freqtpm; /* base frequencies */ +EXTERN dvector Rates; /* rate of each of the categories */ +EXTERN dmatrix iexp; +EXTERN imatrix Basecomp; /* base composition of each taxon */ +EXTERN ivector usedtaxa; /* list needed in the input treefile procedure */ +EXTERN int numtc; /* auxiliary variable for printing rooted tree */ +EXTERN int qcalg_optn; /* use quartet subsampling algorithm */ +EXTERN int approxp_optn; /* approximate parameter estimation */ +EXTERN int chi2fail; /* flag for chi2 test */ +EXTERN int Converg; /* flag for ML convergence (no clock) */ +EXTERN int Convergc; /* flag for ML convergence (clock) */ +EXTERN int data_optn; /* type of sequence input data */ +EXTERN int Dayhf_optn; /* Dayhoff model */ +EXTERN int HKY_optn; /* use HKY model */ +EXTERN int Jtt_optn; /* JTT model */ +EXTERN int blosum62_optn; /* BLOSUM 62 model */ +EXTERN int mtrev_optn; /* mtREV model */ +EXTERN int cprev_optn; /* cpREV model */ +EXTERN int vtmv_optn; /* VT model */ +EXTERN int wag_optn; /* WAG model */ +EXTERN int Maxsite; /* number of ML characters per taxum */ +EXTERN int Maxspc; /* number of sequences */ +EXTERN int mlmode; /* quartet ML or user defined tree ML */ +EXTERN int nuc_optn; /* nucleotide (4x4) models */ +EXTERN int Numbrnch; /* number of branches of current tree */ +EXTERN int numcats; /* number of rate categories */ +EXTERN int Numconst; /* number of constant sites */ +EXTERN int Numconstpat; /* number of constant patterns */ +EXTERN int Numibrnch; /* number of internal branches of current tree */ +EXTERN int Numitc; /* number of ML iterations assumning clock */ +EXTERN int Numit; /* number of ML iterations if there is convergence */ +EXTERN int Numptrn; /* number of site patterns */ +EXTERN int Numspc; /* number of sequences of current tree */ +EXTERN int optim_optn; /* optimize model parameters */ +EXTERN int grate_optim; /* optimize Gamma rate heterogeneity parameter */ +EXTERN int SH_optn; /* SH nucleotide (16x16) model */ +EXTERN int TN_optn; /* use TN model */ +EXTERN int tpmradix; /* number of different states */ +EXTERN int fracinv_optim; /* optimize fraction of invariable sites */ +EXTERN int typ_optn; /* type of PUZZLE analysis */ +EXTERN ivector Weight; /* weight of each site pattern */ +EXTERN Tree *Ctree; /* pointer to current tree */ +EXTERN ulivector badtaxon; /* involment of each taxon in a bad quartet */ +EXTERN int qca, qcb, qcc, qcd; /* quartet currently optimized */ +EXTERN ivector Alias; /* link site -> corresponding site pattern */ +EXTERN ivector bestrate; /* optimal assignment of rates to sequence sites */ + +EXTERN int bestratefound; + +/* function prototypes of all ml function */ + +void convfreq(dvector); +void radixsort(cmatrix, ivector, int, int, int *); +void condenceseq(cmatrix, ivector, cmatrix, ivector, int, int, int); +void countconstantsites(cmatrix, ivector, int, int, int *, int*); +void evaluateseqs(void); +void elmhes(dmatrix, ivector, int); +void eltran(dmatrix, dmatrix, ivector, int); +void mcdiv(double, double, double, double, double *, double *); +void hqr2(int, int, int, dmatrix, dmatrix, dvector, dvector); +void onepamratematrix(dmatrix); +void eigensystem(dvector, dmatrix); +void luinverse(dmatrix, dmatrix, int); +void checkevector(dmatrix, dmatrix, int); +void tranprobmat(void); +void tprobmtrx(double, dmatrix); +double comptotloglkl(dmatrix); +void allsitelkl(dmatrix, dvector); +double pairlkl(double); +double mldistance(int, int); +void initdistan(void); +void computedistan(void); +void productpartials(Node *); +void partialsinternal(Node *); +void partialsexternal(Node *); +void initpartials(Tree *); +double intlkl(double); +void optinternalbranch(Node *); +double extlkl(double); +void optexternalbranch(Node *); +void finishlkl(Node *); +double optlkl(Tree *); +double treelkl(Tree *); +void luequation(dmatrix, dvector, int); +void lslength(Tree *, dvector, int, int, dvector); + +void getusertree(FILE *, cvector, int); +Node *internalnode(Tree *, char **, int *); +void constructtree(Tree *, cvector); +void removebasalbif(cvector); +void makeusertree(FILE *); +Tree *new_tree(int, int, cmatrix); +Tree *new_quartet(int, cmatrix); +void free_tree(Tree *, int); +void make_quartet(int, int, int, int); +void changedistan(dmatrix, dvector, int); +double quartet_lklhd(int, int, int, int); +double quartet_alklhd(int, int, int, int); +void readusertree(FILE *); +double usertree_lklhd(void); +double usertree_alklhd(void); +void mlstart(void); +void distupdate(int, int, int, int); +void mlfinish(void); +void prbranch(Node *, int, int, int, ivector, ivector, FILE *); +void getproportion(double *, dvector, int); +void prtopology(FILE *); +void fputphylogeny(FILE *); +void resulttree(FILE *); +void njtree(FILE *); +void njdistantree(Tree *); +void findbestratecombination(void); +void printbestratecombination(FILE *); +int checkedge(int); +void fputsubstree(FILE *, Node *); +void fputrooted(FILE *, int); +void findheights(Node *); +void initclock(int); +double clock_alklhd(int); +double heightlkl(double); +void optheight(void); +double rheightlkl(double); +void optrheight(void); +double clock_lklhd(int); +int findrootedge(void); +void resultheights(FILE *); + +double homogentest(int); +void YangDiscreteGamma(double, int, double *); +void updaterates(void); +void computestat(double *, int, double *, double *); +double quartetml(int, int, int, int); +double opttsq(double); +double optyrq(double); +void optimseqevolparamsq(void); +double opttst(double); +double optyrt(double); +void optimseqevolparamst(void); +double optfi(double); +double optge(double); +void optimrateparams(void); + +int gettpmradix(void); +void rtfdata(dmatrix, double *); +int code2int(cvector); +char *int2code(int); + +void jttdata(dmatrix, double *); +void dyhfdata(dmatrix, double *); +void mtrevdata(dmatrix, double *); +void cprev45data(dmatrix, double *); +void blosum62data(dmatrix, double *); +void vtmvdata(dmatrix, double *); +void wagdata(dmatrix, double *); + +#endif diff --git a/forester/archive/RIO/others/puzzle_mod/src/ml1.c b/forester/archive/RIO/others/puzzle_mod/src/ml1.c new file mode 100644 index 0000000..0e905ef --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/ml1.c @@ -0,0 +1,1734 @@ +/* + * ml1.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/******************************************************************************/ +/* definitions and prototypes */ +/******************************************************************************/ + +#define EXTERN extern + +/* prototypes */ +#include +#include +#include +#include +#include "util.h" +#include "ml.h" + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + + +/******************************************************************************/ +/* compacting sequence data information */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +/* make all frequencies a little different */ +void convfreq(dvector freqemp) +{ + int i, j, maxi=0; + double freq, maxfreq, sum; + + + sum = 0.0; + maxfreq = 0.0; + for (i = 0; i < tpmradix; i++) { + freq = freqemp[i]; + if (freq < MINFREQ) freqemp[i] = MINFREQ; + if (freq > maxfreq) { + maxfreq = freq; + maxi = i; + } + sum += freqemp[i]; + } + freqemp[maxi] += 1.0 - sum; + + for (i = 0; i < tpmradix - 1; i++) { + for (j = i + 1; j < tpmradix; j++) { + if (freqemp[i] == freqemp[j]) { + freqemp[i] += MINFDIFF/2.0; + freqemp[j] -= MINFDIFF/2.0; + } + } + } +} + +/* sort site patters of original input data */ +void radixsort(cmatrix seqchar, ivector ali, int maxspc, int maxsite, + int *numptrn) +{ + int i, j, k, l, n, pass; + int *awork; + int *count; + + + awork = new_ivector(maxsite); + count = new_ivector(tpmradix+1); + for (i = 0; i < maxsite; i++) + ali[i] = i; + for (pass = maxspc - 1; pass >= 0; pass--) { + for (j = 0; j < tpmradix+1; j++) + count[j] = 0; + for (i = 0; i < maxsite; i++) + count[(int) seqchar[pass][ali[i]]]++; + for (j = 1; j < tpmradix+1; j++) + count[j] += count[j-1]; + for (i = maxsite-1; i >= 0; i--) + awork[ --count[(int) seqchar[pass][ali[i]]] ] = ali[i]; + for (i = 0; i < maxsite; i++) + ali[i] = awork[i]; + } + free_ivector(awork); + free_ivector(count); + n = 1; + for (j = 1; j < maxsite; j++) { + k = ali[j]; + l = ali[j-1]; + for (i = 0; i < maxspc; i++) { + if (seqchar[i][l] != seqchar[i][k]) { + n++; + break; + } + } + } + *numptrn = n; +} + + +void condenceseq(cmatrix seqchar, ivector ali, cmatrix seqconint, + ivector weight, int maxspc, int maxsite, int numptrn) +{ + int i, j, k, n; + int agree_flag; /* boolean */ + + + n = 0; + k = ali[n]; + for (i = 0; i < maxspc; i++) { + seqconint[i][n] = seqchar[i][k]; + } + weight[n] = 1; + Alias[k] = 0; + for (j = 1; j < maxsite; j++) { + k = ali[j]; + agree_flag = TRUE; + for (i = 0; i < maxspc; i++) { + if (seqconint[i][n] != seqchar[i][k]) { + agree_flag = FALSE; + break; + } + } + if (agree_flag == FALSE) { + n++; + for (i = 0; i < maxspc; i++) { + seqconint[i][n] = seqchar[i][k]; + } + weight[n] = 1; + Alias[k] = n; + } else { + weight[n]++; + Alias[k] = n; + } + } + n++; + if (numptrn != n) { + /* Problem in condenceseq */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR A TO DEVELOPERS\n\n\n"); + exit(1); + } +} + +void countconstantsites(cmatrix seqpat, ivector weight, int maxspc, int numptrn, + int *numconst, int *numconstpat) +{ + int character, s, i, constflag; + + *numconst = 0; + *numconstpat = 0; + for (s = 0; s < numptrn; s++) { /* check all patterns */ + constpat[s] = FALSE; + constflag = TRUE; + character = seqpat[0][s]; + for (i = 1; i < maxspc; i++) { + if (seqpat[i][s] != character) { + constflag = FALSE; + break; + } + } + if (character != tpmradix && constflag) { + (*numconst) = (*numconst) + weight[s]; + (*numconstpat)++; + constpat[s] = TRUE; + } + } +} + +/***************************** exported functions *****************************/ + + +void evaluateseqs() +{ + ivector ali; + + convfreq(Freqtpm); /* make all frequencies slightly different */ + ali = new_ivector(Maxsite); + radixsort(Seqchar, ali, Maxspc, Maxsite, &Numptrn); + Seqpat = new_cmatrix(Maxspc, Numptrn); + constpat = new_ivector(Numptrn); + Weight = new_ivector(Numptrn); + condenceseq(Seqchar, ali, Seqpat, Weight, Maxspc, Maxsite, Numptrn); + free_ivector(ali); + countconstantsites(Seqpat, Weight, Maxspc, Numptrn, &Numconst, &Numconstpat); + fracconstpat = (double) Numconstpat / (double) Numptrn; + fracconst = (double) Numconst / (double) Maxsite; +} + + +/******************************************************************************/ +/* computation of Pij(t) */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +void elmhes(dmatrix a, ivector ordr, int n) +{ + int m, j, i; + double y, x; + + + for (i = 0; i < n; i++) + ordr[i] = 0; + for (m = 2; m < n; m++) { + x = 0.0; + i = m; + for (j = m; j <= n; j++) { + if (fabs(a[j - 1][m - 2]) > fabs(x)) { + x = a[j - 1][m - 2]; + i = j; + } + } + ordr[m - 1] = i; /* vector */ + if (i != m) { + for (j = m - 2; j < n; j++) { + y = a[i - 1][j]; + a[i - 1][j] = a[m - 1][j]; + a[m - 1][j] = y; + } + for (j = 0; j < n; j++) { + y = a[j][i - 1]; + a[j][i - 1] = a[j][m - 1]; + a[j][m - 1] = y; + } + } + if (x != 0.0) { + for (i = m; i < n; i++) { + y = a[i][m - 2]; + if (y != 0.0) { + y /= x; + a[i][m - 2] = y; + for (j = m - 1; j < n; j++) + a[i][j] -= y * a[m - 1][j]; + for (j = 0; j < n; j++) + a[j][m - 1] += y * a[j][i]; + } + } + } + } +} + + +void eltran(dmatrix a, dmatrix zz, ivector ordr, int n) +{ + int i, j, m; + + + for (i = 0; i < n; i++) { + for (j = i + 1; j < n; j++) { + zz[i][j] = 0.0; + zz[j][i] = 0.0; + } + zz[i][i] = 1.0; + } + if (n <= 2) + return; + for (m = n - 1; m >= 2; m--) { + for (i = m; i < n; i++) + zz[i][m - 1] = a[i][m - 2]; + i = ordr[m - 1]; + if (i != m) { + for (j = m - 1; j < n; j++) { + zz[m - 1][j] = zz[i - 1][j]; + zz[i - 1][j] = 0.0; + } + zz[i - 1][m - 1] = 1.0; + } + } +} + + +void mcdiv(double ar, double ai, double br, double bi, + double *cr, double *ci) +{ + double s, ars, ais, brs, bis; + + + s = fabs(br) + fabs(bi); + ars = ar / s; + ais = ai / s; + brs = br / s; + bis = bi / s; + s = brs * brs + bis * bis; + *cr = (ars * brs + ais * bis) / s; + *ci = (ais * brs - ars * bis) / s; +} + + +void hqr2(int n, int low, int hgh, dmatrix h, + dmatrix zz, dvector wr, dvector wi) +{ + int i, j, k, l=0, m, en, na, itn, its; + double p=0, q=0, r=0, s=0, t, w, x=0, y, ra, sa, vi, vr, z=0, norm, tst1, tst2; + int notlas; /* boolean */ + + + norm = 0.0; + k = 1; + /* store isolated roots and compute matrix norm */ + for (i = 0; i < n; i++) { + for (j = k - 1; j < n; j++) + norm += fabs(h[i][j]); + k = i + 1; + if (i + 1 < low || i + 1 > hgh) { + wr[i] = h[i][i]; + wi[i] = 0.0; + } + } + en = hgh; + t = 0.0; + itn = n * 30; + while (en >= low) { /* search for next eigenvalues */ + its = 0; + na = en - 1; + while (en >= 1) { + /* look for single small sub-diagonal element */ + for (l = en; l > low; l--) { + s = fabs(h[l - 2][l - 2]) + fabs(h[l - 1][l - 1]); + if (s == 0.0) + s = norm; + tst1 = s; + tst2 = tst1 + fabs(h[l - 1][l - 2]); + if (tst2 == tst1) + goto L100; + } + l = low; + L100: + x = h[en - 1][en - 1]; /* form shift */ + if (l == en || l == na) + break; + if (itn == 0) { + /* all eigenvalues have not converged */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR B TO DEVELOPERS\n\n\n"); + exit(1); + } + y = h[na - 1][na - 1]; + w = h[en - 1][na - 1] * h[na - 1][en - 1]; + /* form exceptional shift */ + if (its == 10 || its == 20) { + t += x; + for (i = low - 1; i < en; i++) + h[i][i] -= x; + s = fabs(h[en - 1][na - 1]) + fabs(h[na - 1][en - 3]); + x = 0.75 * s; + y = x; + w = -0.4375 * s * s; + } + its++; + itn--; + /* look for two consecutive small sub-diagonal elements */ + for (m = en - 2; m >= l; m--) { + z = h[m - 1][m - 1]; + r = x - z; + s = y - z; + p = (r * s - w) / h[m][m - 1] + h[m - 1][m]; + q = h[m][m] - z - r - s; + r = h[m + 1][m]; + s = fabs(p) + fabs(q) + fabs(r); + p /= s; + q /= s; + r /= s; + if (m == l) + break; + tst1 = fabs(p) * + (fabs(h[m - 2][m - 2]) + fabs(z) + fabs(h[m][m])); + tst2 = tst1 + fabs(h[m - 1][m - 2]) * (fabs(q) + fabs(r)); + if (tst2 == tst1) + break; + } + for (i = m + 2; i <= en; i++) { + h[i - 1][i - 3] = 0.0; + if (i != m + 2) + h[i - 1][i - 4] = 0.0; + } + for (k = m; k <= na; k++) { + notlas = (k != na); + if (k != m) { + p = h[k - 1][k - 2]; + q = h[k][k - 2]; + r = 0.0; + if (notlas) + r = h[k + 1][k - 2]; + x = fabs(p) + fabs(q) + fabs(r); + if (x != 0.0) { + p /= x; + q /= x; + r /= x; + } + } + if (x != 0.0) { + if (p < 0.0) /* sign */ + s = - sqrt(p * p + q * q + r * r); + else + s = sqrt(p * p + q * q + r * r); + if (k != m) + h[k - 1][k - 2] = -s * x; + else { + if (l != m) + h[k - 1][k - 2] = -h[k - 1][k - 2]; + } + p += s; + x = p / s; + y = q / s; + z = r / s; + q /= p; + r /= p; + if (!notlas) { + for (j = k - 1; j < n; j++) { /* row modification */ + p = h[k - 1][j] + q * h[k][j]; + h[k - 1][j] -= p * x; + h[k][j] -= p * y; + } + j = (en < (k + 3)) ? en : (k + 3); /* min */ + for (i = 0; i < j; i++) { /* column modification */ + p = x * h[i][k - 1] + y * h[i][k]; + h[i][k - 1] -= p; + h[i][k] -= p * q; + } + /* accumulate transformations */ + for (i = low - 1; i < hgh; i++) { + p = x * zz[i][k - 1] + y * zz[i][k]; + zz[i][k - 1] -= p; + zz[i][k] -= p * q; + } + } else { + for (j = k - 1; j < n; j++) { /* row modification */ + p = h[k - 1][j] + q * h[k][j] + r * h[k + 1][j]; + h[k - 1][j] -= p * x; + h[k][j] -= p * y; + h[k + 1][j] -= p * z; + } + j = (en < (k + 3)) ? en : (k + 3); /* min */ + for (i = 0; i < j; i++) { /* column modification */ + p = x * h[i][k - 1] + y * h[i][k] + z * h[i][k + 1]; + h[i][k - 1] -= p; + h[i][k] -= p * q; + h[i][k + 1] -= p * r; + } + /* accumulate transformations */ + for (i = low - 1; i < hgh; i++) { + p = x * zz[i][k - 1] + y * zz[i][k] + + z * zz[i][k + 1]; + zz[i][k - 1] -= p; + zz[i][k] -= p * q; + zz[i][k + 1] -= p * r; + } + } + } + } /* for k */ + } /* while infinite loop */ + if (l == en) { /* one root found */ + h[en - 1][en - 1] = x + t; + wr[en - 1] = h[en - 1][en - 1]; + wi[en - 1] = 0.0; + en = na; + continue; + } + y = h[na - 1][na - 1]; + w = h[en - 1][na - 1] * h[na - 1][en - 1]; + p = (y - x) / 2.0; + q = p * p + w; + z = sqrt(fabs(q)); + h[en - 1][en - 1] = x + t; + x = h[en - 1][en - 1]; + h[na - 1][na - 1] = y + t; + if (q >= 0.0) { /* real pair */ + if (p < 0.0) /* sign */ + z = p - fabs(z); + else + z = p + fabs(z); + wr[na - 1] = x + z; + wr[en - 1] = wr[na - 1]; + if (z != 0.0) + wr[en - 1] = x - w / z; + wi[na - 1] = 0.0; + wi[en - 1] = 0.0; + x = h[en - 1][na - 1]; + s = fabs(x) + fabs(z); + p = x / s; + q = z / s; + r = sqrt(p * p + q * q); + p /= r; + q /= r; + for (j = na - 1; j < n; j++) { /* row modification */ + z = h[na - 1][j]; + h[na - 1][j] = q * z + p * h[en - 1][j]; + h[en - 1][j] = q * h[en - 1][j] - p * z; + } + for (i = 0; i < en; i++) { /* column modification */ + z = h[i][na - 1]; + h[i][na - 1] = q * z + p * h[i][en - 1]; + h[i][en - 1] = q * h[i][en - 1] - p * z; + } + /* accumulate transformations */ + for (i = low - 1; i < hgh; i++) { + z = zz[i][na - 1]; + zz[i][na - 1] = q * z + p * zz[i][en - 1]; + zz[i][en - 1] = q * zz[i][en - 1] - p * z; + } + } else { /* complex pair */ + wr[na - 1] = x + p; + wr[en - 1] = x + p; + wi[na - 1] = z; + wi[en - 1] = -z; + } + en -= 2; + } /* while en >= low */ + /* backsubstitute to find vectors of upper triangular form */ + if (norm != 0.0) { + for (en = n; en >= 1; en--) { + p = wr[en - 1]; + q = wi[en - 1]; + na = en - 1; + if (q == 0.0) {/* real vector */ + m = en; + h[en - 1][en - 1] = 1.0; + if (na != 0) { + for (i = en - 2; i >= 0; i--) { + w = h[i][i] - p; + r = 0.0; + for (j = m - 1; j < en; j++) + r += h[i][j] * h[j][en - 1]; + if (wi[i] < 0.0) { + z = w; + s = r; + } else { + m = i + 1; + if (wi[i] == 0.0) { + t = w; + if (t == 0.0) { + tst1 = norm; + t = tst1; + do { + t = 0.01 * t; + tst2 = norm + t; + } while (tst2 > tst1); + } + h[i][en - 1] = -(r / t); + } else { /* solve real equations */ + x = h[i][i + 1]; + y = h[i + 1][i]; + q = (wr[i] - p) * (wr[i] - p) + wi[i] * wi[i]; + t = (x * s - z * r) / q; + h[i][en - 1] = t; + if (fabs(x) > fabs(z)) + h[i + 1][en - 1] = (-r - w * t) / x; + else + h[i + 1][en - 1] = (-s - y * t) / z; + } + /* overflow control */ + t = fabs(h[i][en - 1]); + if (t != 0.0) { + tst1 = t; + tst2 = tst1 + 1.0 / tst1; + if (tst2 <= tst1) { + for (j = i; j < en; j++) + h[j][en - 1] /= t; + } + } + } + } + } + } else if (q > 0.0) { + m = na; + if (fabs(h[en - 1][na - 1]) > fabs(h[na - 1][en - 1])) { + h[na - 1][na - 1] = q / h[en - 1][na - 1]; + h[na - 1][en - 1] = (p - h[en - 1][en - 1]) / + h[en - 1][na - 1]; + } else + mcdiv(0.0, -h[na - 1][en - 1], h[na - 1][na - 1] - p, q, + &h[na - 1][na - 1], &h[na - 1][en - 1]); + h[en - 1][na - 1] = 0.0; + h[en - 1][en - 1] = 1.0; + if (en != 2) { + for (i = en - 3; i >= 0; i--) { + w = h[i][i] - p; + ra = 0.0; + sa = 0.0; + for (j = m - 1; j < en; j++) { + ra += h[i][j] * h[j][na - 1]; + sa += h[i][j] * h[j][en - 1]; + } + if (wi[i] < 0.0) { + z = w; + r = ra; + s = sa; + } else { + m = i + 1; + if (wi[i] == 0.0) + mcdiv(-ra, -sa, w, q, &h[i][na - 1], + &h[i][en - 1]); + else { /* solve complex equations */ + x = h[i][i + 1]; + y = h[i + 1][i]; + vr = (wr[i] - p) * (wr[i] - p); + vr = vr + wi[i] * wi[i] - q * q; + vi = (wr[i] - p) * 2.0 * q; + if (vr == 0.0 && vi == 0.0) { + tst1 = norm * (fabs(w) + fabs(q) + fabs(x) + + fabs(y) + fabs(z)); + vr = tst1; + do { + vr = 0.01 * vr; + tst2 = tst1 + vr; + } while (tst2 > tst1); + } + mcdiv(x * r - z * ra + q * sa, + x * s - z * sa - q * ra, vr, vi, + &h[i][na - 1], &h[i][en - 1]); + if (fabs(x) > fabs(z) + fabs(q)) { + h[i + 1] + [na - 1] = (q * h[i][en - 1] - + w * h[i][na - 1] - ra) / x; + h[i + 1][en - 1] = (-sa - w * h[i][en - 1] - + q * h[i][na - 1]) / x; + } else + mcdiv(-r - y * h[i][na - 1], + -s - y * h[i][en - 1], z, q, + &h[i + 1][na - 1], &h[i + 1][en - 1]); + } + /* overflow control */ + t = (fabs(h[i][na - 1]) > fabs(h[i][en - 1])) ? + fabs(h[i][na - 1]) : fabs(h[i][en - 1]); + if (t != 0.0) { + tst1 = t; + tst2 = tst1 + 1.0 / tst1; + if (tst2 <= tst1) { + for (j = i; j < en; j++) { + h[j][na - 1] /= t; + h[j][en - 1] /= t; + } + } + } + } + } + } + } + } + /* end back substitution. vectors of isolated roots */ + for (i = 0; i < n; i++) { + if (i + 1 < low || i + 1 > hgh) { + for (j = i; j < n; j++) + zz[i][j] = h[i][j]; + } + } + /* multiply by transformation matrix to give vectors of + * original full matrix. */ + for (j = n - 1; j >= low - 1; j--) { + m = ((j + 1) < hgh) ? (j + 1) : hgh; /* min */ + for (i = low - 1; i < hgh; i++) { + z = 0.0; + for (k = low - 1; k < m; k++) + z += zz[i][k] * h[k][j]; + zz[i][j] = z; + } + } + } + return; +} + + +/* make rate matrix with 0.01 expected substitutions per unit time */ +void onepamratematrix(dmatrix a) +{ + int i, j; + double delta, temp, sum; + dvector m; + + for (i = 0; i < tpmradix; i++) + { + for (j = 0; j < tpmradix; j++) + { + a[i][j] = Freqtpm[j]*a[i][j]; + } + } + + m = new_dvector(tpmradix); + for (i = 0, sum = 0.0; i < tpmradix; i++) + { + for (j = 0, temp = 0.0; j < tpmradix; j++) + temp += a[i][j]; + m[i] = temp; /* row sum */ + sum += temp*Freqtpm[i]; /* exp. rate */ + } + delta = 0.01 / sum; /* 0.01 subst. per unit time */ + for (i = 0; i < tpmradix; i++) { + for (j = 0; j < tpmradix; j++) { + if (i != j) + a[i][j] = delta * a[i][j]; + else + a[i][j] = delta * (-m[i]); + } + } + free_dvector(m); +} + + +void eigensystem(dvector eval, dmatrix evec) +{ + dvector evali, forg; + dmatrix a, b; + ivector ordr; + int i, j, k, error; + double zero; + + + ordr = new_ivector(tpmradix); + evali = new_dvector(tpmradix); + forg = new_dvector(tpmradix); + a = new_dmatrix(tpmradix,tpmradix); + b = new_dmatrix(tpmradix,tpmradix); + + rtfdata(a, forg); /* get relative transition matrix and frequencies */ + + onepamratematrix(a); /* make 1 PAM rate matrix */ + + /* copy a to b */ + for (i = 0; i < tpmradix; i++) + for (j = 0; j < tpmradix; j++) + b[i][j] = a[i][j]; + + elmhes(a, ordr, tpmradix); /* compute eigenvalues and eigenvectors */ + eltran(a, evec, ordr, tpmradix); + hqr2(tpmradix, 1, tpmradix, a, evec, eval, evali); + + /* check eigenvalue equation */ + error = FALSE; + for (j = 0; j < tpmradix; j++) { + for (i = 0, zero = 0.0; i < tpmradix; i++) { + for (k = 0; k < tpmradix; k++) zero += b[i][k] * evec[k][j]; + zero -= eval[j] * evec[i][j]; + if (fabs(zero) > 1.0e-5) + error = TRUE; + } + } + if (error) + FPRINTF(STDOUTFILE "\nWARNING: Eigensystem doesn't satisfy eigenvalue equation!\n"); + + free_ivector(ordr); + free_dvector(evali); + free_dvector(forg); + free_dmatrix(a); + free_dmatrix(b); +} + + +void luinverse(dmatrix inmat, dmatrix imtrx, int size) +{ + double eps = 1.0e-20; /* ! */ + int i, j, k, l, maxi=0, idx, ix, jx; + double sum, tmp, maxb, aw; + ivector index; + double *wk; + dmatrix omtrx; + + + index = new_ivector(tpmradix); + omtrx = new_dmatrix(tpmradix,tpmradix); + + /* copy inmat to omtrx */ + for (i = 0; i < tpmradix; i++) + for (j = 0; j < tpmradix; j++) + omtrx[i][j] = inmat[i][j]; + + wk = (double *) malloc((unsigned)size * sizeof(double)); + aw = 1.0; + for (i = 0; i < size; i++) { + maxb = 0.0; + for (j = 0; j < size; j++) { + if (fabs(omtrx[i][j]) > maxb) + maxb = fabs(omtrx[i][j]); + } + if (maxb == 0.0) { + /* Singular matrix */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR C TO DEVELOPERS\n\n\n"); + exit(1); + } + wk[i] = 1.0 / maxb; + } + for (j = 0; j < size; j++) { + for (i = 0; i < j; i++) { + sum = omtrx[i][j]; + for (k = 0; k < i; k++) + sum -= omtrx[i][k] * omtrx[k][j]; + omtrx[i][j] = sum; + } + maxb = 0.0; + for (i = j; i < size; i++) { + sum = omtrx[i][j]; + for (k = 0; k < j; k++) + sum -= omtrx[i][k] * omtrx[k][j]; + omtrx[i][j] = sum; + tmp = wk[i] * fabs(sum); + if (tmp >= maxb) { + maxb = tmp; + maxi = i; + } + } + if (j != maxi) { + for (k = 0; k < size; k++) { + tmp = omtrx[maxi][k]; + omtrx[maxi][k] = omtrx[j][k]; + omtrx[j][k] = tmp; + } + aw = -aw; + wk[maxi] = wk[j]; + } + index[j] = maxi; + if (omtrx[j][j] == 0.0) + omtrx[j][j] = eps; + if (j != size - 1) { + tmp = 1.0 / omtrx[j][j]; + for (i = j + 1; i < size; i++) + omtrx[i][j] *= tmp; + } + } + for (jx = 0; jx < size; jx++) { + for (ix = 0; ix < size; ix++) + wk[ix] = 0.0; + wk[jx] = 1.0; + l = -1; + for (i = 0; i < size; i++) { + idx = index[i]; + sum = wk[idx]; + wk[idx] = wk[i]; + if (l != -1) { + for (j = l; j < i; j++) + sum -= omtrx[i][j] * wk[j]; + } else if (sum != 0.0) + l = i; + wk[i] = sum; + } + for (i = size - 1; i >= 0; i--) { + sum = wk[i]; + for (j = i + 1; j < size; j++) + sum -= omtrx[i][j] * wk[j]; + wk[i] = sum / omtrx[i][i]; + } + for (ix = 0; ix < size; ix++) + imtrx[ix][jx] = wk[ix]; + } + free((char *)wk); + wk = NULL; + free_ivector(index); + free_dmatrix(omtrx); +} + + +void checkevector(dmatrix evec, dmatrix ivec, int nn) +{ + int i, j, ia, ib, ic, error; + dmatrix matx; + double sum; + + + matx = new_dmatrix(nn, nn); + /* multiply matrix of eigenvectors and its inverse */ + for (ia = 0; ia < nn; ia++) { + for (ic = 0; ic < nn; ic++) { + sum = 0.0; + for (ib = 0; ib < nn; ib++) sum += evec[ia][ib] * ivec[ib][ic]; + matx[ia][ic] = sum; + } + } + /* check whether the unitary matrix is obtained */ + error = FALSE; + for (i = 0; i < nn; i++) { + for (j = 0; j < nn; j++) { + if (i == j) { + if (fabs(matx[i][j] - 1.0) > 1.0e-5) + error = TRUE; + } else { + if (fabs(matx[i][j]) > 1.0e-5) + error = TRUE; + } + } + } + if (error) { + FPRINTF(STDOUTFILE "\nWARNING: Inversion of eigenvector matrix not perfect!\n"); + } + free_dmatrix(matx); +} + + +/***************************** exported functions *****************************/ + + +/* compute 1 PAM rate matrix, its eigensystem, and the inverse matrix thereof */ +void tranprobmat() +{ + eigensystem(Eval, Evec); /* eigensystem of 1 PAM rate matrix */ + luinverse(Evec, Ievc, tpmradix); /* inverse eigenvectors are in Ievc */ + checkevector(Evec, Ievc, tpmradix); /* check whether inversion was OK */ +} + + +/* compute P(t) */ +void tprobmtrx(double arc, dmatrix tpr) +{ + register int i, j, k; + register double temp; + + + for (k = 0; k < tpmradix; k++) { + temp = exp(arc * Eval[k]); + for (j = 0; j < tpmradix; j++) + iexp[k][j] = Ievc[k][j] * temp; + } + for (i = 0; i < tpmradix; i++) { + for (j = 0; j < tpmradix; j++) { + temp = 0.0; + for (k = 0; k < tpmradix; k++) + temp += Evec[i][k] * iexp[k][j]; + tpr[i][j] = fabs(temp); + } + } +} + + +/******************************************************************************/ +/* estimation of maximum likelihood distances */ +/******************************************************************************/ + +/* compute total log-likelihood + input: likelihoods for each site and non-zero rate + output: total log-likelihood (incl. zero rate category) */ +double comptotloglkl(dmatrix cdl) +{ + int k, r; + double loglkl, fv, fv2, sitelkl; + + loglkl = 0.0; + fv = 1.0-fracinv; + fv2 = (1.0-fracinv)/(double) numcats; + + if (numcats == 1) { + + for (k = 0; k < Numptrn; k++) { + + /* compute likelihood for pattern k */ + sitelkl = cdl[0][k]*fv; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } else { + + for (k = 0; k < Numptrn; k++) { + + /* this general routine works always but it's better + to run it only when it's really necessary */ + + /* compute likelihood for pattern k */ + sitelkl = 0.0; + for (r = 0; r < numcats; r++) + sitelkl += cdl[r][k]; + sitelkl = fv2*sitelkl; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } + + return loglkl; +} + + +/* computes the site log-likelihoods + input: likelihoods for each site and non-zero rate + output: log-likelihood for each site */ +void allsitelkl(dmatrix cdl, dvector aslkl) +{ + int k, r; + double fv, fv2, sitelkl; + + fv = 1.0-fracinv; + fv2 = (1.0-fracinv)/(double) numcats; + + if (numcats == 1) { + + for (k = 0; k < Numptrn; k++) { + + /* compute likelihood for pattern k */ + sitelkl = cdl[0][k]*fv; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* site log-likelihood */ + aslkl[k] = log(sitelkl); + } + + } else { + + for (k = 0; k < Numptrn; k++) { + + /* this general routine works always but it's better + to run it only when it's really necessary */ + + /* compute likelihood for pattern k */ + sitelkl = 0.0; + for (r = 0; r < numcats; r++) + sitelkl += cdl[r][k]; + sitelkl = fv2*sitelkl; + if (constpat[k] == TRUE) + sitelkl += fracinv*Freqtpm[(int) Seqpat[0][k]]; + + /* total log-likelihood */ + aslkl[k] = log(sitelkl); + + } + } +} + + +/***************************** internal functions *****************************/ + +/* compute negative log-likelihood of distance arc between sequences seqchi/j */ +double pairlkl(double arc) +{ + int k, r, ci, cj; + double loglkl, fv, sitelkl; + + + /* compute tpms */ + for (r = 0; r < numcats; r++) + /* compute tpm for rate category r */ + tprobmtrx(arc*Rates[r], ltprobr[r]); + + loglkl = 0.0; + fv = 1.0-fracinv; + + if (numcats == 1) { + + for (k = 0; k < Numptrn; k++) { + + /* compute likelihood for site k */ + ci = seqchi[k]; + cj = seqchj[k]; + if (ci != tpmradix && cj != tpmradix) + sitelkl = ltprobr[0][ci][cj]*fv; + else + sitelkl = fv; + if (ci == cj && ci != tpmradix) + sitelkl += fracinv*Freqtpm[ci]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } else { + + for (k = 0; k < Numptrn; k++) { + + /* this general routine works always but it's better + to run it only when it's really necessary */ + + /* compute likelihood for site k */ + ci = seqchi[k]; + cj = seqchj[k]; + if (ci != tpmradix && cj != tpmradix) { + sitelkl = 0.0; + for (r = 0; r < numcats; r++) + sitelkl += ltprobr[r][ci][cj]; + sitelkl = fv*sitelkl/(double) numcats; + } else + sitelkl = fv; + if (ci == cj && ci != tpmradix) + sitelkl += fracinv*Freqtpm[ci]; + + /* total log-likelihood */ + loglkl += log(sitelkl)*Weight[k]; + + } + + } + + /* return negative log-likelihood as we use a minimizing procedure */ + return -loglkl; +} + + +/***************************** exported functions *****************************/ + + +/* maximum likelihood distance between sequence i and j */ +double mldistance(int i, int j) +{ + double dist, fx, f2x; + + if (i == j) return 0.0; + + /* use old distance as start value */ + dist = Distanmat[i][j]; + + if (dist == 0.0) return 0.0; + + seqchi = Seqpat[i]; + seqchj = Seqpat[j]; + + if (dist <= MINARC) dist = MINARC+1.0; + if (dist >= MAXARC) dist = MAXARC-1.0; + + dist = onedimenmin(MINARC, dist, MAXARC, pairlkl, EPSILON, &fx, &f2x); + + return dist; +} + + +/* initialize distance matrix */ +void initdistan() +{ + int i, j, k, diff, x, y; + double obs, temp; + + for (i = 0; i < Maxspc; i++) { + Distanmat[i][i] = 0.0; + for (j = i + 1; j < Maxspc; j++) { + seqchi = Seqpat[i]; + seqchj = Seqpat[j]; + + /* count observed differences */ + diff = 0; + for (k = 0; k < Numptrn; k++) { + x = seqchi[k]; + y = seqchj[k]; + if (x != y && + x != tpmradix && + y != tpmradix) + diff += Weight[k]; + } + if (diff == 0) + Distanmat[i][j] = 0.0; + else { + /* use generalized JC correction to get first estimate + (for the SH model the observed distance is used) */ + /* observed distance */ + obs = (double) diff / (double) Maxsite; + temp = 1.0 - (double) obs*tpmradix/(tpmradix-1.0); + if (temp > 0.0 && !(data_optn == 0 && SH_optn)) + /* use JC corrected distance */ + Distanmat[i][j] = -100.0*(tpmradix-1.0)/tpmradix * log(temp); + else + /* use observed distance */ + Distanmat[i][j] = obs * 100.0; + if (Distanmat[i][j] < MINARC) Distanmat[i][j] = MINARC; + if (Distanmat[i][j] > MAXARC) Distanmat[i][j] = MAXARC; + } + Distanmat[j][i] = Distanmat[i][j]; + } + } +} + +/* compute distance matrix */ +void computedistan() +{ + int i, j; + + for (i = 0; i < Maxspc; i++) + for (j = i; j < Maxspc; j++) { + Distanmat[i][j] = mldistance(i, j); + Distanmat[j][i] = Distanmat[i][j]; + } +} + + +/******************************************************************************/ +/* computation of maximum likelihood edge lengths for a given tree */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +/* multiply partial likelihoods */ +void productpartials(Node *op) +{ + Node *cp; + int i, j, r; + dcube opc, cpc; + + cp = op; + opc = op->partials; + while (cp->isop->isop != op) { + cp = cp->isop; + cpc = cp->partials; + for (r = 0; r < numcats; r++) + for (i = 0; i < Numptrn; i++) + for (j = 0; j < tpmradix; j++) + opc[r][i][j] *= cpc[r][i][j]; + } +} + + +/* compute internal partial likelihoods */ +void partialsinternal(Node *op) +{ + int i, j, k, r; + double sum; + dcube oprob, cprob; + + if (clockmode == 1) { /* clocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->lengthc)*Rates[r], ltprobr[r]); + } + } else { /* non-clocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->length)*Rates[r], ltprobr[r]); + } + } + + oprob = op->partials; + cprob = op->kinp->isop->partials; + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + for (i = 0; i < tpmradix; i++) { + sum = 0.0; + for (j = 0; j < tpmradix; j++) + sum += ltprobr[r][i][j] * cprob[r][k][j]; + oprob[r][k][i] = sum; + } + } + } +} + + +/* compute external partial likelihoods */ +void partialsexternal(Node *op) +{ + int i, j, k, r; + dcube oprob; + cvector dseqi; + + if (clockmode == 1) { /* clocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->lengthc)*Rates[r], ltprobr[r]); + } + } else { /* nonclocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx((op->length)*Rates[r], ltprobr[r]); + } + } + + oprob = op->partials; + dseqi = op->kinp->eprob; + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + if ((j = dseqi[k]) == tpmradix) { + for (i = 0; i < tpmradix; i++) + oprob[r][k][i] = 1.0; + } else { + for (i = 0; i < tpmradix; i++) + oprob[r][k][i] = ltprobr[r][i][j]; + } + } + } +} + + +/* compute all partial likelihoods */ +void initpartials(Tree *tr) +{ + Node *cp, *rp; + + cp = rp = tr->rootp; + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + cp = cp->kinp; /* not descen */ + partialsexternal(cp); + } else { /* internal node */ + if (!cp->descen) { + productpartials(cp->kinp->isop); + partialsinternal(cp); + } + } + } while (cp != rp); +} + + +/* compute log-likelihood given internal branch with length arc + between partials partiali and partials partialj */ +double intlkl(double arc) +{ + double sumlk, slk; + int r, s, i, j; + dmatrix cdl; + + cdl = Ctree->condlkl; + for (r = 0; r < numcats; r++) { + tprobmtrx(arc*Rates[r], ltprobr[r]); + } + for (r = 0; r < numcats; r++) { + for (s = 0; s < Numptrn; s++) { + sumlk = 0.0; + for (i = 0; i < tpmradix; i++) { + slk = 0.0; + for (j = 0; j < tpmradix; j++) + slk += partialj[r][s][j] * ltprobr[r][i][j]; + sumlk += Freqtpm[i] * partiali[r][s][i] * slk; + } + cdl[r][s] = sumlk; + } + } + + /* compute total log-likelihood for current tree */ + Ctree->lklhd = comptotloglkl(cdl); + + return -(Ctree->lklhd); /* we use a minimizing procedure */ +} + + +/* optimize internal branch */ +void optinternalbranch(Node *op) +{ + double arc, fx, f2x; + + partiali = op->isop->partials; + partialj = op->kinp->isop->partials; + arc = op->length; /* nonclocklike branch lengths */ + if (arc <= MINARC) arc = MINARC+1.0; + if (arc >= MAXARC) arc = MAXARC-1.0; + arc = onedimenmin(MINARC, arc, MAXARC, intlkl, EPSILON, &fx, &f2x); + op->kinp->length = arc; + op->length = arc; + + /* variance of branch length */ + f2x = fabs(f2x); + if (1.0/(MAXARC*MAXARC) < f2x) + op->varlen = 1.0/f2x; + else + op->varlen = MAXARC*MAXARC; +} + + +/* compute log-likelihood given external branch with length arc + between partials partiali and sequence seqchi */ +double extlkl(double arc) +{ + double sumlk; + int r, s, i, j; + dvector opb; + dmatrix cdl; + + cdl = Ctree->condlkl; + for (r = 0; r < numcats; r++) { + tprobmtrx(arc*Rates[r], ltprobr[r]); + } + for (r = 0; r < numcats; r++) { + for (s = 0; s < Numptrn; s++) { + opb = partiali[r][s]; + sumlk = 0.0; + if ((j = seqchi[s]) != tpmradix) { + for (i = 0; i < tpmradix; i++) + sumlk += (Freqtpm[i] * (opb[i] * ltprobr[r][i][j])); + } else { + for (i = 0; i < tpmradix; i++) + sumlk += Freqtpm[i] * opb[i]; + } + cdl[r][s] = sumlk; + } + } + + /* compute total log-likelihood for current tree */ + Ctree->lklhd = comptotloglkl(cdl); + + return -(Ctree->lklhd); /* we use a minimizing procedure */ +} + +/* optimize external branch */ +void optexternalbranch(Node *op) +{ + double arc, fx, f2x; + + partiali = op->isop->partials; + seqchi = op->kinp->eprob; + arc = op->length; /* nonclocklike branch lengths */ + if (arc <= MINARC) arc = MINARC+1.0; + if (arc >= MAXARC) arc = MAXARC-1.0; + arc = onedimenmin(MINARC, arc, MAXARC, extlkl, EPSILON, &fx, &f2x); + op->kinp->length = arc; + op->length = arc; + + /* variance of branch length */ + f2x = fabs(f2x); + if (1.0/(MAXARC*MAXARC) < f2x) + op->varlen = 1.0/f2x; + else + op->varlen = MAXARC*MAXARC; +} + + +/* finish likelihoods for each rate and site */ +void finishlkl(Node *op) +{ + int r, k, i, j; + double arc, sumlk, slk; + dmatrix cdl; + + partiali = op->isop->partials; + partialj = op->kinp->isop->partials; + cdl = Ctree->condlkl; + arc = op->length; /* nonclocklike branch lengths */ + for (r = 0; r < numcats; r++) { + tprobmtrx(arc*Rates[r], ltprobr[r]); + } + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + sumlk = 0.0; + for (i = 0; i < tpmradix; i++) { + slk = 0.0; + for (j = 0; j < tpmradix; j++) + slk += partialj[r][k][j] * ltprobr[r][i][j]; + sumlk += Freqtpm[i] * partiali[r][k][i] * slk; + } + cdl[r][k] = sumlk; + } + } +} + + +/***************************** exported functions *****************************/ + + +/* optimize branch lengths to get maximum likelihood (nonclocklike branchs) */ +double optlkl(Tree *tr) +{ + Node *cp, *rp; + int nconv; + double lendiff; + + clockmode = 0; /* nonclocklike branch lengths */ + nconv = 0; + Converg = FALSE; + initpartials(tr); + for (Numit = 1; (Numit <= MAXIT) && (!Converg); Numit++) { + + cp = rp = tr->rootp; + do { + cp = cp->isop->kinp; + productpartials(cp->kinp->isop); + if (cp->isop == NULL) { /* external node */ + cp = cp->kinp; /* not descen */ + + lendiff = cp->length; + optexternalbranch(cp); + lendiff = fabs(lendiff - cp->length); + if (lendiff < EPSILON) nconv++; + else nconv = 0; + + partialsexternal(cp); + } else { /* internal node */ + if (cp->descen) { + partialsinternal(cp); + } else { + + lendiff = cp->length; + optinternalbranch(cp); + lendiff = fabs(lendiff - cp->length); + if (lendiff < EPSILON) nconv++; + else nconv = 0; + + /* eventually compute likelihoods for each site */ + if ((cp->number == Numibrnch-1 && lendiff < EPSILON) || + Numit == MAXIT-1) finishlkl(cp); + + partialsinternal(cp); + } + } + if (nconv >= Numbrnch) { /* convergence */ + Converg = TRUE; + cp = rp; /* get out of here */ + } + } while (cp != rp); + } + + /* compute total log-likelihood for current tree */ + return comptotloglkl(tr->condlkl); +} + + +/* compute likelihood of tree for given branch lengths */ +double treelkl(Tree *tr) +{ + int i, k, r; + Node *cp; + dmatrix cdl; + dcube prob1, prob2; + double sumlk; + + /* compute for each site and rate log-likelihoods */ + initpartials(tr); + cp = tr->rootp; + productpartials(cp->isop); + prob1 = cp->partials; + prob2 = cp->isop->partials; + cdl = tr->condlkl; + for (r = 0; r < numcats; r++) { + for (k = 0; k < Numptrn; k++) { + sumlk = 0.0; + for (i = 0; i < tpmradix; i++) + sumlk += Freqtpm[i] * (prob1[r][k][i] * prob2[r][k][i]); + cdl[r][k] = sumlk; + } + } + + /* return total log-likelihood for current tree */ + return comptotloglkl(cdl); +} + + +/******************************************************************************/ +/* least-squares estimate of branch lengths */ +/******************************************************************************/ + + +/***************************** internal functions *****************************/ + + +void luequation(dmatrix amat, dvector yvec, int size) +{ + double eps = 1.0e-20; /* ! */ + int i, j, k, l, maxi=0, idx; + double sum, tmp, maxb, aw; + dvector wk; + ivector index; + + + wk = new_dvector(size); + index = new_ivector(size); + aw = 1.0; + for (i = 0; i < size; i++) { + maxb = 0.0; + for (j = 0; j < size; j++) { + if (fabs(amat[i][j]) > maxb) + maxb = fabs(amat[i][j]); + } + if (maxb == 0.0) { + /* Singular matrix */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR D TO DEVELOPERS\n\n\n"); + exit(1); + } + wk[i] = 1.0 / maxb; + } + for (j = 0; j < size; j++) { + for (i = 0; i < j; i++) { + sum = amat[i][j]; + for (k = 0; k < i; k++) + sum -= amat[i][k] * amat[k][j]; + amat[i][j] = sum; + } + maxb = 0.0; + for (i = j; i < size; i++) { + sum = amat[i][j]; + for (k = 0; k < j; k++) + sum -= amat[i][k] * amat[k][j]; + amat[i][j] = sum; + tmp = wk[i] * fabs(sum); + if (tmp >= maxb) { + maxb = tmp; + maxi = i; + } + } + if (j != maxi) { + for (k = 0; k < size; k++) { + tmp = amat[maxi][k]; + amat[maxi][k] = amat[j][k]; + amat[j][k] = tmp; + } + aw = -aw; + wk[maxi] = wk[j]; + } + index[j] = maxi; + if (amat[j][j] == 0.0) + amat[j][j] = eps; + if (j != size - 1) { + tmp = 1.0 / amat[j][j]; + for (i = j + 1; i < size; i++) + amat[i][j] *= tmp; + } + } + l = -1; + for (i = 0; i < size; i++) { + idx = index[i]; + sum = yvec[idx]; + yvec[idx] = yvec[i]; + if (l != -1) { + for (j = l; j < i; j++) + sum -= amat[i][j] * yvec[j]; + } else if (sum != 0.0) + l = i; + yvec[i] = sum; + } + for (i = size - 1; i >= 0; i--) { + sum = yvec[i]; + for (j = i + 1; j < size; j++) + sum -= amat[i][j] * yvec[j]; + yvec[i] = sum / amat[i][i]; + } + free_ivector(index); + free_dvector(wk); +} + + +/* least square estimation of branch lengths + used for the approximate ML and as starting point + in the calculation of the exact value of the ML */ +void lslength(Tree *tr, dvector distanvec, int numspc, int numibrnch, dvector Brnlength) +{ + int i, i1, j, j1, j2, k, numbrnch, numpair; + double sum, leng, alllen, rss; + ivector pths; + dmatrix atmt, atamt; + Node **ebp, **ibp; + + numbrnch = numspc + numibrnch; + numpair = (numspc * (numspc - 1)) / 2; + atmt = new_dmatrix(numbrnch, numpair); + atamt = new_dmatrix(numbrnch, numbrnch); + ebp = tr->ebrnchp; + ibp = tr->ibrnchp; + for (i = 0; i < numspc; i++) { + for (j1 = 1, j = 0; j1 < numspc; j1++) { + if (j1 == i) { + for (j2 = 0; j2 < j1; j2++, j++) { + atmt[i][j] = 1.0; + } + } else { + for (j2 = 0; j2 < j1; j2++, j++) { + if (j2 == i) + atmt[i][j] = 1.0; + else + atmt[i][j] = 0.0; + } + } + } + } + for (i1 = 0, i = numspc; i1 < numibrnch; i1++, i++) { + pths = ibp[i1]->paths; + for (j1 = 1, j = 0; j1 < numspc; j1++) { + for (j2 = 0; j2 < j1; j2++, j++) { + if (pths[j1] != pths[j2]) + atmt[i][j] = 1.0; + else + atmt[i][j] = 0.0; + } + } + } + for (i = 0; i < numbrnch; i++) { + for (j = 0; j <= i; j++) { + for (k = 0, sum = 0.0; k < numpair; k++) + sum += atmt[i][k] * atmt[j][k]; + atamt[i][j] = sum; + atamt[j][i] = sum; + } + } + for (i = 0; i < numbrnch; i++) { + for (k = 0, sum = 0.0; k < numpair; k++) + sum += atmt[i][k] * distanvec[k]; + Brnlength[i] = sum; + } + luequation(atamt, Brnlength, numbrnch); + for (i = 0, rss = 0.0; i < numpair; i++) { + sum = distanvec[i]; + for (j = 0; j < numbrnch; j++) { + if (atmt[j][i] == 1.0 && Brnlength[j] > 0.0) + sum -= Brnlength[j]; + } + rss += sum * sum; + } + tr->rssleast = sqrt(rss); + alllen = 0.0; + for (i = 0; i < numspc; i++) { + leng = Brnlength[i]; + alllen += leng; + if (leng < MINARC) leng = MINARC; + if (leng > MAXARC) leng = MAXARC; + if (clockmode) { /* clock */ + ebp[i]->lengthc = leng; + ebp[i]->kinp->lengthc = leng; + } else { /* no clock */ + ebp[i]->length = leng; + ebp[i]->kinp->length = leng; + } + Brnlength[i] = leng; + } + for (i = 0, j = numspc; i < numibrnch; i++, j++) { + leng = Brnlength[j]; + alllen += leng; + if (leng < MINARC) leng = MINARC; + if (leng > MAXARC) leng = MAXARC; + if (clockmode) { /* clock */ + ibp[i]->lengthc = leng; + ibp[i]->kinp->lengthc = leng; + } else { /* no clock */ + ibp[i]->length = leng; + ibp[i]->kinp->length = leng; + } + Brnlength[j] = leng; + } + free_dmatrix(atmt); + free_dmatrix(atamt); +} diff --git a/forester/archive/RIO/others/puzzle_mod/src/ml2.c b/forester/archive/RIO/others/puzzle_mod/src/ml2.c new file mode 100644 index 0000000..7ad780d --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/ml2.c @@ -0,0 +1,1871 @@ +/* + * ml2.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/* Modified by Christian Zmasek to: + - Names of 26 chars. + + !WARNING: Use ONLY together with FORESTER/RIO! + !For all other puposes download the excellent original! + + last modification: 05/22/01 + + + Node *internalnode(Tree *tr, char **chpp, int *ninode): + + char ident[100], idcomp[11]; -> char ident[100], idcomp[27]; + + idcomp[10] = '\0'; -> idcomp[26] = '\0'; + + } while (!stop && (ff != 10)); -> } while (!stop && (ff != 26)); + + + +*/ + + + +#define EXTERN extern + +/* prototypes */ +#include +#include +#include +#include +#include +#include "util.h" +#include "ml.h" + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + +/* prototypes for two functions of puzzle2.c */ +void fputid10(FILE *, int); +int fputid(FILE *, int); + + +/******************************************************************************/ +/* user tree input */ +/******************************************************************************/ + +/* read user tree, drop all blanks, tabs, and newlines. + Drop edgelengths (after :) but keep internal + labels. Check whether all pairs of brackets match. */ +void getusertree(FILE *itfp, cvector tr, int maxlen) +{ + int n, brac, ci; + int comment = 0; + + /* look for opening bracket */ + n = 0; + brac = 0; + do { + ci = fgetc(itfp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing start bracket in tree)\n\n\n"); + exit(1); + } + if (ci == '[') comment = 1; + if ((ci == ']') && comment) { + comment = 0; + ci = fgetc(itfp); + } + } while (comment || ((char) ci != '(')); + tr[n] = (char) ci; + brac++; + + do { + /* get next character (skip blanks, newlines, and tabs) */ + do { + ci = fgetc(itfp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no more characters in tree)\n\n\n"); + exit(1); + } + if (ci == '[') comment = 1; + if ((ci == ']') && comment) { + comment = 0; + ci = fgetc(itfp); + } + } while (comment || (char) ci == ' ' || (char) ci == '\n' || (char) ci == '\t'); + + if ((char) ci == ':') { /* skip characters until a ,) appears */ + do { + ci = fgetc(itfp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing ';' or ',' in tree)\n\n\n"); + exit(1); + } + if (ci == '[') comment = 1; + if ((ci == ']') && comment) { + comment = 0; + ci = fgetc(itfp); + } + } while (comment || ((char) ci != ',' && (char) ci != ')') ); + } + + if ((char) ci == '(') { + brac++; + } + if ((char) ci == ')') { + brac--; + } + + n++; + tr[n] = (char) ci; + + } while (((char) ci != ';') && (n != maxlen-2)); + + if (n == maxlen-2) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (tree description too long)\n\n\n"); + exit(1); + } + + if (brac != 0) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (brackets don't match in tree)\n\n\n"); + exit(1); + } + + n++; + tr[n] = '\0'; +} + + +Node *internalnode(Tree *tr, char **chpp, int *ninode) +{ + Node *xp, *np, *rp; + int i, j, dvg, ff, stop, numc; + char ident[100], idcomp[27]; /* CZ 05/22/01 */ + char *idp; + + (*chpp)++; + if (**chpp == '(') { /* process subgroup */ + + xp = internalnode(tr, chpp, ninode); + xp->isop = xp; + dvg = 1; + while (**chpp != ')') { + if (**chpp == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + dvg++; + /* insert edges around node */ + np = internalnode(tr, chpp, ninode); + np->isop = xp->isop; + xp->isop = np; + xp = np; + } + /* closing bracket reached */ + + (*chpp)++; + if (dvg < 2) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (only one OTU inside pair of brackets)\n\n\n"); + exit(1); + } + + if ((*ninode) >= Maxspc-3) { /* all internal nodes already used */ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no unrooted tree)\n\n\n"); + exit(1); + } + + rp = tr->ibrnchp[*ninode]; + rp->isop = xp->isop; + xp->isop = rp; + + for (j = 0; j < Numspc; j++) + rp->paths[j] = 0; + xp = rp->isop; + while (xp != rp) { + for (j = 0; j < Numspc; j++) { + if (xp->paths[j] == 1) + rp->paths[j] = 1; + } + xp = xp->isop; + } + (*ninode)++; + + if ((**chpp) == ',' || (**chpp) == ')') return rp->kinp; + if ((**chpp) == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + + /* read internal label into rp->label (max. 20 characters) */ + rp->label = new_cvector(21); + (rp->label)[0] = **chpp; + (rp->label)[1] = '\0'; + for (numc = 1; numc < 20; numc++) { + (*chpp)++; + if ((**chpp) == ',' || (**chpp) == ')') return rp->kinp; + if ((**chpp) == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + (rp->label)[numc] = **chpp; + (rp->label)[numc+1] = '\0'; + } + do { /* skip the rest of the internal label */ + (*chpp)++; + if ((**chpp) == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + } while (((**chpp) != ',' && (**chpp) != ')')); + + return rp->kinp; + + } else { /* process species names */ + /* read species name */ + for (idp = ident; **chpp != ',' && + **chpp != ')' && **chpp != '\0'; (*chpp)++) { + *idp++ = **chpp; + } + *idp = '\0'; + /* look for internal number */ + idcomp[26] = '\0'; /* CZ 05/22/01 */ + + for (i = 0; i < Maxspc; i++) { + ff = 0; + stop = FALSE; + do { + idcomp[ff] = Identif[i][ff]; + ff++; + if (idcomp[ff-1] == ' ') stop = TRUE; + } while (!stop && (ff != 26)); /* CZ 05/22/01 */ + if (stop) idcomp[ff-1] = '\0'; + + if (!strcmp(ident, idcomp)) { + if (usedtaxa[i]) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (multiple occurence of sequence '"); + FPRINTF(STDOUTFILE "%s' in tree)\n\n\n", ident); + exit(1); + } + usedtaxa[i] = TRUE; + return tr->ebrnchp[i]->kinp; + } + } + + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unknown sequence '%s' in tree)\n\n\n", ident); + exit(1); + } + return NULL; /* never returned but without some compilers complain */ +} + +/* make tree structure, the tree description may contain internal + labels but no edge lengths */ +void constructtree(Tree *tr, cvector strtree) +{ + char *chp; + int ninode, i; + int dvg, numc; + Node *xp, *np; + + ninode = 0; + chp = strtree; + usedtaxa = new_ivector(Maxspc); + for (i = 0; i < Maxspc; i++) usedtaxa[i] = FALSE; + + xp = internalnode(tr, &chp, &ninode); + xp->isop = xp; + dvg = 1; + while (*chp != ')') { /* look for closing bracket */ + if (*chp == '\0') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (unexpected end of tree)\n\n\n"); + exit(1); + } + dvg++; + /* insert edges around node */ + np = internalnode(tr, &chp, &ninode); + np->isop = xp->isop; + xp->isop = np; + xp = np; + } + + for (i = 0; i < Maxspc; i++) + if (usedtaxa[i] == FALSE) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (sequences missing in tree)\n\n\n"); + exit(1); + } + + /* closing bracket reached */ + if (dvg < 3) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no unrooted tree)\n\n\n"); + exit(1); + } + tr->rootp = xp; + Numibrnch = ninode; + Numbrnch = Numspc + ninode; + + chp++; + if (*chp == ';' || *chp == '\0') { + free_ivector(usedtaxa); + return; + } + + /* copy last internal label (max. 20 characters) */ + xp->label = new_cvector(21); + (xp->label)[0] = *chp; + (xp->label)[1] = '\0'; + for (numc = 1; numc < 20; numc++) { + chp++; + if (*chp == ';' || *chp == '\0') { + free_ivector(usedtaxa); + return; + } else { + (xp->label)[numc] = *chp; + (xp->label)[numc+1] = '\0'; + } + } + free_ivector(usedtaxa); + return; +} + + +/* remove possible basal bifurcation */ +void removebasalbif(cvector strtree) +{ + int n, c, brak, cutflag, h; + + /* check how many OTUs on basal level */ + n = 0; + c = 0; + brak = 0; + do { + if (strtree[n] == '(') brak++; + if (strtree[n] == ')') brak--; + + if (strtree[n] == ',' && brak == 1) c++; /* number of commas in outer bracket */ + + n++; + } while (strtree[n] != '\0'); + + /* if only 1 OTU inside outer bracket stop now */ + if (c == 0) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (Only 1 OTU inside outer bracket in tree)\n\n\n"); + exit(1); + } + + /* if only 2 OTUs inside outer bracket delete second pair of + brackets from the right to remove basal bifurcation */ + + if (c == 1) { + + n = 0; + brak = 0; + cutflag = 0; /* not yet cutted */ + h = 0; + do { + if (strtree[n] == '(') brak++; + if (strtree[n] == ')') brak--; + + if (brak == 2 && cutflag == 0) cutflag = 1; /* cutting */ + if (brak == 1 && cutflag == 1) { + cutflag = 2; /* cutted */ + /* leave out internal label */ + do { + h++; + } while (strtree[n+h] != ')' && strtree[n+h] != ','); + + } + + if (cutflag == 1) strtree[n] = strtree[n+1]; + if (cutflag == 2) strtree[n-1] = strtree[n+h]; + + n++; + } while (strtree[n] != '\0'); + } +} + + +void makeusertree(FILE *itfp) +{ + cvector strtree; + + strtree = new_cvector(23*Maxspc); /* for treefile */ + getusertree(itfp, strtree, 23*Maxspc); + removebasalbif(strtree); + constructtree(Ctree, strtree); + free_cvector(strtree); +} + + +/******************************************************************************/ +/* memory organisation for maximum likelihood tree */ +/******************************************************************************/ + +/* initialise new tree */ +Tree *new_tree(int maxspc, int numptrn, cmatrix seqconint) +{ + int n, i, maxibrnch; + Tree *tr; + Node *dp, *up; + + maxibrnch = maxspc - 3; + heights = (Node **) malloc((unsigned)(maxspc-2) * sizeof(Node *)); + if (heights == NULL) maerror("heights in new_tree"); + tr = (Tree *) malloc(sizeof(Tree)); + if (tr == NULL) maerror("tr in new_tree"); + tr->ebrnchp = (Node **) malloc((unsigned)maxspc * sizeof(Node *)); + if (tr->ebrnchp == NULL) maerror("ebrnchp in new_tree"); + tr->ibrnchp = (Node **) malloc((unsigned)maxibrnch * sizeof(Node *)); + if (tr->ibrnchp == NULL) maerror("ibrnchp in new_tree"); + tr->condlkl = new_dmatrix(numcats, numptrn); + for (n = 0; n < maxspc; n++) { + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_tree"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("up in new_tree"); + dp->isop = NULL; + up->isop = NULL; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = n; + up->number = n; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(maxspc); + up->paths = dp->paths; + for (i = 0; i < maxspc; i++) dp->paths[i] = 0; + dp->paths[n] = 1; + dp->eprob = seqconint[n]; + up->eprob = NULL; + dp->partials = NULL; + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ebrnchp[n] = dp; + up->label = NULL; + dp->label = NULL; + } + for (n = 0; n < maxibrnch; n++) { + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_tree"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("up in new_tree"); + dp->isop = NULL; + up->isop = NULL; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = n; + up->number = n; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(maxspc); + up->paths = dp->paths; + for (i = 0; i < maxspc; i++) dp->paths[i] = 0; + dp->eprob = NULL; + up->eprob = NULL; + dp->partials = new_dcube(numcats, numptrn, tpmradix); + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ibrnchp[n] = dp; + up->label = NULL; + dp->label = NULL; + } + tr->rootp = NULL; + + /* + * reserve memory for lengths of the tree branches + * and for the distance matrix as a vector + * (needed for LS estimation of tree branch lengths) + */ + + Brnlength = new_dvector(2 * maxspc - 3); + Distanvec = new_dvector((maxspc * (maxspc - 1)) / 2); + + return tr; +} + + +/* initialise quartet tree */ +Tree *new_quartet(int numptrn, cmatrix seqconint) +{ + int n, i; + Tree *tr; + Node *dp, *up; + + heights = (Node **) malloc((unsigned)2 * sizeof(Node *)); + if (heights == NULL) maerror("heights in new_quartet"); + /* reserve memory for tree */ + tr = (Tree *) malloc(sizeof(Tree)); + if (tr == NULL) maerror("tr in new_quartet"); + tr->ebrnchp = (Node **) malloc((unsigned) 4 * sizeof(Node *)); + if (tr->ebrnchp == NULL) maerror("ebrnchp in new_quartet"); + tr->ibrnchp = (Node **) malloc((unsigned) sizeof(Node *)); + if (tr->ibrnchp == NULL) maerror("ibrnchp in new_quartet"); + tr->condlkl = new_dmatrix(numcats, numptrn); + /* reserve memory for nodes */ + for (n = 0; n < 4; n++) { + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_quartet"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("dp in new_quartet"); + dp->isop = NULL; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = n; + up->number = n; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(4); + up->paths = dp->paths; + for (i = 0; i < 4; i++) dp->paths[i] = 0; + dp->paths[n] = 1; + dp->eprob = seqconint[n]; /* make quartet (0,1)-(2,3) as default */ + up->eprob = NULL; + dp->partials = NULL; + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ebrnchp[n] = dp; + } + + /* reserve memory for internal branch */ + dp = (Node *) malloc(sizeof(Node)); + if (dp == NULL) maerror("dp in new_quartet"); + up = (Node *) malloc(sizeof(Node)); + if (up == NULL) maerror("dp in new_quartet"); + dp->isop = tr->ebrnchp[3]->kinp; /* connect internal branch */ + up->isop = tr->ebrnchp[0]->kinp; + dp->kinp = up; + up->kinp = dp; + dp->descen = TRUE; + up->descen = FALSE; + dp->number = 0; + up->number = 0; + dp->length = 0.0; + up->length = 0.0; + dp->lengthc = 0.0; + up->lengthc = 0.0; + dp->varlen = 0.0; + up->varlen = 0.0; + dp->paths = new_ivector(4); + up->paths = dp->paths; + up->paths[0] = 0; + up->paths[1] = 0; + up->paths[2] = 1; + up->paths[3] = 1; + dp->eprob = NULL; + up->eprob = NULL; + dp->partials = new_dcube(numcats, numptrn, tpmradix); + up->partials = new_dcube(numcats, numptrn, tpmradix); + tr->ibrnchp[0] = dp; + + /* place root */ + tr->rootp = up; + + /* connect external branches */ + tr->ebrnchp[0]->kinp->isop = tr->ebrnchp[1]->kinp; + tr->ebrnchp[1]->kinp->isop = tr->rootp; + tr->ebrnchp[3]->kinp->isop = tr->ebrnchp[2]->kinp; + tr->ebrnchp[2]->kinp->isop = tr->rootp->kinp; + + /* + * reserve memory for lengths of the five branches + * of a quartet and for the six possible distances + * (needed for LS estimation of branch lengths) + */ + Brnlength = new_dvector(NUMQBRNCH); + Distanvec = new_dvector(NUMQSPC*(NUMQSPC-1)/2); + + return tr; +} + + +/* free tree memory */ +void free_tree(Tree *tr, int taxa) +{ + int n; + Node *dp, *up; + + free(heights); + free_dmatrix(tr->condlkl); + for (n = 0; n < taxa; n++) { + dp = tr->ebrnchp[n]; + up = dp->kinp; + free_ivector(dp->paths); + free_dcube(up->partials); + free(dp); + free(up); + } + free(tr->ebrnchp); + for (n = 0; n < (taxa-3); n++) { + dp = tr->ibrnchp[n]; + up = dp->kinp; + free_dcube(dp->partials); + free_dcube(up->partials); + free_ivector(dp->paths); + free(dp); + free(up); + } + free(tr->ibrnchp); + free(tr); + free_dvector(Brnlength); /* branch lengths (for LS estimation) */ + free_dvector(Distanvec); /* distances (for LS estimation) */ +} + + +/* make (a,b)-(c,d) quartet + + a ---+ +--- c + +-----+ + b ---+ +--- d + + species numbers range from 0 to Maxspc - 1 */ + +void make_quartet(int a, int b, int c, int d) +{ + /* place sequences */ + Ctree->ebrnchp[0]->eprob = Seqpat[a]; + Ctree->ebrnchp[1]->eprob = Seqpat[b]; + Ctree->ebrnchp[2]->eprob = Seqpat[c]; + Ctree->ebrnchp[3]->eprob = Seqpat[d]; + + /* make distance vector */ + Distanvec[0] = Distanmat[b][a]; + Distanvec[1] = Distanmat[c][a]; + Distanvec[2] = Distanmat[c][b]; + Distanvec[3] = Distanmat[d][a]; + Distanvec[4] = Distanmat[d][b]; + Distanvec[5] = Distanmat[d][c]; +} + +/* write distance matrix as vector */ +void changedistan(dmatrix distanmat, dvector distanvec, int numspc) +{ + int i, j, k; + + for (k = 0, i = 1; i < numspc; i++) { + for (j = 0; j < i; j++, k++) + distanvec[k] = distanmat[i][j]; + } +} + + +/******************************************************************************/ +/* computation of maximum likelihood tree */ +/******************************************************************************/ + + +/* compute the likelihood for (a,b)-(c,d) quartet */ +double quartet_lklhd(int a, int b, int c, int d) +{ + /* reserve memory for quartet if necessary */ + if (mlmode != 1) { /* no quartet tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_quartet(Numptrn, Seqpat); + Numbrnch = NUMQBRNCH; + Numibrnch = NUMQIBRNCH; + Numspc = NUMQSPC; + mlmode = 1; + } + + /* make (a,b)-(c,d) quartet */ + make_quartet(a,b,c,d); + + clockmode = 0; /* nonclocklike branch lengths */ + + /* least square estimate for branch length */ + lslength(Ctree, Distanvec, Numspc, Numibrnch, Brnlength); + + /* compute likelihood */ + Ctree->lklhd = optlkl(Ctree); + + return Ctree->lklhd; +} + + +/* compute the approximate likelihood for (a,b)-(c,d) quartet */ +double quartet_alklhd(int a, int b, int c, int d) +{ + /* reserve memory for quartet if necessary */ + if (mlmode != 1) { /* no quartet tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_quartet(Numptrn, Seqpat); + Numbrnch = NUMQBRNCH; + Numibrnch = NUMQIBRNCH; + Numspc = NUMQSPC; + mlmode = 1; + } + + /* make (a,b)-(c,d) quartet */ + make_quartet(a,b,c,d); + + clockmode = 0; /* nonclocklike branch lengths */ + + /* least square estimate for branch length */ + lslength(Ctree, Distanvec, Numspc, Numibrnch, Brnlength); + + /* compute likelihood */ + Ctree->lklhd = treelkl(Ctree); + + return Ctree->lklhd; +} + + +/* read usertree from file to memory */ +void readusertree(FILE *ifp) +{ + /* reserve memory for tree if necessary */ + if (mlmode != 2) { /* no tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_tree(Maxspc, Numptrn, Seqpat); + Numbrnch = 2*Maxspc-3; + Numibrnch = Maxspc-3; + Numspc = Maxspc; + mlmode = 2; + } + + /* read tree */ + makeusertree(ifp); +} + + +/* compute the likelihood of a usertree */ +double usertree_lklhd() +{ + /* be sure to have a usertree in memory and + to have pairwise distances computed */ + + clockmode = 0; /* nonclocklike branch lengths */ + + /* least square estimate for branch length */ + changedistan(Distanmat, Distanvec, Numspc); + lslength(Ctree, Distanvec, Numspc, Numibrnch, Brnlength); + + /* compute likelihood */ + Ctree->lklhd = optlkl(Ctree); + + return Ctree->lklhd; +} + + +/* compute the approximate likelihood of a usertree */ +double usertree_alklhd() +{ + /* be sure to have a usertree in memory and + to have pairwise distances computed */ + + clockmode = 0; /* nonclocklike branch lengths */ + + /* least square estimate for branch length */ + changedistan(Distanmat, Distanvec, Numspc); + lslength(Ctree, Distanvec, Numspc, Numibrnch, Brnlength); + + /* compute likelihood */ + Ctree->lklhd = treelkl(Ctree); + + return Ctree->lklhd; +} + + +/* preparation for ML analysis */ +void mlstart() +{ + /* number of states and code length */ + tpmradix = gettpmradix(); + + /* declare variables */ + Eval = new_dvector(tpmradix); + Evec = new_dmatrix(tpmradix,tpmradix); + Ievc = new_dmatrix(tpmradix,tpmradix); + iexp = new_dmatrix(tpmradix,tpmradix); + Alias = new_ivector(Maxsite); + + /* process sequence information */ + evaluateseqs(); + bestrate = new_ivector(Numptrn); + + /* compute transition probability matrix */ + tranprobmat(); + + /* non-zero rate categories */ + Rates = new_dvector(numcats); + updaterates(); + ltprobr = new_dcube(numcats, tpmradix,tpmradix); + + /* compute distance matrix */ + Distanmat = new_dmatrix(Maxspc, Maxspc); + initdistan(); + + /* initialize tree pointer for quartet tree */ + mlmode = 1; + Ctree = new_quartet(Numptrn, Seqpat); + Numbrnch = NUMQBRNCH; + Numibrnch = NUMQIBRNCH; + Numspc = NUMQSPC; + + /* computing ML distances */ + computedistan(); +} + + +/* recompute ml distances for quartet only */ +void distupdate(int a, int b, int c, int d) +{ + /* update distance matrix */ + /* consider only entries relevant to quartet */ + Distanmat[a][b] = mldistance(a, b); + Distanmat[b][a] = Distanmat[a][b]; + Distanmat[a][c] = mldistance(a, c); + Distanmat[c][a] = Distanmat[a][c]; + Distanmat[a][d] = mldistance(a, d); + Distanmat[d][a] = Distanmat[a][d]; + Distanmat[b][c] = mldistance(b, c); + Distanmat[c][b] = Distanmat[b][c]; + Distanmat[b][d] = mldistance(b, d); + Distanmat[d][b] = Distanmat[b][d]; + Distanmat[c][d] = mldistance(c, d); + Distanmat[d][c] = Distanmat[c][d]; +} + + +/* cleanup after ML analysis */ +void mlfinish() +{ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + free_ivector(bestrate); + free_ivector(Alias); + free_cmatrix(Seqpat); + free_ivector(constpat); + free_ivector(Weight); + free_dmatrix(Distanmat); + free_dvector(Eval); + free_dmatrix(Evec); + free_dmatrix(Ievc); + free_dvector(Rates); + free_dcube(ltprobr); + free_dmatrix(iexp); +} + + +/******************************************************************************/ +/* tree output */ +/******************************************************************************/ + + +#define MAXOVER 50 +#define MAXLENG 30 +#define MAXCOLUMN 80 + + +void prbranch(Node *up, int depth, int m, int maxm, + ivector umbrella, ivector column, FILE *outfp) +{ + int i, num, n, maxn, lim; + Node *cp; + char bch; + + if ((int)((clockmode ? up->lengthc : up->length) * Proportion) >= MAXOVER) { + column[depth] = MAXLENG; + bch = '+'; + } else { + column[depth] = (int)((clockmode ? up->lengthc : up->length) * Proportion) + 3; + bch = '-'; + } + + if (up->isop == NULL) { /* external branch */ + num = up->number + 1; /* offset */ + if (m == 1) umbrella[depth - 1] = TRUE; + for (i = 0; i < depth; i++) { + if (umbrella[i]) + fprintf(outfp, "%*c", column[i], ':'); + else + fprintf(outfp, "%*c", column[i], ' '); + } + if (m == maxm) + umbrella[depth - 1] = FALSE; + for (i = 0, lim = column[depth] - 3; i < lim; i++) + fputc(bch, outfp); + fprintf(outfp, "-%d ", num); + + fputid(outfp, up->number); + + + fputc('\n', outfp); + fputc(' ', outfp); + return; + } + + num = up->number + 1 + Numspc; /* offset, internal branch */ + for (cp = up->isop, maxn = 0; cp != up; cp = cp->isop, maxn++) + ; + for (cp = up->isop, n = 1; cp != up; cp = cp->isop, n++) { + prbranch(cp->kinp, depth + 1, n, maxn, umbrella, column, outfp); + if (m == 1 && n == maxn / 2) umbrella[depth - 1] = TRUE; + if (n != maxn) { + for (i = 0; i < depth; i++) { + if (umbrella[i]) + fprintf(outfp, "%*c", column[i], ':'); + else + fprintf(outfp, "%*c", column[i], ' '); + } + if (n == maxn / 2) { /* internal branch */ + for (i = 0, lim = column[depth] - 3; i < lim; i++) + fputc(bch, outfp); + if (num < 10) + fprintf(outfp, "--%d", num); + else if (num < 100) + fprintf(outfp, "-%2d", num); + else + fprintf(outfp, "%3d", num); + } else { + if (umbrella[depth]) + fprintf(outfp, "%*c", column[depth], ':'); + else + fprintf(outfp, "%*c", column[depth], ' '); + } + fputc('\n', outfp); + fputc(' ', outfp); + } + if (m == maxm) umbrella[depth - 1] = FALSE; + } + return; +} + + +void getproportion(double *proportion, dvector distanvec, int numspc) +{ + int i, maxpair; + double maxdis; + + maxpair = (numspc*(numspc-1))/2; + + maxdis = 0.0; + for (i = 0; i < maxpair; i++) { + if (distanvec[i] > maxdis) { + maxdis = distanvec[i]; + } + } + *proportion = (double) MAXCOLUMN / (maxdis * 3.0); + if (*proportion > 1.0) *proportion = 1.0; +} + + +void prtopology(FILE *outfp) +{ + int n, maxn, depth; + ivector umbrella; + ivector column; + Node *cp, *rp; + + getproportion(&Proportion, Distanvec, Numspc); + + umbrella = new_ivector(Numspc); + column = new_ivector(Numspc); + + for (n = 0; n < Numspc; n++) { + umbrella[n] = FALSE; + column[n] = 3; + } + column[0] = 1; + + fputc(' ', outfp); + + /* original code: rp = Ctree->rootp */ + /* but we want to print the first group in the + trichotomy as outgroup at the bottom! */ + rp = Ctree->rootp->isop; + + for (maxn = 1, cp = rp->isop; cp != rp; cp = cp->isop, maxn++) + ; + depth = 1; + n = 0; + + cp = rp; + do { + cp = cp->isop; + n++; + prbranch(cp->kinp, depth, n, maxn, umbrella, column, outfp); + if (cp != rp) fprintf(outfp, "%*c\n ", column[0], ':'); + } while (cp != rp); + + free_ivector(umbrella); + free_ivector(column); +} + + +/* print unrooted tree file with branch lengths */ +void fputphylogeny(FILE *fp) +{ + Node *cp, *rp; + int n; + + cp = rp = Ctree->rootp; + putc('(', fp); + n = 1; + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + if (n > 60) { + fprintf(fp, "\n"); + n = 2; + } + n += fputid(fp, cp->number); + fprintf(fp, ":%.5f", ((clockmode ? cp->lengthc : cp->length))*0.01); + n += 7; + cp = cp->kinp; + } else { /* internal node */ + if (cp->descen) { + if (n > 60) { + fprintf(fp, "\n"); + n = 1; + } + putc('(', fp); + n++; + } else { + putc(')', fp); + n++; + if (n > 60) { + fprintf(fp, "\n"); + n = 1; + } + /* internal label */ + if (cp->kinp->label != NULL) { + fprintf(fp, "%s", cp->kinp->label); + n += strlen(cp->kinp->label); + } + fprintf(fp, ":%.5f", ((clockmode ? cp->lengthc : cp->length))*0.01); + n += 7; + } + } + if (!cp->descen && !cp->isop->descen && cp != rp) { + putc(',', fp); /* not last subtree */ + n++; + } + } while (cp != rp); + fprintf(fp, ")"); + /* internal label */ + if (cp->label != NULL) + fprintf(fp, "%s", cp->label); + fprintf(fp, ";\n"); +} + + +void resulttree(FILE *outfp) +{ + int n, ne, closeflag; + Node *ep, *ip; + double blen; + + closeflag = FALSE; + + if (clockmode) { + fprintf(outfp, "\n branch length nc/c"); + fprintf(outfp, " branch length nc/c (= non-clock/clock)\n"); + } else { + fprintf(outfp, "\n branch length S.E."); + fprintf(outfp, " branch length S.E.\n"); + } + for (n = 0; n < Numspc; n++) { + ep = Ctree->ebrnchp[n]; + ne = ep->number; + fputid10(outfp, ne); + fputs(" ", outfp); + fprintf(outfp, "%3d", ne + 1); + blen = (clockmode ? ep->lengthc : ep->length); + fprintf(outfp, "%9.5f", blen*0.01); + if (blen < 5.0*MINARC || blen > 0.95*MAXARC) closeflag = TRUE; + if (clockmode) + fprintf(outfp, "%9.3f", (ep->length)/(ep->lengthc)); + else + fprintf(outfp, "%9.5f", 0.01*sqrt(ep->kinp->varlen)); + if (n < Numibrnch) { + ip = Ctree->ibrnchp[n]; + fprintf(outfp, "%8d", n + 1 + Numspc); + blen = (clockmode ? ip->lengthc : ip->length); + fprintf(outfp, "%9.5f", blen*0.01); + if (blen < 5.0*MINARC || blen > 0.95*MAXARC) closeflag = TRUE; + if (clockmode) + fprintf(outfp, "%9.3f", (ip->length)/(ip->lengthc)); + else + fprintf(outfp, "%9.5f", 0.01*sqrt(ip->kinp->varlen)); + fputc('\n', outfp); + } else { + if (n == Numspc - 3) { + fputc('\n', outfp); + } else if (n == Numspc - 2) { + if (clockmode) { + if (!Convergc) + fprintf(outfp, " No convergence after %d iterations!\n", Numitc); + else + fprintf(outfp, " %d iterations until convergence\n", Numitc); + } else { + if (!Converg) + fprintf(outfp, " No convergence after %d iterations!\n", Numit); + else + fprintf(outfp, " %d iterations until convergence\n", Numit); + } + } else if (n == Numspc - 1) { + fprintf(outfp, " log L: %.2f\n", (clockmode ? Ctree->lklhdc : Ctree->lklhd)); + } else { + fputc('\n', outfp); + } + } + } + if(closeflag) + fprintf(outfp, "\nWARNING --- at least one branch length is close to an internal boundary!\n"); +} + + +/******************************************************************************/ +/* Neighbor-joining tree */ +/******************************************************************************/ + + +/* compute NJ tree and write to file */ +void njtree(FILE *fp) +{ + /* reserve memory for tree if necessary */ + if (mlmode != 3) { /* no tree */ + if (Ctree != NULL) + free_tree(Ctree, Numspc); + Ctree = new_tree(Maxspc, Numptrn, Seqpat); + Numbrnch = 2*Maxspc-3; + Numibrnch = Maxspc-3; + Numspc = Maxspc; + mlmode = 3; + } + + /* construct NJ tree from distance matrix */ + njdistantree(Ctree); + + fputphylogeny(fp); +} + + +/* construct NJ tree from distance matrix */ +void njdistantree(Tree *tr) +{ + int i, j, otui=0, otuj=0, otuk, nsp2, cinode, step, restsp, k; + double dij, bix, bjx, bkx, sij, smax, dnsp2; + dvector r; + dmatrix distan; + Node **psotu, *cp, *ip, *jp, *kp; + + distan = new_dmatrix(Maxspc,Maxspc); + for (i = 0; i < Maxspc; i++) + for (j = 0; j < Maxspc; j++) + distan[i][j] = Distanmat[i][j]; + + nsp2 = Maxspc - 2; + dnsp2 = 1.0 / nsp2; + + r = new_dvector(Maxspc); + + psotu = (Node **) malloc((unsigned)Maxspc * sizeof(Node *)); + if (psotu == NULL) maerror("psotu in njdistantree"); + + /* external branches are start OTUs */ + for (i = 0; i < Maxspc; i++) + psotu[i] = tr->ebrnchp[i]->kinp; + + restsp = Maxspc; + cinode = 0; /* counter for internal nodes */ + + for (step = 0; restsp > 3; step++) { /* NJ clustering steps */ + + for (i = 0; i < Maxspc; i++) { + if (psotu[i] != NULL) { + for (j = 0, r[i] = 0.0; j < Maxspc; j++) + if (psotu[j] != NULL) + r[i] += distan[i][j]; + } + } + + smax = -1.0; + for (i = 0; i < Maxspc-1; i++) { + if (psotu[i] != NULL) { + + for (j = i+1; j < Maxspc; j++) { + if (psotu[j] != NULL) + { + sij = ( r[i] + r[j] ) * dnsp2 - distan[i][j]; + + if (sij > smax) { + smax = sij; + otui = i; + otuj = j; + } + } + } + } + } + + /* new pair: otui and otuj */ + + dij = distan[otui][otuj]; + bix = (dij + r[otui]/nsp2 - r[otuj]/nsp2) * 0.5; + bjx = dij - bix; + + cp = tr->ibrnchp[cinode]; + + ip = psotu[otui]; + jp = psotu[otuj]; + cp->isop = ip; + ip->isop = jp; + jp->isop = cp; + ip->length = bix; + jp->length = bjx; + ip->kinp->length = ip->length; + jp->kinp->length = jp->length; + + cp = cp->kinp; + + for (k = 0; k < Maxspc; k++) + { + if (psotu[k] != NULL && k != otui && k != otuj) + { + dij = (distan[otui][k] + distan[otuj][k] - distan[otui][otuj]) * 0.5; + distan[otui][k] = dij; + distan[k][otui] = dij; + } + } + distan[otui][otui] = 0.0; + + psotu[otui] = cp; + psotu[otuj] = NULL; + + cinode++; + + restsp--; + nsp2--; + dnsp2 = 1.0 / nsp2; + } + + otui = otuj = otuk = -1; + for (i = 0; i < Maxspc; i++) + { + if (psotu[i] != NULL) { + if (otui == -1) otui = i; + else if (otuj == -1) otuj = i; + else otuk = i; + } + } + bix = (distan[otui][otuj] + distan[otui][otuk] - distan[otuj][otuk]) * 0.5; + bjx = distan[otui][otuj] - bix; + bkx = distan[otui][otuk] - bix; + ip = psotu[otui]; + jp = psotu[otuj]; + kp = psotu[otuk]; + ip->isop = jp; + jp->isop = kp; + kp->isop = ip; + ip->length = bix; + jp->length = bjx; + kp->length = bkx; + ip->kinp->length = ip->length; + jp->kinp->length = jp->length; + kp->kinp->length = kp->length; + + tr->rootp = kp; + + free_dvector(r); + free_dmatrix(distan); + free((Node *) psotu); +} + +/******************************************************************************/ +/* find best assignment of rate categories */ +/******************************************************************************/ + +/* find best assignment of rate categories */ +void findbestratecombination() +{ + int k, u; + double bestvalue, fv2; + dvector catprob; + dmatrix cdl; + + cdl = Ctree->condlkl; + catprob = new_dvector(numcats+1); + fv2 = (1.0-fracinv)/(double) numcats; + + for (k = 0; k < Numptrn; k++) { + /* zero rate */ + if (constpat[k] == TRUE) + catprob[0] = fracinv*Freqtpm[(int) Seqpat[0][k]]; + else + catprob[0] = 0.0; + /* non-zero-rates */ + for (u = 1; u < numcats+1; u++) + catprob[u] = fv2*cdl[u-1][k]; + /* find best */ + bestvalue = catprob[0]; + bestrate[k] = 0; + for (u = 1; u < numcats+1; u++) + if (catprob[u] >= bestvalue) { + bestvalue = catprob[u]; + bestrate[k] = u; + } + } + free_dvector(catprob); + bestratefound = 1; +} + +/* print best assignment of rate categories */ +void printbestratecombination(FILE *fp) +{ + int s, k; + + for (s = 0; s < Maxsite; s++) { + k = Alias[s]; + fprintf(fp, "%2d", bestrate[k]); + if ((s+1) % 30 == 0) + fprintf(fp, "\n"); + else if ((s+1) % 10 == 0) + fprintf(fp, " "); + } + if (s % 70 != 0) + fprintf(fp, "\n"); +} + + +/******************************************************************************/ +/* computation of clocklike branch lengths */ +/******************************************************************************/ + +/* checks wether e is a valid edge specification */ +int checkedge(int e) +{ + /* there are Numspc external branches: + 0 - Numspc-1 + there are Numibrnch internal branches: + Numspc - Numspc+Numibrnch-1 + */ + + if (e < 0) return FALSE; + if (e < Numspc+Numibrnch) return TRUE; + else return FALSE; +} + +/* print topology of subtree */ +void fputsubstree(FILE *fp, Node *ip) +{ + Node *cp; + + if (ip->isop == NULL) { /* terminal nodes */ + numtc += fputid(fp, ip->number); + } else { + cp = ip; + fprintf(fp, "("); + numtc += 1; + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + numtc += fputid(fp, cp->number); + fprintf(fp, ":%.5f", (cp->lengthc)*0.01); + numtc += 7; + cp = cp->kinp; + } else { /* internal node */ + if (cp->height > 0.0) { + fprintf(fp, "("); + numtc += 1; + } else if (cp->height < 0.0) { + fprintf(fp, ")"); + numtc += 1; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + /* internal label */ + if (cp->kinp->label != NULL) { + fprintf(fp, "%s", cp->kinp->label); + numtc += strlen(cp->kinp->label); + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fprintf(fp, ":%.5f", (cp->lengthc)*0.01); + numtc += 6; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + } + } + if (cp->height <= 0.0 && cp->isop->height <= 0.0 && + cp->isop != ip) { + putc(',', fp); /* not last subtree */ + numtc += 1; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + } + } while (cp->isop != ip); + fprintf(fp, ")"); + numtc += 1; + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + +} + +/* print rooted tree file */ +void fputrooted(FILE *fp, int e) +{ + Node *rootbr; + + /* to be called only after clocklike branch + lengths have been computed */ + + /* pointer to root branch */ + if (e < Numspc) rootbr = Ctree->ebrnchp[e]; + else rootbr = Ctree->ibrnchp[e - Numspc]; + + fprintf(fp, "("); + numtc = 2; + fputsubstree(fp, rootbr); + /* internal label */ + if (rootbr->label != NULL) { + fprintf(fp, "%s", rootbr->label); + numtc += strlen(rootbr->label); + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fprintf(fp, ":%.5f,", (hroot - rootbr->height)*0.01); + numtc += 7; + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fputsubstree(fp, rootbr->kinp); + /* internal label */ + if (rootbr->kinp->label != NULL) { + fprintf(fp, "%s", rootbr->kinp->label); + numtc += strlen(rootbr->kinp->label); + } + if (numtc > 60) { + fprintf(fp, "\n"); + numtc = 1; + } + fprintf(fp, ":%.5f);\n", (hroot - rootbr->kinp->height)*0.01); +} + +/* finds heights in subtree */ +void findheights(Node *ip) +{ + Node *cp, *rp; + + if (ip->isop != NULL) { /* forget terminal nodes */ + + cp = ip; + + /* initialise node */ + cp->height = 1.0; /* up */ + rp = cp; + while (rp->isop != cp) { + rp = rp->isop; + rp->height = -1.0; /* down */ + } + + do { + cp = cp->isop->kinp; + if (cp->isop == NULL) { /* external node */ + cp = cp->kinp; + } else { /* internal node */ + if (cp->height == 0.0) { /* node not yet visited */ + cp->height = 1.0; /* up */ + rp = cp; + while (rp->isop != cp) { + rp = rp->isop; + rp->height = -1.0; /* down */ + } + } else if (cp->kinp->height == 1.0) { + /* cp->kinp is next height pointer */ + heights[Numhts] = cp->kinp; + Numhts++; + } + } + } while (cp->isop != ip); + /* ip is last height pointer */ + heights[Numhts] = ip; + Numhts++; + } +} + + +/* initialise clocklike branch lengths (with root on edge e) */ +void initclock(int e) +{ + int n, h, count; + Node *cp, *rp; + double sum, minh, aveh, len; + + /* be sure to have a Ctree in memory and + to have pairwise distances computed */ + + clockmode = 1; /* clocklike branch lengths */ + + /* least square estimate for branch length */ + changedistan(Distanmat, Distanvec, Numspc); + lslength(Ctree, Distanvec, Numspc, Numibrnch, Brnlength); + + /* pointer to root branch */ + if (e < Numspc) rootbr = Ctree->ebrnchp[e]; + else rootbr = Ctree->ibrnchp[e - Numspc]; + + /* clear all heights */ + for (n = 0; n < Numspc; n++) { + Ctree->ebrnchp[n]->height = 0.0; + Ctree->ebrnchp[n]->kinp->height = 0.0; + Ctree->ebrnchp[n]->varheight = 0.0; + Ctree->ebrnchp[n]->kinp->varheight = 0.0; + if (n < Numibrnch) { + Ctree->ibrnchp[n]->height = 0.0; + Ctree->ibrnchp[n]->kinp->height = 0.0; + Ctree->ibrnchp[n]->varheight = 0.0; + Ctree->ibrnchp[n]->kinp->varheight = 0.0; + } + } + + /* collect pointers to height nodes */ + Numhts = 0; + findheights(rootbr); /* one side */ + findheights(rootbr->kinp); /* other side */ + + /* assign preliminary approximate heights and + corresponding branch lengths */ + for (h = 0; h < Numhts; h++) { + + cp = rp = heights[h]; + sum = 0; + count = 0; + minh = 0.0; + while (rp->isop != cp) { + count++; + rp = rp->isop; + sum += rp->lengthc + rp->kinp->height; + if (rp->kinp->height > minh) minh = rp->kinp->height; + } + aveh = sum / (double) count; + if (aveh < minh + MINARC) aveh = minh + MINARC; + cp->height = aveh; + rp = cp; + while (rp->isop != cp) { + rp = rp->isop; + len = aveh - rp->kinp->height; + rp->kinp->lengthc = len; + rp->lengthc = len; + } + + } + if (rootbr->height > rootbr->kinp->height) minh = rootbr->height; + else minh = rootbr->kinp->height; + aveh = 0.5*(rootbr->lengthc + rootbr->height + rootbr->kinp->height); + if (aveh < minh + MINARC) aveh = minh + MINARC; + hroot = aveh; + maxhroot = RMHROOT*hroot; /* maximal possible hroot */ + len = (hroot - rootbr->height) + (hroot - rootbr->kinp->height); + rootbr->lengthc = len; + rootbr->kinp->lengthc = len; +} + +/* approximate likelihood under the constaining assumption of + clocklike branch lengths (with root on edge e) */ +double clock_alklhd(int e) +{ + initclock(e); + Ctree->lklhdc = treelkl(Ctree); + + return Ctree->lklhdc; +} + +/* log-likelihood given height ht at node pointed to by chep */ +double heightlkl(double ht) +{ + Node *rp; + double len; + + /* adjust branch lengths */ + chep->height = ht; + /* descendent branches */ + rp = chep; + while (rp->isop != chep) { + rp = rp->isop; + len = chep->height - rp->kinp->height; + rp->kinp->lengthc = len; + rp->lengthc = len; + } + /* upward branch */ + if (chep == rootbr || chep->kinp == rootbr) { + len = (hroot - chep->height) + (hroot - chep->kinp->height); + chep->lengthc = len; + chep->kinp->lengthc = len; + } else { + rp = chep->kinp; + while (rp->isop->height <= 0.0) + rp = rp->isop; + chep->lengthc = rp->isop->height - chep->height; + chep->kinp->lengthc = rp->isop->height - chep->height; + } + + /* compute likelihood */ + Ctree->lklhdc = treelkl(Ctree); + + return -(Ctree->lklhdc); /* we use a minimizing procedure */ +} + +/* optimize current height */ +void optheight(void) +{ + double he, fx, f2x, minh, maxh, len; + Node *rp; + + /* current height */ + he = chep->height; + + /* minimum */ + minh = 0.0; + rp = chep; + while (rp->isop != chep) { + rp = rp->isop; + if (rp->kinp->height > minh) + minh = rp->kinp->height; + } + minh += MINARC; + + /* maximum */ + if (chep == rootbr || chep->kinp == rootbr) { + maxh = hroot; + } else { + rp = chep->kinp; + while (rp->isop->height <= 0.0) + rp = rp->isop; + maxh = rp->isop->height; + } + maxh -= MINARC; + + /* check borders for height */ + if (he < minh) he = minh; + if (he > maxh) he = maxh; + + /* optimization */ + if (!(he == minh && he == maxh)) + he = onedimenmin(minh, he, maxh, heightlkl, HEPSILON, &fx, &f2x); + + /* variance of height */ + f2x = fabs(f2x); + if (1.0/(maxhroot*maxhroot) < f2x) + chep->varheight = 1.0/f2x; + else + chep->varheight = maxhroot*maxhroot; + + /* adjust branch lengths */ + chep->height = he; + /* descendent branches */ + rp = chep; + while (rp->isop != chep) { + rp = rp->isop; + len = chep->height - rp->kinp->height; + rp->kinp->lengthc = len; + rp->lengthc = len; + } + /* upward branch */ + if (chep == rootbr || chep->kinp == rootbr) { + len = (hroot - chep->height) + (hroot - chep->kinp->height); + chep->lengthc = len; + chep->kinp->lengthc = len; + } else { + rp = chep->kinp; + while (rp->isop->height <= 0.0) + rp = rp->isop; + chep->lengthc = rp->isop->height - chep->height; + chep->kinp->lengthc = rp->isop->height - chep->height; + } +} + +/* log-likelihood given height ht at root */ +double rheightlkl(double ht) +{ + double len; + + /* adjust branch lengths */ + hroot = ht; + len = (hroot - rootbr->height) + (hroot - rootbr->kinp->height); + rootbr->lengthc = len; + rootbr->kinp->lengthc = len; + + /* compute likelihood */ + Ctree->lklhdc = treelkl(Ctree); + + return -(Ctree->lklhdc); /* we use a minimizing procedure */ +} + +/* optimize height of root */ +void optrheight(void) +{ + double he, fx, f2x, minh, len; + + /* current height */ + he = hroot; + + /* minimum */ + if (rootbr->height > rootbr->kinp->height) + minh = rootbr->height; + else + minh = rootbr->kinp->height; + minh += MINARC; + + /* check borders for height */ + if (he < minh) he = minh; + if (he > maxhroot) he = maxhroot; + + /* optimization */ + he = onedimenmin(minh, he, maxhroot, rheightlkl, HEPSILON, &fx, &f2x); + + /* variance of height of root */ + f2x = fabs(f2x); + if (1.0/(maxhroot*maxhroot) < f2x) + varhroot = 1.0/f2x; + else + varhroot = maxhroot*maxhroot; + + /* adjust branch lengths */ + hroot = he; + len = (hroot - rootbr->height) + (hroot - rootbr->kinp->height); + rootbr->lengthc = len; + rootbr->kinp->lengthc = len; +} + +/* exact likelihood under the constaining assumption of + clocklike branch lengths (with root on edge e) */ +double clock_lklhd(int e) +{ + int h, nconv; + double old; + + Numitc = 0; + Convergc = FALSE; + + initclock(e); + + do { + + Numitc++; + nconv = 0; + + /* optimize height of root */ + old = hroot; + optrheight(); + if (fabs(old - hroot) < HEPSILON) nconv++; + + /* optimize height of nodes */ + for (h = Numhts-1; h >= 0; h--) { + + /* pointer chep to current height node */ + chep = heights[h]; + + /* store old value */ + old = chep->height; + + /* find better height */ + optheight(); + + /* converged ? */ + if (fabs(old - chep->height) < HEPSILON) nconv++; + } + + if (nconv == Numhts+1) Convergc = TRUE; + + } while (Numitc < MAXIT && !Convergc); + + /* compute final likelihood */ + Ctree->lklhdc = treelkl(Ctree); + + return Ctree->lklhdc; +} + +/* find out the edge containing the root */ +int findrootedge() +{ + int e, ebest; + double logbest, logtest; + + /* compute the likelihood for all edges and take the edge with + best likelihood (using approximate ML) */ + + ebest = 0; + logbest = clock_alklhd(0); + numbestroot = 1; + for (e = 1; e < Numspc+Numibrnch; e++) { + logtest = clock_alklhd(e); + if (logtest > logbest) { + ebest = e; + logbest = logtest; + numbestroot = 1; + } else if (logtest == logbest) { + numbestroot++; + } + } + + return ebest; +} + +/* show heights and corresponding standard errors */ +void resultheights(FILE *fp) +{ + int h, num; + Node *cp; + + fprintf(fp, " height S.E. of node common to branches\n"); + for (h = 0; h < Numhts; h++) { + fprintf(fp, "%.5f %.5f ", (heights[h]->height)*0.01, + sqrt(heights[h]->varheight)*0.01); + cp = heights[h]; + do { + num = (cp->number) + 1; + if (cp->kinp->isop != NULL) num += Numspc; /* internal branch */ + fprintf(fp, "%d ", num); + cp = cp->isop; + } while (cp != heights[h]); + fprintf(fp, "\n"); + + } + fprintf(fp, "%.5f %.5f of root at branch %d\n", + hroot*0.01, sqrt(varhroot)*0.01, locroot+1); +} + diff --git a/forester/archive/RIO/others/puzzle_mod/src/ml3.c b/forester/archive/RIO/others/puzzle_mod/src/ml3.c new file mode 100644 index 0000000..a68a054 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/ml3.c @@ -0,0 +1,350 @@ +/* + * ml3.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#define EXTERN extern + + +/* prototypes */ +#include +#include +#include +#include "util.h" +#include "ml.h" +#include "gamma.h" + + + +/******************************************************************************/ +/* discrete Gamma-distribution and related stuff */ +/******************************************************************************/ + +/* compare general base frequencies with frequencies of taxon i with chi square */ +double homogentest(int taxon) +{ + return chi2test(Freqtpm, Basecomp[taxon], gettpmradix(), &chi2fail); +} + + +/* discrete Gamma according to Yang 1994 (JME 39:306-314) */ +void YangDiscreteGamma (double shape, int c, dvector x) +{ + double twoc, mu; + int i; + + twoc = 2.0*c; + mu = 0.0; + for (i = 0; i < c; i++) + { + /* corresponding rates */ + x[i] = icdfGamma ( (2.0*i+1.0)/twoc, shape); + mu += x[i]; + } + mu = mu/c; + + /* rescale for avarage rate of 1.0 */ + for (i = 0; i < c; i++) + { + x[i] /= mu; + } +} + +/* compute rates of each category when rates are Gamma-distributed */ +void updaterates() +{ + int i; + double alpha; + + if (numcats == 1) + { + Rates[0] = 1.0; + return; + } + if (Geta == 0.0) + { + for (i = 0; i < numcats; i++) + Rates[i] = 1.0; + return; + } + alpha = (1.0 - Geta)/Geta; + + YangDiscreteGamma (alpha, numcats, Rates); + + /* if invariable sites are present */ + for (i = 0; i < numcats; i++) + Rates[i] = Rates[i]/(1.0-fracinv); + + /* check for very small rates */ + for (i = 0; i < numcats; i++) + if (Rates[i] < 0.000001) Rates[i] = 0.000001; +} + + + +/******************************************************************************/ +/* parameter estimation */ +/******************************************************************************/ + +/* compute sample mean and standard deviation of sample mean */ +void computestat(double *data, int n, double *mean, double *err) +{ + int i; + double sum; + + sum = 0; + for (i = 0; i < n; i++) sum += data[i]; + (*mean) = sum/(double) n; + + sum = 0; + for (i = 0; i < n; i++) sum += (data[i] - (*mean))*(data[i] - (*mean)); + if (n != 1) + (*err) = sqrt(sum)/sqrt((double)(n-1)*n); /* unbiased estimator */ + else + (*err) = 0.0; /* if n == 1 */ +} + +/* compute ML value of quartet (a,b,c,d) */ +double quartetml(int a, int b, int c, int d) +{ + double d1, d2, d3; + + /* compute ML for all topologies */ + if (approxp_optn) { /* approximate parameter mode */ + d1 = quartet_alklhd(a,b,c,d); /* (a,b)-(c,d) */ + d2 = quartet_alklhd(a,c,b,d); /* (a,c)-(b,d) */ + d3 = quartet_alklhd(a,d,b,c); /* (a,d)-(b,c) */ + } else { + d1 = quartet_lklhd(a,b,c,d); /* (a,b)-(c,d) */ + d2 = quartet_lklhd(a,c,b,d); /* (a,c)-(b,d) */ + d3 = quartet_lklhd(a,d,b,c); /* (a,d)-(b,c) */ + } + + /* looking for max(d1, d2, d3) */ + if (d1 < d2) { /* d2 > d1 */ + if (d2 < d3) { /* d3 > d2 > d1 */ + /* d3 maximum */ + return d3; + } else { /* d2 >= d3 > d1 */ + /* d2 maximum */ + return d2; + } + } else { /* d1 >= d2 */ + if (d1 < d3) { /* d3 > d1 >= d2 */ + /* d3 maximum */ + return d3; + } else { /* d1 >= d2 && d1 >= d3 */ + /* d1 maximum */ + return d1; + } + } +} + +/* optimization function TSparam - quartets */ +double opttsq(double x) +{ + if (x < MINTS) TSparam = MINTS; + else if (x > MAXTS) TSparam = MAXTS; + else TSparam = x; + tranprobmat(); + distupdate(qca, qcb, qcc, qcd); + return (-quartetml(qca, qcb, qcc, qcd)); +} + +/* optimization function YRparam - quartets */ +double optyrq(double x) +{ + if (x < MINYR) YRparam = MINYR; + else if (x > MAXYR) YRparam = MAXYR; + else YRparam = x; + tranprobmat(); + distupdate(qca, qcb, qcc, qcd); + return (-quartetml(qca, qcb, qcc, qcd)); +} + +/* estimate substitution process parameters - random quartets */ +void optimseqevolparamsq() +{ + double tsmeanold, yrmeanold; + dvector tslist, yrlist; + int fin; + ivector taxon; + uli minqts, maxqts, n; + + + taxon = new_ivector(4); + + /* number of quartets to be investigated */ + minqts = (uli) floor(0.25 * MINPERTAXUM * Maxspc) + 1; + maxqts = (uli) floor(0.25 * MAXPERTAXUM * Maxspc) + 1; + if (Maxspc == 4) { + minqts = (uli) 1; + maxqts = (uli) 1; + } + + tslist = new_dvector(maxqts); + yrlist = new_dvector(maxqts); + + /* initialize averages */ + tsmean = TSparam; + yrmean = YRparam; + + fin = FALSE; + + /* investigate maxqts random quartets */ + for (n = 0; n < maxqts; n++) { + + /* choose random quartet */ + chooser(Maxspc, 4, taxon); + + /* + * optimize parameters on this quartet + */ + + qca = taxon[0]; + qcb = taxon[1]; + qcc = taxon[2]; + qcd = taxon[3]; + + /* initialize start values with average value */ + if ((SH_optn || nuc_optn) && optim_optn && (data_optn == 0)) TSparam = tsmean; + if ((nuc_optn && TN_optn) && optim_optn && (data_optn == 0)) YRparam = yrmean; + + /* estimation */ + twodimenmin(PEPS1, + (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), + MINTS, &TSparam, MAXTS, opttsq, &tserr, + (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), + MINYR, &YRparam, MAXYR, optyrq, &yrerr); + + + tsmeanold = tsmean; + yrmeanold = yrmean; + tslist[n] = TSparam; + yrlist[n] = YRparam; + computestat(tslist, n+1 , &tsmean, &tserr); + computestat(yrlist, n+1 , &yrmean, &yrerr); + + /* check whether the means are converging */ + if (n > minqts-2) { + if ((fabs(tsmean-tsmeanold) < TSDIFF) && + (fabs(yrmean-yrmeanold) < YRDIFF)) + fin = TRUE; + } + + /* investigate at least minqts quartets */ + if (n > minqts-2 && (fin || n > maxqts-2)) break; + } + + /* round estimated numbers to 2 digits after the decimal point */ + if (tserr != 0.0) tsmean = floor(100.0*tsmean+0.5)/100.0; + if (yrerr != 0.0) yrmean = floor(100.0*yrmean+0.5)/100.0; + + /* update ML engine */ + TSparam = tsmean; + YRparam = yrmean; + tranprobmat(); + + free_ivector(taxon); +} + +/* optimization function TSparam - tree */ +double opttst(double x) +{ + double result; + + if (x < MINTS) TSparam = MINTS; + else if (x > MAXTS) TSparam = MAXTS; + else TSparam = x; + tranprobmat(); + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + +/* optimization function YRparam - tree */ +double optyrt(double x) +{ + double result; + + if (x < MINYR) YRparam = MINYR; + else if (x > MAXYR) YRparam = MAXYR; + else YRparam = x; + tranprobmat(); + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + + +/* optimize substitution process parameters - tree */ +void optimseqevolparamst() +{ + twodimenmin(PEPS1, + (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), + MINTS, &TSparam, MAXTS, opttst, &tserr, + (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), + MINYR, &YRparam, MAXYR, optyrt, &yrerr); +} + + +/* optimization function fracinv */ +double optfi(double x) +{ + double result; + + if (x < MINFI) fracinv = MINFI; + else if (x > MAXFI) fracinv = MAXFI; + else fracinv = x; + + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + + +/* optimization function Geta */ +double optge(double x) +{ + double result; + + if (x < MINGE) Geta = MINGE; + else if (x > MAXGE) Geta = MAXGE; + else Geta = x; + + updaterates(); + + computedistan(); + if (approxp_optn) result = usertree_alklhd(); + else result = usertree_lklhd(); + + return (-result); +} + + +/* optimize rate heterogeneity parameters */ +void optimrateparams() +{ + twodimenmin(PEPS2, + fracinv_optim, + MINFI, &fracinv, fracconst, optfi, &fierr, + grate_optim, + MINGE, &Geta, MAXGE, optge, &geerr); + +} diff --git a/forester/archive/RIO/others/puzzle_mod/src/model1.c b/forester/archive/RIO/others/puzzle_mod/src/model1.c new file mode 100644 index 0000000..54fb889 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/model1.c @@ -0,0 +1,326 @@ +/* + * model1.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/* definitions */ +#define EXTERN extern + +/* prototypes */ +#include +#include "util.h" +#include "ml.h" + +/* number of states of the selected model */ +int gettpmradix() +{ + if (data_optn == 0) { /* nucleotides */ + if (nuc_optn) return 4; + if (SH_optn) return 16; + } else if (data_optn == 1) { /* amino acids */ + return 20; + } else { /* two-state model */ + return 2; + } + return 1; +} + +/* relative transition frequencies */ +void rtfdata(dmatrix q, double *f) +{ + double alp, alpy, alpr; + int i, j; + + if (data_optn == 0) + { /* nucleotides */ + + if (nuc_optn) + { /* 4x4 nucleotides */ + alp = 2.0*TSparam; + alpr = (alp * 2.0) / (YRparam + 1.0); + alpy = YRparam * alpr; + + q[0][1] = 1; q[0][2] = alpr; q[0][3] = 1; + q[1][2] = 1; q[1][3] = alpy; + q[2][3] = 1; + + f[0] = 0.25; f[1] = 0.25; f[2] = 0.25; f[3] = 0.25; + } + + if (SH_optn) + { /* 16x16 nucleotides */ + + alp = 2.0*TSparam; + + q[0][1] = 1; q[0][2] = alp; q[0][3] = 1; q[0][4] = 1; + q[0][5] = 0; q[0][6] = 0; q[0][7] = 0; q[0][8] = alp; + q[0][9] = 0; q[0][10] = 0; q[0][11] = 0; q[0][12] = 1; + q[0][13] = 0; q[0][14] = 0; q[0][15] = 0; + + q[1][2] = 1; q[1][3] = alp; q[1][4] = 0; q[1][5] = 1; + q[1][6] = 0; q[1][7] = 0; q[1][8] = 0; q[1][9] = alp; + q[1][10] = 0; q[1][11] = 0; q[1][12] = 0; q[1][13] = 1; + q[1][14] = 0; q[1][15] = 0; + + q[2][3] = 1; q[2][4] = 0; q[2][5] = 0; q[2][6] = 1; + q[2][7] = 0; q[2][8] = 0; q[2][9] = 0; q[2][10] = alp; + q[2][11] = 0; q[2][12] = 0; q[2][13] = 0; q[2][14] = 1; + q[2][15] = 0; + + q[3][4] = 0; q[3][5] = 0; q[3][6] = 0; q[3][7] = 1; + q[3][8] = 0; q[3][9] = 0; q[3][10] = 0; q[3][11] = alp; + q[3][12] = 0; q[3][13] = 0; q[3][14] = 0; q[3][15] = 1; + + q[4][5] = 1; q[4][6] = alp; q[4][7] = 1; q[4][8] = 1; + q[4][9] = 0; q[4][10] = 0; q[4][11] = 0; q[4][12] = alp; + q[4][13] = 0; q[4][14] = 0; q[4][15] = 0; + + q[5][6] = 1; q[5][7] = alp; q[5][8] = 0; q[5][9] = 1; + q[5][10] = 0; q[5][11] = 0; q[5][12] = 0; q[5][13] = alp; + q[5][14] = 0; q[5][15] = 0; + + q[6][7] = 1; q[6][8] = 0; q[6][9] = 0; q[6][10] = 1; + q[6][11] = 0; q[6][12] = 0; q[6][13] = 0; q[6][14] = alp; + q[6][15] = 0; + + q[7][8] = 0; q[7][9] = 0; q[7][10] = 0; q[7][11] = 1; + q[7][12] = 0; q[7][13] = 0; q[7][14] = 0; q[7][15] = alp; + + q[8][9] = 1; q[8][10] = alp; q[8][11] = 1; q[8][12] = 1; + q[8][13] = 0; q[8][14] = 0; q[8][15] = 0; + + q[9][10] = 1; q[9][11] = alp; q[9][12] = 0; q[9][13] = 1; + q[9][14] = 0; q[9][15] = 0; + + q[10][11] = 1; q[10][12] = 0; q[10][13] = 0; q[10][14] = 1; + q[10][15] = 0; + + q[11][12] = 0; q[11][13] = 0; q[11][14] = 0; q[11][15] = 1; + + q[12][13] = 1; q[12][14] = alp; q[12][15] = 1; + + q[13][14] = 1; q[13][15] = alp; + + q[14][15] = 1; + + + for (i = 0; i < 16; i++) f[i] = 0.0625; + } + } + else if (data_optn == 1) + { /* amino acids */ + if (Dayhf_optn) /* Dayhoff model */ + { + dyhfdata(q, f); + } + else if (Jtt_optn) /* JTT model */ + { + jttdata(q, f); + } + else if (blosum62_optn) /* BLOSUM 62 model */ + { + blosum62data(q, f); + } + else if (mtrev_optn) /* mtREV model */ + { + mtrevdata(q, f); + } + else if (cprev_optn) /* cpREV model */ + { + cprev45data(q, f); + } + else if (vtmv_optn) /* VT model */ + { + vtmvdata(q, f); + } + else /* if (wag_optn) */ /* WAG model */ + { + wagdata(q, f); + } + + } + else /* two-state model */ + { + q[0][1] = 1.0; + + f[0] = 0.5; f[1] = 0.5; + } + + /* fill matrix from upper triangle */ + for (i = 0; i < tpmradix; i++) + { + q[i][i] = 0.0; + for (j = i+1; j < tpmradix; j++) + { + q[j][i] = q[i][j]; + } + } +} + +/* transform letter codes to state numbers */ +int code2int(cvector c) +{ if (data_optn == 0) { /* nucleotides */ + if (nuc_optn) { /* 4x4 */ + switch (c[0]) { + case 'A': return 0; + case 'C': return 1; + case 'G': return 2; + case 'T': return 3; + case 'U': return 3; + default : return 4; + } + } + if (SH_optn) { /* 16x16 */ + if (c[0] == 'A') { + switch (c[1]) { + case 'A': return 0; /* AA */ + case 'C': return 1; /* AC */ + case 'G': return 2; /* AG */ + case 'T': return 3; /* AT */ + case 'U': return 3; /* AT */ + default: return 16; + } + } + if (c[0] == 'C') { + switch (c[1]) { + case 'A': return 4; /* CA */ + case 'C': return 5; /* CC */ + case 'G': return 6; /* CG */ + case 'T': return 7; /* CT */ + case 'U': return 7; /* CT */ + default: return 16; + } + } + if (c[0] == 'G') { + switch (c[1]) { + case 'A': return 8; /* GA */ + case 'C': return 9; /* GC */ + case 'G': return 10; /* GG */ + case 'T': return 11; /* GT */ + case 'U': return 11; /* GT */ + default: return 16; + } + } + if (c[0] == 'T' || c[0] == 'U') { + switch (c[1]) { + case 'A': return 12; /* TA */ + case 'C': return 13; /* TC */ + case 'G': return 14; /* TG */ + case 'T': return 15; /* TT */ + case 'U': return 15; /* TT */ + default: return 16; + } + } + return 16; + } + } else if (data_optn == 1) { /* amino acids */ + switch (c[0]) { + case 'A': return 0; + case 'C': return 4; + case 'D': return 3; + case 'E': return 6; + case 'F': return 13; + case 'G': return 7; + case 'H': return 8; + case 'I': return 9; + case 'K': return 11; + case 'L': return 10; + case 'M': return 12; + case 'N': return 2; + case 'P': return 14; + case 'Q': return 5; + case 'R': return 1; + case 'S': return 15; + case 'T': return 16; + case 'V': return 19; + case 'W': return 17; + case 'Y': return 18; + default : return 20; + } + } else { /* two-state model */ + switch (c[0]) { + case '0': return 0; + case '1': return 1; + default : return 2; + } + } + return 0; +} + +/* return letter code belonging to state number */ +char *int2code(int s) +{ + if (data_optn == 0) { /* nucleotides */ + if (nuc_optn) { /* 4x4 */ + switch (s) { + case 0: return "A"; + case 1: return "C"; + case 2: return "G"; + case 3: return "T"; + default : return "?"; + } + } + if (SH_optn) { /* 16x16 */ + switch (s) { + case 0: return "AA"; + case 1: return "AC"; + case 2: return "AG"; + case 3: return "AT"; + case 4: return "CA"; + case 5: return "CC"; + case 6: return "CG"; + case 7: return "CT"; + case 8: return "GA"; + case 9: return "GC"; + case 10: return "GG"; + case 11: return "GT"; + case 12: return "TA"; + case 13: return "TC"; + case 14: return "TG"; + case 15: return "TT"; + default : return "??"; + } + } + } else if (data_optn == 1) { /* amino acids */ + switch (s) { + case 0: return "A"; + case 1: return "R"; + case 2: return "N"; + case 3: return "D"; + case 4: return "C"; + case 5: return "Q"; + case 6: return "E"; + case 7: return "G"; + case 8: return "H"; + case 9: return "I"; + case 10: return "L"; + case 11: return "K"; + case 12: return "M"; + case 13: return "F"; + case 14: return "P"; + case 15: return "S"; + case 16: return "T"; + case 17: return "W"; + case 18: return "Y"; + case 19: return "V"; + default : return "?"; + } + } else { /* two-state model */ + switch (s) { + case 0: return "0"; + case 1: return "1"; + default : return "?"; + } + } + return "?"; +} diff --git a/forester/archive/RIO/others/puzzle_mod/src/model2.c b/forester/archive/RIO/others/puzzle_mod/src/model2.c new file mode 100644 index 0000000..9e2197f --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/model2.c @@ -0,0 +1,1125 @@ +/* + * model2.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/* definitions */ +#define EXTERN extern + +/* prototypes */ +#include +#include "util.h" +#include "ml.h" + + +void jttdata(dmatrix q, double *f) +{ + /* + * JTT model for amino acid evolution + * D.T. Jones, W.R. Taylor, and J.M. Thornton + * "The rapid generation of mutation data matrices from protein sequences" + * CABIOS vol. 8 no. 3 1992 pp. 275-282 + */ + + q[0][1]=3.1628651460584e+00; q[0][2]=3.2804935927860e+00; + q[0][3]=4.8477237048666e+00; q[0][4]=3.4612244897959e+00; + q[0][5]=3.3130910900946e+00; q[0][6]=6.3199473337722e+00; + q[0][7]=1.0440154440154e+01; q[0][8]=1.3061224489796e+00; + q[0][9]=2.1726844583987e+00; q[0][10]=1.8443597219107e+00; + q[0][11]=2.2137668626773e+00; q[0][12]=2.7210884353741e+00; + q[0][13]=8.3265306122449e-01; q[0][14]=1.1537414965986e+01; + q[0][15]=2.2838213546288e+01; q[0][16]=2.7007955724663e+01; + q[0][17]=5.1311953352770e-01; q[0][18]=8.3673469387755e-01; + q[0][19]=1.7474335188621e+01; + + q[1][2]=2.6598918637222e+00; q[1][3]=9.1014867485456e-01; + q[1][4]=6.1624649859944e+00; q[1][5]=1.8036482885837e+01; + q[1][6]=1.8924731182796e+00; q[1][7]=8.1810886516769e+00; + q[1][8]=1.9119717452198e+01; q[1][9]=1.4410687351864e+00; + q[1][10]=2.2211961707760e+00; q[1][11]=3.9239234676922e+01; + q[1][12]=2.5060690943044e+00; q[1][13]=3.9439775910364e-01; + q[1][14]=4.1953094963476e+00; q[1][15]=5.9016766126741e+00; + q[1][16]=3.8437069743152e+00; q[1][17]=7.6766706682673e+00; + q[1][18]=1.4173669467787e+00; q[1][19]=1.0308123249300e+00; + + q[2][3]=3.2226935854843e+01; q[2][4]=1.8710963455150e+00; + q[2][5]=4.5351268130622e+00; q[2][6]=3.3951344979102e+00; + q[2][7]=4.5987249708180e+00; q[2][8]=2.3693774375271e+01; + q[2][9]=2.9235880398671e+00; q[2][10]=8.0960899565551e-01; + q[2][11]=1.5024269384537e+01; q[2][12]=1.9003322259136e+00; + q[2][13]=4.3853820598007e-01; q[2][14]=7.1083317047749e-01; + q[2][15]=2.9456208772690e+01; q[2][16]=1.3735908553410e+01; + q[2][17]=1.6706217370669e-01; q[2][18]=4.1661129568106e+00; + q[2][19]=9.7452934662237e-01; + + q[3][4]=6.2857142857143e-01; q[3][5]=3.0662020905923e+00; + q[3][6]=4.5450549450549e+01; q[3][7]=7.5402435402435e+00; + q[3][8]=6.0544672718586e+00; q[3][9]=6.8808114961961e-01; + q[3][10]=3.6130902064968e-01; q[3][11]=1.6718197057180e+00; + q[3][12]=1.0879120879121e+00; q[3][13]=1.9340659340659e-01; + q[3][14]=7.3949579831933e-01; q[3][15]=3.4196528109572e+00; + q[3][16]=2.4749487800335e+00; q[3][17]=3.4536891679749e-01; + q[3][18]=2.6895604395604e+00; q[3][19]=1.8608058608059e+00; + + q[4][5]=5.5191637630662e-01; q[4][6]=3.2442396313364e-01; + q[4][7]=3.3297297297297e+00; q[4][8]=4.3726708074534e+00; + q[4][9]=9.1868131868132e-01; q[4][10]=9.9466248037677e-01; + q[4][11]=2.9830508474576e-01; q[4][12]=2.4095238095238e+00; + q[4][13]=4.1485714285714e+00; q[4][14]=7.3949579831933e-01; + q[4][15]=1.2862939958592e+01; q[4][16]=2.8125907990315e+00; + q[4][17]=6.8244897959184e+00; q[4][18]=1.2885714285714e+01; + q[4][19]=3.7714285714286e+00; + + q[5][6]=2.0316061593796e+01; q[5][7]=1.3922214897825e+00; + q[5][8]=3.3861536130889e+01; q[5][9]=4.7172339855267e-01; + q[5][10]=4.2320327755868e+00; q[5][11]=1.7835941652395e+01; + q[5][12]=2.6573751451800e+00; q[5][13]=2.7595818815331e-01; + q[5][14]=9.4992143198743e+00; q[5][15]=3.2350653941322e+00; + q[5][16]=3.0973838067678e+00; q[5][17]=1.0512692882031e+00; + q[5][18]=1.5331010452962e+00; q[5][19]=1.0778164924506e+00; + + q[6][7]=6.6857641051189e+00; q[6][8]=1.4458024443999e+00; + q[6][9]=6.7068415455512e-01; q[6][10]=5.7932850559579e-01; + q[6][11]=1.0365070686558e+01; q[6][12]=1.0138248847926e+00; + q[6][13]=2.6359447004608e-01; q[6][14]=1.1291226167887e+00; + q[6][15]=1.8337006611901e+00; q[6][16]=1.9520424900414e+00; + q[6][17]=6.9519420671494e-01; q[6][18]=3.8018433179723e-01; + q[6][19]=2.7772657450077e+00; + + q[7][8]=1.2113479939567e+00; q[7][9]=3.2670032670033e-01; + q[7][10]=4.1817641817642e-01; q[7][11]=1.6354950592239e+00; + q[7][12]=7.6447876447876e-01; q[7][13]=3.0579150579151e-01; + q[7][14]=1.2391551215081e+00; q[7][15]=1.1138492529797e+01; + q[7][16]=1.8888816176952e+00; q[7][17]=3.3491450634308e+00; + q[7][18]=3.1853281853282e-01; q[7][19]=2.8416988416988e+00; + + q[8][9]=1.0931677018634e+00; q[8][10]=3.2194389461470e+00; + q[8][11]=3.1498052426571e+00; q[8][12]=1.9130434782609e+00; + q[8][13]=2.7329192546584e+00; q[8][14]=6.7304834977469e+00; + q[8][15]=4.3726708074534e+00; q[8][16]=2.8162964522581e+00; + q[8][17]=7.8083407275954e-01; q[8][18]=3.5118012422360e+01; + q[8][19]=7.2877846790890e-01; + + q[9][10]=1.4069798333535e+01; q[9][11]=1.2292791953809e+00; + q[9][12]=2.8366300366300e+01; q[9][13]=4.7384615384615e+00; + q[9][14]=5.8780435251023e-01; q[9][15]=2.4105749323141e+00; + q[9][16]=1.5243062022723e+01; q[9][17]=8.2888540031397e-01; + q[9][18]=1.8434065934066e+00; q[9][19]=5.7699633699634e+01; + + q[10][11]=8.8039805231089e-01; q[10][12]=2.2425954997384e+01; + q[10][13]=1.5099529042386e+01; q[10][14]=6.2626896912611e+00; + q[10][15]=3.4917298022888e+00; q[10][16]=1.6109411169944e+00; + q[10][17]=3.2366001345593e+00; q[10][18]=1.4505494505495e+00; + q[10][19]=1.0557823129252e+01; + + q[11][12]=3.6577885391445e+00; q[11][13]=1.4915254237288e-01; + q[11][14]=1.2868062479229e+00; q[11][15]=2.8162964522581e+00; + q[11][16]=5.7494151926786e+00; q[11][17]=5.4790729851263e-01; + q[11][18]=5.3268765133172e-01; q[11][19]=7.4899112187248e-01; + + q[12][13]=2.5666666666667e+00; q[12][14]=9.4491129785247e-01; + q[12][15]=1.6397515527950e+00; q[12][16]=1.2180790960452e+01; + q[12][17]=1.1972789115646e+00; q[12][18]=1.1130952380952e+00; + q[12][19]=1.7746031746032e+01; + + q[13][14]=8.8739495798319e-01; q[13][15]=5.6298136645963e+00; + q[13][16]=8.3099273607748e-01; q[13][17]=3.3224489795918e+00; + q[13][18]=3.3392857142857e+01; q[13][19]=3.6000000000000e+00; + + q[14][15]=1.6261762676085e+01; q[14][16]=6.8852490148602e+00; + q[14][17]=4.2256902761104e-01; q[14][18]=6.7787114845938e-01; + q[14][19]=1.2549019607843e+00; + + q[15][16]=2.7891216619293e+01; q[15][17]=1.8740017746229e+00; + q[15][18]=3.7349896480331e+00; q[15][19]=2.4182194616977e+00; + + q[16][17]=4.8702870978900e-01; q[16][18]=1.1985472154964e+00; + q[16][19]=6.7925746569814e+00; + + q[17][18]=4.6020408163265e+00; q[17][19]=1.4693877551020e+00; + + q[18][19]=1.0000000000000e+00; + + + f[0] = 0.077; f[1] = 0.051; f[2] = 0.043; f[3] = 0.052; + f[4] = 0.02; f[5] = 0.041; f[6] = 0.062; f[7] = 0.074; + f[8] = 0.023; f[9] = 0.052; f[10] = 0.091; f[11] = 0.059; + f[12] = 0.024; f[13] = 0.04; f[14] = 0.051; f[15] = 0.069; + f[16] = 0.059; f[17] = 0.014; f[18] = 0.032; f[19] = 0.066; +} + +void dyhfdata(dmatrix q, double *f) +{ + /* + * Dayhoff model for amino acid evolution + * Dayhoff, M.O., Schwartz, R.M., Orcutt, B.C. (1978) + * "A model of evolutionary change in proteins." + * Dayhoff, M.O. (ed.) Atlas of Protein Sequence Structur., Vol5, Suppl. 3, + * National Biomedical Research Foundation, Washington DC, pp. 345-352. + */ + + q[0][1]=9.6472567159749e-01; q[0][2]=3.5927991886410e+00; + q[0][3]=4.3200552414656e+00; q[0][4]=1.3184584178499e+00; + q[0][5]=3.2267534963169e+00; q[0][6]=7.0141987829615e+00; + q[0][7]=8.5773867857875e+00; q[0][8]=8.1434196396611e-01; + q[0][9]=2.3518447453539e+00; q[0][10]=1.4735711728911e+00; + q[0][11]=9.3940162271805e-01; q[0][12]=2.5490196078431e+00; + q[0][13]=6.5922920892495e-01; q[0][14]=8.9189834148670e+00; + q[0][15]=1.4540712836859e+01; q[0][16]=1.3411904595370e+01; + q[0][17]=3.8517964118027e-02; q[0][18]=8.7897227856660e-01; + q[0][19]=7.4036511156187e+00; + + q[1][2]=1.1890243902439e+00; q[1][3]=5.9525626545377e-02; + q[1][4]=8.4778922655537e-01; q[1][5]=8.8348561504191e+00; + q[1][6]=5.5954088952654e-02; q[1][7]=3.1434881434075e-01; + q[1][8]=8.4753987678285e+00; q[1][9]=2.2684090115941e+00; + q[1][10]=5.5954088952654e-01; q[1][11]=1.6681312769010e+01; + q[1][12]=3.1707317073171e+00; q[1][13]=4.8959827833572e-01; + q[1][14]=3.6754156468900e+00; q[1][15]=5.4755072760812e+00; + q[1][16]=9.6472567159749e-01; q[1][17]=7.5538020086083e+00; + q[1][18]=2.7977044476327e-01; q[1][19]=8.6083213773314e-01; + + q[2][3]=3.2459324155194e+01; q[2][4]=7.3852625416383e-02; + q[2][5]=3.7732198142415e+00; q[2][6]=5.3911764705882e+00; + q[2][7]=5.0264375413087e+00; q[2][8]=1.9061418685121e+01; + q[2][9]=2.7901430842607e+00; q[2][10]=1.2482698961938e+00; + q[2][11]=1.1542279411765e+01; q[2][12]=1.9117647058824e-01; + q[2][13]=5.0183823529412e-01; q[2][14]=1.5181660899654e+00; + q[2][15]=1.7697478991597e+01; q[2][16]=8.3557302231237e+00; + q[2][17]=8.6029411764706e-01; q[2][18]=3.4411764705882e+00; + q[2][19]=5.7352941176471e-01; + + q[3][4]=2.5534152404601e-02; q[3][5]=4.8811013767209e+00; + q[3][6]=4.0561952440551e+01; q[3][7]=4.4423506911730e+00; + q[3][8]=3.0865788117500e+00; q[3][9]=8.5749078239692e-01; + q[3][10]=2.5926985518518e-02; q[3][11]=2.5930851063830e+00; + q[3][12]=1.1667143483333e-01; q[3][13]=1.2963492759259e-02; + q[3][14]=4.7853935065891e-01; q[3][15]=3.4167709637046e+00; + q[3][16]=2.3984722282163e+00; q[3][17]=3.2408731898147e-02; + q[3][18]=8.1351689612015e-02; q[3][19]=6.3829787234043e-01; + + q[4][5]=2.1864264103535e-02; q[4][6]=1.4770525083277e-02; + q[4][7]=3.9055458751427e-01; q[4][8]=1.0223340673168e+00; + q[4][9]=1.5970515970516e+00; q[4][10]=3.9098448749850e-02; + q[4][11]=8.0776309049169e-03; q[4][12]=1.4155086538140e-01; + q[4][13]=8.6898395721925e-02; q[4][14]=6.8155604487784e-01; + q[4][15]=5.8097784568373e+00; q[4][16]=5.9929928084086e-01; + q[4][17]=3.4759358288770e-01; q[4][18]=3.4759358288770e+00; + q[4][19]=1.7647058823529e+00; + + q[5][6]=2.5476780185759e+01; q[5][7]=1.0174974779977e+00; + q[5][8]=2.1573939173192e+01; q[5][9]=6.5266504894988e-01; + q[5][10]=2.6634492806410e+00; q[5][11]=5.5466331269350e+00; + q[5][12]=4.0247678018576e+00; q[5][13]=1.8038017885416e-02; + q[5][14]=5.5044618466582e+00; q[5][15]=2.0267580716497e+00; + q[5][16]=1.9256432155439e+00; q[5][17]=9.6202762055552e-02; + q[5][18]=1.0061919504644e-01; q[5][19]=1.2538699690402e+00; + + q[6][7]=2.8869795109055e+00; q[6][8]=1.5519031141869e+00; + q[6][9]=2.1701112877583e+00; q[6][10]=4.0484429065744e-01; + q[6][11]=2.9823529411765e+00; q[6][12]=1.0705882352941e+00; + q[6][13]=1.9801735189768e-02; q[6][14]=1.7993079584775e+00; + q[6][15]=2.8184873949580e+00; q[6][16]=1.2261663286004e+00; + q[6][17]=7.3114099162219e-02; q[6][18]=7.6470588235294e-01; + q[6][19]=1.3058823529412e+00; + + q[7][8]=3.7906768788150e-01; q[7][9]=2.3128004846840e-02; + q[7][10]=2.5776602775942e-01; q[7][11]=9.6662260409782e-01; + q[7][12]=6.0145406477198e-01; q[7][13]=5.4775280898876e-01; + q[7][14]=1.2382877804129e+00; q[7][15]=8.2853366065527e+00; + q[7][16]=1.1110604644803e+00; q[7][17]=1.2888301387971e-01; + q[7][18]=1.7114723586662e-02; q[7][19]=1.9233311302049e+00; + + q[8][9]=2.7354343963341e-01; q[8][10]=1.5876246692449e+00; + q[8][11]=9.6993944636678e-01; q[8][12]=1.2544085640577e-01; + q[8][13]=1.6868512110727e+00; q[8][14]=3.3075513942601e+00; + q[8][15]=1.2530894710826e+00; q[8][16]=8.1434196396611e-01; + q[8][17]=1.0121107266436e+00; q[8][18]=4.4982698961938e+00; + q[8][19]=1.5570934256055e+00; + + q[9][10]=9.2275320303002e+00; q[9][11]=1.6663354531002e+00; + q[9][12]=1.1780604133545e+01; q[9][13]=6.9753577106518e+00; + q[9][14]=4.2551201720752e-01; q[9][15]=8.8575970928912e-01; + q[9][16]=6.8951811852420e+00; q[9][17]=9.8802836705702e-02; + q[9][18]=1.3434022257552e+00; q[9][19]=3.1526232114467e+01; + + q[10][11]=6.5787197231834e-01; q[10][12]=1.8622837370242e+01; + q[10][13]=5.6340830449827e+00; q[10][14]=1.1377976796255e+00; + q[10][15]=6.1690558576372e-01; q[10][16]=1.2098794893211e+00; + q[10][17]=1.7543252595156e+00; q[10][18]=1.0346020761246e+00; + q[10][19]=6.2906574394464e+00; + + q[11][12]=8.6029411764706e+00; q[11][13]=6.6640454965565e-03; + q[11][14]=1.2089100346021e+00; q[11][15]=3.4411764705882e+00; + q[11][16]=4.9442190669371e+00; q[11][17]=3.4272233982290e-02; + q[11][18]=4.7794117647059e-01; q[11][19]=3.7500000000000e-01; + + q[12][13]=3.2500000000000e+00; q[12][14]=5.9976931949250e-01; + q[12][15]=2.1848739495798e+00; q[12][16]=3.6916835699797e+00; + q[12][17]=1.6247577591604e-01; q[12][18]=1.1508700794053e-01; + q[12][19]=9.0588235294118e+00; + + q[13][14]=3.9359861591695e-01; q[13][15]=1.6386554621849e+00; + q[13][16]=4.9442190669371e-01; q[13][17]=2.8676470588235e+00; + q[13][18]=2.4852941176471e+01; q[13][19]=4.4117647058824e-01; + + q[14][15]=8.6431043005437e+00; q[14][16]=2.8308077795013e+00; + q[14][17]=3.5840244687362e-02; q[14][18]=4.3804743506776e-02; + q[14][19]=1.7301038062284e+00; + + q[15][16]=1.9663865546218e+01; q[15][17]=2.7857142857143e+00; + q[15][18]=1.2016806722689e+00; q[15][19]=1.0840336134454e+00; + + q[16][17]=4.2019597219666e-02; q[16][18]=1.5162271805274e+00; + q[16][19]=5.6592292089249e+00; + + q[17][18]=2.2941176470588e+00; q[17][19]=1.2654363316538e-01; + + q[18][19]=1.0000000000000e+00; + + + f[0] = 0.087; f[1] = 0.041; f[2] = 0.040; f[3] = 0.047; + f[4] = 0.033; f[5] = 0.038; f[6] = 0.05; f[7] = 0.089; + f[8] = 0.034; f[9] = 0.037; f[10] = 0.085; f[11] = 0.08; + f[12] = 0.015; f[13] = 0.04; f[14] = 0.051; f[15] = 0.07; + f[16] = 0.058; f[17] = 0.01; f[18] = 0.03; f[19] = 0.065; +} + +void mtrevdata(dmatrix q, double *f) +{ + /* + * mtREV24 model of amino acid evolution + * (complete sequence data of mtDNA from 24 vertebrate species) + * Adachi, J. and Hasegawa, M. (1996) + */ + + q[0][1]=1.2199217606346e+01; q[0][2]=1.4182139942122e+01; + q[0][3]=9.2985091873208e+00; q[0][4]=3.1542792981957e+01; + q[0][5]=1.0025852846688e+00; q[0][6]=5.1418866803338e+00; + q[0][7]=6.3531246495131e+01; q[0][8]=7.3137132861715e+00; + q[0][9]=5.0782382656186e+01; q[0][10]=1.3399741808481e+01; + q[0][11]=4.4021672780560e+00; q[0][12]=7.4673480520104e+01; + q[0][13]=3.3513021631978e+00; q[0][14]=2.8582502221773e+01; + q[0][15]=2.0413623195312e+02; q[0][16]=2.5301305153906e+02; + q[0][17]=1.0000000000000e+00; q[0][18]=3.4084158197615e+00; + q[0][19]=1.0266468401249e+02; + + q[1][2]=6.9661274444534e+00; q[1][3]=1.0000000000000e+00; + q[1][4]=5.4384584796568e+01; q[1][5]=1.1631134513343e+02; + q[1][6]=1.0000000000000e+00; q[1][7]=1.2122831341194e+01; + q[1][8]=8.6961067087353e+01; q[1][9]=1.0000000000000e+00; + q[1][10]=8.1976829394538e+00; q[1][11]=7.4423215395318e+01; + q[1][12]=1.0000000000000e+00; q[1][13]=2.4659158338099e+00; + q[1][14]=1.2439947713615e+01; q[1][15]=3.1791814866372e+00; + q[1][16]=1.0935327216119e+00; q[1][17]=1.1550775790126e+01; + q[1][18]=1.0000000000000e+00; q[1][19]=4.0211417480338e+00; + + q[2][3]=4.1809325468160e+02; q[2][4]=3.1020979842967e+01; + q[2][5]=9.1349622725361e+01; q[2][6]=3.3185663516310e+01; + q[2][7]=2.8052324651124e+01; q[2][8]=2.6112087577885e+02; + q[2][9]=1.4261453863336e+01; q[2][10]=7.9775653461977e+00; + q[2][11]=3.2036829276162e+02; q[2][12]=3.4424354918739e+01; + q[2][13]=7.9996445145608e+00; q[2][14]=3.8586541461044e+01; + q[2][15]=2.6020426225852e+02; q[2][16]=1.2550758780474e+02; + q[2][17]=5.6207759736659e+00; q[2][18]=1.0071406219571e+02; + q[2][19]=1.0000000000000e+00; + + q[3][4]=1.0000000000000e+00; q[3][5]=2.9097352675564e+01; + q[3][6]=3.0713149855302e+02; q[3][7]=2.9877072751897e+01; + q[3][8]=5.9995408885817e+01; q[3][9]=2.2827096245105e+00; + q[3][10]=1.0000000000000e+00; q[3][11]=1.2183938185384e+00; + q[3][12]=1.0000000000000e+00; q[3][13]=2.6221929413096e+00; + q[3][14]=7.0708004204733e+00; q[3][15]=3.6327934317139e+01; + q[3][16]=1.4743408713748e+01; q[3][17]=1.0453246057102e+01; + q[3][18]=1.1165627147496e+01; q[3][19]=1.0000000000000e+00; + + q[4][5]=3.9599394038972e+01; q[4][6]=1.0000000000000e+00; + q[4][7]=1.6163581056674e+01; q[4][8]=7.4467985406234e+01; + q[4][9]=3.3018175376623e+01; q[4][10]=1.3500725995091e+01; + q[4][11]=1.0000000000000e+00; q[4][12]=3.2504095376923e+00; + q[4][13]=3.7264767083096e+01; q[4][14]=1.6454136037822e+01; + q[4][15]=1.4581783243113e+02; q[4][16]=9.4720031458442e+01; + q[4][17]=1.7684087896962e+01; q[4][18]=1.3409157685926e+02; + q[4][19]=1.0000000000000e+00; + + q[5][6]=1.6503249008836e+02; q[5][7]=3.5530760735494e+00; + q[5][8]=3.0652523140859e+02; q[5][9]=4.3905393139325e+00; + q[5][10]=2.0895470525345e+01; q[5][11]=2.4504076430724e+02; + q[5][12]=2.4931300477797e+01; q[5][13]=1.0059428264289e+01; + q[5][14]=7.2256314165467e+01; q[5][15]=2.8480937892158e+01; + q[5][16]=4.9962974409828e+01; q[5][17]=1.0000000000000e+00; + q[5][18]=2.0430790980529e+01; q[5][19]=9.9986289000676e+00; + + q[6][7]=1.4884496769963e+01; q[6][8]=2.5853576435567e+01; + q[6][9]=1.7418201388328e+00; q[6][10]=1.0000000000000e+00; + q[6][11]=1.6519126809071e+02; q[6][12]=1.0000000000000e+00; + q[6][13]=1.4067850525292e+00; q[6][14]=6.7547121641947e+00; + q[6][15]=2.8794794140840e+01; q[6][16]=7.8001372062558e+00; + q[6][17]=1.0000000000000e+00; q[6][18]=6.9067239183061e+00; + q[6][19]=1.1127702362585e+01; + + q[7][8]=1.0000000000000e+00; q[7][9]=3.1466649021550e+00; + q[7][10]=1.2699794194865e+00; q[7][11]=1.1962111069278e+01; + q[7][12]=1.0000000000000e+00; q[7][13]=1.0000000000000e+00; + q[7][14]=1.0000000000000e+00; q[7][15]=6.6277950574411e+01; + q[7][16]=5.8800079133028e+00; q[7][17]=5.7494182626674e+00; + q[7][18]=1.6887657206208e+00; q[7][19]=1.3320553471351e+00; + + q[8][9]=6.4536986087271e+00; q[8][10]=6.0472584534958e+00; + q[8][11]=6.7197196398961e+01; q[8][12]=6.2977633277779e+00; + q[8][13]=2.5347805183364e+01; q[8][14]=3.2089868698728e+01; + q[8][15]=4.0766987134407e+01; q[8][16]=2.3570850628539e+01; + q[8][17]=3.7286635325194e+00; q[8][18]=3.5270764890474e+02; + q[8][19]=1.0000000000000e+00; + + q[9][10]=1.7320653206333e+02; q[9][11]=1.0298655619743e+01; + q[9][12]=2.7262244199514e+02; q[9][13]=4.4561065036310e+01; + q[9][14]=1.0856482766156e+01; q[9][15]=2.5107659603898e+01; + q[9][16]=1.9391167162525e+02; q[9][17]=1.0000000000000e+00; + q[9][18]=1.3161329199391e+01; q[9][19]=6.4365086389428e+02; + + q[10][11]=7.8314019154706e+00; q[10][12]=2.8290920517725e+02; + q[10][13]=1.1371735519833e+02; q[10][14]=2.1105885757279e+01; + q[10][15]=3.8741359395934e+01; q[10][16]=6.6524559321657e+01; + q[10][17]=1.7071378554833e+01; q[10][18]=2.3234516108847e+01; + q[10][19]=4.8247261078055e+01; + + q[11][12]=4.8092094826036e+01; q[11][13]=3.3887559483420e+00; + q[11][14]=2.6368577564199e+01; q[11][15]=5.5679895711418e+01; + q[11][16]=7.1750284708933e+01; q[11][17]=1.2631893872825e+01; + q[11][18]=2.6932728996777e+01; q[11][19]=1.0000000000000e+00; + + q[12][13]=4.7798798034572e+01; q[12][14]=9.9165053447429e+00; + q[12][15]=5.8505442466161e+01; q[12][16]=2.7798190504760e+02; + q[12][17]=1.1427000119701e+01; q[12][18]=2.1029990530586e+01; + q[12][19]=2.0397078683768e+02; + + q[13][14]=9.1089574817139e+00; q[13][15]=3.3835737720574e+01; + q[13][16]=1.7815549567056e+01; q[13][17]=4.1272404968214e+00; + q[13][18]=2.4504156395152e+02; q[13][19]=3.3435675442163e+00; + + q[14][15]=8.9421193040709e+01; q[14][16]=6.7485067008375e+01; + q[14][17]=2.2161693733113e+00; q[14][18]=8.5338209390745e+00; + q[14][19]=4.3342126659660e+00; + + q[15][16]=3.1432036618746e+02; q[15][17]=2.0305343047059e+01; + q[15][18]=3.4167877957799e+01; q[15][19]=1.0000000000000e+00; + + q[16][17]=5.2559565123081e+00; q[16][18]=2.0382362288681e+01; + q[16][19]=1.0765527137500e+02; + + q[17][18]=1.3814733274637e+01; q[17][19]=2.8259139240676e+00; + + q[18][19]=1.0000000000000e+00; + + + /* amino acid frequencies */ + f[0]=0.072; f[1]=0.019; f[2]=0.039; f[3]=0.019; f[4]=0.006; + f[5]=0.025; f[6]=0.024; f[7]=0.056; f[8]=0.028; f[9]=0.088; + f[10]=0.168; f[11]=0.023; f[12]=0.054; f[13]=0.061; f[14]=0.054; + f[15]=0.072; f[16]=0.086; f[17]=0.029; f[18]=0.033; f[19]=0.043; +} + +void blosum62data(dmatrix q, double *f) +{ + /* + * BLOSUM62 model of amino acid evolution + * + * S. Henikoff and J. G. Henikoff. 1992. PNAS USA 89:10915-10919. + * + */ + + q[0][1]=7.3579038969751e-01; q[0][2]=4.8539105546575e-01; + q[0][3]=5.4316182089867e-01; q[0][4]=1.4599953104700e+00; + q[0][5]=1.1997057046020e+00; q[0][6]=1.1709490427999e+00; + q[0][7]=1.9558835749595e+00; q[0][8]=7.1624144499779e-01; + q[0][9]=6.0589900368677e-01; q[0][10]=8.0001653051838e-01; + q[0][11]=1.2952012667833e+00; q[0][12]=1.2537582666635e+00; + q[0][13]=4.9296467974759e-01; q[0][14]=1.1732759009239e+00; + q[0][15]=4.3250926870566e+00; q[0][16]=1.7291780194850e+00; + q[0][17]=4.6583936772479e-01; q[0][18]=7.1820669758623e-01; + q[0][19]=2.1877745220045e+00; + + q[1][2]=1.2974467051337e+00; q[1][3]=5.0096440855513e-01; + q[1][4]=2.2782657420895e-01; q[1][5]=3.0208336100636e+00; + q[1][6]=1.3605741904203e+00; q[1][7]=4.1876330851753e-01; + q[1][8]=1.4561411663360e+00; q[1][9]=2.3203644514174e-01; + q[1][10]=6.2271166969249e-01; q[1][11]=5.4111151414889e+00; + q[1][12]=9.8369298745695e-01; q[1][13]=3.7164469320875e-01; + q[1][14]=4.4813366171831e-01; q[1][15]=1.1227831042096e+00; + q[1][16]=9.1466595456337e-01; q[1][17]=4.2638231012175e-01; + q[1][18]=7.2051744121611e-01; q[1][19]=4.3838834377202e-01; + + q[2][3]=3.1801000482161e+00; q[2][4]=3.9735894989702e-01; + q[2][5]=1.8392161469920e+00; q[2][6]=1.2404885086396e+00; + q[2][7]=1.3558723444845e+00; q[2][8]=2.4145014342081e+00; + q[2][9]=2.8301732627800e-01; q[2][10]=2.1188815961519e-01; + q[2][11]=1.5931370434574e+00; q[2][12]=6.4844127878707e-01; + q[2][13]=3.5486124922252e-01; q[2][14]=4.9488704370192e-01; + q[2][15]=2.9041016564560e+00; q[2][16]=1.8981736345332e+00; + q[2][17]=1.9148204624678e-01; q[2][18]=5.3822251903674e-01; + q[2][19]=3.1285879799342e-01; + + q[3][4]=2.4083661480204e-01; q[3][5]=1.1909457033960e+00; + q[3][6]=3.7616252083685e+00; q[3][7]=7.9847324896839e-01; + q[3][8]=7.7814266402188e-01; q[3][9]=4.1855573246161e-01; + q[3][10]=2.1813157759360e-01; q[3][11]=1.0324479249521e+00; + q[3][12]=2.2262189795786e-01; q[3][13]=2.8173069420651e-01; + q[3][14]=7.3062827299842e-01; q[3][15]=1.5827541420653e+00; + q[3][16]=9.3418750943056e-01; q[3][17]=1.4534504627853e-01; + q[3][18]=2.6142220896504e-01; q[3][19]=2.5812928941763e-01; + + q[4][5]=3.2980150463028e-01; q[4][6]=1.4074889181440e-01; + q[4][7]=4.1820319228376e-01; q[4][8]=3.5405810983129e-01; + q[4][9]=7.7489402279418e-01; q[4][10]=8.3184264014158e-01; + q[4][11]=2.8507880090648e-01; q[4][12]=7.6768882347954e-01; + q[4][13]=4.4133747118660e-01; q[4][14]=3.5600849876863e-01; + q[4][15]=1.1971884150942e+00; q[4][16]=1.1198313585160e+00; + q[4][17]=5.2766441887169e-01; q[4][18]=4.7023773369610e-01; + q[4][19]=1.1163524786062e+00; + + q[5][6]=5.5289191779282e+00; q[5][7]=6.0984630538281e-01; + q[5][8]=2.4353411311401e+00; q[5][9]=2.3620245120365e-01; + q[5][10]=5.8073709318144e-01; q[5][11]=3.9452776745146e+00; + q[5][12]=2.4948960771127e+00; q[5][13]=1.4435695975031e-01; + q[5][14]=8.5857057567418e-01; q[5][15]=1.9348709245965e+00; + q[5][16]=1.2774802945956e+00; q[5][17]=7.5865380864172e-01; + q[5][18]=9.5898974285014e-01; q[5][19]=5.3078579012486e-01; + + q[6][7]=4.2357999217628e-01; q[6][8]=1.6268910569817e+00; + q[6][9]=1.8684804693170e-01; q[6][10]=3.7262517508685e-01; + q[6][11]=2.8024271516787e+00; q[6][12]=5.5541539747043e-01; + q[6][13]=2.9140908416530e-01; q[6][14]=9.2656393484598e-01; + q[6][15]=1.7698932389373e+00; q[6][16]=1.0710972360073e+00; + q[6][17]=4.0763564893830e-01; q[6][18]=5.9671930034577e-01; + q[6][19]=5.2425384633796e-01; + + q[7][8]=5.3985912495418e-01; q[7][9]=1.8929629237636e-01; + q[7][10]=2.1772115923623e-01; q[7][11]=7.5204244030271e-01; + q[7][12]=4.5943617357855e-01; q[7][13]=3.6816646445253e-01; + q[7][14]=5.0408659952683e-01; q[7][15]=1.5093262532236e+00; + q[7][16]=6.4143601140497e-01; q[7][17]=5.0835892463812e-01; + q[7][18]=3.0805573703500e-01; q[7][19]=2.5334079019018e-01; + + q[8][9]=2.5271844788492e-01; q[8][10]=3.4807220979697e-01; + q[8][11]=1.0225070358890e+00; q[8][12]=9.8431152535870e-01; + q[8][13]=7.1453370392764e-01; q[8][14]=5.2700733915060e-01; + q[8][15]=1.1170297629105e+00; q[8][16]=5.8540709022472e-01; + q[8][17]=3.0124860078016e-01; q[8][18]=4.2189539693890e+00; + q[8][19]=2.0155597175031e-01; + + q[9][10]=3.8909637733035e+00; q[9][11]=4.0619358664202e-01; + q[9][12]=3.3647977631042e+00; q[9][13]=1.5173593259539e+00; + q[9][14]=3.8835540920564e-01; q[9][15]=3.5754441245967e-01; + q[9][16]=1.1790911972601e+00; q[9][17]=3.4198578754023e-01; + q[9][18]=6.7461709322842e-01; q[9][19]=8.3118394054582e+00; + + q[10][11]=4.4557027426059e-01; q[10][12]=6.0305593795716e+00; + q[10][13]=2.0648397032375e+00; q[10][14]=3.7455568747097e-01; + q[10][15]=3.5296918452729e-01; q[10][16]=9.1525985769421e-01; + q[10][17]=6.9147463459998e-01; q[10][18]=8.1124585632307e-01; + q[10][19]=2.2314056889131e+00; + + q[11][12]=1.0730611843319e+00; q[11][13]=2.6692475051102e-01; + q[11][14]=1.0473834507215e+00; q[11][15]=1.7521659178195e+00; + q[11][16]=1.3038752007987e+00; q[11][17]=3.3224304063396e-01; + q[11][18]=7.1799348690032e-01; q[11][19]=4.9813847530407e-01; + + q[12][13]=1.7738551688305e+00; q[12][14]=4.5412362510273e-01; + q[12][15]=9.1872341574605e-01; q[12][16]=1.4885480537218e+00; + q[12][17]=8.8810109815193e-01; q[12][18]=9.5168216224591e-01; + q[12][19]=2.5758507553153e+00; + + q[13][14]=2.3359790962888e-01; q[13][15]=5.4002764482413e-01; + q[13][16]=4.8820611879305e-01; q[13][17]=2.0743248934965e+00; + q[13][18]=6.7472604308008e+00; q[13][19]=8.3811961017754e-01; + + q[14][15]=1.1691295777157e+00; q[14][16]=1.0054516831488e+00; + q[14][17]=2.5221483002727e-01; q[14][18]=3.6940531935451e-01; + q[14][19]=4.9690841067567e-01; + + q[15][16]=5.1515562922704e+00; q[15][17]=3.8792562209837e-01; + q[15][18]=7.9675152076106e-01; q[15][19]=5.6192545744165e-01; + + q[16][17]=5.1312812689059e-01; q[16][18]=8.0101024319939e-01; + q[16][19]=2.2530740511763e+00; + + q[17][18]=4.0544190065580e+00; q[17][19]=2.6650873142646e-01; + + q[18][19]=1.0000000000000e+00; + + + f[0]=0.074; f[1]=0.052; f[2]=0.045; f[3]=0.054; + f[4]=0.025; f[5]=0.034; f[6]=0.054; f[7]=0.074; + f[8]=0.026; f[9]=0.068; f[10]=0.099; f[11]=0.058; + f[12]=0.025; f[13]=0.047; f[14]=0.039; f[15]=0.057; + f[16]=0.051; f[17]=0.013; f[18]=0.032; f[19]=0.073; +} + + + +void vtmvdata(dmatrix q, double *f) +{ + /* + * variable time (VT) model for amino acid evolution + * Mueller, T. and Vingron, M. (1999) + * "Modeling Amino Acid Replacement" + * Journal of Comp. Biology + */ + +/* amino acid frequencies */ + +f[0]=0.078837 ; +f[1]=0.051238 ; +f[2]=0.042313 ; +f[3]=0.053066 ; +f[4]=0.015175 ; +f[5]=0.036713 ; +f[6]=0.061924 ; +f[7]=0.070852 ; +f[8]=0.023082 ; +f[9]=0.062056 ; +f[10]=0.096371 ; +f[11]=0.057324 ; +f[12]=0.023771 ; +f[13]=0.043296 ; +f[14]=0.043911 ; +f[15]=0.063403 ; +f[16]=0.055897 ; +f[17]=0.013272 ; +f[18]=0.034399 ; +f[19]=0.073101 ; + + +q[0][1] = 0.233108 ; +q[0][2] = 0.199097 ; +q[0][3] = 0.265145 ; +q[0][4] = 0.227333 ; +q[0][5] = 0.310084 ; +q[0][6] = 0.567957 ; +q[0][7] = 0.876213 ; +q[0][8] = 0.078692 ; +q[0][9] = 0.222972 ; +q[0][10] = 0.424630 ; +q[0][11] = 0.393245 ; +q[0][12] = 0.211550 ; +q[0][13] = 0.116646 ; +q[0][14] = 0.399143 ; +q[0][15] = 1.817198 ; +q[0][16] = 0.877877 ; +q[0][17] = 0.030309 ; +q[0][18] = 0.087061 ; +q[0][19] = 1.230985 ; + +q[1][2] = 0.210797 ; +q[1][3] = 0.105191 ; +q[1][4] = 0.031726 ; +q[1][5] = 0.493763 ; +q[1][6] = 0.255240 ; +q[1][7] = 0.156945 ; +q[1][8] = 0.213164 ; +q[1][9] = 0.081510 ; +q[1][10] = 0.192364 ; +q[1][11] = 1.755838 ; +q[1][12] = 0.087930 ; +q[1][13] = 0.042569 ; +q[1][14] = 0.128480 ; +q[1][15] = 0.292327 ; +q[1][16] = 0.204109 ; +q[1][17] = 0.046417 ; +q[1][18] = 0.097010 ; +q[1][19] = 0.113146 ; + +q[2][3] = 0.883422 ; +q[2][4] = 0.027495 ; +q[2][5] = 0.275700 ; +q[2][6] = 0.270417 ; +q[2][7] = 0.362028 ; +q[2][8] = 0.290006 ; +q[2][9] = 0.087225 ; +q[2][10] = 0.069245 ; +q[2][11] = 0.503060 ; +q[2][12] = 0.057420 ; +q[2][13] = 0.039769 ; +q[2][14] = 0.083956 ; +q[2][15] = 0.847049 ; +q[2][16] = 0.471268 ; +q[2][17] = 0.010459 ; +q[2][18] = 0.093268 ; +q[2][19] = 0.049824 ; + +q[3][4] = 0.010313 ; +q[3][5] = 0.205842 ; +q[3][6] = 1.599461 ; +q[3][7] = 0.311718 ; +q[3][8] = 0.134252 ; +q[3][9] = 0.011720 ; +q[3][10] = 0.060863 ; +q[3][11] = 0.261101 ; +q[3][12] = 0.012182 ; +q[3][13] = 0.016577 ; +q[3][14] = 0.160063 ; +q[3][15] = 0.461519 ; +q[3][16] = 0.178197 ; +q[3][17] = 0.011393 ; +q[3][18] = 0.051664 ; +q[3][19] = 0.048769 ; + +q[4][5] = 0.004315 ; +q[4][6] = 0.005321 ; +q[4][7] = 0.050876 ; +q[4][8] = 0.016695 ; +q[4][9] = 0.046398 ; +q[4][10] = 0.091709 ; +q[4][11] = 0.004067 ; +q[4][12] = 0.023690 ; +q[4][13] = 0.051127 ; +q[4][14] = 0.011137 ; +q[4][15] = 0.175270 ; +q[4][16] = 0.079511 ; +q[4][17] = 0.007732 ; +q[4][18] = 0.042823 ; +q[4][19] = 0.163831 ; + +q[5][6] = 0.960976 ; +q[5][7] = 0.128660 ; +q[5][8] = 0.315521 ; +q[5][9] = 0.054602 ; +q[5][10] = 0.243530 ; +q[5][11] = 0.738208 ; +q[5][12] = 0.120801 ; +q[5][13] = 0.026235 ; +q[5][14] = 0.156570 ; +q[5][15] = 0.358017 ; +q[5][16] = 0.248992 ; +q[5][17] = 0.021248 ; +q[5][18] = 0.062544 ; +q[5][19] = 0.112027 ; + +q[6][7] = 0.250447 ; +q[6][8] = 0.104458 ; +q[6][9] = 0.046589 ; +q[6][10] = 0.151924 ; +q[6][11] = 0.888630 ; +q[6][12] = 0.058643 ; +q[6][13] = 0.028168 ; +q[6][14] = 0.205134 ; +q[6][15] = 0.406035 ; +q[6][16] = 0.321028 ; +q[6][17] = 0.018844 ; +q[6][18] = 0.055200 ; +q[6][19] = 0.205868 ; + +q[7][8] = 0.058131 ; +q[7][9] = 0.051089 ; +q[7][10] = 0.087056 ; +q[7][11] = 0.193243 ; +q[7][12] = 0.046560 ; +q[7][13] = 0.050143 ; +q[7][14] = 0.124492 ; +q[7][15] = 0.612843 ; +q[7][16] = 0.136266 ; +q[7][17] = 0.023990 ; +q[7][18] = 0.037568 ; +q[7][19] = 0.082579 ; + +q[8][9] = 0.020039 ; +q[8][10] = 0.103552 ; +q[8][11] = 0.153323 ; +q[8][12] = 0.021157 ; +q[8][13] = 0.079807 ; +q[8][14] = 0.078892 ; +q[8][15] = 0.167406 ; +q[8][16] = 0.101117 ; +q[8][17] = 0.020009 ; +q[8][18] = 0.286027 ; +q[8][19] = 0.068575 ; + +q[9][10] = 2.089890 ; +q[9][11] = 0.093181 ; +q[9][12] = 0.493845 ; +q[9][13] = 0.321020 ; +q[9][14] = 0.054797 ; +q[9][15] = 0.081567 ; +q[9][16] = 0.376588 ; +q[9][17] = 0.034954 ; +q[9][18] = 0.086237 ; +q[9][19] = 3.654430 ; + +q[10][11] = 0.201204 ; +q[10][12] = 1.105667 ; +q[10][13] = 0.946499 ; +q[10][14] = 0.169784 ; +q[10][15] = 0.214977 ; +q[10][16] = 0.243227 ; +q[10][17] = 0.083439 ; +q[10][18] = 0.189842 ; +q[10][19] = 1.337571 ; + +q[11][12] = 0.096474 ; +q[11][13] = 0.038261 ; +q[11][14] = 0.212302 ; +q[11][15] = 0.400072 ; +q[11][16] = 0.446646 ; +q[11][17] = 0.023321 ; +q[11][18] = 0.068689 ; +q[11][19] = 0.144587 ; + +q[12][13] = 0.173052 ; +q[12][14] = 0.010363 ; +q[12][15] = 0.090515 ; +q[12][16] = 0.184609 ; +q[12][17] = 0.022019 ; +q[12][18] = 0.073223 ; +q[12][19] = 0.307309 ; + +q[13][14] = 0.042564 ; +q[13][15] = 0.138119 ; +q[13][16] = 0.085870 ; +q[13][17] = 0.128050 ; +q[13][18] = 0.898663 ; +q[13][19] = 0.247329 ; + +q[14][15] = 0.430431 ; +q[14][16] = 0.207143 ; +q[14][17] = 0.014584 ; +q[14][18] = 0.032043 ; +q[14][19] = 0.129315 ; + +q[15][16] = 1.767766 ; +q[15][17] = 0.035933 ; +q[15][18] = 0.121979 ; +q[15][19] = 0.127700 ; + +q[16][17] = 0.020437 ; +q[16][18] = 0.094617 ; +q[16][19] = 0.740372 ; + +q[17][18] = 0.124746 ; +q[17][19] = 0.022134 ; + +q[18][19] = 0.125733 ; + +} + + +/* + * WAG matrix: Simon Whelan and Nick Goldman + * + */ + +void wagdata(dmatrix q, double *f) +{ + /* + * WAG model of amino acid evolution + * + * S. Whelan and N. Goldman. 2000. In prep. + * + * presented at the MASAMB-X workshop in Cambridge + * + * Whelan, S., and N. Goldman. 2000. + * The WAG amino acid rate matrix. + * Manuscript in prep. + */ + + /* Q matrix */ + q[0][1] = 0.610810; q[0][2] = 0.569079; + q[0][3] = 0.821500; q[0][4] = 1.141050; + q[0][5] = 1.011980; q[0][6] = 1.756410; + q[0][7] = 1.572160; q[0][8] = 0.354813; + q[0][9] = 0.219023; q[0][10] = 0.443935; + q[0][11] = 1.005440; q[0][12] = 0.989475; + q[0][13] = 0.233492; q[0][14] = 1.594890; + q[0][15] = 3.733380; q[0][16] = 2.349220; + q[0][17] = 0.125227; q[0][18] = 0.268987; + q[0][19] = 2.221870; + + q[1][2] = 0.711690; q[1][3] = 0.165074; + q[1][4] = 0.585809; q[1][5] = 3.360330; + q[1][6] = 0.488649; q[1][7] = 0.650469; + q[1][8] = 2.362040; q[1][9] = 0.206722; + q[1][10] = 0.551450; q[1][11] = 5.925170; + q[1][12] = 0.758446; q[1][13] = 0.116821; + q[1][14] = 0.753467; q[1][15] = 1.357640; + q[1][16] = 0.613776; q[1][17] = 1.294610; + q[1][18] = 0.423612; q[1][19] = 0.280336; + + q[2][3] = 6.013660; q[2][4] = 0.296524; + q[2][5] = 1.716740; q[2][6] = 1.056790; + q[2][7] = 1.253910; q[2][8] = 4.378930; + q[2][9] = 0.615636; q[2][10] = 0.147156; + q[2][11] = 3.334390; q[2][12] = 0.224747; + q[2][13] = 0.110793; q[2][14] = 0.217538; + q[2][15] = 4.394450; q[2][16] = 2.257930; + q[2][17] = 0.078463; q[2][18] = 1.208560; + q[2][19] = 0.221176; + + q[3][4] = 0.033379; q[3][5] = 0.691268; + q[3][6] = 6.833400; q[3][7] = 0.961142; + q[3][8] = 1.032910; q[3][9] = 0.043523; + q[3][10] = 0.093930; q[3][11] = 0.533362; + q[3][12] = 0.116813; q[3][13] = 0.052004; + q[3][14] = 0.472601; q[3][15] = 1.192810; + q[3][16] = 0.417372; q[3][17] = 0.146348; + q[3][18] = 0.363243; q[3][19] = 0.169417; + + q[4][5] = 0.109261; q[4][6] = 0.023920; + q[4][7] = 0.341086; q[4][8] = 0.275403; + q[4][9] = 0.189890; q[4][10] = 0.428414; + q[4][11] = 0.083649; q[4][12] = 0.437393; + q[4][13] = 0.441300; q[4][14] = 0.122303; + q[4][15] = 1.560590; q[4][16] = 0.570186; + q[4][17] = 0.795736; q[4][18] = 0.604634; + q[4][19] = 1.114570; + + q[5][6] = 6.048790; q[5][7] = 0.366510; + q[5][8] = 4.749460; q[5][9] = 0.131046; + q[5][10] = 0.964886; q[5][11] = 4.308310; + q[5][12] = 1.705070; q[5][13] = 0.110744; + q[5][14] = 1.036370; q[5][15] = 1.141210; + q[5][16] = 0.954144; q[5][17] = 0.243615; + q[5][18] = 0.252457; q[5][19] = 0.333890; + + q[6][7] = 0.630832; q[6][8] = 0.635025; + q[6][9] = 0.141320; q[6][10] = 0.172579; + q[6][11] = 2.867580; q[6][12] = 0.353912; + q[6][13] = 0.092310; q[6][14] = 0.755791; + q[6][15] = 0.782467; q[6][16] = 0.914814; + q[6][17] = 0.172682; q[6][18] = 0.217549; + q[6][19] = 0.655045; + + q[7][8] = 0.276379; q[7][9] = 0.034151; + q[7][10] = 0.068651; q[7][11] = 0.415992; + q[7][12] = 0.194220; q[7][13] = 0.055288; + q[7][14] = 0.273149; q[7][15] = 1.486700; + q[7][16] = 0.251477; q[7][17] = 0.374321; + q[7][18] = 0.114187; q[7][19] = 0.209108; + + q[8][9] = 0.152215; q[8][10] = 0.555096; + q[8][11] = 0.992083; q[8][12] = 0.450867; + q[8][13] = 0.756080; q[8][14] = 0.771387; + q[8][15] = 0.822459; q[8][16] = 0.525511; + q[8][17] = 0.289998; q[8][18] = 4.290350; + q[8][19] = 0.131869; + + q[9][10] = 3.517820; q[9][11] = 0.360574; + q[9][12] = 4.714220; q[9][13] = 1.177640; + q[9][14] = 0.111502; q[9][15] = 0.353443; + q[9][16] = 1.615050; q[9][17] = 0.234326; + q[9][18] = 0.468951; q[9][19] = 8.659740; + + q[10][11] = 0.287583; q[10][12] = 5.375250; + q[10][13] = 2.348200; q[10][14] = 0.462018; + q[10][15] = 0.382421; q[10][16] = 0.364222; + q[10][17] = 0.740259; q[10][18] = 0.443205; + q[10][19] = 1.997370; + + q[11][12] = 1.032220; q[11][13] = 0.098843; + q[11][14] = 0.619503; q[11][15] = 1.073780; + q[11][16] = 1.537920; q[11][17] = 0.152232; + q[11][18] = 0.147411; q[11][19] = 0.342012; + + q[12][13] = 1.320870; q[12][14] = 0.194864; + q[12][15] = 0.556353; q[12][16] = 1.681970; + q[12][17] = 0.570369; q[12][18] = 0.473810; + q[12][19] = 2.282020; + + q[13][14] = 0.179896; q[13][15] = 0.606814; + q[13][16] = 0.191467; q[13][17] = 1.699780; + q[13][18] = 7.154480; q[13][19] = 0.725096; + + q[14][15] = 1.786490; q[14][16] = 0.885349; + q[14][17] = 0.156619; q[14][18] = 0.239607; + q[14][19] = 0.351250; + + q[15][16] = 4.847130; q[15][17] = 0.578784; + q[15][18] = 0.872519; q[15][19] = 0.258861; + + q[16][17] = 0.126678; q[16][18] = 0.325490; + q[16][19] = 1.547670; + + q[17][18] = 2.763540; q[17][19] = 0.409817; + + q[18][19] = 0.347826; + + /* original frequencies */ + f[ 0] = 0.0866; + f[ 1] = 0.0440; + f[ 2] = 0.0391; + f[ 3] = 0.0570; + f[ 4] = 0.0193; + f[ 5] = 0.0367; + f[ 6] = 0.0581; + f[ 7] = 0.0833; + f[ 8] = 0.0244; + f[ 9] = 0.0485; + f[10] = 0.0862; + f[11] = 0.0620; + f[12] = 0.0195; + f[13] = 0.0384; + f[14] = 0.0458; + f[15] = 0.0695; + f[16] = 0.0610; + f[17] = 0.0144; + f[18] = 0.0353; + f[19] = 0.0709; +} + +void cprev45data(dmatrix q, double *f) +{ + /* cpREV45 model of amino acid evolution + * Adachi, J., P.J. Waddell, W. Martin, and M. Hasegawa. 2000. + * J. Mol. Evol. 50:348-358 + * (reconstructed from 45 chloroplast genomes) + */ + q[0][1] = 105; q[0][2] = 227; + q[0][3] = 175; q[0][4] = 669; + q[0][5] = 157; q[0][6] = 499; + q[0][7] = 665; q[0][8] = 66; + q[0][9] = 145; q[0][10] = 197; + q[0][11] = 236; q[0][12] = 185; + q[0][13] = 68; q[0][14] = 490; + q[0][15] = 2440; q[0][16] = 1340; + q[0][17] = 14; q[0][18] = 56; + q[0][19] = 968; + + q[1][2] = 357; q[1][3] = 43; + q[1][4] = 823; q[1][5] = 1745; + q[1][6] = 152; q[1][7] = 243; + q[1][8] = 715; q[1][9] = 136; + q[1][10] = 203; q[1][11] = 4482; + q[1][12] = 125; q[1][13] = 53; + q[1][14] = 87; q[1][15] = 385; + q[1][16] = 314; q[1][17] = 230; + q[1][18] = 323; q[1][19] = 92; + + q[2][3] = 4435; q[2][4] = 538; + q[2][5] = 768; q[2][6] = 1055; + q[2][7] = 653; q[2][8] = 1405; + q[2][9] = 168; q[2][10] = 113; + q[2][11] = 2430; q[2][12] = 61; + q[2][13] = 97; q[2][14] = 173; + q[2][15] = 2085; q[2][16] = 1393; + q[2][17] = 40; q[2][18] = 754; + q[2][19] = 83; + + q[3][4] = 10; q[3][5] = 400; + q[3][6] = 3691; q[3][7] = 431; + q[3][8] = 331; q[3][9] = 10; + q[3][10] = 10; q[3][11] = 412; + q[3][12] = 47; q[3][13] = 22; + q[3][14] = 170; q[3][15] = 590; + q[3][16] = 266; q[3][17] = 18; + q[3][18] = 281; q[3][19] = 75; + + q[4][5] = 10; q[4][6] = 10; + q[4][7] = 303; q[4][8] = 441; + q[4][9] = 280; q[4][10] = 396; + q[4][11] = 48; q[4][12] = 159; + q[4][13] = 726; q[4][14] = 285; + q[4][15] = 2331; q[4][16] = 576; + q[4][17] = 435; q[4][18] = 1466; + q[4][19] = 592; + + q[5][6] = 3122; q[5][7] = 133; + q[5][8] = 1269; q[5][9] = 92; + q[5][10] = 286; q[5][11] = 3313; + q[5][12] = 202; q[5][13] = 10; + q[5][14] = 323; q[5][15] = 396; + q[5][16] = 241; q[5][17] = 53; + q[5][18] = 391; q[5][19] = 54; + + q[6][7] = 379; q[6][8] = 162; + q[6][9] = 148; q[6][10] = 82; + q[6][11] = 2629; q[6][12] = 113; + q[6][13] = 145; q[6][14] = 185; + q[6][15] = 568; q[6][16] = 369; + q[6][17] = 63; q[6][18] = 142; + q[6][19] = 200; + + q[7][8] = 19; q[7][9] = 40; + q[7][10] = 20; q[7][11] = 263; + q[7][12] = 21; q[7][13] = 25; + q[7][14] = 28; q[7][15] = 691; + q[7][16] = 92; q[7][17] = 82; + q[7][18] = 10; q[7][19] = 91; + + q[8][9] = 29; q[8][10] = 66; + q[8][11] = 305; q[8][12] = 10; + q[8][13] = 127; q[8][14] = 152; + q[8][15] = 303; q[8][16] = 32; + q[8][17] = 69; q[8][18] = 1971; + q[8][19] = 25; + + q[9][10] = 1745; q[9][11] = 345; + q[9][12] = 1772; q[9][13] = 454; + q[9][14] = 117; q[9][15] = 216; + q[9][16] = 1040; q[9][17] = 42; + q[9][18] = 89; q[9][19] = 4797; + + q[10][11] = 218; q[10][12] = 1351; + q[10][13] = 1268; q[10][14] = 219; + q[10][15] = 516; q[10][16] = 156; + q[10][17] = 159; q[10][18] = 189; + q[10][19] = 865; + + q[11][12] = 193; q[11][13] = 72; + q[11][14] = 302; q[11][15] = 868; + q[11][16] = 918; q[11][17] = 10; + q[11][18] = 247; q[11][19] = 249; + + q[12][13] = 327; q[12][14] = 100; + q[12][15] = 93; q[12][16] = 645; + q[12][17] = 86; q[12][18] = 215; + q[12][19] = 475; + + q[13][14] = 43; q[13][15] = 487; + q[13][16] = 148; q[13][17] = 468; + q[13][18] = 2370; q[13][19] = 317; + + q[14][15] = 1202; q[14][16] = 260; + q[14][17] = 49; q[14][18] = 97; + q[14][19] = 122; + + q[15][16] = 2151; q[15][17] = 73; + q[15][18] = 522; q[15][19] = 167; + + q[16][17] = 29; q[16][18] = 71; + q[16][19] = 760; + + q[17][18] = 346; q[17][19] = 10; + + q[18][19] = 119; + + f[0] = 0.076; + f[1] = 0.062; + f[2] = 0.041; + f[3] = 0.037; + f[4] = 0.009; + f[5] = 0.038; + f[6] = 0.049; + f[7] = 0.084; + f[8] = 0.025; + f[9] = 0.081; + f[10] = 0.101; + f[11] = 0.050; + f[12] = 0.022; + f[13] = 0.051; + f[14] = 0.043; + f[15] = 0.062; + f[16] = 0.054; + f[17] = 0.018; + f[18] = 0.031; + f[19] = 0.066; +} + diff --git a/forester/archive/RIO/others/puzzle_mod/src/ppuzzle.c b/forester/archive/RIO/others/puzzle_mod/src/ppuzzle.c new file mode 100644 index 0000000..04a1cc2 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/ppuzzle.c @@ -0,0 +1,2418 @@ +/* + * ppuzzle.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + +#define EXTERN extern + +#include +#include +#include "ppuzzle.h" + + +int PP_IamMaster; +int PP_IamSlave; +int PP_Myid; +int PP_MyMaster; +int PP_NumProcs; +MPI_Comm PP_Comm; + +int *freeslaves; /* Queue of free slaves */ +int firstslave, /* headpointer of queue */ + lastslave; /* tailpointer of queue */ + +int *permutsent, + *permutrecved, + *quartsent, + *quartrecved, + *doquartsent, + *doquartrecved, + *splitsent, + *splitrecved, + *permutsentn, + *permutrecvedn, + *quartsentn, + *quartrecvedn, + *doquartsentn, + *doquartrecvedn, + *splitsentn, + *splitrecvedn; + +double *walltimes, + *cputimes; +double *fullwalltimes, + *fullcputimes; +double *altwalltimes, + *altcputimes; + +int PP_permutsent = 0; /* # of */ +int PP_permutrecved = 0; /* # of */ +int PP_quartsent = 0; /* # of */ +int PP_quartrecved = 0; /* # of */ +int PP_doquartsent = 0; /* # of */ +int PP_doquartrecved = 0; /* # of */ +int PP_splitsent = 0; /* # of */ +int PP_splitrecved = 0; /* # of */ +int PP_permutsentn = 0; /* # of */ +int PP_permutrecvedn = 0; /* # of */ +int PP_quartsentn = 0; /* # of */ +int PP_quartrecvedn = 0; /* # of */ +int PP_doquartsentn = 0; /* # of */ +int PP_doquartrecvedn = 0; /* # of */ +int PP_splitsentn = 0; /* # of */ +int PP_splitrecvedn = 0; /* # of */ + +double PP_starttime = 0, + PP_stoptime = 0, + PP_inittime = 0, + PP_paramcomptime = 0, + PP_paramsendtime = 0, + PP_quartcomptime = 0, + PP_quartsendtime = 0, + PP_puzzletime = 0, + PP_treetime = 0, + PP_lasttime = 0; + +int PP_MaxSlave = 0; + + +/********************************************************************* +* miscellaneous utilities * +*********************************************************************/ + +int dcmp(const void *a, const void *b) +{ + if (*(double *)a > *(double *)b) return (-1); + else if (*(double *)a < *(double *)b) return 1; + else return 0; +} + +/******************/ + +void PP_cmpd(int rank, double a, double b) +{ + if (a != b) + FPRINTF(STDOUTFILE "(%2d) *** %.3f != %.3f\n", rank, a, b); +} + +/******************/ + +void PP_cmpi(int rank, int a, int b) +{ + if (a != b) + FPRINTF(STDOUTFILE "(%2d) *** %d != %d\n", rank, a, b); +} + +/******************/ + +double PP_timer() +{ + double tmptime; + if (PP_lasttime == 0) { + PP_lasttime = MPI_Wtime(); + return(0); + } + else { + tmptime = PP_lasttime; + PP_lasttime = MPI_Wtime(); + return(PP_lasttime - tmptime); + } +} + +/******************/ + +void PP_Printerror(FILE *of, int id, int err) +{ + char errstr[MPI_MAX_ERROR_STRING]; + int errstrlen; + + if ((err > MPI_SUCCESS) && (err <= MPI_ERR_LASTCODE)) { + MPI_Error_string(err, errstr, &errstrlen); + fprintf(of, "(%2d) MPI ERROR %d : %s\n", id, err, errstr); + } + else { + if (err == MPI_SUCCESS) + fprintf(of, "(%2d) MPI ERROR %d : No error\n", id, err); + else + fprintf(of, "(%2d) MPI ERROR %d : unknown error number\n", id, err); + } +} /* PP_Printerror */ + +/******************/ + +void PP_Printbiparts(cmatrix biparts) +{ int n1, n2; + for (n1=0; n1<(Maxspc-3); n1++) { + if (n1==0) FPRINTF(STDOUTFILE "(%2d) bipartition : ", PP_Myid); + else FPRINTF(STDOUTFILE "(%2d) : ", PP_Myid); + for (n2=0; n2= qnum) + while ((lowval > qnum)) { + dd -= 1; lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + } + else { + while (highval <= qnum) { + dd += 1; highval = (uli) (dd+1)*dd*(dd-1)*(dd-2)/24; + } + lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + } + qnum -= lowval; + if (qnum > 0) { + temp = (double)(6 * qnum); + temp = pow(temp, (double)(1/3)); + cc = (uli) floor(temp); + if (cc < 2) cc= 2; + lowval = (uli) cc*(cc-1)*(cc-2)/6; + highval = (uli) (cc+1)*cc*(cc-1)/6; + if (lowval >= qnum) + while ((lowval > qnum)) { + cc -= 1; lowval = (uli) cc*(cc-1)*(cc-2)/6; + } + else { + while (highval <= qnum) { + cc += 1; highval = (uli) (cc+1)*cc*(cc-1)/6; + } + lowval = (uli) cc*(cc-1)*(cc-2)/6; + } + qnum -= lowval; + if (qnum > 0) { + temp = (double)(2 * qnum); + temp = sqrt(temp); + bb = (uli) floor(temp); + if (bb < 1) bb= 1; + lowval = (uli) bb*(bb-1)/2; + highval = (uli) (bb+1)*bb/2; + if (lowval >= qnum) + while ((lowval > qnum)) { + bb -= 1; lowval = (uli) bb*(bb-1)/2; + } + else { + while (highval <= qnum) { + bb += 1; highval = (uli) (bb+1)*bb/2; + } + lowval = (uli) bb*(bb-1)/2; + } + qnum -= lowval; + if (qnum > 0) { + aa = (uli) qnum; + if (aa < 0) aa= 0; + } + } + } + *d = (int)dd; + *c = (int)cc; + *b = (int)bb; + *a = (int)aa; +} /* num2quart */ + +/******************/ + +uli numquarts(int maxspc) +{ + uli tmp; + int a, b, c, d; + + if (maxspc < 4) + return (uli)0; + else { + maxspc--; + a = maxspc-3; + b = maxspc-2; + c = maxspc-1; + d = maxspc; + + tmp = (uli) 1 + a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); + } +} /* numquarts */ + +/******************/ + +uli quart2num (int a, int b, int c, int d) +{ + uli tmp; + if ((a>b) || (b>c) || (c>d)) { + fprintf(stderr, "Error PP5 not (%d <= %d <= %d <= %d) !!!\n", a, b, c, d); + exit (1); + } + tmp = (uli) a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); +} /* quart2num */ +#endif +/******************/ + + +/********************************************************************* +* queue for storing the ranks of slaves waiting for work * +*********************************************************************/ + +void PP_initslavequeue() +{ + int n; + freeslaves = new_ivector(PP_NumProcs); + firstslave = 0; + PP_MaxSlave = PP_NumProcs-1; + lastslave = PP_MaxSlave-1; + freeslaves[PP_MaxSlave] = PP_MaxSlave; + for (n=0; n 900) { + /* every 900 seconds */ + /* percentage of completed trees */ + if (mflag == 0) { + FPRINTF(STDOUTFILE "\n"); + mflag = 1; + } + tc2 = 100.0*Currtrial/Numtrial + + 100.0*nq/Numquartets/Numtrial; + mintogo = (100.0-tc2) * + (double) (time2-time0)/60.0/tc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%2.2f%%", tc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + time1 = time2; + } +# endif /* SEQUENTIAL */ + } + + /* find out which edge has the lowest edgeinfo */ + minimumedgeinfo(); + + /* add the next leaf on minedge */ + addnextleaf(minedge); + } + + /* compute bipartitions of current tree */ + computebiparts(); + +#if PARALLEL + if (PP_IamMaster) makenewsplitentries(); +# else + makenewsplitentries(); +# endif + + { + int *ctree, startnode; + char *trstr; + ctree = initctree(); + copytree(ctree); + startnode = sortctree(ctree); + trstr=sprintfctree(ctree, psteptreestrlen); + (void) addtree2list(&trstr, 1, &psteptreelist, &psteptreenum, &psteptreesum); +# ifdef PVERBOSE2 + /* fprintf(STDOUT, "%s\n", trstr); */ + printfpstrees(psteptreelist); +# endif + freectree(&ctree); + } + + + /* free tree before building the next tree */ + freetree(); + +} /* PP_slave_do_puzzling */ + +/******************/ + +void PP_do_puzzling(ivector trueID) +{ +int dest; + +# if PARALLEL + dest = PP_getslave(); + PP_SendPermut(dest, Maxspc, trueID); +# endif + + /* initialize tree */ + inittree(); + + PP_RecvSplits(Maxspc, biparts); + +# ifdef PVERBOSE3 + PP_Printbiparts(biparts); +# endif /* PVERBOSE3 */ + + makenewsplitentries(); + + /* free tree before building the next tree */ + freetree(); + +} /* PP_do_puzzling */ + +/******************/ + + +void PP_do_write_quart(int e, + int f, + int g, + int h, + double d1, + double d2, + double d3, + uli *numbq, + uli *bqarr) +{ + double lhs[3], + temp, + wlist[6], + plist[6]; + unsigned char qpbranching; + int badquartet; + + lhs[0] = d1; + lhs[1] = d2; + lhs[2] = d3; + + badquartet = FALSE; + + /* compute Bayesian weights */ + temp = (lhs[0] + lhs[1] + lhs[2])/3.0; + lhs[0] = exp(lhs[0] - temp); + lhs[1] = exp(lhs[1] - temp); + lhs[2] = exp(lhs[2] - temp); + temp = lhs[0] + lhs[1] + lhs[2]; + wlist[0] = lhs[0] / temp; + wlist[1] = 1.0; + wlist[2] = lhs[1] / temp; + wlist[3] = 2.0; + wlist[4] = lhs[2] / temp; + wlist[5] = 4.0; + + /* sort in descending order */ + qsort(wlist, 3, 2*sizeof(double), dcmp); + + /* check out the three possibilities */ + + /* 100 distribution */ + plist[0] = (1.0 - wlist[0])*(1.0 - wlist[0]) + + (0.0 - wlist[2])*(0.0 - wlist[2]) + + (0.0 - wlist[4])*(0.0 - wlist[4]); + plist[1] = wlist[1]; + + /* 110 distribution */ + plist[2] = (0.5 - wlist[0])*(0.5 - wlist[0]) + + (0.5 - wlist[2])*(0.5 - wlist[2]) + + (0.0 - wlist[4])*(0.0 - wlist[4]); + plist[3] = wlist[1] + wlist[3]; + + /* 111 distribution */ + temp = 1.0/3.0; + plist[4] = (temp - wlist[0])*(temp - wlist[0]) + + (temp - wlist[2])*(temp - wlist[2]) + + (temp - wlist[4])*(temp - wlist[4]); + plist[5] = wlist[1] + wlist[3] + wlist[5]; + + /* sort in descending order */ + qsort(plist, 3, 2*sizeof(double), dcmp); + + qpbranching = (unsigned char) plist[5]; + writequartet(e, f, g, h, qpbranching); + + /* a bad quartet is a quartet that shows + equal weights for all three possible topologies */ + if (qpbranching == 7) badquartet = TRUE; + + if (badquartet) { + bqarr[(*numbq)++] = quart2num(e, f, g, h); +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) bad quartet: %d %d %d %d -> %ld\n", + PP_Myid, e, f, g, h, quart2num(e, f, g, h)); +# endif /* PVERBOSE3 */ + badqs++; + badtaxon[e]++; + badtaxon[f]++; + badtaxon[g]++; + badtaxon[h]++; + } /* if badquartet */ +} /* PP_do_write_quart */ + +/********************************************************************* +* sending/receiving the important sizes and parameter (M->S) * +*********************************************************************/ + +void PP_SendSizes(int mspc, + int msite, + int ncats, + int nptrn, + int rad, + int outgr, + double frconst, + int rseed) +{ +# define NUMINT 7 +# define NUMDBL 1 + int ints[NUMINT]; + double doubles[NUMDBL]; + MPI_Datatype Dtypes[2] = {MPI_INT, MPI_DOUBLE}; + int Dtypelens[2] = {NUMINT , NUMDBL}; + MPI_Aint Dtypeaddr[2]; + MPI_Datatype PP_Sizes; + int dest; + int error; + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: Maxspc=%d Maxsite=%d numcats=%d\n", PP_Myid, mspc, msite, ncats); + FPRINTF(STDOUTFILE "(%2d) Numprtn=%d tpmradix=%d fracconst=%.3f\n", PP_Myid, nptrn, rad, frconst); +# endif /* PVERBOSE2 */ + + ints[0] = mspc; + ints[1] = msite; + ints[2] = ncats; + ints[3] = nptrn; + ints[4] = rad; + ints[5] = outgr; + ints[6] = rseed; + doubles[0] = frconst; + + MPI_Address(ints, Dtypeaddr); + MPI_Address(doubles, (Dtypeaddr+1)); + + MPI_Type_struct(2, Dtypelens, Dtypeaddr, Dtypes, &PP_Sizes); + MPI_Type_commit(&PP_Sizes); + + for (dest=1; dest (%2d) Sent Sizes\n", PP_Myid, dest); +# endif /* PVERBOSE3 */ + + } /* for each slave */ + + MPI_Type_free(&PP_Sizes); + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent Sizes\n", PP_Myid); +# endif /* PVERBOSE3 */ + +# undef NUMINT +# undef NUMDBL +} /* PP_SendSizes */ + + +/******************/ + +void PP_RecvSizes(int *mspc, + int *msite, + int *ncats, + int *nptrn, + int *rad, + int *outgr, + double *frconst, + int *rseed) +{ +# define NUMINT 7 +# define NUMDBL 1 + int ints[NUMINT]; + double doubles[NUMDBL]; + MPI_Datatype Dtypes[2] = {MPI_INT, MPI_DOUBLE}; + int Dtypelens[2] = {NUMINT , NUMDBL}; + MPI_Aint Dtypeaddr[2]; + MPI_Datatype PP_Sizes; + MPI_Status stat; + int error; + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) Receiving Sizes ...\n", PP_Myid); +# endif /* PVERBOSE3 */ + + MPI_Address(ints, Dtypeaddr); + MPI_Address(doubles, (Dtypeaddr+1)); + + MPI_Type_struct(2, Dtypelens, Dtypeaddr, Dtypes, &PP_Sizes); + MPI_Type_commit(&PP_Sizes); + + error = MPI_Probe(PP_MyMaster, MPI_ANY_TAG, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 700+PP_Myid, error); + if (stat.MPI_TAG != PP_SIZES) { + if (stat.MPI_TAG == PP_DONE) { + PP_RecvDone(); +# ifdef PVERBOSE1 + FPRINTF(STDOUTFILE "(%2d) Finishing...\n", PP_Myid); +# endif /* PVERBOSE1 */ + MPI_Finalize(); + exit(1); + } else { + FPRINTF(STDOUTFILE "(%2d) Error: unexpected TAG received...\n", PP_Myid); + MPI_Finalize(); + exit(1); + } + } + + error = MPI_Recv(MPI_BOTTOM, 1, PP_Sizes, PP_MyMaster, MPI_ANY_TAG, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 700+PP_Myid, error); + if (stat.MPI_TAG != PP_SIZES) { + FPRINTF(STDOUTFILE "(%2d) Error: unexpected TAG received...\n", PP_Myid); + MPI_Finalize(); + exit(1); + } + + *mspc = ints[0]; + *msite = ints[1]; + *ncats = ints[2]; + *nptrn = ints[3]; + *rad = ints[4]; + *outgr = ints[5]; + *rseed = ints[6]; + *frconst = doubles[0]; + + MPI_Type_free(&PP_Sizes); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) <- (%2d) Received: Maxspec=%d Maxsite=%d numcats=%d\n", PP_Myid, PP_MyMaster, *mspc, *msite, *ncats); + FPRINTF(STDOUTFILE "(%2d) Numprtn=%d tpmradix=%d fracconst=%.3f\n", PP_Myid, *nptrn, *rad, *frconst); +# endif /* PVERBOSE2 */ + +# undef NUMINT +# undef NUMDBL +} /* PP_RecvSizes */ + + + +/********************************************************************* +* sending/receiving the data matrizes (M->S) * +*********************************************************************/ + +void PP_RecvData( + cmatrix Seqpat, /* cmatrix (Maxspc x Numptrn) */ + ivector Alias, /* ivector (Maxsite) */ + ivector Weight, /* ivector (Numptrn) */ + ivector constpat, + dvector Rates, /* dvector (numcats) */ + dvector Eval, /* dvector (tpmradix) */ + dvector Freqtpm, + dmatrix Evec, /* dmatrix (tpmradix x tpmradix) */ + dmatrix Ievc, + dmatrix iexp, + dmatrix Distanmat, /* dmatrix (Maxspc x Maxspc) */ + dcube ltprobr) /* dcube (numcats x tpmradix x tpmradix) */ +{ + MPI_Datatype Dtypes[12]; + int Dtypelens[12]; + MPI_Aint Dtypeaddr[12]; + MPI_Datatype PP_Data; + MPI_Status stat; + int error; + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) Receiving Sizes ...\n", PP_Myid); +# endif /* PVERBOSE2 */ + + Dtypes [0] = MPI_CHAR; Dtypelens [0] = Maxspc * Numptrn; + MPI_Address(&(Seqpat[0][0]), &(Dtypeaddr[0])); + Dtypes [1] = MPI_INT; Dtypelens [1] = Maxsite ; + MPI_Address(&(Alias[0]), &(Dtypeaddr[1])); + Dtypes [2] = MPI_INT; Dtypelens [2] = Numptrn ; + MPI_Address(&(Weight[0]), &(Dtypeaddr[2])); + Dtypes [3] = MPI_INT; Dtypelens [3] = Numptrn ; + MPI_Address(&(constpat[0]), &(Dtypeaddr[3])); + Dtypes [4] = MPI_DOUBLE; Dtypelens [4] = numcats ; + MPI_Address(&(Rates[0]), &(Dtypeaddr[4])); + Dtypes [5] = MPI_DOUBLE; Dtypelens [5] = tpmradix ; + MPI_Address(&(Eval[0]), &(Dtypeaddr[5])); + Dtypes [6] = MPI_DOUBLE; Dtypelens [6] = tpmradix ; + MPI_Address(&(Freqtpm[0]), &(Dtypeaddr[6])); + Dtypes [7] = MPI_DOUBLE; Dtypelens [7] = tpmradix * tpmradix ; + MPI_Address(&(Evec[0][0]), &(Dtypeaddr[7])); + Dtypes [8] = MPI_DOUBLE; Dtypelens [8] = tpmradix * tpmradix ; + MPI_Address(&(Ievc[0][0]), &(Dtypeaddr[8])); + Dtypes [9] = MPI_DOUBLE; Dtypelens [9] = tpmradix * tpmradix ; + MPI_Address(&(iexp[0][0]), &(Dtypeaddr[9])); + Dtypes [10] = MPI_DOUBLE; Dtypelens [10] = Maxspc * Maxspc ; + MPI_Address(&(Distanmat[0][0]), &(Dtypeaddr[10])); + Dtypes [11] = MPI_DOUBLE; Dtypelens [11] = numcats * tpmradix * tpmradix ; + MPI_Address(&(ltprobr[0][0][0]), &(Dtypeaddr[11])); + + MPI_Type_struct(12, Dtypelens, Dtypeaddr, Dtypes, &PP_Data); + MPI_Type_commit(&PP_Data); + + + error = MPI_Probe(PP_MyMaster, MPI_ANY_TAG, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 700+PP_Myid, error); + if (stat.MPI_TAG != PP_DATA) { + if (stat.MPI_TAG == PP_DONE) { + PP_RecvDone(); +# ifdef PVERBOSE1 + FPRINTF(STDOUTFILE "(%2d) Finishing...\n", PP_Myid); +# endif /* PVERBOSE1 */ + MPI_Finalize(); + exit(1); + } else { + FPRINTF(STDOUTFILE "(%2d) Error: unexpected TAG received...\n", PP_Myid); + MPI_Finalize(); + exit(1); + } + } + + + error = MPI_Recv(MPI_BOTTOM, 1, PP_Data, PP_MyMaster, PP_DATA, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 900+PP_Myid, error); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) <- (%2d) Received : Alias(0)=%d - Weight(0)=%d - constpat(0)=%d\n", PP_Myid, PP_MyMaster, Alias[0], Weight[0], constpat[0]); + FPRINTF(STDOUTFILE "(%2d) Rates(0)=%.3f - Eval(0)=%.3f - Freqtpm(0)=%.3f\n", PP_Myid, Rates[0], Eval[0], Freqtpm[0]); + FPRINTF(STDOUTFILE "(%2d) Evec(0,0)=%.3f - Ievc(0,0)=%.3f - iexp(0,0)=%.3f - Distanmat(0,1)=%.3f\n", PP_Myid, Evec[0][0], Ievc[0][0], iexp[0][0], Distanmat[0][1]); + FPRINTF(STDOUTFILE "(%2d) Distanmat(0,1)=%.3f\n", PP_Myid, Distanmat[0][1]); + FPRINTF(STDOUTFILE "(%2d) ltprobr(0,0,0)=%.3f\n", PP_Myid, ltprobr[0][0][0]); +# endif /* PVERBOSE2 */ + + MPI_Type_free(&PP_Data); + +} /* PP_RecvData */ + + +/******************/ + +void PP_SendData( + cmatrix Seqpat, /* cmatrix (Maxspc x Numptrn) */ + ivector Alias, /* ivector (Maxsite) */ + ivector Weight, /* ivector (Numptrn) */ + ivector constpat, + dvector Rates, /* dvector (numcats) */ + dvector Eval, /* dvector (tpmradix) */ + dvector Freqtpm, + dmatrix Evec, /* dmatrix (tpmradix x tpmradix) */ + dmatrix Ievc, + dmatrix iexp, + dmatrix Distanmat, /* dmatrix (Maxspc x Maxspc) */ + dcube ltprobr) /* dcube (numcats x tpmradix x tpmradix) */ +{ + MPI_Datatype Dtypes[12]; + int Dtypelens[12]; + MPI_Aint Dtypeaddr[12]; + MPI_Datatype PP_Data; + int dest; + int error; + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: Alias(0)=%d - Weight(0)=%d - constpat(0)=%d\n", PP_Myid, Alias[0], Weight[0], constpat[0]); + FPRINTF(STDOUTFILE "(%2d) Rates(0)=%.3f - Eval(0)=%.3f - Freqtpm(0)=%.3f\n", PP_Myid, Rates[0], Eval[0], Freqtpm[0]); + FPRINTF(STDOUTFILE "(%2d) Evec(0,0)=%.3f - Ievc(0,0)=%.3f - iexp(0,0)=%.3f - Distanmat(0,1)=%.3f\n", PP_Myid, Evec[0][0], Ievc[0][0], iexp[0][0], Distanmat[0][1]); + FPRINTF(STDOUTFILE "(%2d) ltprobr(0,0,0)=%.3f\n", PP_Myid, ltprobr[0][0][0]); +# endif /* PVERBOSE2 */ + + Dtypes [0] = MPI_CHAR; Dtypelens [0] = Maxspc * Numptrn; + MPI_Address(&(Seqpat[0][0]), &(Dtypeaddr[0])); + Dtypes [1] = MPI_INT; Dtypelens [1] = Maxsite ; + MPI_Address(&(Alias[0]), &(Dtypeaddr[1])); + Dtypes [2] = MPI_INT; Dtypelens [2] = Numptrn ; + MPI_Address(&(Weight[0]), &(Dtypeaddr[2])); + Dtypes [3] = MPI_INT; Dtypelens [3] = Numptrn ; + MPI_Address(&(constpat[0]), &(Dtypeaddr[3])); + Dtypes [4] = MPI_DOUBLE; Dtypelens [4] = numcats ; + MPI_Address(&(Rates[0]), &(Dtypeaddr[4])); + Dtypes [5] = MPI_DOUBLE; Dtypelens [5] = tpmradix ; + MPI_Address(&(Eval[0]), &(Dtypeaddr[5])); + Dtypes [6] = MPI_DOUBLE; Dtypelens [6] = tpmradix ; + MPI_Address(&(Freqtpm[0]), &(Dtypeaddr[6])); + Dtypes [7] = MPI_DOUBLE; Dtypelens [7] = tpmradix * tpmradix ; + MPI_Address(&(Evec[0][0]), &(Dtypeaddr[7])); + Dtypes [8] = MPI_DOUBLE; Dtypelens [8] = tpmradix * tpmradix ; + MPI_Address(&(Ievc[0][0]), &(Dtypeaddr[8])); + Dtypes [9] = MPI_DOUBLE; Dtypelens [9] = tpmradix * tpmradix ; + MPI_Address(&(iexp[0][0]), &(Dtypeaddr [9])); + Dtypes [10] = MPI_DOUBLE; Dtypelens [10] = Maxspc * Maxspc ; + MPI_Address(&(Distanmat[0][0]), &(Dtypeaddr[10])); + Dtypes [11] = MPI_DOUBLE; Dtypelens [11] = numcats * tpmradix * tpmradix ; + MPI_Address(&(ltprobr[0][0][0]), &(Dtypeaddr[11])); + + MPI_Type_struct(12, Dtypelens, Dtypeaddr, Dtypes, &PP_Data); + MPI_Type_commit(&PP_Data); + + for (dest=1; dest (%2d) Sent Data\n", PP_Myid, dest); +# endif /* PVERBOSE2 */ + + } /* for each slave */ + + MPI_Type_free(&PP_Data); + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent Data\n", PP_Myid); +# endif /* PVERBOSE2 */ + +} /* PP_SendData */ + + +/************************************************************************** +* procedures to send the request to compute a single quartet (M->S) * +**************************************************************************/ + +void PP_SendDoQuart(int dest, + int a, + int b, + int c, + int d, + int approx) +{ +# define NUMINT 5 + int ints[NUMINT]; + int error; + + ints[0] = a; + ints[1] = b; + ints[2] = c; + ints[3] = d; + ints[4] = approx; + + PP_doquartsent++; + PP_doquartsentn++; + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending -> (%2d): Quart(%d,%d,%d,%d)\n", PP_Myid, dest, a, b, c, d); +# endif /* PVERBOSE2 */ + + error = MPI_Ssend(ints, NUMINT, MPI_INT, dest, PP_DOQUART, PP_Comm); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, PP_Myid, error); + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent \n", PP_Myid); +# endif /* PVERBOSE3 */ +# undef NUMINT + +} /* PP_SendDoQuart */ + + + +/******************/ + +void PP_RecvDoQuart(int *a, + int *b, + int *c, + int *d, + int *approx) +{ +# define NUMINT 5 + int ints[NUMINT]; + int error; + MPI_Status stat; + PP_doquartrecved++; + PP_doquartrecvedn++; + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) Receiving: Quart\n", PP_Myid); +# endif /* PVERBOSE3 */ + + error = MPI_Recv(ints, NUMINT, MPI_INT, PP_MyMaster, PP_DOQUART, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 200+PP_Myid, error); + + *a = ints[0]; + *b = ints[1]; + *c = ints[2]; + *d = ints[3]; + *approx = ints[4]; + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Received: Quart(%d,%d,%d,%d,%c)\n", PP_Myid, *a, *b, *c, *d, (approx ? 'A' : 'E')); +# endif /* PVERBOSE2 */ +# undef NUMINT + +} /* PP_RecvDoQuart */ + + +/************************************************************************** +* procedures to send the result of a single quartet (S->M) * +**************************************************************************/ + +void PP_SendQuart(int a, + int b, + int c, + int d, + double d1, + double d2, + double d3, + int approx) +{ +# define NUMINT 5 +# define NUMDBL 3 + int ints[NUMINT]; + double doubles[NUMDBL]; + MPI_Datatype Dtypes[2] = {MPI_INT, MPI_DOUBLE}; + int Dtypelens[2] = {NUMINT , NUMDBL}; + MPI_Aint Dtypeaddr[2]; + MPI_Datatype PP_Quart; + int error; + + PP_quartsent++; + PP_quartsentn++; + ints[0] = a; + ints[1] = b; + ints[2] = c; + ints[3] = d; + ints[4] = approx; + doubles[0] = d1; + doubles[1] = d2; + doubles[2] = d3; + + MPI_Address(ints, Dtypeaddr); + MPI_Address(doubles, (Dtypeaddr+1)); + + MPI_Type_struct(2, Dtypelens, Dtypeaddr, Dtypes, &PP_Quart); + MPI_Type_commit(&PP_Quart); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: Quart(%d,%d,%d,%d) = (%.3f, %.3f, %.3f)\n", PP_Myid, a, b, c, d, d1, d2, d3); +# endif /* PVERBOSE2 */ + + error = MPI_Ssend(MPI_BOTTOM, 1, PP_Quart, PP_MyMaster, PP_QUART, PP_Comm); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 400+PP_Myid, error); + + MPI_Type_free(&PP_Quart); + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent \n", PP_Myid); +# endif /* PVERBOSE3 */ +# undef NUMINT +# undef NUMDBL + +} /* PP_SendQuart */ + + + +/******************/ + +void PP_RecvQuart(int *a, + int *b, + int *c, + int *d, + double *d1, + double *d2, + double *d3, + int *approx) +{ +# define NUMINT 5 +# define NUMDBL 3 + int ints[NUMINT]; + double doubles[NUMDBL]; + MPI_Datatype Dtypes[2] = {MPI_INT, MPI_DOUBLE}; + int Dtypelens[2] = {NUMINT , NUMDBL}; + MPI_Aint Dtypeaddr[2]; + MPI_Datatype PP_Quart; + int error; + MPI_Status stat; + + PP_quartrecved++; + PP_quartrecvedn++; + MPI_Address(ints, Dtypeaddr); + MPI_Address(doubles, (Dtypeaddr+1)); + + MPI_Type_struct(2, Dtypelens, Dtypeaddr, Dtypes, &PP_Quart); + MPI_Type_commit(&PP_Quart); + + error = MPI_Recv(MPI_BOTTOM, 1, PP_Quart, MPI_ANY_SOURCE, PP_QUART, PP_Comm, &stat); + + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 500+PP_Myid, error); + + PP_putslave(stat.MPI_SOURCE); + + *a = ints[0]; + *b = ints[1]; + *c = ints[2]; + *d = ints[3]; + *d1 = doubles[0]; + *d2 = doubles[1]; + *d3 = doubles[2]; + *approx = ints[4]; + + MPI_Type_free(&PP_Quart); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Received <- (%2d): Quart(%d,%d,%d,%d)=(%.3f, %.3f, %.3f)\n", PP_Myid, stat.MPI_SOURCE, *a, *b, *c, *d, *d1, *d2, *d3); +# endif /* PVERBOSE2 */ +# undef NUMINT +# undef NUMDBL + +} /* PP_RecvQuart */ + + + +/************************************************************************** +* procedures to send the request to compute a block of quartets (M->S) * +**************************************************************************/ + +void PP_SendDoQuartBlock(int dest, uli firstq, uli amount, int approx) +{ +# define NUMULI 3 + uli ulongs[NUMULI]; + int error; + + PP_doquartsent += amount; + PP_doquartsentn++; +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: DOQuartBlock Signal\n", PP_Myid); +# endif /* PVERBOSE2 */ + + ulongs[0] = firstq; + ulongs[1] = amount; + ulongs[2] = (uli)approx; + + error = MPI_Ssend(ulongs, NUMULI, MPI_UNSIGNED_LONG, dest, PP_DOQUARTBLOCK, PP_Comm); + if (error != MPI_SUCCESS) PP_Printerror(STDOUT, 2100+PP_Myid, error); + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent DOQuartBlock Signal (addr:%ld, num:%ld)\n", PP_Myid, firstq, amount); +# endif /* PVERBOSE3 */ +# undef NUMULI + +} /* PP_SendDoQuartBlock */ + +/******************/ + +void PP_RecvDoQuartBlock(uli *firstq, uli *amount, uli **bq, int *approx) +{ +# define NUMULI 3 + uli ulongs[NUMULI]; + MPI_Status stat; + int error; + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Receiving: DOQuartBlock Signal\n", PP_Myid); +# endif /* PVERBOSE2 */ + + error = MPI_Recv(&ulongs, NUMULI, MPI_UNSIGNED_LONG, PP_MyMaster, PP_DOQUARTBLOCK, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 2100+PP_Myid, error); + + *firstq=ulongs[0]; + *amount=ulongs[1]; + *approx= (int)ulongs[2]; + + *bq = malloc((unsigned)*amount * sizeof(uli)); + + PP_doquartrecved += *amount; + PP_doquartrecvedn++; + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... DOQuartBlock (addr:%ld, num:%ld)\n", + PP_Myid, *firstq, *amount); +# endif /* PVERBOSE3 */ + +# undef NUMULI +} /* PP_RecvDoQuartBlock */ + +/********************************************************************* +* procedures to send the results of a block of quartets (S->M) * +*********************************************************************/ + +void PP_SendQuartBlock(uli startq, + uli numofq, + unsigned char *quartetinfo, + uli numofbq, + uli *bq, + int approx) +{ +# define NUMULI 3 +# define NUMINT 1 + unsigned char *trueaddr; + uli truenum; + int error; + int ints[NUMINT]; + uli ulis[NUMULI]; + MPI_Datatype Dtypes[2] = {MPI_UNSIGNED_LONG, MPI_INT}; + int Dtypelens[2] = {NUMULI, NUMINT}; + MPI_Aint Dtypeaddr[2]; + MPI_Datatype PP_QBlockSpecs; + MPI_Datatype DtypesRes[2] = {MPI_UNSIGNED_CHAR, MPI_UNSIGNED_LONG}; + int DtypelensRes[2]; + MPI_Aint DtypeaddrRes[2]; + MPI_Datatype PP_QBlockRes; + +/* + uli *bq; + uli numofbq; +*/ + + PP_quartsent += numofq; + PP_quartsentn++; + + truenum = (uli)((numofq+1)/2); + trueaddr = (unsigned char *)(quartetinfo + (uli)(startq/2)); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: startq=%lud numofq=%lud\n", PP_Myid, startq, numofq); + FPRINTF(STDOUTFILE "(%2d) approx=%c\n", PP_Myid, (approx ? 'A' : 'E')); +# endif /* PVERBOSE2 */ + + ints[0] = approx; + ulis[0] = startq; + ulis[1] = numofq; + ulis[2] = numofbq; + + MPI_Address(ulis, Dtypeaddr); + MPI_Address(ints, (Dtypeaddr+1)); + + MPI_Type_struct(2, Dtypelens, Dtypeaddr, Dtypes, &PP_QBlockSpecs); + MPI_Type_commit(&PP_QBlockSpecs); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: xxPP_QuartBlockSpecs(0,%lu)=%d,%d\n", PP_Myid, truenum-1, trueaddr[0], trueaddr[truenum-1]); +# endif /* PVERBOSE2 */ + + + error = MPI_Ssend(MPI_BOTTOM, 1, PP_QBlockSpecs, PP_MyMaster, PP_QUARTBLOCKSPECS, PP_Comm); +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent QuartBlockSpecs (%ld, %ld, %ld, %d)\n", PP_Myid, ulis[0], ulis[1], ulis[2], ints[0]); +# endif /* PVERBOSE3 */ + + MPI_Address(trueaddr, DtypeaddrRes); + DtypelensRes[0] = truenum; + + MPI_Address(bq, (DtypeaddrRes + 1)); + DtypelensRes[1] = numofbq; + MPI_Type_struct(2, DtypelensRes, DtypeaddrRes, DtypesRes, &PP_QBlockRes); + MPI_Type_commit(&PP_QBlockRes); + + error = MPI_Ssend(MPI_BOTTOM, 1, PP_QBlockRes, PP_MyMaster, PP_QUARTBLOCK, PP_Comm); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, PP_Myid, error); + + MPI_Type_free(&PP_QBlockSpecs); + MPI_Type_free(&PP_QBlockRes); +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent xxPP_QuartBlock(0,%lu)=%d,%d\n", PP_Myid, truenum-1, trueaddr[0], trueaddr[truenum-1]); +# endif /* PVERBOSE3 */ + +# undef NUMULI +# undef NUMINT +} /* PP_SendQuartBlock */ + + + +/******************/ + +void PP_RecvQuartBlock(int slave, + uli *startq, + uli *numofq, + unsigned char *quartetinfo, + int *approx) +{ +# define NUMULI 3 +# define NUMINT 1 + unsigned char *trueaddr; + uli truenum; + int error; + int dest; + int ints[NUMINT]; + uli ulis[NUMULI]; + MPI_Datatype Dtypes[2] = {MPI_UNSIGNED_LONG, MPI_INT}; + int Dtypelens[2] = {NUMULI, NUMINT}; + MPI_Aint Dtypeaddr[2]; + MPI_Datatype PP_QBlockSpecs; + MPI_Datatype DtypesRes[2] = {MPI_UNSIGNED_CHAR, MPI_UNSIGNED_LONG}; + int DtypelensRes[2]; + MPI_Aint DtypeaddrRes[2]; + MPI_Datatype PP_QBlockRes; + MPI_Status stat; + uli count; +uli num; +uli *numofbq; +uli *bq; +numofbq=# +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) Receiving QuartBlock ...\n", PP_Myid); +# endif /* PVERBOSE3 */ + MPI_Address(ulis, Dtypeaddr); + MPI_Address(ints, (Dtypeaddr+1)); + + MPI_Type_struct(2, Dtypelens, Dtypeaddr, Dtypes, &PP_QBlockSpecs); + MPI_Type_commit(&PP_QBlockSpecs); + + MPI_Probe(MPI_ANY_SOURCE, PP_QUARTBLOCKSPECS, PP_Comm, &stat); + dest = stat.MPI_SOURCE; + error = MPI_Recv(MPI_BOTTOM, 1, PP_QBlockSpecs, dest, PP_QUARTBLOCKSPECS, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, PP_Myid, error); + + *approx = ints[0]; + *startq = ulis[0]; + *numofq = ulis[1]; + *numofbq = ulis[2]; + + PP_quartrecved += *numofq; + PP_quartrecvedn++; + truenum = (uli)((*numofq+1)/2); + trueaddr = (unsigned char *)(quartetinfo + (uli)(*startq/2)); +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Recv QuartBlockSpecs (%ld, %ld, %ld, %d)\n", PP_Myid, ulis[0], ulis[1], ulis[2], ints[0]); +# endif /* PVERBOSE3 */ + + DtypelensRes[0] = truenum; + MPI_Address(trueaddr, DtypeaddrRes); + + bq = malloc((unsigned) *numofbq * sizeof(uli)); + + DtypelensRes[1] = *numofbq; + MPI_Address(bq, (DtypeaddrRes+1)); + MPI_Type_struct(2, DtypelensRes, DtypeaddrRes, DtypesRes, &PP_QBlockRes); + MPI_Type_commit(&PP_QBlockRes); + + error = MPI_Recv(MPI_BOTTOM, 1, PP_QBlockRes, dest, PP_QUARTBLOCK, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, PP_Myid, error); +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Recv QuartBlock \n", PP_Myid); +# endif /* PVERBOSE3 */ + + PP_putslave(dest); + + for(count = 0; count < *numofbq; count++){ + int a, b, c, d; + num2quart(bq[count], &a, &b, &c, &d); +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) %ld. bad quarted (%d, %d, %d, %d) = %ld\n", PP_Myid, count, a, b, c, d, bq[count]); +# endif /* PVERBOSE2 */ + + badqs++; + badtaxon[a]++; + badtaxon[b]++; + badtaxon[c]++; + badtaxon[d]++; + if (show_optn) { + fputid10(unresfp, a); + fprintf(unresfp, " "); + fputid10(unresfp, b); + fprintf(unresfp, " "); + fputid10(unresfp, c); + fprintf(unresfp, " "); + fputid(unresfp, d); + fprintf(unresfp, "\n"); + } + } + free(bq); + MPI_Type_free(&PP_QBlockSpecs); + MPI_Type_free(&PP_QBlockRes); +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) <- (%2d) ... Recv xxPP_QuartBlock(0,%lu)=%d,%d\n", PP_Myid, dest, truenum-1, trueaddr[0], trueaddr[truenum-1]); +# endif /* PVERBOSE2 */ + +# undef NUMULI +# undef NUMINT +} /* PP_RecvQuartBlock */ + + +/********************************************************************* +* send/receive array with all quartets (M->S) * +*********************************************************************/ + +void PP_SendAllQuarts(unsigned long Numquartets, + unsigned char *quartetinfo) +{ + MPI_Datatype Dtypes[1] = {MPI_UNSIGNED_CHAR}; + int Dtypelens[1]; + MPI_Aint Dtypeaddr[1]; + MPI_Datatype PP_AllQuarts; + int dest; + int error; + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: PP_AllQuart(0)=%d\n", PP_Myid, quartetinfo[0]); +# endif /* PVERBOSE2 */ + + /* compute number of quartets */ + if (Numquartets % 2 == 0) { /* even number */ + Dtypelens[0] = (Numquartets)/2; + } else { /* odd number */ + Dtypelens[0] = (Numquartets + 1)/2; + } + + MPI_Address(&(quartetinfo[0]), Dtypeaddr); + MPI_Type_struct(1, Dtypelens, Dtypeaddr, Dtypes, &PP_AllQuarts); + MPI_Type_commit(&PP_AllQuarts); + + for (dest=1; dest (%2d) ... Sent xxAllQuart(0,%d)=%d,%d (%luq -> %db)\n", + PP_Myid, dest, Dtypelens[0]-1, quartetinfo[0], quartetinfo[Dtypelens[0]-1], + Numquartets, Dtypelens[0]-1); +# endif /* PVERBOSE3 */ + } /* for each slave */ + + MPI_Type_free(&PP_AllQuarts); + + +} /* PP_SendAllQuarts */ + + + +/******************/ + +void PP_RecvAllQuarts(int taxa, + unsigned long *Numquartets, + unsigned char *quartetinfo) +{ + MPI_Datatype Dtypes[1] = {MPI_UNSIGNED_CHAR}; + int Dtypelens[1]; + MPI_Aint Dtypeaddr[1]; + MPI_Datatype PP_AllQuarts; + MPI_Status stat; + int error; + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) Receiving AllQuarts ...\n", PP_Myid); +# endif /* PVERBOSE3 */ + + /* compute number of quartets */ + *Numquartets = (uli) taxa*(taxa-1)*(taxa-2)*(taxa-3)/24; + if (*Numquartets % 2 == 0) { /* even number */ + Dtypelens[0] = (*Numquartets)/2; + } else { /* odd number */ + Dtypelens[0] = (*Numquartets + 1)/2; + } + + MPI_Address(&(quartetinfo[0]), Dtypeaddr); + MPI_Type_struct(1, Dtypelens, Dtypeaddr, Dtypes, &PP_AllQuarts); + MPI_Type_commit(&PP_AllQuarts); + + error = MPI_Recv(MPI_BOTTOM, 1, PP_AllQuarts, PP_MyMaster, PP_ALLQUARTS, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 1300+PP_Myid, error); + + MPI_Type_free(&PP_AllQuarts); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) <- (%2d) ... Recv xxAllQuart(0,%d)=%d,%d (%luq -> %db)\n", + PP_Myid, PP_MyMaster, Dtypelens[0]-1, quartetinfo[0], quartetinfo[Dtypelens[0]-1], + *Numquartets, Dtypelens[0]-1); +# endif /* PVERBOSE2 */ + +} /* PP_RecvAllQuarts */ + + + +/********************************************************************* +* procedures to send request for a single puzzle tree * +*********************************************************************/ + +void PP_SendPermut(int dest, + int taxa, + ivector permut) +{ + MPI_Datatype Dtypes[1] = {MPI_INT}; + int Dtypelens[1]; + MPI_Aint Dtypeaddr[1]; + MPI_Datatype PP_Permut; + int error; + + PP_permutsent++; + PP_permutsentn++; + Dtypelens[0] = taxa; + + MPI_Address(&(permut[0]), Dtypeaddr); + MPI_Type_struct(1, Dtypelens, Dtypeaddr, Dtypes, &PP_Permut); + MPI_Type_commit(&PP_Permut); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending -> (%2d): PP_Permut(0)=%d\n", PP_Myid, dest, permut[0]); +# endif /* PVERBOSE2 */ + + error = MPI_Ssend(MPI_BOTTOM, 1, PP_Permut, dest, PP_DOPUZZLE, PP_Comm); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 1500+PP_Myid, error); + + MPI_Type_free(&PP_Permut); + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent PP_Permut\n", PP_Myid); +# endif /* PVERBOSE3 */ + +} /* PP_SendPermut */ + +/******************/ + +void PP_RecvPermut(int taxa, + ivector permut) +{ + MPI_Datatype Dtypes[1] = {MPI_INT}; + int Dtypelens[1]; + MPI_Aint Dtypeaddr[1]; + MPI_Datatype PP_Permut; + MPI_Status stat; + int error; + + PP_permutrecved++; + PP_permutrecvedn++; +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) Receiving: PP_Permut\n", PP_Myid); +# endif /* PVERBOSE3 */ + + Dtypelens[0] = taxa; + + MPI_Address(&(permut[0]), Dtypeaddr); + MPI_Type_struct(1, Dtypelens, Dtypeaddr, Dtypes, &PP_Permut); + MPI_Type_commit(&PP_Permut); + + error = MPI_Recv(MPI_BOTTOM, 1, PP_Permut, PP_MyMaster, PP_DOPUZZLE, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 1700+PP_Myid, error); + + MPI_Type_free(&PP_Permut); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Received: PP_Permut(0)=%d\n", PP_Myid, permut[0]); +# endif /* PVERBOSE2 */ + +} /* PP_RecvPermut */ + +/********************************************************************* +* procedures to send the splits of a puzzle tree to the master * +*********************************************************************/ + +void PP_SendSplitsBlock(int taxa, + uli blocksize, + cmatrix *biparts, + int pstnum, + treelistitemtype *pstlist) +{ + MPI_Datatype *Dtypes; + int *Dtypelens; + MPI_Aint *Dtypeaddr; + MPI_Datatype PP_Biparts; + int error; + int n; + int ints[3]; + int *pstnumarr; + treelistitemtype *pstptr; + + PP_splitsent+=blocksize; + PP_splitsentn++; + + ints[0] = taxa; + ints[1] = (int) blocksize; + ints[2] = pstnum; + error = MPI_Ssend(ints, 3, MPI_INT, PP_MyMaster, PP_PUZZLEBLOCKSPECS, PP_Comm); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 1800+PP_Myid, error); + + Dtypes = malloc((blocksize + pstnum + 1) * sizeof(MPI_Datatype)); + Dtypelens = malloc((blocksize + pstnum + 1) * sizeof(int)); + Dtypeaddr = malloc((blocksize + pstnum + 1) * sizeof(MPI_Aint)); + pstnumarr = malloc(pstnum * sizeof(int)); + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Sending: PP_bipartsblock(0..%lu,0,0)8=\"%c\"\n", PP_Myid, blocksize, biparts[0][0][0]); +# endif /* PVERBOSE2 */ + + for (n=0; n<(int)blocksize; n++) { + Dtypes[n] = MPI_CHAR; + Dtypelens[n] = (taxa - 3) * taxa; + MPI_Address(&(biparts[n][0][0]), &(Dtypeaddr[n])); + } + pstptr = pstlist; + for (n=0; n%d: [%d/%d] #=%d \"%s\"\n", + PP_Myid, PP_MyMaster, n, pstnum, pstnumarr[n], (*pstptr).tree); +# endif /* PVERBOSE3 */ + pstptr = (*pstptr).succ; + } + Dtypes[((int)blocksize + pstnum)] = MPI_INT; + Dtypelens[((int)blocksize + pstnum)] = pstnum; + MPI_Address(&(pstnumarr[0]), &(Dtypeaddr[((int)blocksize + pstnum)])); + + MPI_Type_struct(((int)blocksize + pstnum + 1), Dtypelens, Dtypeaddr, Dtypes, &PP_Biparts); + MPI_Type_commit(&PP_Biparts); + + error = MPI_Ssend(MPI_BOTTOM, 1, PP_Biparts, PP_MyMaster, PP_PUZZLEBLOCK, PP_Comm); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 1800+PP_Myid, error); + + MPI_Type_free(&PP_Biparts); + free(Dtypes); + free(Dtypelens); + free(Dtypeaddr); + free(pstnumarr); + +# ifdef PVERBOSE3 + FPRINTF(STDOUTFILE "(%2d) ... Sent PP_bipartsblock\n", PP_Myid); +# endif /* PVERBOSE3 */ + +} /* PP_SendSplitsBlock */ + +/******************/ + +void PP_RecvSplitsBlock(int *taxa, + uli *blocksize, + cmatrix **bip, + treelistitemtype **pstlist, + int *pstnum, + int *pstsum) +/* bp -> (*bip) */ +{ + MPI_Datatype *Dtypes; + int *Dtypelens; + MPI_Aint *Dtypeaddr; + MPI_Datatype PP_Biparts; + MPI_Status stat; + int error; + int n; + int dest; + int ints[3]; + int pstlistnum; + int tmpnum; + int tmpsum; + int *pstnumarr; + char **pstarr; + treelistitemtype *treeitem; + + error = MPI_Recv(ints, 3, MPI_INT, MPI_ANY_SOURCE, PP_PUZZLEBLOCKSPECS, PP_Comm, &stat); + if (error != MPI_SUCCESS) + PP_Printerror(STDOUT, 1900+PP_Myid, error); + + dest = stat.MPI_SOURCE; + *taxa = ints[0]; + *blocksize = (uli) ints[1]; + pstlistnum = ints[2]; + +# ifdef PVERBOSE2 + FPRINTF(STDOUTFILE "(%2d) Received<-%d: PP_bipartsblockspec(t=%d,b=%ld,p=%d)\n", PP_Myid, dest, *taxa, *blocksize, pstlistnum); +# endif /* PVERBOSE2 */ + + PP_splitrecved += *blocksize; + PP_splitrecvedn++; + + Dtypes = malloc((*blocksize + pstlistnum + 1) * sizeof(MPI_Datatype)); + Dtypelens = malloc((*blocksize + pstlistnum + 1) * sizeof(int)); + Dtypeaddr = malloc((*blocksize + pstlistnum + 1) * sizeof(MPI_Aint)); + (*bip) = (cmatrix *) malloc(*blocksize * sizeof(void *)); + pstnumarr = (int *) malloc(pstlistnum * sizeof(int)); + pstarr = (char **) malloc(pstlistnum * sizeof(char *)); + +/* pstarr[0] = (char *) malloc(psteptreestrlen * pstlistnum * sizeof(char)); + for(n=1; n 0) { + if (PP_emptyslave()) { + PP_RecvSplitsBlock(&tx, &bs, &bp, &psteptreelist, &psteptreenum, &psteptreesum); + for (bipnum=0; bipnum 0) { + PP_RecvSplitsBlock(&tx, &bs, &bp, &psteptreelist, &psteptreenum, &psteptreesum); + for (bipnum=0; bipnum%4ld (%dx%ld)\n", PP_Myid, qstart, qend, PP_NumProcs-1, qtodo); +# endif + + addtimes(GENERAL, &tarr); + for (i = 3; i < Maxspc; i++) + for (c = 2; c < i; c++) + for (b = 1; b < c; b++) + for (a = 0; a < b; a++) { + + idx = (uli) a + + (uli) b*(b-1)/2 + + (uli) c*(c-1)*(c-2)/6 + + (uli) i*(i-1)*(i-2)*(i-3)/24; + if ((idx >= qstart) && (idx <= qend)) { +# ifdef PVERBOSE4 + FPRINTF(STDOUTFILE "(%2d) %4ld <---> (%d,%d,%d,%d)\n",PP_Myid, idx, a,b,c,i); +# endif + compute_quartlklhds(a,b,c,i,&d1,&d2,&d3,approx); + PP_do_write_quart(a,b,c,i,d1,d2,d3,&nofbq,bqarr); + addtimes(QUARTETS, &tarr); + } /* if idx */ + } /* for for for for */ + PP_SendQuartBlock(qstart, qtodo, quartetinfo, nofbq, bqarr, approx); + + free(bqarr); bqarr=NULL; + + break; + } + + case PP_DOPUZZLEBLOCK: { + if (PP_AllQuartsReceived){ + uli Numtrial, ptodo; + cmatrix *bp; + int n; + + PP_RecvDoPermutBlock(&Numtrial); + ptodo = Numtrial; + + bp = (cmatrix *) malloc(Numtrial * sizeof(void *)); + for(n=0; nS */ +# define PP_SIZES 1 /* Array sizes needed M->S */ +# define PP_DATA 2 /* Data Arrays M->S */ + +# define PP_ALLQUARTS 3 /* All Quartets M->S */ + +# define PP_DOQUART 4 /* do 4Specs M->S */ +# define PP_DOQUARTX2 5 /* do 4Specs + X^2 M->S */ +# define PP_QUART 6 /* quartet back S->M */ +# define PP_QUARTX2 7 /* quartet + X^2 back S->M */ + +# define PP_DOQUARTBLOCKSPECS 8 /* do block Specs M->S */ +# define PP_DOQUARTBLOCK 9 /* do block of Quarts M->S */ +# define PP_QUARTBLOCKSPECS 10 /* block Specs S->M */ +# define PP_QUARTBLOCK 11 /* block of Quarts S->M */ + +# define PP_DOPUZZLE 12 /* do Puzzling step M->S */ +# define PP_PUZZLE 13 /* Puzzling tree back S->M */ +# define PP_DOPUZZLEBLOCK 14 /* do Puzzling block M->S */ +# define PP_DOPUZZLEBLOCKSPECS 15 /* do Puzzling block M->S */ +# define PP_PUZZLEBLOCK 16 /* Puzzling block S->M */ +# define PP_PUZZLEBLOCKSPECS 17 /* Puzzling block S->M */ + +# define PP_STATS 18 /* Slave Statistics S->M */ + +# define PP_WAIT 18 /* waiting for work S->M */ +# define PP_TEST 100 /* testing */ + +# define PERMUTQUEUESIZE 100 +# define QUARTQUEUESIZE 100 + + extern int PP_IamMaster; + extern int PP_IamSlave; + extern int PP_Myid; + extern int PP_MyMaster; + extern int PP_NumProcs; + extern MPI_Comm PP_Comm; +#endif /* PARALLEL */ + +extern int *permutsent, + *permutrecved, + *quartsent, + *quartrecved, + *doquartsent, + *doquartrecved, + *splitsent, + *splitrecved, + *permutsentn, + *permutrecvedn, + *quartsentn, + *quartrecvedn, + *doquartsentn, + *doquartrecvedn, + *splitsentn, + *splitrecvedn; +extern double *walltimes, + *cputimes; +extern double *fullwalltimes, + *fullcputimes; +extern double *altwalltimes, + *altcputimes; + +extern int PP_permutsent, + PP_permutrecved, + PP_quartsent, + PP_quartrecved, + PP_doquartsent, + PP_doquartrecved, + PP_splitsent, + PP_splitrecved, + PP_permutsentn, + PP_permutrecvedn, + PP_quartsentn, + PP_quartrecvedn, + PP_doquartsentn, + PP_doquartrecvedn, + PP_splitsentn, + PP_splitrecvedn; + +extern double PP_starttime, + PP_stoptime, + PP_inittime, + PP_paramcomptime, + PP_paramsendtime, + PP_quartcomptime, + PP_quartsendtime, + PP_puzzletime, + PP_treetime; + +void num2quart(uli qnum, int *a, int *b, int *c, int *d); +uli numquarts(int maxspc); +uli quart2num (int a, int b, int c, int d); + +int slave_main(int argc, char *argv[]); +void PP_Init(int *argc, char **argv[]); +void PP_Finalize(); +void PP_Printerror(FILE *of, int id, int err); +void PP_do_puzzling(ivector trueID); + +void PP_RecvDoQuart(int *a, + int *b, + int *c, + int *d, + int *approx); +void PP_SendDoQuart(int dest, + int a, + int b, + int c, + int d, + int approx); +void PP_RecvQuart(int *a, + int *b, + int *c, + int *d, + double *d1, + double *d2, + double *d3, + int *approx); +void PP_SendQuart(int a, + int b, + int c, + int d, + double d1, + double d2, + double d3, + int approx); +void PP_SendSizes(int mspc, + int msite, + int ncats, + int nptrn, + int rad, + int outgr, + double frconst, + int rseed); +void PP_RecvSizes(int *mspc, + int *msite, + int *ncats, + int *nptrn, + int *rad, + int *outgr, + double *frconst, + int *rseed); +void PP_RecvData( + cmatrix Seqpat, /* cmatrix (Maxspc x Numptrn) */ + ivector Alias, /* ivector (Maxsite) */ + ivector Weight, /* ivector (Numptrn) */ + ivector constpat, + dvector Rates, /* dvector (numcats) */ + dvector Eval, /* dvector (tpmradix) */ + dvector Freqtpm, + dmatrix Evec, /* dmatrix (tpmradix x tpmradix) */ + dmatrix Ievc, + dmatrix iexp, + dmatrix Distanmat, /* dmatrix (Maxspc x Maxspc) */ + dcube ltprobr); /* dcube (numcats x tpmradix x tpmradix) */ +void PP_SendData( + cmatrix Seqpat, /* cmatrix (Maxspc x Numptrn) */ + ivector Alias, /* ivector (Maxsite) */ + ivector Weight, /* ivector (Numptrn) */ + ivector constpat, + dvector Rates, /* dvector (numcats) */ + dvector Eval, /* dvector (tpmradix) */ + dvector Freqtpm, + dmatrix Evec, /* dmatrix (tpmradix x tpmradix) */ + dmatrix Ievc, + dmatrix iexp, + dmatrix Distanmat, /* dmatrix (Maxspc x Maxspc) */ + dcube ltprobr); /* dcube (numcats x tpmradix x tpmradix) */ +void PP_SendAllQuarts(unsigned long Numquartets, + unsigned char *quartetinfo); +void PP_RecvAllQuarts(int taxa, + unsigned long *Numquartets, + unsigned char *quartetinfo); + +void PP_SendDoQuartBlock(int dest, uli firstq, uli amount, int approx); +void PP_RecvDoQuartBlock(uli *firstq, uli *amount, uli **bq, int *approx); +void PP_SendQuartBlock(uli startq, + uli numofq, + unsigned char *quartetinfo, + uli numofbq, + uli *bq, + int approx); +void PP_RecvQuartBlock(int slave, + uli *startq, + uli *numofq, + unsigned char *quartetinfo, + int *approx); + +void PP_SendPermut(int dest, + int taxa, + ivector permut); +void PP_RecvPermut(int taxa, + ivector permut); +void PP_SendDoPermutBlock(uli puzzlings); +void PP_RecvDoPermutBlock(uli *taxa); + +void PP_SendSplits(int taxa, + cmatrix biparts); +void PP_RecvSplits(int taxa, + cmatrix biparts); +void PP_SendDone(); +void PP_RecvDone(); + +int PP_emptyslave(); +void PP_putslave(int sl); +int PP_getslave(); + +void PP_cmpd(int rank, double a, double b); +void PP_cmpi(int rank, int a, int b); + +#endif /* _PPUZZLE_ */ diff --git a/forester/archive/RIO/others/puzzle_mod/src/puzzle.h b/forester/archive/RIO/others/puzzle_mod/src/puzzle.h new file mode 100644 index 0000000..8165b1a --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/puzzle.h @@ -0,0 +1,493 @@ +/* + * puzzle.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef _PUZZLE_ +#define _PUZZLE_ + +#ifndef PACKAGE +# define PACKAGE "tree-puzzle" +#endif +#ifndef VERSION +# define VERSION "5.0" +#endif +#define DATE "October 2000" + +/* prototypes */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" +#include "ml.h" +#ifdef PARALLEL +# include "ppuzzle.h" +#endif + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + +/* filenames */ +# define FILENAMELENTH 2048 + + +# define INFILEDEFAULT "infile" +# define OUTFILEDEFAULT "outfile" +# define TREEFILEDEFAULT "outtree" +# define INTREEDEFAULT "intree" +# define DISTANCESDEFAULT "outdist" +# define TRIANGLEDEFAULT "outlm.eps" +# define UNRESOLVEDDEFAULT "outqlist" +# define ALLQUARTDEFAULT "outallquart" +# define ALLQUARTLHDEFAULT "outallquartlh" +# define OUTPTLISTDEFAULT "outpstep" +# define OUTPTORDERDEFAULT "outptorder" + +# define INFILE infilename +# define OUTFILE outfilename +# define TREEFILE outtreename +# define INTREE intreename +# define DISTANCES outdistname +# define TRIANGLE outlmname +# define UNRESOLVED outqlistname +# define ALLQUART outallquartname +# define ALLQUARTLH outallquartlhname +# define OUTPTLIST outpstepname +# define OUTPTORDER outptordername + +EXTERN char infilename [FILENAMELENTH]; +EXTERN char outfilename [FILENAMELENTH]; +EXTERN char outtreename [FILENAMELENTH]; +EXTERN char intreename [FILENAMELENTH]; +EXTERN char outdistname [FILENAMELENTH]; +EXTERN char outlmname [FILENAMELENTH]; +EXTERN char outqlistname [FILENAMELENTH]; +EXTERN char outallquartname [FILENAMELENTH]; +EXTERN char outallquartlhname [FILENAMELENTH]; +EXTERN char outpstepname [FILENAMELENTH]; +EXTERN char outptordername [FILENAMELENTH]; + +#define OUTFILEEXT "puzzle" +#define TREEFILEEXT "tree" +#define DISTANCESEXT "dist" +#define TRIANGLEEXT "eps" +#define UNRESOLVEDEXT "qlist" +#define ALLQUARTEXT "allquart" +#define ALLQUARTLHEXT "allquartlh" +#define OUTPTLISTEXT "pstep" +#define OUTPTORDEREXT "ptorder" + +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUT stdout +# define STDOUTFILE STDOUT, +#endif + + +/* auto_aamodel/auto_datatype values (xxx) */ +#define AUTO_OFF 0 +#define AUTO_GUESS 1 +#define AUTO_DEFAULT 2 + + +/* qptlist values (xxx) */ +#define PSTOUT_NONE 0 +#define PSTOUT_ORDER 1 +#define PSTOUT_LISTORDER 2 +#define PSTOUT_LIST 3 + +/* dtat_optn values (xxx) */ +#define NUCLEOTIDE 0 +#define AMINOACID 1 +#define BINARY 2 + +/* typ_optn values (xxx) */ +#define LIKMAPING_OPTN 1 +#define TREERECON_OPTN 0 + +/* puzzlemodes (xxx) */ +#define QUARTPUZ 0 +#define USERTREE 1 +#define PAIRDIST 2 + +/* rhetmodes (xxx) Modes of rate heterogeneity */ +#define UNIFORMRATE 0 +#define GAMMARATE 1 +#define TWORATE 2 +#define MIXEDRATE 3 + +/* defines for types of quartet likelihood computation (xxx) */ +#define EXACT 0 +#define APPROX 1 + +/* tree structure */ +typedef struct oneedge { + /* pointer to other three edges */ + struct oneedge *up; + struct oneedge *downleft; + struct oneedge *downright; + int numedge; /* number of edge */ + uli edgeinfo; /* value of this edge */ + int *edgemap; /* pointer to the local edgemap */ +} ONEEDGE; + + +/* variables */ +EXTERN cmatrix biparts; /* bipartitions of tree of current puzzling step */ +EXTERN cmatrix consbiparts; /* bipartitions of majority rule consensus tree */ +EXTERN cmatrix seqchars; /* characters contained in data set */ +EXTERN cmatrix treepict; /* picture of consensus tree */ +EXTERN double minscore; /* value of edgescore on minedge */ +EXTERN double tstvf84; /* F84 transition/transversion ratio */ +EXTERN double tstvratio; /* expected transition/transversion ratio */ +EXTERN double yrtsratio; /* expected pyrimidine/purine transition ratio */ +EXTERN dvector ulkl; /* log L of user trees */ +EXTERN dmatrix allsites; /* log L per sites of user trees */ +EXTERN dvector ulklc; /* log L of user trees (clock) */ +EXTERN dmatrix allsitesc; /* log L per sites of user trees (clock) */ +EXTERN FILE *utfp; /* pointer to user tree file */ +EXTERN FILE *ofp; /* pointer to output file */ +EXTERN FILE *seqfp; /* pointer to sequence input file */ +EXTERN FILE *tfp; /* pointer to tree file */ +EXTERN FILE *dfp; /* pointer to distance file */ +EXTERN FILE *trifp; /* pointer to triangle file */ +EXTERN FILE *unresfp; /* pointer to file with unresolved quartets */ +EXTERN FILE *tmpfp; /* pointer to temporary file */ +EXTERN FILE *qptlist; /* pointer to file with puzzling step trees */ +EXTERN FILE *qptorder; /* pointer to file with unique puzzling step trees */ +EXTERN int SHcodon; /* whether SH should be applied to 1st, 2nd codon positions */ +EXTERN int utree_optn; /* use first user tree for estimation */ +EXTERN int listqptrees; /* list puzzling step trees */ +EXTERN int approxqp; /* approximate QP quartets */ +EXTERN int *edgeofleaf; /* vector with edge number of all leaves */ +EXTERN int codon_optn; /* declares what positions in a codon should be used */ +EXTERN int compclock; /* computation of clocklike branch lengths */ +EXTERN int chooseA; /* leaf variable */ +EXTERN int chooseB; /* leaf variable */ +EXTERN int clustA, clustB, clustC, clustD; /* number of members of LM clusters */ +EXTERN int column; /* used for breaking lines (writing tree to treefile) */ +EXTERN int Frequ_optn; /* use empirical base frequencies */ +EXTERN int Maxbrnch; /* 2*Maxspc - 3 */ +EXTERN int Maxseqc; /* number of sequence characters per taxum */ +EXTERN int mflag; /* flag used for correct printing of runtime messages */ +EXTERN int minedge; /* edge with minimum edgeinfo */ +EXTERN int nextedge; /* number of edges in the current tree */ +EXTERN int nextleaf; /* next leaf to add to tree */ +EXTERN int numclust; /* number of clusters in LM analysis */ +EXTERN int outgroup; /* outgroup */ +EXTERN int puzzlemode; /* computation of QP tree and/or ML distances */ +EXTERN int rootsearch; /* how location of root is found */ +EXTERN int rhetmode; /* model of rate heterogeneity */ +EXTERN int splitlength; /* length of one entry in splitpatterns */ +EXTERN int *splitsizes; /* size of all different splits of all trees */ +EXTERN int usebestq_optn; /* use only best quartet topology, no bayesian weights */ +EXTERN int show_optn; /* show unresolved quartets */ +EXTERN int savequart_optn; /* save memory block which quartets to file */ +EXTERN int savequartlh_optn; /* save quartet likelihoods to file */ +EXTERN int saveqlhbin_optn; /* save quartet likelihoods binary */ +EXTERN int readquart_optn; /* read memory block which quartets from file */ +EXTERN int sym_optn; /* symmetrize doublet frequencies */ +EXTERN int xsize; /* depth of consensus tree picture */ +EXTERN int ytaxcounter; /* counter for establishing y-coordinates of all taxa */ +EXTERN int numutrees; /* number of users trees in input tree file */ +EXTERN ivector clusterA, clusterB, clusterC, clusterD; /* clusters for LM analysis */ +EXTERN ivector consconfid; /* confidence values of majority rule consensus tree */ +EXTERN ivector conssizes; /* partition sizes of majority rule consensus tree */ +EXTERN ivector trueID; /* leaf -> taxon on this leaf */ +EXTERN ivector xcor; /* x-coordinates of consensus tree nodes */ +EXTERN ivector ycor; /* y-coordinates of consensus tree nodes */ +EXTERN ivector ycormax; /* maximal y-coordinates of consensus tree nodes */ +EXTERN ivector ycormin; /* minimal y-coordinates of consensus tree nodes */ +EXTERN ivector ycortax; /* y-coordinates of all taxa */ +EXTERN ONEEDGE *edge; /* vector with all the edges of the tree */ +EXTERN uli *splitcomp; /* bipartition storage */ +EXTERN uli *splitfreqs; /* frequencies of all different splits of all trees */ +EXTERN uli *splitpatterns; /* all different splits of all trees */ +EXTERN uli badqs; /* number of bad quartets */ +EXTERN uli consincluded; /* number of included biparts in the consensus tree */ +EXTERN uli Currtrial; /* counter for puzzling steps */ +EXTERN uli maxbiparts; /* space is reserved for that many bipartitions */ +EXTERN uli mininfo; /* value of edgeinfo on minedge */ +EXTERN uli numbiparts; /* number of different bipartitions */ +EXTERN uli Numquartets; /* number of quartets */ +EXTERN uli Numtrial; /* number of puzzling steps */ +EXTERN uli lmqts; /* quartets investigated in LM analysis (0 = ALL) */ + +EXTERN int auto_datatype; /* guess datatype ? */ +EXTERN int guessdata_optn; /* guessed datatype */ + +EXTERN int auto_aamodel; /* guess amino acid modell ? */ +EXTERN int guessauto_aamodel; /* guessed amino acid modell ? */ +EXTERN int guessDayhf_optn; /* guessed Dayhoff model option */ +EXTERN int guessJtt_optn; /* guessed JTT model option */ +EXTERN int guessblosum62_optn; /* guessed BLOSUM 62 model option */ +EXTERN int guessmtrev_optn; /* guessed mtREV model option */ +EXTERN int guesscprev_optn; /* guessed cpREV model option */ +EXTERN int guessvtmv_optn; /* guessed VT model option */ +EXTERN int guesswag_optn; /* guessed WAG model option */ + +/* counter variables needed in likelihood mapping analysis */ +EXTERN uli ar1, ar2, ar3; +EXTERN uli reg1, reg2, reg3, reg4, reg5, reg6, reg7; +EXTERN uli reg1l, reg1r, reg2u, reg2d, reg3u, reg3d, + reg4u, reg4d, reg5l, reg5r, reg6u, reg6d; +EXTERN unsigned char *quartetinfo; /* place where quartets are stored */ +EXTERN dvector qweight; /* for use in QP and LM analysis */ +EXTERN dvector sqdiff; +EXTERN ivector qworder; +EXTERN ivector sqorder; + +EXTERN int randseed; +EXTERN int psteptreestrlen; + +typedef struct treelistitemtypedummy { + struct treelistitemtypedummy *pred; + struct treelistitemtypedummy *succ; + struct treelistitemtypedummy *sortnext; + struct treelistitemtypedummy *sortlast; + char *tree; + int count; + int id; + int idx; +} treelistitemtype; + +EXTERN treelistitemtype *psteptreelist; +EXTERN treelistitemtype *psteptreesortlist; +EXTERN int psteptreenum; +EXTERN int psteptreesum; + + +/* prototypes */ +void makeF84model(void); +void compnumqts(void); +void setoptions(void); +void openfiletoread(FILE **, char[], char[]); +void openfiletowrite(FILE **, char[], char[]); +void openfiletoappend(FILE **, char[], char[]); +void closefile(FILE *); +void symdoublets(void); +void computeexpectations(void); +void putdistance(FILE *); +void findidenticals(FILE *); +double averagedist(void); +void initps(FILE *); +void plotlmpoint(FILE *, double, double); +void finishps(FILE *); +void makelmpoint(FILE *, double, double, double); +void printtreestats(FILE *); +void timestamp(FILE *); +void writeoutputfile(FILE *, int); + +/* definitions for writing output */ +#define WRITEALL 0 +#define WRITEPARAMS 1 +#define WRITEREST 2 + +void writetimesstat(FILE *ofp); +void writecutree(FILE *, int); +void starttimer(void); +void checktimer(uli); +void estimateparametersnotree(void); +void estimateparameterstree(void); +int main(int, char *[]); +int ulicmp(const void *, const void *); +int intcmp(const void *, const void *); + +void readid(FILE *, int); +char readnextcharacter(FILE *, int, int); +void skiprestofline(FILE *, int, int); +void skipcntrl(FILE *, int, int); +void getseqs(FILE *); +void initid(int); +void fputid10(FILE *, int); +int fputid(FILE *, int); +void getsizesites(FILE *); +void getdataset(FILE *); +int guessdatatype(void); +void translatedataset(void); +void estimatebasefreqs(void); +void guessmodel(void); +void inittree(void); +void addnextleaf(int); +void freetree(void); +void writeOTU(FILE *, int); +void writetree(FILE *); +int *initctree(); +void copytree(int *ctree); +void freectree(int **snodes); +void printctree(int *ctree); +char *sprintfctree(int *ctree, int strlen); +void fprintffullpstree(FILE *outf, char *treestr); +int printfsortctree(int *ctree); +int sortctree(int *ctree); +int ct_1stedge(int node); +int ct_2ndedge(int node); +int ct_3rdedge(int node); + +void printfpstrees(treelistitemtype *list); +void printfsortedpstrees(treelistitemtype *list); +void fprintfsortedpstrees(FILE *output, treelistitemtype *list, int itemnum, int itemsum, int comment, float cutoff); + +void sortbynum(treelistitemtype *list, treelistitemtype **sortlist); +treelistitemtype *addtree2list(char **tree, + int numtrees, + treelistitemtype **list, + int *numitems, + int *numsum); +void freetreelist(treelistitemtype **list, + int *numitems, + int *numsum); +void resetedgeinfo(void); +void incrementedgeinfo(int, int); +void minimumedgeinfo(void); +void initconsensus(void); +void makepart(int, int); +void computebiparts(void); +void printsplit(FILE *, uli); +void makenewsplitentries(void); +void copysplit(uli, int); +void makeconsensus(void); +void writenode(FILE *, int); +void writeconsensustree(FILE *); +void nodecoordinates(int); +void drawnode(int, int); +void plotconsensustree(FILE *); +unsigned char *mallocquartets(int); +void freequartets(void); +unsigned char readquartet(int, int, int, int); +void writequartet(int, int, int, int, unsigned char); +void sort3doubles(dvector, ivector); +void computeallquartets(void); +void checkquartet(int, int, int, int); +void num2quart(uli qnum, int *a, int *b, int *c, int *d); +uli numquarts(int maxspc); +uli quart2num (int a, int b, int c, int d); + +void writetpqfheader(int nspec, FILE *ofp, int flag); + + +/* extracted from main (xxx) */ +void compute_quartlklhds(int a, int b, int c, int d, double *d1, double *d2, double *d3, int approx); + + +/* definitions for timing */ + +#define OVERALL 0 +#define GENERAL 1 +#define OPTIONS 2 +#define PARAMEST 3 +#define QUARTETS 4 +#define PUZZLING 5 +#define TREEEVAL 6 + +typedef struct { + int currentjob; + clock_t tempcpu; + clock_t tempfullcpu; + clock_t tempcpustart; + time_t temptime; + time_t tempfulltime; + time_t temptimestart; + + clock_t maxcpu; + clock_t mincpu; + time_t maxtime; + time_t mintime; + + double maxcpublock; + double mincpublock; + double mincputick; + double mincputicktime; + double maxtimeblock; + double mintimeblock; + + double generalcpu; + double optionscpu; + double paramestcpu; + double quartcpu; + double quartblockcpu; + double quartmaxcpu; + double quartmincpu; + double puzzcpu; + double puzzblockcpu; + double puzzmaxcpu; + double puzzmincpu; + double treecpu; + double treeblockcpu; + double treemaxcpu; + double treemincpu; + double cpu; + double fullcpu; + + double generaltime; + double optionstime; + double paramesttime; + double quarttime; + double quartblocktime; + double quartmaxtime; + double quartmintime; + double puzztime; + double puzzblocktime; + double puzzmaxtime; + double puzzmintime; + double treetime; + double treeblocktime; + double treemaxtime; + double treemintime; + double time; + double fulltime; +} timearray_t; + +EXTERN double cputime, walltime; +EXTERN double fullcpu, fulltime; +EXTERN double fullcputime, fullwalltime; +EXTERN double altcputime, altwalltime; +EXTERN clock_t cputimestart, cputimestop, cputimedummy; +EXTERN time_t walltimestart, walltimestop, walltimedummy; +EXTERN clock_t Startcpu; /* start cpu time */ +EXTERN clock_t Stopcpu; /* stop cpu time */ +EXTERN time_t Starttime; /* start time */ +EXTERN time_t Stoptime; /* stop time */ +EXTERN time_t time0; /* timer variable */ +EXTERN time_t time1; /* yet another timer */ +EXTERN time_t time2; /* yet another timer */ +EXTERN timearray_t tarr; + +void resetqblocktime(timearray_t *ta); +void resetpblocktime(timearray_t *ta); +void inittimearr(timearray_t *ta); +void addtimes(int jobtype, timearray_t *ta); +#ifdef TIMEDEBUG + void printtimearr(timearray_t *ta); +#endif /* TIMEDEBUG */ + +#endif /* _PUZZLE_ */ + diff --git a/forester/archive/RIO/others/puzzle_mod/src/puzzle1.c b/forester/archive/RIO/others/puzzle_mod/src/puzzle1.c new file mode 100644 index 0000000..9a4d790 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/puzzle1.c @@ -0,0 +1,4527 @@ +/* + * puzzle1.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/* Modified by Christian Zmasek to: + - name and pairwise dist. output as one line per seq. + - removed some unnecessary -- for my puposes -- output. + + + !WARNING: Use ONLY together with FORESTER/RIO! + !For all other puposes download the excellent original! + + last modification: 05/19/01 + + + + void putdistance(FILE *fp): + + removed: "if ((j + 1) % 7 == 0 && j+1 != Maxspc) + fprintf(fp, "\n ");" + + + + + int main(int argc, char *argv[]): + + removed: + "FPRINTF(STDOUTFILE "Writing parameters to file %s\n", OUTFILE); + openfiletowrite(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEPARAMS); + fclose(ofp);" + + "openfiletoappend(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEREST);" + + "openfiletoappend(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEREST);" + + "openfiletoappend(&ofp, OUTFILE, "general output"); + writeoutputfile(ofp,WRITEREST);" + + "timestamp(ofp); + closefile(ofp);" + + +*/ + + + +#define EXTERN + +#include "puzzle.h" +#include "gamma.h" + +void num2quart(uli qnum, int *a, int *b, int *c, int *d) +{ + double temp; + uli aa, bb, cc, dd; + uli lowval=0, highval=0; + + aa=0; bb=1; cc=2; dd=3; + + temp = (double)(24 * qnum); + temp = sqrt(temp); + temp = sqrt(temp); + /* temp = pow(temp, (double)(1/4)); */ + dd = (uli) floor(temp) + 1; + if (dd < 3) dd = 3; + lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + highval = (uli) (dd+1)*dd*(dd-1)*(dd-2)/24; + if (lowval >= qnum) + while ((lowval > qnum)) { + dd -= 1; lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + } + else { + while (highval <= qnum) { + dd += 1; highval = (uli) (dd+1)*dd*(dd-1)*(dd-2)/24; + } + lowval = (uli) dd*(dd-1)*(dd-2)*(dd-3)/24; + } + qnum -= lowval; + if (qnum > 0) { + temp = (double)(6 * qnum); + temp = pow(temp, (double)(1/3)); + cc = (uli) floor(temp); + if (cc < 2) cc= 2; + lowval = (uli) cc*(cc-1)*(cc-2)/6; + highval = (uli) (cc+1)*cc*(cc-1)/6; + if (lowval >= qnum) + while ((lowval > qnum)) { + cc -= 1; lowval = (uli) cc*(cc-1)*(cc-2)/6; + } + else { + while (highval <= qnum) { + cc += 1; highval = (uli) (cc+1)*cc*(cc-1)/6; + } + lowval = (uli) cc*(cc-1)*(cc-2)/6; + } + qnum -= lowval; + if (qnum > 0) { + temp = (double)(2 * qnum); + temp = sqrt(temp); + bb = (uli) floor(temp); + if (bb < 1) bb= 1; + lowval = (uli) bb*(bb-1)/2; + highval = (uli) (bb+1)*bb/2; + if (lowval >= qnum) + while ((lowval > qnum)) { + bb -= 1; lowval = (uli) bb*(bb-1)/2; + } + else { + while (highval <= qnum) { + bb += 1; highval = (uli) (bb+1)*bb/2; + } + lowval = (uli) bb*(bb-1)/2; + } + qnum -= lowval; + if (qnum > 0) { + aa = (uli) qnum; + if (aa < 0) aa= 0; + } + } + } + *d = (int)dd; + *c = (int)cc; + *b = (int)bb; + *a = (int)aa; +} /* num2quart */ + +/******************/ + +uli numquarts(int maxspc) +{ + uli tmp; + int a, b, c, d; + + if (maxspc < 4) + return (uli)0; + else { + maxspc--; + a = maxspc-3; + b = maxspc-2; + c = maxspc-1; + d = maxspc; + + tmp = (uli) 1 + a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); + } +} /* numquarts */ + +/******************/ + +uli quart2num (int a, int b, int c, int d) +{ + uli tmp; + if ((a>b) || (b>c) || (c>d)) { + fprintf(stderr, "Error PP5 not (%d <= %d <= %d <= %d) !!!\n", a, b, c, +d); + exit (1); + } + tmp = (uli) a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); +} /* quart2num */ + +/******************/ + + + +/* flag=0 old allquart binary */ +/* flag=1 allquart binary */ +/* flag=2 allquart ACSII */ +/* flag=3 quartlh binary */ +/* flag=4 quartlh ASCII */ + +void writetpqfheader(int nspec, + FILE *ofp, + int flag) +{ int currspec; + + if (flag == 0) { + unsigned long nquart; + unsigned long blocklen; + + nquart = numquarts(nspec); + /* compute number of bytes */ + if (nquart % 2 == 0) { /* even number */ + blocklen = (nquart)/2; + } else { /* odd number */ + blocklen = (nquart + 1)/2; + } + /* FPRINTF(STDOUTFILE "Writing quartet file: %s\n", filename); */ + fprintf(ofp, "TREE-PUZZLE\n%s\n\n", VERSION); + fprintf(ofp, "species: %d\n", nspec); + fprintf(ofp, "quartets: %lu\n", nquart); + fprintf(ofp, "bytes: %lu\n\n", blocklen); + + + /* fwrite(&(quartetinfo[0]), sizeof(char), blocklen, ofp); */ + } + + if (flag == 1) fprintf(ofp, "##TPQF-BB (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + if (flag == 2) fprintf(ofp, "##TPQF-BA (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + if (flag == 3) fprintf(ofp, "##TPQF-LB (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + if (flag == 4) fprintf(ofp, "##TPQF-LA (TREE-PUZZLE %s)\n%d\n", VERSION, nspec); + + for (currspec=0; currspec MAXTS) { + FPRINTF(STDOUTFILE "\n\n\nF84 model not possible "); + FPRINTF(STDOUTFILE "(bad Ts/Tv parameter)\n"); + tstvf84 = 0.0; + return; + } + if (yr < MINYR || yr > MAXYR) { + FPRINTF(STDOUTFILE "\n\n\nF84 model not possible "); + FPRINTF(STDOUTFILE "(bad Y/R transition parameter)\n"); + tstvf84 = 0.0; + return; + } + TSparam = ts; + YRparam = yr; + optim_optn = FALSE; +} + +/* compute number of quartets used in LM analysis */ +void compnumqts() +{ + if (lmqts == 0) { + if (numclust == 4) + Numquartets = (uli) clustA*clustB*clustC*clustD; + if (numclust == 3) + Numquartets = (uli) clustA*clustB*clustC*(clustC-1)/2; + if (numclust == 2) + Numquartets = (uli) clustA*(clustA-1)/2 * clustB*(clustB-1)/2; + if (numclust == 1) + Numquartets = (uli) Maxspc*(Maxspc-1)*(Maxspc-2)*(Maxspc-3)/24; + } else { + Numquartets = lmqts; + } +} + +/* set options interactively */ +void setoptions() +{ + int i, valid; + double sumfreq; + char ch; + + /* defaults */ + rhetmode = UNIFORMRATE; /* assume rate homogeneity */ + numcats = 1; + Geta = 0.05; + grate_optim = FALSE; + fracinv = 0.0; + fracinv_optim = FALSE; + + compclock = FALSE; /* compute clocklike branch lengths */ + locroot = -1; /* search for optimal place of root */ + qcalg_optn = FALSE; /* don't use sampling of quartets */ + approxp_optn = TRUE; /* approximate parameter estimates */ + listqptrees = PSTOUT_NONE; /* list puzzling step trees */ + + /* approximate QP quartets? */ + if (Maxspc <= 6) approxqp = FALSE; + else approxqp = TRUE; + + codon_optn = 0; /* use all positions in a codon */ + + /* number of puzzling steps */ + if (Maxspc <= 25) Numtrial = 1000; + else if (Maxspc <= 50) Numtrial = 10000; + else if (Maxspc <= 75) Numtrial = 25000; + else Numtrial = 50000; + + utree_optn = TRUE; /* use first user tree for estimation */ + outgroup = 0; /* use first taxon as outgroup */ + sym_optn = FALSE; /* symmetrize doublet frequencies */ + tstvf84 = 0.0; /* disable F84 model */ + show_optn = FALSE; /* show unresolved quartets */ + typ_optn = TREERECON_OPTN; /* tree reconstruction */ + numclust = 1; /* one clusters in LM analysis */ + lmqts = 0; /* all quartets in LM analysis */ + compnumqts(); + if (Numquartets > 10000) { + lmqts = 10000; /* 10000 quartets in LM analysis */ + compnumqts(); + } + + do { + FPRINTF(STDOUTFILE "\n\n\nGENERAL OPTIONS\n"); + FPRINTF(STDOUTFILE " b Type of analysis? "); + if (typ_optn == TREERECON_OPTN) FPRINTF(STDOUTFILE "Tree reconstruction\n"); + if (typ_optn == LIKMAPING_OPTN) FPRINTF(STDOUTFILE "Likelihood mapping\n"); + if (typ_optn == TREERECON_OPTN) { + FPRINTF(STDOUTFILE " k Tree search procedure? "); + if (puzzlemode == QUARTPUZ) FPRINTF(STDOUTFILE "Quartet puzzling\n"); + if (puzzlemode == USERTREE) FPRINTF(STDOUTFILE "User defined trees\n"); + if (puzzlemode == PAIRDIST) FPRINTF(STDOUTFILE "Pairwise distances only (no tree)\n"); + if (puzzlemode == QUARTPUZ) { + FPRINTF(STDOUTFILE " v Approximate quartet likelihood? %s\n", + (approxqp ? "Yes" : "No")); + FPRINTF(STDOUTFILE " u List unresolved quartets? %s\n", + (show_optn ? "Yes" : "No")); + FPRINTF(STDOUTFILE " n Number of puzzling steps? %lu\n", + Numtrial); + FPRINTF(STDOUTFILE " j List puzzling step trees? "); + switch (listqptrees) { + case PSTOUT_NONE: FPRINTF(STDOUTFILE "No\n"); break; + case PSTOUT_ORDER: FPRINTF(STDOUTFILE "Unique topologies\n"); break; + case PSTOUT_LISTORDER: FPRINTF(STDOUTFILE "Unique topologies & Chronological list\n"); break; + case PSTOUT_LIST: FPRINTF(STDOUTFILE "Chronological list only\n"); break; + } + + FPRINTF(STDOUTFILE " o Display as outgroup? "); + fputid(STDOUT, outgroup); + FPRINTF(STDOUTFILE "\n"); + } + if (puzzlemode == QUARTPUZ || puzzlemode == USERTREE) { + FPRINTF(STDOUTFILE " z Compute clocklike branch lengths? "); + if (compclock) FPRINTF(STDOUTFILE "Yes\n"); + else FPRINTF(STDOUTFILE "No\n"); + } + if (compclock) + if (puzzlemode == QUARTPUZ || puzzlemode == USERTREE) { + FPRINTF(STDOUTFILE " l Location of root? "); + if (locroot < 0) FPRINTF(STDOUTFILE "Best place (automatic search)\n"); + else if (locroot < Maxspc) { + FPRINTF(STDOUTFILE "Branch %d (", locroot + 1); + fputid(STDOUT, locroot); + FPRINTF(STDOUTFILE ")\n"); + } else FPRINTF(STDOUTFILE "Branch %d (internal branch)\n", locroot + 1); + } + } + if (typ_optn == LIKMAPING_OPTN) { + FPRINTF(STDOUTFILE " g Group sequences in clusters? "); + if (numclust == 1) FPRINTF(STDOUTFILE "No\n"); + else FPRINTF(STDOUTFILE "Yes (%d clusters as specified)\n", numclust); + FPRINTF(STDOUTFILE " n Number of quartets? "); + if (lmqts == 0) FPRINTF(STDOUTFILE "%lu (all possible)\n", Numquartets); + else FPRINTF(STDOUTFILE "%lu (random choice)\n", lmqts); + } + FPRINTF(STDOUTFILE " e Parameter estimates? "); + if (approxp_optn) FPRINTF(STDOUTFILE "Approximate (faster)\n"); + else FPRINTF(STDOUTFILE "Exact (slow)\n"); + if (!(puzzlemode == USERTREE && typ_optn == TREERECON_OPTN)) { + FPRINTF(STDOUTFILE " x Parameter estimation uses? "); + if (qcalg_optn) FPRINTF(STDOUTFILE "Quartet sampling + NJ tree\n"); + else FPRINTF(STDOUTFILE "Neighbor-joining tree\n"); + + } else { + FPRINTF(STDOUTFILE " x Parameter estimation uses? "); + if (utree_optn) + FPRINTF(STDOUTFILE "1st input tree\n"); + else if (qcalg_optn) FPRINTF(STDOUTFILE "Quartet sampling + NJ tree\n"); + else FPRINTF(STDOUTFILE "Neighbor-joining tree\n"); + } + FPRINTF(STDOUTFILE "SUBSTITUTION PROCESS\n"); + FPRINTF(STDOUTFILE " d Type of sequence input data? "); + if (auto_datatype == AUTO_GUESS) FPRINTF(STDOUTFILE "Auto: "); + if (data_optn == NUCLEOTIDE) FPRINTF(STDOUTFILE "Nucleotides\n"); + if (data_optn == AMINOACID) FPRINTF(STDOUTFILE "Amino acids\n"); + if (data_optn == BINARY) FPRINTF(STDOUTFILE "Binary states\n"); + if (data_optn == NUCLEOTIDE && (Maxseqc % 3) == 0 && !SH_optn) { + FPRINTF(STDOUTFILE " h Codon positions selected? "); + if (codon_optn == 0) FPRINTF(STDOUTFILE "Use all positions\n"); + if (codon_optn == 1) FPRINTF(STDOUTFILE "Use only 1st positions\n"); + if (codon_optn == 2) FPRINTF(STDOUTFILE "Use only 2nd positions\n"); + if (codon_optn == 3) FPRINTF(STDOUTFILE "Use only 3rd positions\n"); + if (codon_optn == 4) FPRINTF(STDOUTFILE "Use 1st and 2nd positions\n"); + } + FPRINTF(STDOUTFILE " m Model of substitution? "); + if (data_optn == NUCLEOTIDE) { /* nucleotides */ + if (nuc_optn) { + if(HKY_optn) + FPRINTF(STDOUTFILE "HKY (Hasegawa et al. 1985)\n"); + else { + FPRINTF(STDOUTFILE "TN (Tamura-Nei 1993)\n"); + FPRINTF(STDOUTFILE " p Constrain TN model to F84 model? "); + if (tstvf84 == 0.0) + FPRINTF(STDOUTFILE "No\n"); + else FPRINTF(STDOUTFILE "Yes (Ts/Tv ratio = %.2f)\n", tstvf84); + } + FPRINTF(STDOUTFILE " t Transition/transversion parameter? "); + if (optim_optn) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else + FPRINTF(STDOUTFILE "%.2f\n", TSparam); + if (TN_optn) { + FPRINTF(STDOUTFILE " r Y/R transition parameter? "); + if (optim_optn) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else + FPRINTF(STDOUTFILE "%.2f\n", YRparam); + } + } + if (SH_optn) { + FPRINTF(STDOUTFILE "SH (Schoeniger-von Haeseler 1994)\n"); + FPRINTF(STDOUTFILE " t Transition/transversion parameter? "); + if (optim_optn) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else + FPRINTF(STDOUTFILE "%.2f\n", TSparam); + } + } + if (data_optn == NUCLEOTIDE && SH_optn) { + FPRINTF(STDOUTFILE " h Doublets defined by? "); + if (SHcodon) + FPRINTF(STDOUTFILE "1st and 2nd codon positions\n"); + else + FPRINTF(STDOUTFILE "1st+2nd, 3rd+4th, etc. site\n"); + } + if (data_optn == AMINOACID) { /* amino acids */ + switch (auto_aamodel) { + case AUTO_GUESS: + FPRINTF(STDOUTFILE "Auto: "); + break; + case AUTO_DEFAULT: + FPRINTF(STDOUTFILE "Def.: "); + break; + } + if (Dayhf_optn) FPRINTF(STDOUTFILE "Dayhoff (Dayhoff et al. 1978)\n"); + if (Jtt_optn) FPRINTF(STDOUTFILE "JTT (Jones et al. 1992)\n"); + if (mtrev_optn) FPRINTF(STDOUTFILE "mtREV24 (Adachi-Hasegawa 1996)\n"); + if (cprev_optn) FPRINTF(STDOUTFILE "cpREV45 (Adachi et al. 2000)\n"); + if (blosum62_optn) FPRINTF(STDOUTFILE "BLOSUM62 (Henikoff-Henikoff 92)\n"); + if (vtmv_optn) FPRINTF(STDOUTFILE "VT (Mueller-Vingron 2000)\n"); + if (wag_optn) FPRINTF(STDOUTFILE "WAG (Whelan-Goldman 2000)\n"); + } + if (data_optn == BINARY) { /* binary states */ + FPRINTF(STDOUTFILE "Two-state model (Felsenstein 1981)\n"); + } + if (data_optn == AMINOACID) + FPRINTF(STDOUTFILE " f Amino acid frequencies? "); + else if (data_optn == NUCLEOTIDE && SH_optn) + FPRINTF(STDOUTFILE " f Doublet frequencies? "); + else if (data_optn == NUCLEOTIDE && nuc_optn) + FPRINTF(STDOUTFILE " f Nucleotide frequencies? "); + else if (data_optn == BINARY) + FPRINTF(STDOUTFILE " f Binary state frequencies? "); + FPRINTF(STDOUTFILE "%s\n", (Frequ_optn ? "Estimate from data set" : + "Use specified values")); + if (data_optn == NUCLEOTIDE && SH_optn) + FPRINTF(STDOUTFILE " s Symmetrize doublet frequencies? %s\n", + (sym_optn ? "Yes" : "No")); + + FPRINTF(STDOUTFILE "RATE HETEROGENEITY\n"); + FPRINTF(STDOUTFILE " w Model of rate heterogeneity? "); + if (rhetmode == UNIFORMRATE) FPRINTF(STDOUTFILE "Uniform rate\n"); + if (rhetmode == GAMMARATE ) FPRINTF(STDOUTFILE "Gamma distributed rates\n"); + if (rhetmode == TWORATE ) FPRINTF(STDOUTFILE "Two rates (1 invariable + 1 variable)\n"); + if (rhetmode == MIXEDRATE ) FPRINTF(STDOUTFILE "Mixed (1 invariable + %d Gamma rates)\n", numcats); + + if (rhetmode == TWORATE || rhetmode == MIXEDRATE) { + FPRINTF(STDOUTFILE " i Fraction of invariable sites? "); + if (fracinv_optim) FPRINTF(STDOUTFILE "Estimate from data set"); + else FPRINTF(STDOUTFILE "%.2f", fracinv); + if (fracinv == 0.0 && !fracinv_optim) FPRINTF(STDOUTFILE " (all sites variable)"); + FPRINTF(STDOUTFILE "\n"); + } + if (rhetmode == GAMMARATE || rhetmode == MIXEDRATE) { + FPRINTF(STDOUTFILE " a Gamma distribution parameter alpha? "); + if (grate_optim) + FPRINTF(STDOUTFILE "Estimate from data set\n"); + else if (Geta > 0.5) + FPRINTF(STDOUTFILE "%.2f (strong rate heterogeneity)\n", (1.0-Geta)/Geta); + else FPRINTF(STDOUTFILE "%.2f (weak rate heterogeneity)\n", (1.0-Geta)/Geta); + FPRINTF(STDOUTFILE " c Number of Gamma rate categories? %d\n", numcats); + } + + FPRINTF(STDOUTFILE "\nQuit [q], confirm [y], or change [menu] settings: "); + + /* read one char */ + ch = getchar(); + if (ch != '\n') { + do ; + while (getchar() != '\n'); + } + ch = (char) tolower((int) ch); + + /* letters in use: a b c d e f g h i j k l m n o p q r s t u v w y x z */ + /* letters not in use: */ + + switch (ch) { + + case '\n': break; + + case 'z': if (typ_optn == TREERECON_OPTN && (puzzlemode == QUARTPUZ || puzzlemode == USERTREE)) { + compclock = compclock + 1; + if (compclock == 2) compclock = 0; + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'l': if (compclock && typ_optn == TREERECON_OPTN && (puzzlemode == QUARTPUZ || puzzlemode == USERTREE)) { + FPRINTF(STDOUTFILE "\n\n\nEnter an invalid branch number to search "); + FPRINTF(STDOUTFILE "for the best location!\n"); + FPRINTF(STDOUTFILE "\nPlace root at branch (1-%d): ", + 2*Maxspc-3); + scanf("%d", &locroot); + do ; + while (getchar() != '\n'); + if (locroot < 1 || locroot > 2*Maxspc-3) locroot = 0; + locroot = locroot - 1; + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'e': if ((rhetmode == TWORATE || rhetmode == MIXEDRATE) && fracinv_optim) { + FPRINTF(STDOUTFILE "\n\n\nInvariable sites estimation needs to be exact!\n"); + } else { + approxp_optn = approxp_optn + 1; + if (approxp_optn == 2) approxp_optn = 0; + } + break; + + case 'w': rhetmode = rhetmode + 1; + if (rhetmode == 4) rhetmode = UNIFORMRATE; + if (rhetmode == UNIFORMRATE) { /* uniform rate */ + numcats = 1; + Geta = 0.05; + grate_optim = FALSE; + fracinv = 0.0; + fracinv_optim = FALSE; + } + if (rhetmode == GAMMARATE ) { /* Gamma distributed rates */ + numcats = 8; + Geta = 0.05; + grate_optim = TRUE; + fracinv = 0.0; + fracinv_optim = FALSE; + } + if (rhetmode == TWORATE ) { /* two rates (1 invariable + 1 variable) */ + approxp_optn = FALSE; + numcats = 1; + Geta = 0.05; + grate_optim = FALSE; + fracinv = 0.0; + fracinv_optim = TRUE; + } + if (rhetmode == MIXEDRATE ) { /* mixed (1 invariable + Gamma rates) */ + approxp_optn = FALSE; + numcats = 8; + Geta = 0.05; + grate_optim = TRUE; + fracinv = 0.0; + fracinv_optim = TRUE; + } + break; + + case 'i': if (rhetmode == TWORATE || rhetmode == MIXEDRATE) { + FPRINTF(STDOUTFILE "\n\n\nEnter an invalid value for "); + FPRINTF(STDOUTFILE "estimation from data set!\n"); + FPRINTF(STDOUTFILE "\nFraction of invariable sites among all sites (%.2f-%.2f): ", + MINFI, MAXFI); + scanf("%lf", &fracinv); + do ; + while (getchar() != '\n'); + if (fracinv < MINFI || fracinv > MAXFI) { + fracinv_optim = TRUE; + fracinv = 0.0; + } else { + fracinv_optim = FALSE; + } + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'a': if (rhetmode == GAMMARATE || rhetmode == MIXEDRATE) { + FPRINTF(STDOUTFILE "\n\n\nEnter an invalid value for estimation from data set!\n"); + FPRINTF(STDOUTFILE "\nGamma distribution parameter alpha (%.2f-%.2f): ", + (1.0-MAXGE)/MAXGE, (1.0-MINGE)/MINGE); + scanf("%lf", &Geta); + do ; + while (getchar() != '\n'); + if (Geta < (1.0-MAXGE)/MAXGE || Geta > (1.0-MINGE)/MINGE) { + grate_optim = TRUE; + Geta = 0.05; + } else { + grate_optim = FALSE; + Geta = 1.0/(1.0 + Geta); + } + } else + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + break; + + case 'c': if (rhetmode == GAMMARATE || rhetmode == MIXEDRATE) { + FPRINTF(STDOUTFILE "\n\n\nNumber of Gamma rate categories (%d-%d): ", + MINCAT, MAXCAT); + scanf("%d", &numcats); + do ; + while (getchar() != '\n'); + if (numcats < MINCAT || numcats > MAXCAT) { + FPRINTF(STDOUTFILE "\n\n\nThis number of categories is not available!\n"); + numcats = 4; + } + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'h': if (data_optn == NUCLEOTIDE && (Maxseqc % 3) == 0 && !SH_optn) { + codon_optn = codon_optn + 1; + if (codon_optn == 5) codon_optn = 0; + translatedataset(); + /* reestimate nucleotide frequencies only + if user did not specify other values */ + if (Frequ_optn) estimatebasefreqs(); + + } else if (data_optn == NUCLEOTIDE && SH_optn) { + if (Maxseqc % 2 != 0 && Maxseqc % 3 == 0) { + SHcodon = TRUE; + FPRINTF(STDOUTFILE "\n\n\nThis is the only possible option for the data set!\n"); + } + if (Maxseqc % 3 != 0 && Maxseqc % 2 == 0) { + SHcodon = FALSE; + FPRINTF(STDOUTFILE "\n\n\nThis is the only possible option for the data set!\n"); + } + if (Maxseqc % 2 == 0 && Maxseqc % 3 == 0) { + if (SHcodon) + SHcodon = FALSE; + else + SHcodon = TRUE; + translatedataset(); + /* reestimate nucleotide frequencies only + if user did not specify other values */ + if (Frequ_optn) estimatebasefreqs(); + } + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'x': if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE) { + if (utree_optn) { + utree_optn = FALSE; + qcalg_optn = FALSE; + } else { + qcalg_optn = qcalg_optn + 1; + if (qcalg_optn == 2) { + qcalg_optn = 0; + utree_optn = TRUE; + } + } + } else { + qcalg_optn = qcalg_optn + 1; + if (qcalg_optn == 2) qcalg_optn = 0; + } + break; + + case 'k': if (typ_optn == TREERECON_OPTN) { + puzzlemode = (puzzlemode + 1) % 3; + /* puzzlemode = puzzlemode + 1; + if (puzzlemode == 3) puzzlemode = 0; + xxx */ + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'b': typ_optn = (typ_optn + 1) % 2; + /* typ_optn = typ_optn + 1; + if (typ_optn == 2) typ_optn = TREERECON_OPTN; + xxx */ + break; + + case 'g': if (typ_optn == LIKMAPING_OPTN) { + clustA = clustB = clustC = clustD = 0; + if (numclust != 1) { + numclust = 1; + } else { + FPRINTF(STDOUTFILE "\n\n\nNumber of clusters (2-4): "); + scanf("%d", &numclust); + do ; + while (getchar() != '\n'); + if (numclust < 2 || numclust > 4) { + numclust = 1; + FPRINTF(STDOUTFILE "\n\n\nOnly 2, 3, or 4 "); + FPRINTF(STDOUTFILE "clusters possible\n"); + } else { + FPRINTF(STDOUTFILE "\nDistribute all sequences over the "); + if (numclust == 2) { + FPRINTF(STDOUTFILE "two clusters a and b (At least two\n"); + FPRINTF(STDOUTFILE "sequences per cluster are necessary), "); + } + if (numclust == 3) { + FPRINTF(STDOUTFILE "three clusters a, b, and c\n"); + FPRINTF(STDOUTFILE "(At least one sequence in cluster a and b, and at least two\n"); + FPRINTF(STDOUTFILE "sequences in c are necessary), "); + } + if (numclust == 4) { + FPRINTF(STDOUTFILE "four clusters a, b, c, and d\n"); + FPRINTF(STDOUTFILE "(At least one sequence per cluster is necessary),\n"); + } + FPRINTF(STDOUTFILE "type x to exclude a sequence:\n\n"); + + for (i = 0; i < Maxspc; i++) { + valid = FALSE; + do { + fputid10(STDOUT, i); + FPRINTF(STDOUTFILE ": "); + /* read one char */ + ch = getchar(); + if (ch != '\n') { + do ; + while (getchar() != '\n'); + } + ch = (char) tolower((int) ch); + if (ch == 'a' || ch == 'b' || ch == 'x') + valid = TRUE; + if (numclust == 3 || numclust == 4) + if (ch == 'c') valid = TRUE; + if (numclust == 4) + if (ch == 'd') valid = TRUE; + } while (!valid); + if (ch == 'a') { + clusterA[clustA] = i; + clustA++; + } + if (ch == 'b') { + clusterB[clustB] = i; + clustB++; + } + if (ch == 'c') { + clusterC[clustC] = i; + clustC++; + } + if (ch == 'd') { + clusterD[clustD] = i; + clustD++; + } + } + /* check clusters */ + valid = TRUE; + if (numclust == 4) { + if (clustA == 0) { + valid = FALSE; + numclust = 1; + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster a\n"); + } + if (clustB == 0) { + valid = FALSE; + numclust = 1; + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster b\n"); + } + if (clustC == 0) { + valid = FALSE; + numclust = 1; + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster c\n"); + } + if (clustD == 0) { + valid = FALSE; + numclust = 1; + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster d\n"); + } + } + if (numclust == 3) { + if (clustA == 0) { + valid = FALSE; + numclust = 1; + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster a\n"); + } + if (clustB == 0) { + valid = FALSE; + numclust = 1; + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster b\n"); + } + if (clustC < 2) { + valid = FALSE; + numclust = 1; + if (clustC == 0) + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster c\n"); + else + FPRINTF(STDOUTFILE "\n\n\nOnly one sequence in cluster c\n"); + } + } + if (numclust == 2) { + if (clustA < 2) { + valid = FALSE; + numclust = 1; + if (clustA == 0) + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster a\n"); + else + FPRINTF(STDOUTFILE "\n\n\nOnly one sequence in cluster a\n"); + } + if (clustB < 2) { + valid = FALSE; + numclust = 1; + if (clustB == 0) + FPRINTF(STDOUTFILE "\n\n\nNo sequence in cluster b\n"); + else + FPRINTF(STDOUTFILE "\n\n\nOnly one sequence in cluster b\n"); + } + } + if (valid) { + FPRINTF(STDOUTFILE "\nNumber of sequences in each cluster:\n\n"); + FPRINTF(STDOUTFILE "Cluster a: %d\n", clustA); + FPRINTF(STDOUTFILE "Cluster b: %d\n", clustB); + if (numclust > 2) + FPRINTF(STDOUTFILE "Cluster c: %d\n", clustC); + if (numclust == 4) + FPRINTF(STDOUTFILE "Cluster d: %d\n", clustD); + FPRINTF(STDOUTFILE "\nExcluded sequences: "); + if (numclust == 2) FPRINTF(STDOUTFILE "%d\n", + Maxspc-clustA-clustB); + if (numclust == 3) FPRINTF(STDOUTFILE "%d\n", + Maxspc-clustA-clustB-clustC); + if (numclust == 4) FPRINTF(STDOUTFILE "%d\n", + Maxspc-clustA-clustB-clustC-clustD); + + } + } + } + /* number of resulting quartets */ + compnumqts(); + + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'd': if (auto_datatype == AUTO_GUESS) { + auto_datatype = AUTO_OFF; + guessdata_optn = data_optn; + data_optn = 0; + } else { + data_optn = data_optn + 1; + if (data_optn == 3) { + auto_datatype = AUTO_GUESS; + data_optn = guessdata_optn; + } + } + /* translate characters into format used by ML engine */ + translatedataset(); + estimatebasefreqs(); + break; + + case 'u': if (puzzlemode == QUARTPUZ && typ_optn == TREERECON_OPTN) + show_optn = 1 - show_optn; + else + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + break; + + case 'j': if (puzzlemode == QUARTPUZ && typ_optn == TREERECON_OPTN) + listqptrees = (listqptrees + 1) % 4; + else + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + break; + + case 'v': if (puzzlemode == QUARTPUZ && typ_optn == TREERECON_OPTN) + approxqp = 1 - approxqp; + else + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + break; + + case 'f': if (Frequ_optn) { + tstvf84 = 0.0; + Frequ_optn = FALSE; + sumfreq = 0.0; + if (data_optn == AMINOACID) + FPRINTF(STDOUTFILE "\n\n\nAmino acid"); + else if (data_optn == NUCLEOTIDE && SH_optn) + FPRINTF(STDOUTFILE "\n\n\nDoublet"); + else if (data_optn == NUCLEOTIDE && nuc_optn) + FPRINTF(STDOUTFILE "\n\n\nNucleotide"); + else if (data_optn == BINARY) + FPRINTF(STDOUTFILE "\n\n\nBinary state"); + FPRINTF(STDOUTFILE " frequencies (in %%):\n\n"); + for (i = 0; i < gettpmradix() - 1; i++) { + FPRINTF(STDOUTFILE "pi(%s) = ", int2code(i)); + scanf("%lf", &(Freqtpm[i])); + do ; + while (getchar() != '\n'); + Freqtpm[i] = Freqtpm[i]/100.0; + if (Freqtpm[i] < 0.0) { + FPRINTF(STDOUTFILE "\n\n\nNegative frequency not possible\n"); + estimatebasefreqs(); + break; + } + sumfreq = sumfreq + Freqtpm[i]; + if (sumfreq > 1.0) { + FPRINTF(STDOUTFILE "\n\n\nThe sum of "); + FPRINTF(STDOUTFILE "all frequencies exceeds"); + FPRINTF(STDOUTFILE " 100%%\n"); + estimatebasefreqs(); + break; + } + if (i == gettpmradix() - 2) + Freqtpm[i+1] = 1.0 - sumfreq; + } + } else estimatebasefreqs(); + break; + + case 's': if (data_optn == NUCLEOTIDE && SH_optn) { + sym_optn = 1 - sym_optn; + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'n': if (puzzlemode == QUARTPUZ && typ_optn == TREERECON_OPTN) + { + FPRINTF(STDOUTFILE "\n\n\nNumber of puzzling steps: "); + scanf("%lu", &Numtrial); + do ; + while (getchar() != '\n'); + if (Numtrial < 1) { + FPRINTF(STDOUTFILE "\n\n\nThe number of puzzling"); + FPRINTF(STDOUTFILE " steps can't be smaller than one\n"); + Numtrial = 1000; + } + } + else if (typ_optn == LIKMAPING_OPTN) + { + FPRINTF(STDOUTFILE "\n\nEnter zero to use all possible"); + FPRINTF(STDOUTFILE " quartets in the analysis!\n"); + FPRINTF(STDOUTFILE "\nNumber of random quartets: "); + scanf("%lu", &lmqts); + do ; + while (getchar() != '\n'); + + /* compute number of quartets used */ + compnumqts(); + } + else + { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'o': if (puzzlemode == QUARTPUZ && typ_optn == TREERECON_OPTN) { + FPRINTF(STDOUTFILE "\n\n\nSequence to be displayed as outgroup (1-%d): ", + Maxspc); + scanf("%d", &outgroup); + do ; + while (getchar() != '\n'); + if (outgroup < 1 || outgroup > Maxspc) { + FPRINTF(STDOUTFILE "\n\n\nSequences are numbered "); + FPRINTF(STDOUTFILE "from 1 to %d\n", + Maxspc); + outgroup = 1; + } + outgroup = outgroup - 1; + } else { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } + break; + + case 'm': if (data_optn == NUCLEOTIDE) { /* nucleotide data */ + if(HKY_optn && nuc_optn) { + /* HKY -> TN */ + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 0.9; + HKY_optn = FALSE; + TN_optn = TRUE; + optim_optn = TRUE; + nuc_optn = TRUE; + SH_optn = FALSE; + break; + } + if(TN_optn && nuc_optn) { + if (Maxseqc % 2 == 0 || Maxseqc % 3 == 0) { + /* number of chars needs to be a multiple 2 or 3 */ + /* TN -> SH */ + if (Maxseqc % 2 != 0 && Maxseqc % 3 == 0) + SHcodon = TRUE; + else + SHcodon = FALSE; + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 1.0; + HKY_optn = TRUE; + TN_optn = FALSE; + optim_optn = TRUE; + nuc_optn = FALSE; + SH_optn = TRUE; + /* translate characters into format */ + /* used by ML engine */ + translatedataset(); + estimatebasefreqs(); + } else { + FPRINTF(STDOUTFILE "\n\n\nSH model not "); + FPRINTF(STDOUTFILE "available for the data set!\n"); + /* TN -> HKY */ + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 1.0; + HKY_optn = TRUE; + TN_optn = FALSE; + optim_optn = TRUE; + nuc_optn = TRUE; + SH_optn = FALSE; + } + break; + } + if(SH_optn) { + /* SH -> HKY */ + tstvf84 = 0.0; + TSparam = 2.0; + YRparam = 1.0; + HKY_optn = TRUE; + TN_optn = FALSE; + optim_optn = TRUE; + nuc_optn = TRUE; + SH_optn = FALSE; + /* translate characters into format */ + /* used by ML engine */ + translatedataset(); + estimatebasefreqs(); + break; + } + break; + } + if (data_optn == AMINOACID) { /* amino acid data */ + if (auto_aamodel) { + /* AUTO -> Dayhoff */ + Dayhf_optn = TRUE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } + if (Dayhf_optn) { + /* Dayhoff -> JTT */ + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } + if (Jtt_optn) { + /* JTT -> mtREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = TRUE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#ifdef CPREV + if (mtrev_optn) { + /* mtREV -> cpREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = TRUE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#else /* ! CPREV */ + if (mtrev_optn) { + /* mtREV -> BLOSUM 62 */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = TRUE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#endif /* ! CPREV */ + +#ifdef CPREV + if (cprev_optn) { + /* cpREV -> BLOSUM 62 */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = TRUE; + vtmv_optn = FALSE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } +#endif + if (blosum62_optn) { + /* BLOSUM 62 -> VT model */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = TRUE; + wag_optn = FALSE; + auto_aamodel = AUTO_OFF; + break; + } + if (vtmv_optn) { + /* VT model -> WAG model */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = TRUE; + auto_aamodel = AUTO_OFF; + break; + } + if (wag_optn) { + /* WAG model -> AUTO */ + Dayhf_optn = guessDayhf_optn; + Jtt_optn = guessJtt_optn; + mtrev_optn = guessmtrev_optn; + cprev_optn = guesscprev_optn; + blosum62_optn = guessblosum62_optn; + vtmv_optn = guessvtmv_optn; + wag_optn = guesswag_optn; + auto_aamodel = guessauto_aamodel; + break; + } + break; + } + if (data_optn == BINARY) { + FPRINTF(STDOUTFILE "\n\n\nNo other model available!\n"); + } + break; + + case 't': if (data_optn != NUCLEOTIDE) { + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } else { + tstvf84 = 0.0; + FPRINTF(STDOUTFILE "\n\n\nEnter an invalid value for "); + FPRINTF(STDOUTFILE "estimation from data set!\n"); + FPRINTF(STDOUTFILE "\nTransition/transversion parameter (%.2f-%.2f): ", + MINTS, MAXTS); + scanf("%lf", &TSparam); + do ; + while (getchar() != '\n'); + if (TSparam < MINTS || TSparam > MAXTS) { + optim_optn = TRUE; + TSparam = 2.0; + } else { + optim_optn = FALSE; + } + } + break; + + case 'q': FPRINTF(STDOUTFILE "\n\n\n"); +# if PARALLEL + PP_SendDone(); + MPI_Finalize(); +# endif /* PARALLEL */ + exit(0); + + break; + + case 'r': if (!(TN_optn && nuc_optn)){ + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } else { + tstvf84 = 0.0; + FPRINTF(STDOUTFILE "\n\n\nEnter an invalid value "); + FPRINTF(STDOUTFILE "for estimation from data set!\n"); + FPRINTF(STDOUTFILE "\nY/R transition parameter (%.2f-%.2f): ", MINYR, MAXYR); + scanf("%lf", &YRparam); + do ; + while (getchar() != '\n'); + if (YRparam < MINYR || YRparam > MAXYR) { + optim_optn = TRUE; + YRparam = 0.9; + } else if (YRparam == 1.0) { + TN_optn = FALSE; + HKY_optn = TRUE; + if (optim_optn) TSparam = 2.0; + } else { + optim_optn = FALSE; + } + } + break; + + case 'p': if (!(TN_optn && nuc_optn)){ + FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + } else { + FPRINTF(STDOUTFILE "\n\n\nThe F84 model (Felsenstein 1984) is a restricted"); + FPRINTF(STDOUTFILE " TN model, and the one\nF84 parameter uniquely"); + FPRINTF(STDOUTFILE " determines the two corresponding TN parameters!\n\n"); + FPRINTF(STDOUTFILE "F84 expected transition/transversion ratio: "); + scanf("%lf", &tstvf84); + do ; + while (getchar() != '\n'); + if (tstvf84 <= 0.0) tstvf84 = 0.0; + else makeF84model(); + } + break; + + case 'y': break; + + default: FPRINTF(STDOUTFILE "\n\n\nThis is not a possible option!\n"); + break; + } + } while (ch != 'y'); + + FPRINTF(STDOUTFILE "\n\n\n"); +} + +/* open file for reading */ +void openfiletoread(FILE **fp, char name[], char descr[]) +{ + int count = 0; + cvector str; + + if ((*fp = fopen(name, "r")) == NULL) { + FPRINTF(STDOUTFILE "\n\n\nPlease enter a file name for the %s: ", descr); + str = mygets(); + while ((*fp = fopen(str, "r")) == NULL) + { + count++; + if (count > 10) + { + FPRINTF(STDOUTFILE "\n\n\nToo many trials - quitting ...\n"); + exit(1); + } + FPRINTF(STDOUTFILE "File '%s' not found, ", str); + FPRINTF(STDOUTFILE "please enter alternative name: "); + free_cvector(str); + str = mygets(); + } + free_cvector(str); + FPRINTF(STDOUTFILE "\n"); + } +} /* openfiletoread */ + + +/* open file for writing */ +void openfiletowrite(FILE **fp, char name[], char descr[]) +{ + int count = 0; + cvector str; + + if ((*fp = fopen(name, "w")) == NULL) { + FPRINTF(STDOUTFILE "\n\n\nPlease enter a file name for the %s: ", descr); + str = mygets(); + while ((*fp = fopen(str, "w")) == NULL) + { + count++; + if (count > 10) + { + FPRINTF(STDOUTFILE "\n\n\nToo many trials - quitting ...\n"); + exit(1); + } + FPRINTF(STDOUTFILE "File '%s' not created, ", str); + FPRINTF(STDOUTFILE "please enter other name: "); + free_cvector(str); + str = mygets(); + } + free_cvector(str); + FPRINTF(STDOUTFILE "\n"); + } +} /* openfiletowrite */ + + +/* open file for appending */ +void openfiletoappend(FILE **fp, char name[], char descr[]) +{ + int count = 0; + cvector str; + + if ((*fp = fopen(name, "a")) == NULL) { + FPRINTF(STDOUTFILE "\n\n\nPlease enter a file name for the %s: ", descr); + str = mygets(); + while ((*fp = fopen(str, "a")) == NULL) + { + count++; + if (count > 10) + { + FPRINTF(STDOUTFILE "\n\n\nToo many trials - quitting ...\n"); + exit(1); + } + FPRINTF(STDOUTFILE "File '%s' not created, ", str); + FPRINTF(STDOUTFILE "please enter other name: "); + free_cvector(str); + str = mygets(); + } + free_cvector(str); + FPRINTF(STDOUTFILE "\n"); + } +} /* openfiletowrite */ + + +/* close file */ +void closefile(FILE *fp) +{ + fclose(fp); +} /* closefile */ + +/* symmetrize doublet frequencies */ +void symdoublets() +{ + int i, imean; + double mean; + + if (data_optn == NUCLEOTIDE && SH_optn && sym_optn) { + /* ML frequencies */ + mean = (Freqtpm[1] + Freqtpm[4])/2.0; /* AC CA */ + Freqtpm[1] = mean; + Freqtpm[4] = mean; + mean = (Freqtpm[2] + Freqtpm[8])/2.0; /* AG GA */ + Freqtpm[2] = mean; + Freqtpm[8] = mean; + mean = (Freqtpm[3] + Freqtpm[12])/2.0; /* AT TA */ + Freqtpm[3] = mean; + Freqtpm[12] = mean; + mean = (Freqtpm[6] + Freqtpm[9])/2.0; /* CG GC */ + Freqtpm[6] = mean; + Freqtpm[9] = mean; + mean = (Freqtpm[7] + Freqtpm[13])/2.0; /* CT TC */ + Freqtpm[7] = mean; + Freqtpm[13] = mean; + mean = (Freqtpm[11] + Freqtpm[14])/2.0; /* GT TG */ + Freqtpm[11] = mean; + Freqtpm[14] = mean; + + /* base composition of each taxon */ + for (i = 0; i < Maxspc; i++) { + imean = (Basecomp[i][1] + Basecomp[i][4])/2; /* AC CA */ + Basecomp[i][1] = imean; + Basecomp[i][4] = imean; + imean = (Basecomp[i][2] + Basecomp[i][8])/2; /* AG GA */ + Basecomp[i][2] = imean; + Basecomp[i][8] = imean; + imean = (Basecomp[i][3] + Basecomp[i][12])/2; /* AT TA */ + Basecomp[i][3] = imean; + Basecomp[i][12] = imean; + imean = (Basecomp[i][6] + Basecomp[i][9])/2; /* CG GC */ + Basecomp[i][6] = imean; + Basecomp[i][9] = imean; + imean = (Basecomp[i][7] + Basecomp[i][13])/2; /* CT TC */ + Basecomp[i][7] = imean; + Basecomp[i][13] = imean; + imean = (Basecomp[i][11] + Basecomp[i][14])/2; /* GT TG */ + Basecomp[i][11] = imean; + Basecomp[i][14] = imean; + } + } +} + +/* show Ts/Tv ratio and Ts Y/R ratio */ +void computeexpectations() +{ + double AlphaYBeta, AlphaRBeta, piR, piY, num, denom, pyr, pur; + + if (nuc_optn == TRUE) { /* 4x4 nucs */ + piR = Freqtpm[0] + Freqtpm[2]; + piY = Freqtpm[1] + Freqtpm[3]; + AlphaRBeta = 4.0*TSparam / (1 + YRparam); + AlphaYBeta = AlphaRBeta * YRparam; + tstvratio = (AlphaRBeta*Freqtpm[0]*Freqtpm[2] + + AlphaYBeta*Freqtpm[1]*Freqtpm[3])/(piR * piY); + yrtsratio = (AlphaYBeta*Freqtpm[1]*Freqtpm[3]) / + (AlphaRBeta*Freqtpm[0]*Freqtpm[2]); + } else { /* 16x16 nucs */ + pyr = Freqtpm[1]*Freqtpm[3] + Freqtpm[5]*Freqtpm[7] + + Freqtpm[9]*Freqtpm[11] + Freqtpm[4]*Freqtpm[12] + + Freqtpm[5]*Freqtpm[13] + Freqtpm[6]*Freqtpm[14] + + Freqtpm[7]*Freqtpm[15] + Freqtpm[13]*Freqtpm[15]; + pur = Freqtpm[0]*Freqtpm[2] + Freqtpm[4]*Freqtpm[6] + + Freqtpm[0]*Freqtpm[8] + Freqtpm[1]*Freqtpm[9] + + Freqtpm[2]*Freqtpm[10] + Freqtpm[8]*Freqtpm[10] + + Freqtpm[3]*Freqtpm[11] + Freqtpm[12]*Freqtpm[14]; + num = pyr + pur; + denom = Freqtpm[0]*Freqtpm[1] + Freqtpm[1]*Freqtpm[2] + + Freqtpm[0]*Freqtpm[3] + Freqtpm[2]*Freqtpm[3] + + Freqtpm[0]*Freqtpm[4] + Freqtpm[1]*Freqtpm[5] + + Freqtpm[4]*Freqtpm[5] + Freqtpm[2]*Freqtpm[6] + + Freqtpm[5]*Freqtpm[6] + Freqtpm[3]*Freqtpm[7] + + Freqtpm[4]*Freqtpm[7] + Freqtpm[6]*Freqtpm[7] + + Freqtpm[4]*Freqtpm[8] + Freqtpm[5]*Freqtpm[9] + + Freqtpm[8]*Freqtpm[9] + Freqtpm[6]*Freqtpm[10] + + Freqtpm[9]*Freqtpm[10] + Freqtpm[7]*Freqtpm[11] + + Freqtpm[8]*Freqtpm[11] + Freqtpm[10]*Freqtpm[11] + + Freqtpm[0]*Freqtpm[12] + Freqtpm[8]*Freqtpm[12] + + Freqtpm[1]*Freqtpm[13] + Freqtpm[9]*Freqtpm[13] + + Freqtpm[12]*Freqtpm[13] + Freqtpm[2]*Freqtpm[14] + + Freqtpm[10]*Freqtpm[14] + Freqtpm[13]*Freqtpm[14] + + Freqtpm[3]*Freqtpm[15] + Freqtpm[11]*Freqtpm[15] + + Freqtpm[12]*Freqtpm[15] + Freqtpm[14]*Freqtpm[15]; + tstvratio = 2.0*TSparam * num/denom; + yrtsratio = pyr/pur; + } +} + +/* write ML distance matrix to file */ +void putdistance(FILE *fp) /* mod CZ 05/19/01 */ +{ + int i, j; + + fprintf(fp, " %d\n", Maxspc); + for (i = 0; i < Maxspc; i++) { + fputid10(fp, i); + for (j = 0; j < Maxspc; j++) { + fprintf(fp, " %.5f", Distanmat[i][j]/100.0); + } + fprintf(fp, "\n"); + } +} + + +/* find identical sequences */ +void findidenticals(FILE *fp) +{ + int i, j, noids; + cvector useqs; + + useqs = new_cvector(Maxspc); + + for (i = 0; i < Maxspc; i++) + useqs[i] = 0; + + noids = TRUE; + for (i = 0; i < Maxspc && noids; i++) + for (j = i + 1; j < Maxspc && noids; j++) + if (Distanmat[i][j] == 0.0) noids = FALSE; + + if (noids) + fprintf(fp, " All sequences are unique.\n"); + else { + for (i = 0; i < Maxspc; i++) { + noids = TRUE; + for (j = i + 1; j < Maxspc && noids; j++) + if (Distanmat[i][j] == 0.0) noids = FALSE; + + if (!noids && useqs[i] == 0) { + fputid(fp, i); + useqs[i] = 1; + for (j = i + 1; j < Maxspc; j++) + if (Distanmat[i][j] == 0.0) { + fprintf(fp, ", "); + fputid(fp, j); + useqs[j] = 1; + } + fprintf(fp, ".\n"); + } + } + } + free_cvector(useqs); +} + +/* compute average distance */ +double averagedist() +{ + int i, j; + double sum; + + sum = 0.0; + for (i = 0; i < Maxspc; i++) + for (j = i + 1; j < Maxspc; j++) + sum = sum + Distanmat[i][j]; + + sum = sum / (double) Maxspc / ((double) Maxspc - 1.0) * 2.0; + + return sum; +} + +/* first lines of EPSF likelihood mapping file */ +void initps(FILE *ofp) +{ + fprintf(ofp, "%%!PS-Adobe-3.0 EPSF-3.0\n"); + fprintf(ofp, "%%%%BoundingBox: 60 210 550 650\n"); + fprintf(ofp, "%%%%Pages: 1\n"); +# ifndef ALPHA + fprintf(ofp, "%%%%Creator: %s (version %s)\n", PACKAGE, VERSION); +# else + fprintf(ofp, "%%%%Creator: %s (version %s%s)\n", PACKAGE, VERSION, ALPHA); +# endif + fprintf(ofp, "%%%%Title: Likelihood Mapping Analysis\n"); + fprintf(ofp, "%%%%CreationDate: %s", asctime(localtime(&Starttime)) ); + fprintf(ofp, "%%%%DocumentFonts: Helvetica\n"); + fprintf(ofp, "%%%%DocumentNeededFonts: Helvetica\n"); + fprintf(ofp, "%%%%EndComments\n"); + fprintf(ofp, "%% use inch as unit\n"); + fprintf(ofp, "/inch {72 mul} def\n"); + fprintf(ofp, "%% triangle side length (3 inch)\n"); + fprintf(ofp, "/tl {3 inch mul} def\n"); + fprintf(ofp, "%% plot one dot (x-y coordinates on stack)\n"); + fprintf(ofp, "/dot {\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, "0.002 tl 0 360 arc %% radius is 0.002 of the triangle length\n"); + fprintf(ofp, "closepath\n"); + fprintf(ofp, "fill\n"); + fprintf(ofp, "} def\n"); + fprintf(ofp, "%% preamble\n"); + fprintf(ofp, "/Helvetica findfont\n"); + fprintf(ofp, "12 scalefont\n"); + fprintf(ofp, "setfont\n"); + fprintf(ofp, "%% 0/0 for triangle of triangles\n"); + fprintf(ofp, "0.9 inch 3 inch translate\n"); + fprintf(ofp, "%% first triangle (the one with dots)\n"); + fprintf(ofp, "0.6 tl 1.2 tl 0.8660254038 mul translate\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.0 tl 0.0 tl moveto\n"); + fprintf(ofp, " 1.0 tl 0.0 tl lineto\n"); + fprintf(ofp, " 0.5 tl 0.8660254038 tl lineto\n"); + fprintf(ofp, "closepath\n"); + fprintf(ofp, "stroke\n"); +} + +/* plot one point of likelihood mapping analysis */ +void plotlmpoint(FILE *ofp, double w1, double w2) +{ + fprintf(ofp,"%.10f tl %.10f tl dot\n", + 0.5*w1 + w2, w1*0.8660254038); +} + +/* last lines of EPSF likelihood mapping file */ +void finishps(FILE *ofp) +{ + fprintf(ofp, "stroke\n"); + fprintf(ofp, "%% second triangle (the one with 3 basins)\n"); + fprintf(ofp, "/secondtriangle {\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.0 tl 0.0 tl moveto\n"); + fprintf(ofp, " 1.0 tl 0.0 tl lineto\n"); + fprintf(ofp, " 0.5 tl 0.8660254038 tl lineto\n"); + fprintf(ofp, "closepath\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.50 tl 0.2886751346 tl moveto\n"); + fprintf(ofp, " 0.50 tl 0.0000000000 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.50 tl 0.2886751346 tl moveto\n"); + fprintf(ofp, " 0.25 tl 0.4330127019 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.50 tl 0.2886751346 tl moveto\n"); + fprintf(ofp, " 0.75 tl 0.4330127019 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "0.44 tl 0.5 tl moveto %% up\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) ar1*100.0/Numquartets); + fprintf(ofp, "0.25 tl 0.15 tl moveto %% down left\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) ar3*100.0/Numquartets); + fprintf(ofp, "0.63 tl 0.15 tl moveto %% down right\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) ar2*100.0/Numquartets); + fprintf(ofp, "} def\n"); + fprintf(ofp, "%% third triangle (the one with 7 basins)\n"); + fprintf(ofp, "/thirdtriangle {\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.0 tl 0.0 tl moveto\n"); + fprintf(ofp, " 1.0 tl 0.0 tl lineto\n"); + fprintf(ofp, " 0.5 tl 0.8660254038 tl lineto\n"); + fprintf(ofp, "closepath\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.25 tl 0.1443375673 tl moveto\n"); + fprintf(ofp, " 0.75 tl 0.1443375673 tl lineto\n"); + fprintf(ofp, " 0.50 tl 0.5773502692 tl lineto\n"); + fprintf(ofp, "closepath\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.125 tl 0.2165063509 tl moveto\n"); + fprintf(ofp, " 0.250 tl 0.1443375673 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.375 tl 0.6495190528 tl moveto\n"); + fprintf(ofp, " 0.500 tl 0.5773502692 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.625 tl 0.6495190528 tl moveto\n"); + fprintf(ofp, " 0.500 tl 0.5773502692 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.875 tl 0.2165063509 tl moveto\n"); + fprintf(ofp, " 0.750 tl 0.1443375673 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.750 tl 0.00 tl moveto\n"); + fprintf(ofp, " 0.750 tl 0.1443375673 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "newpath\n"); + fprintf(ofp, " 0.250 tl 0.00 tl moveto\n"); + fprintf(ofp, " 0.250 tl 0.1443375673 tl lineto\n"); + fprintf(ofp, "stroke\n"); + fprintf(ofp, "0.42 tl 0.66 tl moveto %% up\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) reg1*100.0/Numquartets); + fprintf(ofp, "0.07 tl 0.05 tl moveto %% down left\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) reg3*100.0/Numquartets); + fprintf(ofp, "0.77 tl 0.05 tl moveto %% down right\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) reg2*100.0/Numquartets); + fprintf(ofp, "0.43 tl 0.05 tl moveto %% down side\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) reg5*100.0/Numquartets); + fprintf(ofp, "0.43 tl 0.28 tl moveto %% center\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) reg7*100.0/Numquartets); + fprintf(ofp, "gsave\n"); + fprintf(ofp, "-60 rotate\n"); + fprintf(ofp, "-0.07 tl 0.77 tl moveto %% right side\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) reg4*100.0/Numquartets); + fprintf(ofp, "grestore\n"); + fprintf(ofp, "gsave\n"); + fprintf(ofp, "60 rotate\n"); + fprintf(ofp, "0.4 tl -0.09 tl moveto %% left side\n"); + fprintf(ofp, "(%.1f%%) show\n", (double) reg6*100.0/Numquartets); + fprintf(ofp, "grestore\n"); + fprintf(ofp, "} def\n"); + fprintf(ofp, "%% print the other two triangles\n"); + fprintf(ofp, "-0.6 tl -1.2 tl 0.8660254038 mul translate\n"); + fprintf(ofp, "secondtriangle\n"); + fprintf(ofp, "1.2 tl 0 translate\n"); + fprintf(ofp, "thirdtriangle\n"); + if (numclust == 4) { /* four cluster analysis */ + fprintf(ofp, "%% label corners\n"); + fprintf(ofp, "0.375 tl 0.9 tl moveto\n"); + fprintf(ofp, "((a,b)-(c,d)) show %% CHANGE HERE IF NECESSARY\n"); + fprintf(ofp, "-0.16 tl -0.08 tl moveto\n"); + fprintf(ofp, "((a,d)-(b,c)) show %% CHANGE HERE IF NECESSARY\n"); + fprintf(ofp, "0.92 tl -0.08 tl moveto\n"); + fprintf(ofp, "((a,c)-(b,d)) show %% CHANGE HERE IF NECESSARY\n"); + } + if (numclust == 3) { /* three cluster analysis */ + fprintf(ofp, "%% label corners\n"); + fprintf(ofp, "0.375 tl 0.9 tl moveto\n"); + fprintf(ofp, "((a,b)-(c,c)) show %% CHANGE HERE IF NECESSARY\n"); + fprintf(ofp, "-0.16 tl -0.08 tl moveto\n"); + fprintf(ofp, "((a,c)-(b,c)) show %% CHANGE HERE IF NECESSARY\n"); + fprintf(ofp, "0.92 tl -0.08 tl moveto\n"); + fprintf(ofp, "((a,c)-(b,c)) show %% CHANGE HERE IF NECESSARY\n"); + } + if (numclust == 2) { /* two cluster analysis */ + fprintf(ofp, "%% label corners\n"); + fprintf(ofp, "0.375 tl 0.9 tl moveto\n"); + fprintf(ofp, "((a,a)-(b,b)) show %% CHANGE HERE IF NECESSARY\n"); + fprintf(ofp, "-0.16 tl -0.08 tl moveto\n"); + fprintf(ofp, "((a,b)-(a,b)) show %% CHANGE HERE IF NECESSARY\n"); + fprintf(ofp, "0.92 tl -0.08 tl moveto\n"); + fprintf(ofp, "((a,b)-(a,b)) show %% CHANGE HERE IF NECESSARY\n"); + } + fprintf(ofp, "showpage\n"); + fprintf(ofp, "%%%%EOF\n"); +} + +/* computes LM point from the three log-likelihood values, + plots the point, and does some statistics */ +void makelmpoint(FILE *fp, double b1, double b2, double b3) +{ + double w1, w2, w3, temp; + unsigned char qpbranching; + double temp1, temp2, temp3, onethird; + unsigned char discreteweight[3], treebits[3]; + + onethird = 1.0/3.0; + treebits[0] = (unsigned char) 1; + treebits[1] = (unsigned char) 2; + treebits[2] = (unsigned char) 4; + + /* sort in descending order */ + qweight[0] = b1; + qweight[1] = b2; + qweight[2] = b3; + sort3doubles(qweight, qworder); + + /* compute Bayesian weights */ + qweight[qworder[1]] = exp(qweight[qworder[1]]-qweight[qworder[0]]); + qweight[qworder[2]] = exp(qweight[qworder[2]]-qweight[qworder[0]]); + qweight[qworder[0]] = 1.0; + temp = qweight[0] + qweight[1] + qweight[2]; + qweight[0] = qweight[0]/temp; + qweight[1] = qweight[1]/temp; + qweight[2] = qweight[2]/temp; + + /* plot one point in likelihood mapping triangle */ + w1 = qweight[0]; + w2 = qweight[1]; + w3 = qweight[2]; + plotlmpoint(fp, w1, w2); + + /* check areas 1,2,3 */ + if (treebits[qworder[0]] == 1) ar1++; + else if (treebits[qworder[0]] == 2) ar2++; + else ar3++; + + /* check out regions 1,2,3,4,5,6,7 */ + + /* 100 distribution */ + temp1 = 1.0 - qweight[qworder[0]]; + sqdiff[0] = temp1*temp1 + + qweight[qworder[1]]*qweight[qworder[1]] + + qweight[qworder[2]]*qweight[qworder[2]]; + discreteweight[0] = treebits[qworder[0]]; + + /* 110 distribution */ + temp1 = 0.5 - qweight[qworder[0]]; + temp2 = 0.5 - qweight[qworder[1]]; + sqdiff[1] = temp1*temp1 + temp2*temp2 + + qweight[qworder[2]]*qweight[qworder[2]]; + discreteweight[1] = treebits[qworder[0]] + treebits[qworder[1]]; + + /* 111 distribution */ + temp1 = onethird - qweight[qworder[0]]; + temp2 = onethird - qweight[qworder[1]]; + temp3 = onethird - qweight[qworder[2]]; + sqdiff[2] = temp1 * temp1 + temp2 * temp2 + temp3 * temp3; + discreteweight[2] = (unsigned char) 7; + + /* sort in descending order */ + sort3doubles(sqdiff, sqorder); + + qpbranching = (unsigned char) discreteweight[sqorder[2]]; + + if (qpbranching == 1) { + reg1++; + if (w2 < w3) reg1l++; + else reg1r++; + } + if (qpbranching == 2) { + reg2++; + if (w1 < w3) reg2d++; + else reg2u++; + } + if (qpbranching == 4) { + reg3++; + if (w1 < w2) reg3d++; + else reg3u++; + } + if (qpbranching == 3) { + reg4++; + if (w1 < w2) reg4d++; + else reg4u++; + } + if (qpbranching == 6) { + reg5++; + if (w2 < w3) reg5l++; + else reg5r++; + } + if (qpbranching == 5) { + reg6++; + if (w1 < w3) reg6d++; + else reg6u++; + } + if (qpbranching == 7) reg7++; +} + +/* print tree statistics */ +void printtreestats(FILE *ofp) +{ + int i, j, besttree; + double bestlkl, difflkl, difflklps, temp, sum; + + /* find best tree */ + besttree = 0; + bestlkl = ulkl[0]; + for (i = 1; i < numutrees; i++) + if (ulkl[i] > bestlkl) { + besttree = i; + bestlkl = ulkl[i]; + } + + fprintf(ofp, "\n\nCOMPARISON OF USER TREES (NO CLOCK)\n\n"); + fprintf(ofp, "Tree log L difference S.E. Significantly worse\n"); + fprintf(ofp, "--------------------------------------------------------\n"); + for (i = 0; i < numutrees; i++) { + difflkl = ulkl[besttree]-ulkl[i]; + fprintf(ofp, "%2d %10.2f %8.2f ", i+1, ulkl[i], difflkl); + if (i == besttree) { + fprintf(ofp, " <----------------- best tree"); + } else { + /* compute variance of Log L differences over sites */ + difflklps = difflkl/(double)Maxsite; + sum = 0.0; + for (j = 0; j < Numptrn; j++) { + temp = allsites[besttree][j] - allsites[i][j] - difflklps; + sum += temp*temp*Weight[j]; + } + sum = sqrt(fabs(sum/(Maxsite-1.0)*Maxsite)); + fprintf(ofp, "%11.2f ", sum); + if (difflkl > 1.96*sum) + fprintf(ofp, "yes"); + else + fprintf(ofp, "no"); + } + fprintf(ofp, "\n"); + } + fprintf(ofp, "\nThis test (5%% significance) follows Kishino and Hasegawa (1989).\n"); + + if (compclock) { + + /* find best tree */ + besttree = 0; + bestlkl = ulklc[0]; + for (i = 1; i < numutrees; i++) + if (ulklc[i] > bestlkl) { + besttree = i; + bestlkl = ulklc[i]; + } + + fprintf(ofp, "\n\nCOMPARISON OF USER TREES (WITH CLOCK)\n\n"); + fprintf(ofp, "Tree log L difference S.E. Significantly worse\n"); + fprintf(ofp, "--------------------------------------------------------\n"); + for (i = 0; i < numutrees; i++) { + difflkl = ulklc[besttree]-ulklc[i]; + fprintf(ofp, "%2d %10.2f %8.2f ", i+1, ulklc[i], difflkl); + if (i == besttree) { + fprintf(ofp, " <----------------- best tree"); + } else { + /* compute variance of Log L differences over sites */ + difflklps = difflkl/(double)Maxsite; + sum = 0.0; + for (j = 0; j < Numptrn; j++) { + temp = allsitesc[besttree][j] - allsitesc[i][j] - difflklps; + sum += temp*temp*Weight[j]; + } + sum = sqrt(fabs(sum/(Maxsite-1.0)*Maxsite)); + fprintf(ofp, "%11.2f ", sum); + if (difflkl > 1.96*sum) + fprintf(ofp, "yes"); + else + fprintf(ofp, "no"); + } + fprintf(ofp, "\n"); + } + fprintf(ofp, "\nThis test (5%% significance) follows Kishino and Hasegawa (1989).\n"); + } +} + +/* time stamp */ +void timestamp(FILE* ofp) +{ + double timespan; + double cpuspan; + timespan = difftime(Stoptime, Starttime); + cpuspan = ((double) (Stopcpu - Startcpu) / CLOCKS_PER_SEC); + fprintf(ofp, "\n\nTIME STAMP\n\n"); + fprintf(ofp, "Date and time: %s", asctime(localtime(&Starttime)) ); + fprintf(ofp, "Runtime (excl. input) : %.0f seconds (= %.1f minutes = %.1f hours)\n", + timespan, timespan/60., timespan/3600.); + fprintf(ofp, "Runtime (incl. input) : %.0f seconds (= %.1f minutes = %.1f hours)\n", + fulltime, fulltime/60., fulltime/3600.); +#ifdef TIMEDEBUG + fprintf(ofp, "CPU time (incl. input): %.0f seconds (= %.1f minutes = %.1f hours)\n\n", + fullcpu, fullcpu/60., fullcpu/3600.); +#endif /* TIMEDEBUG */ + +} + +/* extern int bestrfound; */ + +/* write output file */ +void writeoutputfile(FILE *ofp, int part) +{ + int i, fail, df; + uli li; + double pval, delta; + + if ((part == WRITEPARAMS) || (part == WRITEALL)) { +# ifndef ALPHA + fprintf(ofp, "TREE-PUZZLE %s\n\n", VERSION); +# else + fprintf(ofp, "TREE-PUZZLE %s%s\n\n", VERSION, ALPHA); +# endif + + fprintf(ofp, "Input file name: %s\n",INFILE); + if (puzzlemode == USERTREE) fprintf(ofp, "User tree file name: %s\n",INTREE); + + + fprintf(ofp, "Type of analysis: "); + if (typ_optn == TREERECON_OPTN) fprintf(ofp, "tree reconstruction\n"); + if (typ_optn == LIKMAPING_OPTN) fprintf(ofp, "likelihood mapping\n"); + fprintf(ofp, "Parameter estimation: "); + if (approxp_optn) fprintf(ofp, "approximate (faster)\n"); + else fprintf(ofp, "accurate (slow)\n"); + if (!(puzzlemode == USERTREE && typ_optn == TREERECON_OPTN)) { + fprintf(ofp, "Parameter estimation uses: "); + if (qcalg_optn) + fprintf(ofp, "quartet sampling (for substitution process) + NJ tree (for rate variation)\n"); + else + fprintf(ofp, "neighbor-joining tree (for substitution process and rate variation)\n"); + } else { + fprintf(ofp, "Parameter estimation uses: "); + if (utree_optn) + fprintf(ofp, "1st user tree (for substitution process and rate variation)\n"); + else if (qcalg_optn) + fprintf(ofp, "quartet sampling (for substitution process) + NJ tree (for rate variation)\n"); + else + fprintf(ofp, "neighbor-joining tree (for substitution process and rate variation)\n"); + } + fprintf(ofp, "\nStandard errors (S.E.) are obtained by the curvature method.\n"); + fprintf(ofp, "The upper and lower bounds of an approximate 95%% confidence interval\n"); + fprintf(ofp, "for parameter or branch length x are x-1.96*S.E. and x+1.96*S.E.\n"); + fprintf(ofp, "\n\n"); + fprintf(ofp, "SEQUENCE ALIGNMENT\n\n"); + fprintf(ofp, "Input data: %d sequences with %d ", Maxspc, Maxsite); + if (data_optn == AMINOACID) + fprintf(ofp, "amino acid"); + else if (data_optn == NUCLEOTIDE && SH_optn) + fprintf(ofp, "doublet (%d nucleotide)", Maxsite*2); + else if (data_optn == NUCLEOTIDE && nuc_optn) + fprintf(ofp, "nucleotide"); + else if (data_optn == BINARY) + fprintf(ofp, "binary state"); + fprintf(ofp, " sites"); + if (data_optn == NUCLEOTIDE && (Maxseqc % 3) == 0 && !SH_optn) { + if (codon_optn == 1) fprintf(ofp, " (1st codon positions)"); + if (codon_optn == 2) fprintf(ofp, " (2nd codon positions)"); + if (codon_optn == 3) fprintf(ofp, " (3rd codon positions)"); + if (codon_optn == 4) fprintf(ofp, " (1st and 2nd codon positions)"); + } + if (data_optn == NUCLEOTIDE && SH_optn) { + if (SHcodon) + fprintf(ofp, " (1st and 2nd codon positions)"); + else + fprintf(ofp, " (1st+2nd, 3rd+4th, etc. site)"); + } + fprintf(ofp, "\n"); + fprintf(ofp, "Number of constant sites: %d (= %.1f%% of all sites)\n", + Numconst, 100.0*fracconst); + fprintf(ofp, "Number of site patterns: %d\n", + Numptrn); + fprintf(ofp, "Number of constant site patterns: %d (= %.1f%% of all site patterns)\n\n\n", + Numconstpat, 100.0*fracconstpat); + fprintf(ofp, "SUBSTITUTION PROCESS\n\n"); + fprintf(ofp, "Model of substitution: "); + if (data_optn == NUCLEOTIDE) { /* nucleotides */ + if (nuc_optn) { + if(HKY_optn) fprintf(ofp, "HKY (Hasegawa et al. 1985)\n"); + else fprintf(ofp, "TN (Tamura-Nei 1993)\n"); + fprintf(ofp, "Transition/transversion parameter"); + if (optim_optn) + fprintf(ofp, " (estimated from data set)"); + fprintf(ofp, ": %.2f", TSparam); + if (optim_optn) + fprintf(ofp, " (S.E. %.2f)", tserr); + fprintf(ofp, "\n"); + + if (optim_optn && TSparam > MAXTS - 1.0) + fprintf(ofp, "WARNING --- parameter estimate close to internal upper bound!\n"); + if (optim_optn && TSparam < MINTS + 0.1) + fprintf(ofp, "WARNING --- parameter estimate close to internal lower bound!\n"); + + if (TN_optn) { + fprintf(ofp, "Y/R transition parameter"); + if (optim_optn) + fprintf(ofp, " (estimated from data set)"); + fprintf(ofp, ": %.2f", YRparam); + if (optim_optn) + fprintf(ofp, " (S.E. %.2f)", yrerr); + fprintf(ofp, "\n"); + + if (optim_optn && YRparam > MAXYR - 0.5) + fprintf(ofp, "WARNING --- parameter estimate close to internal upper bound!\n"); + if (optim_optn && YRparam < MINYR + 0.1) + fprintf(ofp, "WARNING --- parameter estimate close to internal lower bound!\n"); + + } + } + if (SH_optn) { + fprintf(ofp, "SH (Schoeniger-von Haeseler 1994)\n"); + fprintf(ofp, "Transition/transversion parameter"); + if (optim_optn) fprintf(ofp, " (estimated from data set)"); + fprintf(ofp, ": %.2f\n", TSparam); + if (optim_optn) + fprintf(ofp, " (S.E. %.2f)", tserr); + fprintf(ofp, "\n"); + + if (optim_optn && TSparam > MAXTS - 1.0) + fprintf(ofp, "WARNING --- parameter estimate close to internal upper bound!\n"); + if (optim_optn && TSparam < MINTS + 0.1) + fprintf(ofp, "WARNING --- parameter estimate close to internal lower bound!\n"); + + } + } + if (data_optn == AMINOACID) { /* amino acids */ + if (Dayhf_optn) fprintf(ofp, "Dayhoff (Dayhoff et al. 1978)\n"); + if (Jtt_optn) fprintf(ofp, "JTT (Jones et al. 1992)\n"); + if (mtrev_optn) fprintf(ofp, "mtREV24 (Adachi-Hasegawa 1996)\n"); + if (cprev_optn) fprintf(ofp, "cpREV45 (Adachi et al. 2000)\n"); + if (blosum62_optn) fprintf(ofp, "BLOSUM 62 (Henikoff-Henikoff 1992)\n"); + if (vtmv_optn) fprintf(ofp, "VT (Mueller-Vingron 2000)\n"); + if (wag_optn) fprintf(ofp, "WAG (Whelan-Goldman 2000)\n"); + } + if (data_optn == BINARY) { /* binary states */ + fprintf(ofp, "Two-state model (Felsenstein 1981)\n"); + } + if (data_optn == AMINOACID) + fprintf(ofp, "Amino acid "); + else if (data_optn == NUCLEOTIDE && SH_optn) + fprintf(ofp, "Doublet "); + else if (data_optn == NUCLEOTIDE && nuc_optn) + fprintf(ofp, "Nucleotide "); + else if (data_optn == BINARY) + fprintf(ofp, "Binary state "); + fprintf(ofp, "frequencies ("); + if (Frequ_optn) fprintf(ofp, "estimated from data set"); + else fprintf(ofp, "user specified"); + if (data_optn == NUCLEOTIDE && SH_optn && sym_optn) + fprintf(ofp, " and symmetrized"); + fprintf(ofp, "):\n\n"); + for (i = 0; i < gettpmradix(); i++) + fprintf(ofp, " pi(%s) = %5.1f%%\n", + int2code(i), Freqtpm[i]*100); + if (data_optn == NUCLEOTIDE) { + fprintf(ofp, "\nExpected transition/transversion ratio: %.2f", + tstvratio); + if (tstvf84 == 0.0) fprintf(ofp, "\n"); + else fprintf(ofp, " (= F84 parameter)\n"); + fprintf(ofp, "Expected pyrimidine transition/purine transition"); + fprintf(ofp, " ratio: %.2f\n", yrtsratio); + if (tstvf84 != 0.0 && TN_optn) + fprintf(ofp, + "This TN model is equivalent to a F84 model (Felsenstein 1984).\n"); + } + fprintf(ofp, "\n\nRATE HETEROGENEITY\n\n"); + fprintf(ofp, "Model of rate heterogeneity: "); + if (rhetmode == UNIFORMRATE) fprintf(ofp, "uniform rate\n"); + if (rhetmode == GAMMARATE ) fprintf(ofp, "Gamma distributed rates\n"); + if (rhetmode == TWORATE ) fprintf(ofp, "two rates (1 invariable + 1 variable)\n"); + if (rhetmode == MIXEDRATE ) fprintf(ofp, "mixed (1 invariable + %d Gamma rates)\n", numcats); + if (rhetmode == TWORATE || rhetmode == MIXEDRATE) { + fprintf(ofp, "Fraction of invariable sites"); + if (fracinv_optim) fprintf(ofp, " (estimated from data set)"); + fprintf(ofp, ": %.2f", fracinv); + if (fracinv_optim) fprintf(ofp, " (S.E. %.2f)", fierr); + fprintf(ofp, "\n"); + + if (fracinv_optim && fracinv > MAXFI - 0.05) + fprintf(ofp, "WARNING --- parameter estimate close to internal upper bound!\n"); + + fprintf(ofp, "Number of invariable sites: %.0f\n", floor(fracinv*Maxsite)); + } + if (rhetmode == GAMMARATE || rhetmode == MIXEDRATE) { + fprintf(ofp, "Gamma distribution parameter alpha"); + if (grate_optim) fprintf(ofp, " (estimated from data set)"); + fprintf(ofp, ": %.2f", (1.0-Geta)/Geta); + if (grate_optim) fprintf(ofp, " (S.E. %.2f)", + geerr/(Geta*Geta)); /* first order approximation */ + fprintf(ofp, "\n"); + + if (grate_optim && Geta > MAXGE - 0.02) + fprintf(ofp, "WARNING --- parameter estimate close to internal upper bound!\n"); + if (grate_optim && Geta < MINGE + 0.01) + fprintf(ofp, "WARNING --- parameter estimate close to internal lower bound!\n"); + + fprintf(ofp, "Number of Gamma rate categories: %d\n", numcats); + } + if (rhetmode == MIXEDRATE) { + fprintf(ofp, "Total rate heterogeneity (invariable sites + Gamma model): "); + fprintf(ofp, "%.2f", fracinv + Geta - fracinv*Geta); + if (grate_optim && fracinv_optim) + fprintf(ofp, " (S.E. %.2f)", geerr + fierr); /* first order approximation */ + else if (grate_optim && !fracinv_optim) + fprintf(ofp, " (S.E. %.2f)", geerr); + else if (!grate_optim && fracinv_optim) + fprintf(ofp, " (S.E. %.2f)", fierr); + fprintf(ofp, "\n"); + } + if (rhetmode != UNIFORMRATE) { + fprintf(ofp, "\nRates and their respective probabilities used in the likelihood function:\n"); + fprintf(ofp, "\n Category Relative rate Probability\n"); + if (rhetmode == TWORATE || rhetmode == MIXEDRATE) + fprintf(ofp, " 0 0.0000 %.4f\n", fracinv); + for (i = 0; i < numcats; i++) + fprintf(ofp, " %d %.4f %.4f\n", + i+1, Rates[i], (1.0-fracinv)/(double) numcats); + } + if (rhetmode == GAMMARATE || rhetmode == MIXEDRATE) { + fprintf(ofp, "\nCategories 1-%d approximate a continous ", numcats); + fprintf(ofp, "Gamma-distribution with expectation 1\n"); + fprintf(ofp, "and variance "); + if (Geta == 1.0) fprintf(ofp, "infinity"); + else fprintf(ofp, "%.2f", Geta/(1.0-Geta)); + fprintf(ofp, ".\n"); + } + + if (typ_optn == TREERECON_OPTN && (puzzlemode == QUARTPUZ || puzzlemode == USERTREE)) + if (rhetmode != UNIFORMRATE) { + fprintf(ofp, "\nCombination of categories that contributes"); + fprintf(ofp, " the most to the likelihood\n"); + fprintf(ofp, "(computation done without clock assumption assuming "); + if (puzzlemode == QUARTPUZ) fprintf(ofp, "quartet-puzzling tree"); + if (puzzlemode == USERTREE) { + if (utree_optn) fprintf(ofp, "1st user tree"); + else fprintf(ofp, "NJ tree"); + } + fprintf(ofp, "):\n\n"); + if (bestratefound==0) findbestratecombination(); + printbestratecombination(ofp); + } + + fprintf(ofp, "\n\nSEQUENCE COMPOSITION (SEQUENCES IN INPUT ORDER)\n\n"); + fail = FALSE; + fprintf(ofp, " 5%% chi-square test p-value\n"); + for (i = 0; i < Maxspc; i++) { + fprintf(ofp, " "); + fputid10(ofp, i); + pval = homogentest(i); + if ( pval < 0.05 ) fprintf(ofp, " failed "); + else fprintf(ofp, " passed "); + if (chi2fail) fail = TRUE; + fprintf(ofp, " %6.2f%% ", pval*100.0); + fprintf(ofp, "\n"); + } + fprintf(ofp, "\n"); + fprintf(ofp, "The chi-square tests compares the "); + if (data_optn == AMINOACID) + fprintf(ofp, "amino acid"); + else if (data_optn == NUCLEOTIDE && SH_optn) + fprintf(ofp, "doublet"); + else if (data_optn == NUCLEOTIDE && nuc_optn) + fprintf(ofp, "nucleotide"); + else if (data_optn == BINARY) + fprintf(ofp, "binary state"); + fprintf(ofp," composition of each sequence\n"); + fprintf(ofp, "to the frequency distribution assumed in the maximum likelihood model.\n"); + if (fail) { + fprintf(ofp, "\nWARNING: Result of chi-square test may not be valid"); + fprintf(ofp, " because of small\nmaximum likelihood frequencies and"); + fprintf(ofp, " short sequence length!\n"); + } + fprintf(ofp, "\n\nIDENTICAL SEQUENCES\n\n"); + fprintf(ofp, "The sequences in each of the following groups are all identical. To speed\n"); + fprintf(ofp, "up computation please remove all but one of each group from the data set.\n\n"); + findidenticals(ofp); + fprintf(ofp, "\n\nMAXIMUM LIKELIHOOD DISTANCES\n\n"); + fprintf(ofp, "Maximum likelihood distances are computed using the "); + fprintf(ofp, "selected model of\nsubstitution and rate heterogeneity.\n\n"); + putdistance(ofp); + fprintf(ofp, "\nAverage distance (over all possible pairs of sequences): %.5f\n", + averagedist() / 100.0); + + + } /* if WRITEPARAMS) || WRITEALL */ + + if ((part == WRITEREST) || (part == WRITEALL)) { + + if (puzzlemode == QUARTPUZ &&typ_optn == TREERECON_OPTN) { + fprintf(ofp, "\n\nBAD QUARTET STATISTICS (SEQUENCES IN INPUT ORDER)\n\n"); + for (i = 0; i < Maxspc; i++) { + fprintf(ofp, " "); + fputid10(ofp, i); + if (badqs > 0) + fprintf(ofp, " [%lu] %6.2f%%\n", badtaxon[i], (double) (100 * badtaxon[i]) / (double) badqs); + else + fprintf(ofp, " [%lu]\n", badtaxon[i]); + } + fprintf(ofp, "\nThe number in square brackets indicates how often each sequence is\n"); + fprintf(ofp, "involved in one of the %lu completely unresolved quartets of the\n", badqs); + fprintf(ofp, "quartet puzzling tree search.\n"); + if (badqs > 0) + fprintf(ofp, "Additionally the according percentages are given.\n"); + } + + if (typ_optn == TREERECON_OPTN) { + + fprintf(ofp, "\n\nTREE SEARCH\n\n"); + if (puzzlemode == QUARTPUZ) { + fprintf(ofp, "Quartet puzzling is used to choose from the possible tree topologies\n"); + fprintf(ofp, "and to simultaneously infer support values for internal branches.\n\n"); + fprintf(ofp, "Number of puzzling steps: %lu\n", Numtrial); + fprintf(ofp, "Analysed quartets: %lu\n", Numquartets); + fprintf(ofp, "Unresolved quartets: %lu (= %.1f%%)\n", + badqs, (double) badqs / (double) Numquartets * 100); + fprintf(ofp, "\nQuartet trees are based on %s maximum likelihood values\n", + (approxqp ? "approximate" : "exact")); + fprintf(ofp, "using the selected model of substitution and rate heterogeneity.\n\n\n"); + } + if (puzzlemode == USERTREE) { + fprintf(ofp, "%d tree topologies were specified by the user.\n", numutrees); + } + if (puzzlemode == PAIRDIST) { + fprintf(ofp, "No tree search performed (maximum likelihood distances only).\n"); + } + + if (puzzlemode == QUARTPUZ) { + fprintf(ofp, "QUARTET PUZZLING TREE\n\n"); + fprintf(ofp, "Support for the internal branches of the unrooted quartet puzzling\n"); + fprintf(ofp, "tree topology is shown in percent.\n"); + if (consincluded == Maxspc - 3) + fprintf(ofp,"\nThis quartet puzzling tree is completely resolved.\n"); + else + fprintf(ofp,"\nThis quartet puzzling tree is not completely resolved!\n"); + fprintf(ofp, "\n\n"); + plotconsensustree(ofp); + fprintf(ofp, "\n\nQuartet puzzling tree (in CLUSTAL W notation):\n\n"); + writeconsensustree(ofp); + fprintf(ofp, "\n\nBIPARTITIONS\n\n"); + fprintf(ofp, "The following bipartitions occured at least once"); + fprintf(ofp, " in all intermediate\ntrees that have been generated "); + fprintf(ofp, "in the %lu puzzling steps:\n\n", Numtrial); + fprintf(ofp, "Bipartitions included in the quartet puzzling tree:\n"); + fprintf(ofp, + "(bipartition with sequences in input order : number of times seen)\n\n"); + for (li = 0; li < consincluded; li++) { + fprintf(ofp, " "); + printsplit(ofp, splitfreqs[2*li+1]); + fprintf(ofp, " : %lu\n", splitfreqs[2*li]); + } + if (consincluded == 0) fprintf(ofp, " None (no bipartition included)\n"); + fprintf(ofp, "\nBipartitions not included in the quartet puzzling tree:\n"); + fprintf(ofp, + "(bipartition with sequences in input order : number of times seen)\n\n"); + + if (consincluded == numbiparts) { + fprintf(ofp, " None (all bipartitions are included)\n"); + } else { + /* print first 20 bipartions not included */ + for (li = consincluded; (li < numbiparts) && (li < consincluded + 20UL); li++) { + fprintf(ofp, " "); + printsplit(ofp, splitfreqs[2*li+1]); + fprintf(ofp, " : %lu\n", splitfreqs[2*li]); + } + if ((li == consincluded + 20UL) && (li != numbiparts)) + fprintf(ofp, "\n(%lu other less frequent bipartitions not shown)\n", + numbiparts - consincluded - 20UL); + } + fprintfsortedpstrees(ofp, psteptreelist, psteptreenum, psteptreesum, 0, 5.0); + } + + if (puzzlemode == QUARTPUZ) { + fprintf(ofp, "\n\nMAXIMUM LIKELIHOOD BRANCH LENGTHS ON QUARTET"); + fprintf(ofp, " PUZZLING TREE (NO CLOCK)\n\nBranch lengths are computed using"); + fprintf(ofp, " the selected model of\nsubstitution and rate heterogeneity.\n\n\n"); + clockmode = 0; /* nonclocklike branch lengths */ + prtopology(ofp); + fprintf(ofp, "\n"); + resulttree(ofp); + fprintf(ofp, "\n\nQuartet puzzling tree with maximum likelihood branch lengths"); + fprintf(ofp, "\n(in CLUSTAL W notation):\n\n"); + fputphylogeny(ofp); + if (compclock) { + fprintf(ofp, "\n\nMAXIMUM LIKELIHOOD BRANCH LENGTHS OF QUARTET"); + fprintf(ofp, " PUZZLING TREE (WITH CLOCK)\n\nBranch lengths are computed using"); + fprintf(ofp, " the selected model of\nsubstitution and rate heterogeneity.\n"); + fprintf(ofp, "\nRoot located at branch: %d ", locroot+1); + if (rootsearch == 0) fprintf(ofp, "(user specified)\n\n\n"); + if (rootsearch == 1) { + fprintf(ofp, "(automatic search)"); + if (numbestroot > 1) fprintf(ofp, "- WARNING: %d best locations found! -", numbestroot); + fprintf(ofp, "\n\n"); + fprintf(ofp, "If the automatic search misplaces the root please rerun the analysis\n"); + fprintf(ofp, "(rename \"outtree\" to \"intree\") and select location of root manually!"); + fprintf(ofp, "\n\n\n"); + } + if (rootsearch == 2) fprintf(ofp, "(displayed outgroup)\n\n\n"); + clockmode = 1; /* clocklike branch lengths */ + prtopology(ofp); + fprintf(ofp, "\n"); + fprintf(ofp, "\nTree drawn as unrooted tree for better "); + fprintf(ofp, "comparison with non-clock tree!\n"); + resulttree(ofp); + fprintf(ofp, "\n"); + resultheights(ofp); + fprintf(ofp, "\n\nRooted quartet puzzling tree with clocklike"); + fprintf(ofp, " maximum likelihood branch lengths\n"); + fprintf(ofp, "(in CLUSTAL W notation):\n\n"); + fputrooted(ofp, locroot); + } + + if (compclock) { + fprintf(ofp, "\n\nMOLECULAR CLOCK LIKELIHOOD RATIO TEST\n\n"); + fprintf(ofp, "log L without clock: %.2f (independent branch parameters: %d)\n", + Ctree->lklhd, Numspc + Numibrnch); + fprintf(ofp, "log L with clock: %.2f (independent branch parameters: %d)\n\n", + Ctree->lklhdc, Numhts + 1); + delta = 2.0*((Ctree->lklhd) - (Ctree->lklhdc)); + fprintf(ofp, "Likelihood ratio test statistic delta: %.2f\n", delta); + df = Numspc + Numibrnch - Numhts - 1; + fprintf(ofp, "Degress of freedom of chi-square distribution: %d\n", df); + + pval = IncompleteGammaQ(df*0.5, delta*0.5); + + fprintf(ofp, "Critical significance level: %.2f%%\n\n", pval*100.0); + if (pval >= 0.05) { + fprintf(ofp, "The simpler (clocklike) tree can not be rejected on a significance\n"); + fprintf(ofp, "level of 5%%. The log-likelihood of the more complex (no clock) tree\n"); + fprintf(ofp, "is not significantly increased.\n"); + } else { + fprintf(ofp, "The simpler (clocklike) tree is rejected on a significance level\n"); + fprintf(ofp, "of 5%%. The log-likelihood of the more complex (no clock) tree is\n"); + fprintf(ofp, "significantly increased.\n"); + } + fprintf(ofp, "\nPlease take care that the correct root is used!\n"); + } + + } + } + + if (typ_optn == LIKMAPING_OPTN) { + + fprintf(ofp, "\n\nLIKELIHOOD MAPPING ANALYSIS\n\n"); + fprintf(ofp, "Number of quartets: %lu", Numquartets); + if (lmqts == 0) fprintf(ofp, " (all possible)\n"); + else fprintf(ofp, " (random choice)\n"); + fprintf(ofp, "\nQuartet trees are based on approximate maximum likelihood values\n"); + fprintf(ofp, "using the selected model of substitution and rate heterogeneity.\n\n\n"); + if (numclust == 1) { + fprintf(ofp, "Sequences are not grouped in clusters.\n"); + } else { + fprintf(ofp, "Sequences are grouped in %d clusters.\n", numclust); + fprintf(ofp, "\nCluster a: %d sequences\n\n", clustA); + for (i = 0; i < clustA; i++) { + fprintf(ofp, " "); + fputid(ofp, clusterA[i]); + fprintf(ofp, "\n"); + } + fprintf(ofp, "\nCluster b: %d sequences\n\n", clustB); + for (i = 0; i < clustB; i++) { + fprintf(ofp, " "); + fputid(ofp, clusterB[i]); + fprintf(ofp, "\n"); + } + if (numclust > 2) { + fprintf(ofp, "\nCluster c: %d sequences\n\n", clustC); + for (i = 0; i < clustC; i++) { + fprintf(ofp, " "); + fputid(ofp, clusterC[i]); + fprintf(ofp, "\n"); + } + } + if (numclust == 4) { + fprintf(ofp, "\nCluster d: %d sequences\n\n", clustD); + for (i = 0; i < clustD; i++) { + fprintf(ofp, " "); + fputid(ofp, clusterD[i]); + fprintf(ofp, "\n"); + } + } + fprintf(ofp, "\nQuartets of sequences used in the likelihood"); + fprintf(ofp, " mapping analysis are generated\n"); + if (numclust == 2) + fprintf(ofp, "by drawing two sequences from cluster a and two from cluster b."); + if (numclust == 3) + fprintf(ofp, "by drawing one sequence from clusters a and b and two from cluster c."); + if (numclust == 4) + fprintf(ofp, "by drawing one sequence from each of the clusters a, b, c, and d."); + } + + fprintf(ofp, "\n\nLIKELIHOOD MAPPING STATISTICS\n\n"); + fprintf(ofp, "Occupancies of the three areas 1, 2, 3:\n\n"); + if (numclust == 4) + fprintf(ofp, " (a,b)-(c,d)\n"); + if (numclust == 3) + fprintf(ofp, " (a,b)-(c,c)\n"); + if (numclust == 2) + fprintf(ofp, " (a,a)-(b,b)\n"); + fprintf(ofp, " /\\\n"); + fprintf(ofp, " / \\\n"); + fprintf(ofp, " / \\\n"); + fprintf(ofp, " / 1 \\\n"); + fprintf(ofp, " / \\ / \\\n"); + fprintf(ofp, " / \\ / \\\n"); + fprintf(ofp, " / \\/ \\\n"); + fprintf(ofp, " / 3 : 2 \\\n"); + fprintf(ofp, " / : \\\n"); + fprintf(ofp, " /__________________\\\n"); + if (numclust == 4) + fprintf(ofp, " (a,d)-(b,c) (a,c)-(b,d)\n"); + if (numclust == 3) + fprintf(ofp, " (a,c)-(b,c) (a,c)-(b,c)\n"); + if (numclust == 2) + fprintf(ofp, " (a,b)-(a,b) (a,b)-(a,b)\n"); + fprintf(ofp, "\n"); + fprintf(ofp, "Number of quartets in region 1: %lu (= %.1f%%)\n", + ar1, (double) ar1*100.0/Numquartets); + fprintf(ofp, "Number of quartets in region 2: %lu (= %.1f%%)\n", + ar2, (double) ar2*100.0/Numquartets); + fprintf(ofp, "Number of quartets in region 3: %lu (= %.1f%%)\n\n", + ar3, (double) ar3*100.0/Numquartets); + fprintf(ofp, "Occupancies of the seven areas 1, 2, 3, 4, 5, 6, 7:\n\n"); + if (numclust == 4) + fprintf(ofp, " (a,b)-(c,d)\n"); + if (numclust == 3) + fprintf(ofp, " (a,b)-(c,c)\n"); + if (numclust == 2) + fprintf(ofp, " (a,a)-(b,b)\n"); + fprintf(ofp, " /\\\n"); + fprintf(ofp, " / \\\n"); + fprintf(ofp, " / 1 \\\n"); + fprintf(ofp, " / \\ / \\\n"); + fprintf(ofp, " / /\\ \\\n"); + fprintf(ofp, " / 6 / \\ 4 \\\n"); + fprintf(ofp, " / / 7 \\ \\\n"); + fprintf(ofp, " / \\ /______\\ / \\\n"); + fprintf(ofp, " / 3 : 5 : 2 \\\n"); + fprintf(ofp, " /__________________\\\n"); + if (numclust == 4) + fprintf(ofp, " (a,d)-(b,c) (a,c)-(b,d)\n"); + if (numclust == 3) + fprintf(ofp, " (a,c)-(b,c) (a,c)-(b,c)\n"); + if (numclust == 2) + fprintf(ofp, " (a,b)-(a,b) (a,b)-(a,b)\n"); + fprintf(ofp, "\n"); + fprintf(ofp, "Number of quartets in region 1: %lu (= %.1f%%) left: %lu right: %lu\n", + reg1, (double) reg1*100.0/Numquartets, reg1l, reg1r); + fprintf(ofp, "Number of quartets in region 2: %lu (= %.1f%%) bottom: %lu top: %lu\n", + reg2, (double) reg2*100.0/Numquartets, reg2d, reg2u); + fprintf(ofp, "Number of quartets in region 3: %lu (= %.1f%%) bottom: %lu top: %lu\n", + reg3, (double) reg3*100.0/Numquartets, reg3d, reg3u); + fprintf(ofp, "Number of quartets in region 4: %lu (= %.1f%%) bottom: %lu top: %lu\n", + reg4, (double) reg4*100.0/Numquartets, reg4d, reg4u); + fprintf(ofp, "Number of quartets in region 5: %lu (= %.1f%%) left: %lu right: %lu\n", + reg5, (double) reg5*100.0/Numquartets, reg5l, reg5r); + fprintf(ofp, "Number of quartets in region 6: %lu (= %.1f%%) bottom: %lu top: %lu\n", + reg6, (double) reg6*100.0/Numquartets, reg6d, reg6u); + fprintf(ofp, "Number of quartets in region 7: %lu (= %.1f%%)\n", + reg7, (double) reg7*100.0/Numquartets); + } + + } /* if WRITEREST) || WRITEALL */ +} + + +#if PARALLEL +void writetimesstat(FILE *ofp) +{ + int n; + double cpusum = 0.0; + double wallmax = 0.0; + cputimes[0] = ((double)(cputimestop - cputimestart) / CLOCKS_PER_SEC); + walltimes[0] = difftime(walltimestop, walltimestart); + fullcpu = tarr.fullcpu; + fulltime = tarr.fulltime; + fullcputimes[0] = tarr.fullcpu; + fullwalltimes[0] = tarr.fulltime; + altcputimes[0] = tarr.cpu; + altwalltimes[0] = tarr.time; + fprintf(ofp, "\n\n\nPARALLEL LOAD STATISTICS\n\n"); + + fprintf(ofp, "The analysis was performed with %d parallel processes (1 master and \n", PP_NumProcs); + fprintf(ofp, "%d worker processes).\n\n", PP_NumProcs-1); + fprintf(ofp, "The following table the distribution of computation to the processes.\n"); + fprintf(ofp, "The first column gives the process number, where 0 is the master process.\n"); + fprintf(ofp, "The second and third column show the number of quartets computed (3 topologies \n"); + fprintf(ofp, "each) and the the number of scheduling blocks the came in. The last two columns \n"); + fprintf(ofp, "state the number of puzzling steps done by a process and number of scheduling \n"); + fprintf(ofp, "blocks.\n\n"); + fprintf(ofp, "process #quartets #chunks #puzzlings #chunks \n"); + fprintf(ofp, "-----------------------------------------------\n"); + for (n=0; n wallmax) wallmax=fullwalltimes[n]; + cpusum += fullcputimes[n]; + } /* for */ + fprintf(ofp, "----------------------------------------------------------------------------\n"); + fprintf(ofp, "Sum/Max: %11.1f %9.1f %9.1f | %11.1f %9.1f %9.1f \n", + cpusum, cpusum/60, cpusum/3600, wallmax, wallmax/60, wallmax/3600); +#else /* TIMEDEBUG */ + fprintf(ofp, "\n\nBelow the distribution of computing times (wallclock) per host is shown.\n"); + fprintf(ofp, "The times are shown in seconds, minutes, and hours. At the bottom of the table the\n"); + fprintf(ofp, "the maximum wallclock times is shown.\n\n"); + fprintf(ofp, "process wallclock[s] [min] [hours] \n"); + fprintf(ofp, "----------------------------------------------------------------------------\n"); + for (n=0; n wallmax) wallmax=fullwalltimes[n]; + cpusum += fullcputimes[n]; + } /* for */ + fprintf(ofp, "----------------------------------------------------------------------------\n"); + fprintf(ofp, "Sum/Max: %11.1f %9.1f %9.1f \n", + wallmax, wallmax/60, wallmax/3600); +#endif /* TIMEDEBUG */ + + fullcpu = cpusum; + fulltime = wallmax; + +} /* writetimesstat */ +#endif + + +/* write current user tree to file */ +void writecutree(FILE *ofp, int num) +{ + int df; + double pval, delta; + + + if (typ_optn == TREERECON_OPTN) { + + if (puzzlemode == USERTREE) { + fprintf(ofp, "\n\nMAXIMUM LIKELIHOOD BRANCH LENGTHS OF USER"); + fprintf(ofp, " DEFINED TREE # %d (NO CLOCK)\n\nBranch lengths are computed using", num); + fprintf(ofp, " the selected model of\nsubstitution and rate heterogeneity.\n\n\n"); + clockmode = 0; /* nonclocklike branch lengths */ + prtopology(ofp); + fprintf(ofp, "\n"); + resulttree(ofp); + fprintf(ofp, "\n\nUnrooted user defined tree with maximum likelihood branch lengths"); + fprintf(ofp, "\n(in CLUSTAL W notation):\n\n"); + fputphylogeny(ofp); + if (compclock) { + fprintf(ofp, "\n\nMAXIMUM LIKELIHOOD BRANCH LENGTHS OF USER"); + fprintf(ofp, " DEFINED TREE # %d (WITH CLOCK)\n\nBranch lengths are computed using", num); + fprintf(ofp, " the selected model of\nsubstitution and rate heterogeneity.\n"); + fprintf(ofp, "\nRoot located at branch: %d ", locroot+1); + if (rootsearch == 0) fprintf(ofp, "(user specified)\n\n\n"); + if (rootsearch == 1) { + fprintf(ofp, "(automatic search)"); + if (numbestroot > 1) fprintf(ofp, "- WARNING: %d best locations found! -", numbestroot); + fprintf(ofp, "\n\n"); + fprintf(ofp, "If the automatic search misplaces the root please rerun the analysis\n"); + fprintf(ofp, "and select location of root manually!"); + fprintf(ofp, "\n\n\n"); + + } + if (rootsearch == 2) fprintf(ofp, "(displayed outgroup)\n\n\n"); + clockmode = 1; /* clocklike branch lengths */ + prtopology(ofp); + fprintf(ofp, "\n"); + resulttree(ofp); + fprintf(ofp, "\n"); + resultheights(ofp); + fprintf(ofp, "\n\nRooted user defined tree with clocklike "); + fprintf(ofp, "maximum likelihood branch lengths\n"); + fprintf(ofp, "(in CLUSTAL W notation):\n\n"); + fputrooted(ofp, locroot); + } + + if (compclock) { + fprintf(ofp, "\n\nMOLECULAR CLOCK LIKELIHOOD RATIO TEST FOR USER TREE # %d\n\n", num); + fprintf(ofp, "log L without clock: %.2f (independent branch parameters: %d)\n", + Ctree->lklhd, Numspc + Numibrnch); + fprintf(ofp, "log L with clock: %.2f (independent branch parameters: %d)\n\n", + Ctree->lklhdc, Numhts + 1); + delta = 2.0*((Ctree->lklhd) - (Ctree->lklhdc)); + fprintf(ofp, "Likelihood ratio test statistic delta: %.2f\n", delta); + df = Numspc + Numibrnch - Numhts - 1; + fprintf(ofp, "Degrees of freedom of chi-square distribution: %d\n", df); + + pval = IncompleteGammaQ (df*0.5, delta*0.5); + + fprintf(ofp, "Critical significance level: %.2f%%\n\n", pval*100.0); + if (pval >= 0.05) { + fprintf(ofp, "The simpler (clocklike) tree can not be rejected on a significance\n"); + fprintf(ofp, "level of 5%%. The log-likelihood of the more complex (no clock) tree\n"); + fprintf(ofp, "is not significantly increased.\n"); + } else { + fprintf(ofp, "The simpler (clocklike) tree is rejected on a significance level\n"); + fprintf(ofp, "of 5%%. The log-likelihood of the more complex (no clock) tree is\n"); + fprintf(ofp, "significantly increased.\n"); + } + fprintf(ofp, "\nPlease take care that the correct root is used!\n"); + } + } + } +} + + +/******************************************************************************/ +/* timer routines */ +/******************************************************************************/ + +/* start timer */ +void starttimer() +{ + time(&time0); + time1 = time0; +} + +/* check remaining time and print message if necessary */ +void checktimer(uli numqts) +{ + double tc2, mintogo, minutes, hours; + + time(&time2); + if ( (time2 - time1) > 900) { /* generate message every 15 minutes */ + /* every 900 seconds */ + /* percentage of completed quartets */ + if (mflag == 0) { + mflag = 1; + FPRINTF(STDOUTFILE "\n"); + } + tc2 = 100.*numqts/Numquartets; + mintogo = (100.0-tc2) * + (double) (time2-time0)/60.0/tc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%.2f%%", tc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + time1 = time2; + } + +} + +/* check remaining time and print message if necessary */ +void checktimer2(uli numqts, uli all, int flag) +{ + double tc2, mintogo, minutes, hours; + + static time_t tt1; + static time_t tt2; + + if (flag == 1) { + time(&tt1); + time(&tt2); + } else { + time(&tt2); + if ( (tt2 - tt1) > 900) { /* generate message every 15 minutes */ + /* every 900 seconds */ + /* percentage of completed quartets */ + if (mflag == 0) { + mflag = 1; + FPRINTF(STDOUTFILE "\n"); + } + tc2 = 100.*numqts/Numquartets; + mintogo = (100.0-tc2) * + (double) (tt2-time0)/60.0/tc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%.2f%%", tc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + tt1 = tt2; + } + } +} + +void resetqblocktime(timearray_t *ta) +{ + ta->quartcpu += ta->quartblockcpu; + ta->quartblockcpu = 0.0; + ta->quarttime += ta->quartblocktime; + ta->quartblocktime = 0.0; +} /* resetqblocktime */ + + +void resetpblocktime(timearray_t *ta) +{ + ta->puzzcpu += ta->puzzblockcpu; + ta->puzzblockcpu = 0.0; + ta->puzztime += ta->puzzblocktime; + ta->puzzblocktime = 0.0; +} /* resetpblocktime */ + + +#ifdef TIMEDEBUG +void printtimearr(timearray_t *ta) +{ +# if ! PARALLEL + int PP_Myid; + PP_Myid = -1; +# endif + printf("(%2d) MMCPU: %11ld / %11ld \n", PP_Myid, ta->maxcpu, ta->mincpu); + printf("(%2d) CTick: %11.6f [tks] / %11.6f [s] \n", PP_Myid, ta->mincputick, ta->mincputicktime); + + printf("(%2d) MMTIM: %11ld / %11ld \n", PP_Myid, ta->maxtime, ta->mintime); + + printf("(%2d) Mxblk: %11.6e / %11.6e \n", PP_Myid, ta->maxcpublock, ta->maxtimeblock); + printf("(%2d) Mnblk: %11.6e / %11.6e \n", PP_Myid, ta->mincpublock, ta->mintimeblock); + + printf("(%2d) Gnrl: %11.6e / %11.6e \n", PP_Myid, ta->generalcpu, ta->generaltime); + printf("(%2d) Optn: %11.6e / %11.6e \n", PP_Myid, ta->optionscpu, ta->optionstime); + printf("(%2d) Estm: %11.6e / %11.6e \n", PP_Myid, ta->paramestcpu, ta->paramesttime); + printf("(%2d) Qurt: %11.6e / %11.6e \n", PP_Myid, ta->quartcpu, ta->quarttime); + printf("(%2d) QBlk: %11.6e / %11.6e \n", PP_Myid, ta->quartblockcpu, ta->quartblocktime); + printf("(%2d) QMax: %11.6e / %11.6e \n", PP_Myid, ta->quartmaxcpu, ta->quartmaxtime); + printf("(%2d) QMin: %11.6e / %11.6e \n", PP_Myid, ta->quartmincpu, ta->quartmintime); + + printf("(%2d) Puzz: %11.6e / %11.6e \n", PP_Myid, ta->puzzcpu, ta->puzztime); + printf("(%2d) PBlk: %11.6e / %11.6e \n", PP_Myid, ta->puzzblockcpu, ta->puzzblocktime); + printf("(%2d) PMax: %11.6e / %11.6e \n", PP_Myid, ta->puzzmaxcpu, ta->puzzmaxtime); + printf("(%2d) PMin: %11.6e / %11.6e \n", PP_Myid, ta->puzzmincpu, ta->puzzmintime); + + printf("(%2d) Tree: %11.6e / %11.6e \n", PP_Myid, ta->treecpu, ta->treetime); + printf("(%2d) TBlk: %11.6e / %11.6e \n", PP_Myid, ta->treeblockcpu, ta->treeblocktime); + printf("(%2d) TMax: %11.6e / %11.6e \n", PP_Myid, ta->treemaxcpu, ta->treemaxtime); + printf("(%2d) TMin: %11.6e / %11.6e \n", PP_Myid, ta->treemincpu, ta->treemintime); + + printf("(%2d) C/T : %11.6e / %11.6e \n", PP_Myid, + (ta->generalcpu + ta->optionscpu + ta->paramestcpu + ta->quartblockcpu + ta->puzzblockcpu + ta->treeblockcpu), + (ta->generaltime + ta->optionstime + ta->paramesttime + ta->quartblocktime + ta->puzzblocktime + ta->treeblocktime)); + printf("(%2d) CPU: %11.6e / Time: %11.6e \n", PP_Myid, ta->cpu, ta->time); + printf("(%2d) aCPU: %11.6e / aTime: %11.6e \n", PP_Myid, ta->fullcpu, ta->fulltime); + +} /* printtimearr */ +#endif /* TIMEDEBUG */ + +char *jtype [7]; + +void inittimearr(timearray_t *ta) +{ + clock_t c0, c1, c2; + + jtype[OVERALL] = "OVERALL"; + jtype[GENERAL] = "GENERAL"; + jtype[OPTIONS] = "OPTIONS"; + jtype[PARAMEST] = "PARAMeter ESTimation"; + jtype[QUARTETS] = "QUARTETS"; + jtype[PUZZLING] = "PUZZLING steps"; + jtype[TREEEVAL] = "TREE EVALuation"; + ta->currentjob = GENERAL; + + c1 = clock(); + c2 = clock(); + while (c1 == c2) + c2 = clock(); + ta->mincputick = (double)(c2 - c1); + ta->mincputicktime = ((double)(c2 - c1))/CLOCKS_PER_SEC; + + ta->tempcpu = clock(); + ta->tempcpustart = ta->tempcpu; + ta->tempfullcpu = ta->tempcpu; + time(&(ta->temptime)); + ta->temptimestart = ta->temptime; + ta->tempfulltime = ta->temptime; + + c0=0; c1=0; c2=(clock_t)((2 * c1) + 1);; + while (c1 < c2) { + c0 = c1; + c1 = c2; + c2 = (clock_t)((2 * c1) + 1); + } + if (c1 == c2) ta->maxcpu=c0; + if (c1 > c2) ta->maxcpu=c1; + + c0=0; c1=0; c2=(clock_t)((2 * c1) - 1); + while (c1 > c2) { + c0 = c1; + c1 = c2; + c2 = (clock_t)((2 * c1) - 1); + } + if (c1 == c2) ta->mincpu=c0; + if (c1 < c2) ta->mincpu=c1; + + + + ta->maxtime = 0; + ta->mintime = 0; + + ta->maxcpublock = 0; + ta->mincpublock = DBL_MAX; + ta->maxtimeblock = 0; + ta->mintimeblock = DBL_MAX; + + ta->cpu = 0.0; + ta->time = 0.0; + + ta->fullcpu = 0.0; + ta->fulltime = 0.0; + + ta->generalcpu = 0.0; + ta->optionscpu = 0.0; + ta->paramestcpu = 0.0; + ta->quartcpu = 0.0; + ta->quartblockcpu = 0.0; + ta->quartmaxcpu = 0.0; + ta->quartmincpu = ((double) ta->maxcpu)/CLOCKS_PER_SEC; + ta->puzzcpu = 0.0; + ta->puzzblockcpu = 0.0; + ta->puzzmaxcpu = 0.0; + ta->puzzmincpu = ((double) ta->maxcpu)/CLOCKS_PER_SEC; + ta->treecpu = 0.0; + ta->treeblockcpu = 0.0; + ta->treemaxcpu = 0.0; + ta->treemincpu = ((double) ta->maxcpu)/CLOCKS_PER_SEC; + + ta->generaltime = 0.0; + ta->optionstime = 0.0; + ta->paramesttime = 0.0; + ta->quarttime = 0.0; + ta->quartblocktime = 0.0; + ta->quartmaxtime = 0.0; + ta->quartmintime = DBL_MAX; + ta->puzztime = 0.0; + ta->puzzblocktime = 0.0; + ta->puzzmaxtime = 0.0; + ta->puzzmintime = DBL_MAX; + ta->treetime = 0.0; + ta->treeblocktime = 0.0; + ta->treemaxtime = 0.0; + ta->treemintime = DBL_MAX; +} /* inittimearr */ + + +/***************/ + +void addup(int jobtype, clock_t c1, clock_t c2, time_t t1, time_t t2, timearray_t *ta) +{ + double c, + t; + + if (t2 != t1) t = difftime(t2, t1); + else t = 0.0; + + if (c2 < c1) + c = ((double)(c2 - ta->mincpu))/CLOCKS_PER_SEC + + ((double)(ta->maxcpu - c1))/CLOCKS_PER_SEC; + else + c = ((double)(c2 - c1))/CLOCKS_PER_SEC; + + if (jobtype != OVERALL) { + + if (ta->mincpublock > c) ta->mincpublock = c; + if (ta->maxcpublock < c) ta->maxcpublock = c; + if (ta->mintimeblock > t) ta->mintimeblock = t; + if (ta->maxtimeblock < t) ta->maxtimeblock = t; + + switch (jobtype) { + case GENERAL: ta->generalcpu += c; + ta->generaltime += t; + break; + case OPTIONS: ta->optionscpu += c; + ta->optionstime += t; + break; + case PARAMEST: ta->paramestcpu += c; + ta->paramesttime += t; + break; + case QUARTETS: ta->quartblockcpu += c; + ta->quartblocktime += t; + if (ta->quartmincpu > c) ta->quartmincpu = c; + if (ta->quartmaxcpu < c) ta->quartmaxcpu = c; + if (ta->quartmintime > t) ta->quartmintime = t; + if (ta->quartmaxtime < t) ta->quartmaxtime = t; + break; + case PUZZLING: ta->puzzblockcpu += c; + ta->puzzblocktime += t; + if (ta->puzzmincpu > c) ta->puzzmincpu = c; + if (ta->puzzmaxcpu < c) ta->puzzmaxcpu = c; + if (ta->puzzmintime > t) ta->puzzmintime = t; + if (ta->puzzmaxtime < t) ta->puzzmaxtime = t; + break; + case TREEEVAL: ta->treeblockcpu += c; + ta->treeblocktime += t; + if (ta->treemincpu > c) ta->treemincpu = c; + if (ta->treemaxcpu < c) ta->treemaxcpu = c; + if (ta->treemintime > t) ta->treemintime = t; + if (ta->treemaxtime < t) ta->treemaxtime = t; + break; + } + ta->cpu += c; + ta->time += t; + + } else { + ta->fullcpu += c; + ta->fulltime += t; + } + +# ifdef TIMEDEBUG + { +# if ! PARALLEL + int PP_Myid = -1; +# endif /* !PARALLEL */ + printf("(%2d) CPU: +%10.6f / Time: +%10.6f (%s)\n", PP_Myid, c, t, jtype[jobtype]); + printf("(%2d) CPU: %11.6f / Time: %11.6f (%s)\n", PP_Myid, ta->cpu, ta->time, jtype[jobtype]); + printf("(%2d) CPU: %11.6f / Time: %11.6f (%s)\n", PP_Myid, ta->fullcpu, ta->fulltime, jtype[jobtype]); + } +# endif /* TIMEDEBUG */ +} /* addup */ + + +/***************/ + + +void addtimes(int jobtype, timearray_t *ta) +{ + clock_t tempc; + time_t tempt; + + time(&tempt); + tempc = clock(); + + if ((tempc < ta->tempfullcpu) || (jobtype == OVERALL)) { /* CPU counter overflow for overall time */ + addup(OVERALL, ta->tempfullcpu, tempc, ta->tempfulltime, tempt, ta); + ta->tempfullcpu = tempc; + ta->tempfulltime = tempt; + if (jobtype == OVERALL) { + addup(ta->currentjob, ta->tempcpustart, tempc, ta->temptimestart, tempt, ta); + ta->tempcpustart = ta->tempcpu; + ta->tempcpu = tempc; + ta->temptimestart = ta->temptime; + ta->temptime = tempt; + } + } + + if((jobtype != ta->currentjob) && (jobtype != OVERALL)) { /* change of job type */ + addup(ta->currentjob, ta->tempcpustart, ta->tempcpu, ta->temptimestart, ta->temptime, ta); + ta->tempcpustart = ta->tempcpu; + ta->tempcpu = tempc; + ta->temptimestart = ta->temptime; + ta->temptime = tempt; + ta->currentjob = jobtype; + } + + if (tempc < ta->tempcpustart) { /* CPU counter overflow */ + addup(jobtype, ta->tempcpustart, tempc, ta->temptimestart, tempt, ta); + ta->tempcpustart = ta->tempcpu; + ta->tempcpu = tempc; + ta->temptimestart = ta->temptime; + ta->temptime = tempt; + } + +} /* addtimes */ + + + +/******************************************************************************/ + +/* estimate parameters of substitution process and rate heterogeneity - no tree + n-taxon tree is not needed because of quartet method or NJ tree topology */ +void estimateparametersnotree() +{ + int it, nump, change; + double TSold, YRold, FIold, GEold; + + it = 0; + nump = 0; + + /* count number of parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) nump++; + if (fracinv_optim || grate_optim) nump++; + + do { /* repeat until nothing changes any more */ + it++; + change = FALSE; + + /* optimize substitution parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) { + + TSold = TSparam; + YRold = YRparam; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing substitution process parameters\n"); + fflush(STDOUT); + + if (qcalg_optn) { /* quartet sampling */ + optimseqevolparamsq(); + } else { /* NJ tree */ + tmpfp = tmpfile(); + njtree(tmpfp); + rewind(tmpfp); + readusertree(tmpfp); + closefile(tmpfp); + optimseqevolparamst(); + } + + computedistan(); /* update ML distances */ + + /* same tolerance as 1D minimization */ + if ((fabs(TSparam - TSold) > 3.3*PEPS1) || + (fabs(YRparam - YRold) > 3.3*PEPS1) + ) change = TRUE; + + } + + /* optimize rate heterogeneity variables */ + if (fracinv_optim || grate_optim) { + + FIold = fracinv; + GEold = Geta; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing rate heterogeneity parameters\n"); + fflush(STDOUT); + /* compute NJ tree */ + tmpfp = tmpfile(); + njtree(tmpfp); + /* use NJ tree topology to estimate parameters */ + rewind(tmpfp); + readusertree(tmpfp); + closefile(tmpfp); + + optimrateparams(); + computedistan(); /* update ML distances */ + + + /* same tolerance as 1D minimization */ + if ((fabs(fracinv - FIold) > 3.3*PEPS2) || + (fabs(Geta - GEold) > 3.3*PEPS2) + ) change = TRUE; + + } + + if (nump == 1) return; + + } while (it != MAXITS && change); + + return; +} + + +/* estimate parameters of substitution process and rate heterogeneity - tree + same as above but here the n-taxon tree is already in memory */ +void estimateparameterstree() +{ + int it, nump, change; + double TSold, YRold, FIold, GEold; + + it = 0; + nump = 0; + + /* count number of parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) nump++; + if (fracinv_optim || grate_optim) nump++; + + do { /* repeat until nothing changes any more */ + it++; + change = FALSE; + + /* optimize substitution process parameters */ + if (data_optn == NUCLEOTIDE && optim_optn) { + + TSold = TSparam; + YRold = YRparam; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing substitution process parameters\n"); + fflush(STDOUT); + optimseqevolparamst(); + computedistan(); /* update ML distances */ + + + /* same tolerance as 1D minimization */ + if ((fabs(TSparam - TSold) > 3.3*PEPS1) || + (fabs(YRparam - YRold) > 3.3*PEPS1) + ) change = TRUE; + + } + + /* optimize rate heterogeneity variables */ + if (fracinv_optim || grate_optim) { + + FIold = fracinv; + GEold = Geta; + + + /* + * optimize + */ + + FPRINTF(STDOUTFILE "Optimizing missing rate heterogeneity parameters\n"); + fflush(STDOUT); + optimrateparams(); + computedistan(); /* update ML distances */ + + + /* same tolerance as 1D minimization */ + if ((fabs(fracinv - FIold) > 3.3*PEPS2) || + (fabs(Geta - GEold) > 3.3*PEPS2) + ) change = TRUE; + + } + + if (nump == 1) return; + + } while (it != MAXITS && change); + + return; +} + + +/******************************************************************************/ +/* exported from main */ +/******************************************************************************/ + +void compute_quartlklhds(int a, int b, int c, int d, double *d1, double *d2, double *d3, int approx) +{ + if (approx == APPROX) { + + *d1 = quartet_alklhd(a,b, c,d); /* (a,b)-(c,d) */ + *d2 = quartet_alklhd(a,c, b,d); /* (a,c)-(b,d) */ + *d3 = quartet_alklhd(a,d, b,c); /* (a,d)-(b,c) */ + + } else /* approx == EXACT */ { + + *d1 = quartet_lklhd(a,b, c,d); /* (a,b)-(c,d) */ + *d2 = quartet_lklhd(a,c, b,d); /* (a,c)-(b,d) */ + *d3 = quartet_lklhd(a,d, b,c); /* (a,d)-(b,c) */ + + } +} + +/***************************************************************/ + +void recon_tree() +{ + int i; +# if ! PARALLEL + int a, b, c; + uli nq; + double tc2, mintogo, minutes, hours; +# endif + + /* allocate memory for taxon list of bad quartets */ + badtaxon = new_ulivector(Maxspc); + for (i = 0; i < Maxspc; i++) badtaxon[i] = 0; + + /* allocate variable used for randomizing input order */ + trueID = new_ivector(Maxspc); + + /* allocate memory for quartets */ + quartetinfo = mallocquartets(Maxspc); + + /* prepare for consensus tree analysis */ + initconsensus(); + + if (!(readquart_optn) || (readquart_optn && savequart_optn)) { + /* compute quartets */ + FPRINTF(STDOUTFILE "Computing quartet maximum likelihood trees\n"); + fflush(STDOUT); + computeallquartets(); + } + + if (savequart_optn) + writeallquarts(Maxspc, ALLQUART, quartetinfo); + if (readquart_optn) { + int xx1, xx2, xx3, xx4, count; + readallquarts (Maxspc, ALLQUART, quartetinfo); + if (show_optn) { /* list all unresolved quartets */ + openfiletowrite(&unresfp, UNRESOLVED, "unresolved quartet trees"); + fprintf(unresfp, "List of all completely unresolved quartets:\n\n"); + } + + /* initialize bad quartet memory */ + for (count = 0; count < Maxspc; count++) badtaxon[count] = 0; + badqs = 0; + + for (xx4 = 3; xx4 < Maxspc; xx4++) + for (xx3 = 2; xx3 < xx4; xx3++) + for (xx2 = 1; xx2 < xx3; xx2++) + for (xx1 = 0; xx1 < xx2; xx1++) { + if (readquartet(xx1, xx2, xx3, xx4) == 7) { + badqs++; + badtaxon[xx1]++; + badtaxon[xx2]++; + badtaxon[xx3]++; + badtaxon[xx4]++; + if (show_optn) { + fputid10(unresfp, xx1); + fprintf(unresfp, " "); + fputid10(unresfp, xx2); + fprintf(unresfp, " "); + fputid10(unresfp, xx3); + fprintf(unresfp, " "); + fputid (unresfp, xx4); + fprintf(unresfp, "\n"); + } + } + } /* end for xx4; for xx3; for xx2; for xx1 */ + if (show_optn) /* list all unresolved quartets */ + fclose(unresfp); + } /* readquart_optn */ + +# if PARALLEL + PP_SendAllQuarts(numquarts(Maxspc), quartetinfo); +# endif /* PARALLEL */ + + FPRINTF(STDOUTFILE "Computing quartet puzzling tree\n"); + fflush(STDOUT); + + /* start timer - percentage of completed trees */ + time(&time0); + time1 = time0; + mflag = 0; + + /* open file for chronological list of puzzling step trees */ + if((listqptrees == PSTOUT_LIST) || (listqptrees == PSTOUT_LISTORDER)) + openfiletowrite(&qptlist, OUTPTLIST, "puzzling step trees (chonological)"); + +# if PARALLEL + { + PP_SendDoPermutBlock(Numtrial); + } +# else + addtimes(GENERAL, &tarr); + for (Currtrial = 0; Currtrial < Numtrial; Currtrial++) { + + /* randomize input order */ + chooser(Maxspc, Maxspc, trueID); + + /* initialize tree */ + inittree(); + + /* adding all other leafs */ + for (i = 3; i < Maxspc; i++) { + + /* clear all edgeinfos */ + resetedgeinfo(); + + /* clear counter of quartets */ + nq = 0; + + /* + * core of quartet puzzling algorithm + */ + + for (a = 0; a < nextleaf - 2; a++) + for (b = a + 1; b < nextleaf - 1; b++) + for (c = b + 1; c < nextleaf; c++) { + + /* check which two _leaves_ out of a, b, c + are closer related to each other than + to leaf i according to a least squares + fit of the continous Baysian weights to the + seven trivial "attractive regions". We assign + a score of 1 to all edges between these two leaves + chooseA and chooseB */ + + checkquartet(a, b, c, i); + incrementedgeinfo(chooseA, chooseB); + + nq++; + + /* generate message every 15 minutes */ + + /* check timer */ + time(&time2); + if ( (time2 - time1) > 900) { + /* every 900 seconds */ + /* percentage of completed trees */ + if (mflag == 0) { + FPRINTF(STDOUTFILE "\n"); + mflag = 1; + } + tc2 = 100.0*Currtrial/Numtrial + + 100.0*nq/Numquartets/Numtrial; + mintogo = (100.0-tc2) * + (double) (time2-time0)/60.0/tc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%2.2f%%", tc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + time1 = time2; + } + } + + /* find out which edge has the lowest edgeinfo */ + minimumedgeinfo(); + + /* add the next leaf on minedge */ + addnextleaf(minedge); + } + + /* compute bipartitions of current tree */ + computebiparts(); + makenewsplitentries(); + + { + int *ctree, startnode; + char *trstr; + treelistitemtype *treeitem; + ctree = initctree(); + copytree(ctree); + startnode = sortctree(ctree); + trstr=sprintfctree(ctree, psteptreestrlen); + + + treeitem = addtree2list(&trstr, 1, &psteptreelist, &psteptreenum, &psteptreesum); + + if((listqptrees == PSTOUT_LIST) + || (listqptrees == PSTOUT_LISTORDER)) { + /* print: order no/# topol per this id/tree id/sum of unique topologies/sum of trees so far */ + fprintf(qptlist, "%ld.\t1\t%d\t%d\t%d\t%d\n", + Currtrial + 1, (*treeitem).count, (*treeitem).id, psteptreenum, psteptreesum); + } + +# ifdef VERBOSE1 + printf("%s\n", trstr); + printfsortedpstrees(psteptreelist); +# endif + freectree(&ctree); + } + + + + /* free tree before building the next tree */ + freetree(); + + addtimes(PUZZLING, &tarr); + } +# endif /* PARALLEL */ + + /* close file for list of puzzling step trees */ + if((listqptrees == PSTOUT_LIST) || (listqptrees == PSTOUT_LISTORDER)) + closefile(qptlist); + + if (mflag == 1) FPRINTF(STDOUTFILE "\n"); + + /* garbage collection */ + free(splitcomp); + free_ivector(trueID); + +# if ! PARALLEL + free_cmatrix(biparts); +# endif /* PARALLEL */ + + freequartets(); + + /* compute majority rule consensus tree */ + makeconsensus(); + + /* write consensus tree to tmp file */ + tmpfp = tmpfile(); + writeconsensustree(tmpfp); +} /* recon_tree */ + +/***************************************************************/ + +void map_lklhd() +{ + int i, a, a1, a2, b, b1, b2, c, c1, c2, d; + uli nq; + double logs[3], d1, d2, d3, temp; + ivector qts, mlorder, gettwo; + /* reset variables */ + ar1 = ar2 = ar3 = 0; + reg1 = reg2 = reg3 = reg4 = reg5 = reg6 = reg7 = 0; + reg1l = reg1r = reg2u = reg2d = reg3u = reg3d = reg4u = + reg4d = reg5l = reg5r = reg6u = reg6d = 0; + + /* place for random quartet */ + qts = new_ivector(4); + + /* initialize output file */ + openfiletowrite(&trifp, TRIANGLE, "Postscript output"); + initps(trifp); + FPRINTF(STDOUTFILE "Performing likelihood mapping analysis\n"); + fflush(STDOUT); + + /* start timer */ + starttimer(); + nq = 0; + mflag = 0; + + addtimes(GENERAL, &tarr); + if (lmqts == 0) { /* all possible quartets */ + + if (numclust == 4) { /* four-cluster analysis */ + + for (a = 0; a < clustA; a++) + for (b = 0; b < clustB; b++) + for (c = 0; c < clustC; c++) + for (d = 0; d < clustD; d++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(clusterA[a],clusterB[b],clusterC[c],clusterD[d],&d1,&d2,&d3, APPROX); + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + } + + if (numclust == 3) { /* three-cluster analysis */ + + gettwo = new_ivector(2); + + for (a = 0; a < clustA; a++) + for (b = 0; b < clustB; b++) + for (c1 = 0; c1 < clustC-1; c1++) + for (c2 = c1+1; c2 < clustC; c2++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(clusterA[a],clusterB[b],clusterC[c1],clusterC[c2],&d1,&d2,&d3, APPROX); + + /* randomize order of d2 and d3 */ + if (randominteger(2) == 1) { + temp = d3; + d3 = d2; + d2 = temp; + } + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + free_ivector(gettwo); + } + + if (numclust == 2) { /* two-cluster analysis */ + + gettwo = new_ivector(2); + + for (a1 = 0; a1 < clustA-1; a1++) + for (a2 = a1+1; a2 < clustA; a2++) + for (b1 = 0; b1 < clustB-1; b1++) + for (b2 = b1+1; b2 < clustB; b2++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(clusterA[a1],clusterA[a2],clusterB[b1],clusterB[b2],&d1,&d2,&d3, APPROX); + + /* randomize order of d2 and d3 */ + if (randominteger(2) == 1) { + temp = d3; + d3 = d2; + d2 = temp; + } + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + + free_ivector(gettwo); + } + + if (numclust == 1) { /* normal likelihood mapping (one cluster) */ + + mlorder = new_ivector(3); + +#if 0 + for (i = 3; i < Maxspc; i++) + for (a = 0; a < i - 2; a++) + for (b = a + 1; b < i - 1; b++) + for (c = b + 1; c < i; c++) + for (d = 3; d < Maxspc; d++) + for (c = 2; c < d; c++) + for (b = 1; b < c; b++) + for (a = 0; a < b; a++) +#endif + + for (i = 3; i < Maxspc; i++) + for (c = 2; c < i; c++) + for (b = 1; b < c; b++) + for (a = 0; a < b; a++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(a,b,c,i,&logs[0],&logs[1],&logs[2], APPROX); + + /* randomize order */ + chooser(3,3,mlorder); + d1 = logs[mlorder[0]]; + d2 = logs[mlorder[1]]; + d3 = logs[mlorder[2]]; + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + free_ivector(mlorder); + } + + } else { /* randomly selected quartets */ + + if (numclust == 4) { /* four-cluster analysis */ + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + qts[0] = clusterA[ randominteger(clustA) ]; + qts[1] = clusterB[ randominteger(clustB) ]; + qts[2] = clusterC[ randominteger(clustC) ]; + qts[3] = clusterD[ randominteger(clustD) ]; + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + } + + if (numclust == 3) { /* three-cluster analysis */ + + gettwo = new_ivector(2); + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + qts[0] = clusterA[ randominteger(clustA) ]; + qts[1] = clusterB[ randominteger(clustB) ]; + chooser(clustC, 2, gettwo); + qts[2] = clusterC[gettwo[0]]; + qts[3] = clusterC[gettwo[1]]; + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* order of d2 and d3 is already randomized! */ + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + + free_ivector(gettwo); + } + + if (numclust == 2) { /* two-cluster analysis */ + + gettwo = new_ivector(2); + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + chooser(clustA, 2, gettwo); + qts[0] = clusterA[gettwo[0]]; + qts[1] = clusterA[gettwo[1]]; + chooser(clustB, 2, gettwo); + qts[2] = clusterB[gettwo[0]]; + qts[3] = clusterB[gettwo[1]]; + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* order of d2 and d3 is already randomized! */ + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + free_ivector(gettwo); + } + + if (numclust == 1) { /* normal likelihood mapping (one cluster) */ + + for (lmqts = 0; lmqts < Numquartets; lmqts++) { + + nq++; + + /* check timer */ + checktimer(nq); + + /* choose random quartet */ + chooser(Maxspc, 4, qts); + + /* maximum likelihood values */ + /* approximate ML is sufficient */ + compute_quartlklhds(qts[0],qts[1],qts[2],qts[3],&d1,&d2,&d3, APPROX); + + /* order of d1, d2, and d3 is already randomized! */ + + /* draw point for LM analysis */ + makelmpoint(trifp, d1, d2, d3); + addtimes(QUARTETS, &tarr); + + } + } + } + + finishps(trifp); + closefile(trifp); + free_ivector(qts); + +} /* map_lklhd */ + +/***************************************************************/ + +void setdefaults() { + + strcpy(INFILE, INFILEDEFAULT); + strcpy(OUTFILE, OUTFILEDEFAULT); + strcpy(TREEFILE, TREEFILEDEFAULT); + strcpy(INTREE, INTREEDEFAULT); + strcpy(DISTANCES, DISTANCESDEFAULT); + strcpy(TRIANGLE, TRIANGLEDEFAULT); + strcpy(UNRESOLVED, UNRESOLVEDDEFAULT); + strcpy(ALLQUART, ALLQUARTDEFAULT); + strcpy(ALLQUARTLH, ALLQUARTLHDEFAULT); + strcpy(OUTPTLIST, OUTPTLISTDEFAULT); + strcpy(OUTPTORDER, OUTPTORDERDEFAULT); + + usebestq_optn = FALSE; + savequartlh_optn = FALSE; + savequart_optn = FALSE; + readquart_optn = FALSE; + + randseed = -1; /* to set random random seed */ + +} /* setdefaults */ + +/***************************************************************/ + +void printversion() +{ +# if ! PARALLEL + fprintf(stderr, "puzzle (%s) %s\n", PACKAGE, VERSION); +#else + fprintf(stderr, "ppuzzle (%s) %s\n", PACKAGE, VERSION); +# endif + exit (0); +} +/***************************************************************/ + +void printusage(char *fname) +{ + fprintf(stderr, "\n\nUsage: %s [-h] [ Infilename [ UserTreeFilename ] ]\n\n", fname); +# if PARALLEL + PP_SendDone(); + MPI_Finalize(); +# endif + exit (1); +} + +/***************************************************************/ + +#ifdef HHH +void printusagehhh(char *fname) +{ + fprintf(stderr, "\n\nUsage: %s [options] [ Infilename [ UserTreeFilename ] ]\n\n", fname); + fprintf(stderr, " -h - print usage\n"); + fprintf(stderr, " -wqf - write quartet file to Infilename.allquart\n"); + fprintf(stderr, " -rqf - read quartet file from Infilename.allquart\n"); + fprintf(stderr, " -wqlb - write quart lhs to Infilename.allquartlh (binary)\n"); + fprintf(stderr, " -wqla - write quart lhs to Infilename.allquartlh (ASCII)\n"); + fprintf(stderr, " -bestq - use best quart, no basian weights\n"); + fprintf(stderr, " -randseed<#> - use <#> as random number seed, for debug purposes only\n"); +# if PARALLEL + PP_SendDone(); + MPI_Finalize(); +# endif + exit (2); +} +#endif /* HHH */ + +/***************************************************************/ + + +void scancmdline(int *argc, char **argv[]) +{ + static short infileset = 0; + static short intreefileset = 0; + short flagused; + int n; + int count, dummyint; + + for (n = 1; n < *argc; n++) { +# ifdef VERBOSE1 + printf("argv[%d] = %s\n", n, (*argv)[n]); +# endif + + flagused = FALSE; + +# ifdef HHH + dummyint = 0; + count = sscanf((*argv)[n], "-wqlb%n", &dummyint); + if (dummyint == 5) { + savequartlh_optn = TRUE; + saveqlhbin_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n], "-wqla%n", &dummyint); + if (dummyint == 5) { + savequartlh_optn = TRUE; + saveqlhbin_optn = FALSE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n], "-wqf%n", &dummyint); + if (dummyint == 4) { + savequart_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-rqf%n", &dummyint); + if (dummyint == 4) { + readquart_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-bestq%n", &dummyint); + if (dummyint == 6) { + usebestq_optn = TRUE; + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-hhh%n", &dummyint); + if (dummyint==4) { + printusagehhh((*argv)[0]); + flagused = TRUE; + } +# endif /* HHH */ + + dummyint = 0; + count = sscanf((*argv)[n],"-V%n", &dummyint); + if (dummyint==2) { + printversion((*argv)[0]); + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-version%n", &dummyint); + if (dummyint==8) { + printversion((*argv)[0]); + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"--version%n", &dummyint); + if (dummyint>=4) { + printversion((*argv)[0]); + flagused = TRUE; + } + + dummyint = 0; + count = sscanf((*argv)[n],"-h%n", &dummyint); + if (dummyint==2) { + printusage((*argv)[0]); + flagused = TRUE; + } + + count = sscanf((*argv)[n],"-randseed%d", &dummyint); + if (count == 1) { + randseed = dummyint; + flagused = TRUE; + } + +#if 0 + count = sscanf((*argv)[n],"-h%n", &dummyint); + if ((count == 1) && (dummyint>=2)) printusage((*argv)[0]); + + count = sscanf((*argv)[n],"-writequarts%n", &dummyint); + if (count == 1) writequartstofile = 1;; + + count = sscanf((*argv)[n],"-ws%d", &dummyint); + if (count == 1) windowsize = dummyint; +#endif + + if ((*argv)[n][0] != '-') { + if (infileset == 0) { + strcpy(INFILE, (*argv)[n]); + infileset++; + sprintf(OUTFILE ,"%s.%s", INFILE, OUTFILEEXT); + sprintf(TREEFILE ,"%s.%s", INFILE, TREEFILEEXT); + sprintf(DISTANCES ,"%s.%s", INFILE, DISTANCESEXT); + sprintf(TRIANGLE ,"%s.%s", INFILE, TRIANGLEEXT); + sprintf(UNRESOLVED ,"%s.%s", INFILE, UNRESOLVEDEXT); + sprintf(ALLQUART ,"%s.%s", INFILE, ALLQUARTEXT); + sprintf(ALLQUARTLH ,"%s.%s", INFILE, ALLQUARTLHEXT); + sprintf(OUTPTLIST ,"%s.%s", INFILE, OUTPTLISTEXT); + sprintf(OUTPTORDER ,"%s.%s", INFILE, OUTPTORDEREXT); + FPRINTF(STDOUTFILE "Input file: %s\n", INFILE); + flagused = TRUE; + } else { + if (intreefileset == 0) { + strcpy(INTREE, (*argv)[n]); + intreefileset++; + sprintf(OUTFILE ,"%s.%s", INTREE, OUTFILEEXT); + sprintf(TREEFILE ,"%s.%s", INTREE, TREEFILEEXT); + sprintf(DISTANCES ,"%s.%s", INTREE, DISTANCESEXT); + FPRINTF(STDOUTFILE "Usertree file: %s\n", INTREE); + flagused = TRUE; + } + } + } + if (flagused == FALSE) { + fprintf(stderr, "WARNING: commandline parameter %d not recognized (\"%s\")\n", n, (*argv)[n]); + } + flagused = FALSE; + } + +} /* scancmdline */ + + +/***************************************************************/ + +void inputandinit(int *argc, char **argv[]) { + + int ci; + + /* vectors used in QP and LM analysis */ + qweight = new_dvector(3); + sqdiff = new_dvector(3); + qworder = new_ivector(3); + sqorder = new_ivector(3); + + /* Initialization and parsing of Commandline */ + setdefaults(); + scancmdline(argc, argv); + + /* initialize random numbers generator */ + if (randseed >= 0) + fprintf(stderr, "WARNING: random seed set to %d for debugging!\n", randseed); + randseed = initrandom(randseed); + + psteptreelist = NULL; + psteptreesum = 0; + bestratefound = 0; + +# ifndef ALPHA + FPRINTF(STDOUTFILE "\n\n\nWELCOME TO TREE-PUZZLE %s!\n\n\n", VERSION); +# else + FPRINTF(STDOUTFILE "\n\n\nWELCOME TO TREE-PUZZLE %s%s!\n\n\n", VERSION, ALPHA); +# endif + + + /* get sequences */ + openfiletoread(&seqfp, INFILE, "sequence data"); + getsizesites(seqfp); + FPRINTF(STDOUTFILE "\nInput data set contains %d sequences of length %d\n", Maxspc, Maxseqc); + getdataset(seqfp); + closefile(seqfp); + data_optn = guessdatatype(); + + /* translate characters into format used by ML engine */ + nuc_optn = TRUE; + SH_optn = FALSE; + Seqchar = NULL; + translatedataset(); + + /* estimate base frequencies from data set */ + Freqtpm = NULL; + Basecomp = NULL; + estimatebasefreqs(); + + /* guess model of substitution */ + guessmodel(); + + /* initialize guess variables */ + auto_datatype = AUTO_GUESS; + if (data_optn == AMINOACID) auto_aamodel = AUTO_GUESS; + else auto_aamodel = AUTO_DEFAULT; + /* save guessed amino acid options */ + guessDayhf_optn = Dayhf_optn; + guessJtt_optn = Jtt_optn; + guessmtrev_optn = mtrev_optn; + guesscprev_optn = cprev_optn; + guessblosum62_optn = blosum62_optn; + guessvtmv_optn = vtmv_optn; + guesswag_optn = wag_optn; + guessauto_aamodel = auto_aamodel; + + + /* check for user specified tree */ + if ((utfp = fopen(INTREE, "r")) != NULL) { + fclose(utfp); + puzzlemode = USERTREE; + } else { + puzzlemode = QUARTPUZ; + } + + /* reserve memory for cluster LM analysis */ + clusterA = new_ivector(Maxspc); + clusterB = new_ivector(Maxspc); + clusterC = new_ivector(Maxspc); + clusterD = new_ivector(Maxspc); + + /* set options interactively */ + setoptions(); + + /* open usertree file right after start */ + if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE) { + openfiletoread(&utfp, INTREE, "user trees"); + } + + /* start main timer */ + time(&Starttime); + Startcpu=clock(); + addtimes(OPTIONS, &tarr); + + /* symmetrize doublet frequencies if specified */ + symdoublets(); + + /* initialise ML */ + mlstart(); + + /* determine how many usertrees */ + if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE) { + numutrees = 0; + do { + ci = fgetc(utfp); + if ((char) ci == ';') numutrees++; + } while (ci != EOF); + rewind(utfp); + if (numutrees < 1) { + FPRINTF(STDOUTFILE "Unable to proceed (no tree in input tree file)\n\n\n"); + exit(1); + } + } + + /* check fraction of invariable sites */ + if ((rhetmode == TWORATE || rhetmode == MIXEDRATE) && !fracinv_optim) + /* fraction of invariable site was specified manually */ + if (fracinv > MAXFI) + fracinv = MAXFI; + + addtimes(GENERAL, &tarr); + /* estimate parameters */ + if (!(typ_optn == TREERECON_OPTN && puzzlemode == USERTREE)) { + /* no tree present */ + estimateparametersnotree(); + } else { + if (utree_optn) { + /* use 1st user tree */ + readusertree(utfp); + rewind(utfp); + estimateparameterstree(); + } else { + /* don't use first user tree */ + estimateparametersnotree(); + } + } + addtimes(PARAMEST, &tarr); + + /* compute expected Ts/Tv ratio */ + if (data_optn == NUCLEOTIDE) computeexpectations(); + +} /* inputandinit */ + + + +/***************************************************************/ + +void evaluatetree(FILE *intreefp, FILE *outtreefp, int pmode, int utreenum, int maxutree, int *oldlocroot) +{ + + switch (pmode) { + case QUARTPUZ: /* read QP tree */ + readusertree(intreefp); + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (without clock)\n"); + fflush(STDOUT); + usertree_lklhd(); + findbestratecombination(); + break; + case USERTREE: /* read user tree */ + readusertree(intreefp); + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (without clock) for tree # %d\n", utreenum+1); + fflush(STDOUT); + usertree_lklhd(); + if (maxutree > 1) { + ulkl[utreenum] = Ctree->lklhd; + allsitelkl(Ctree->condlkl, allsites[utreenum]); + } + if (utreenum==0) findbestratecombination(); + break; + } + + + if (compclock) { /* clocklike branch length */ + switch (pmode) { + case QUARTPUZ: + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (with clock)\n"); + fflush(STDOUT); + break; + case USERTREE: + FPRINTF(STDOUTFILE "Computing maximum likelihood branch lengths (with clock) for tree # %d\n", utreenum+1); + fflush(STDOUT); + break; + } + + /* find best place for root */ + rootsearch = 0; + + if (utreenum==0) locroot = *oldlocroot; + else *oldlocroot = locroot; + + if (locroot < 0) { + locroot = findrootedge(); + rootsearch = 1; + } + /* if user-specified edge for root does not exist use displayed outgroup */ + if (!checkedge(locroot)) { + locroot = outgroup; + rootsearch = 2; + } + /* compute likelihood */ + clock_lklhd(locroot); + if (maxutree > 1) { + ulklc[utreenum] = Ctree->lklhdc; + allsitelkl(Ctree->condlkl, allsitesc[utreenum]); + } + + } + + if (clockmode == 0) + fprintf(outtreefp, "[ lh=%.6f ]", Ctree->lklhd); + else + fprintf(outtreefp, "[ lh=%.6f ]", Ctree->lklhdc); + + /* write ML branch length tree to outree file */ + clockmode = 0; /* nonclocklike branch lengths */ + fputphylogeny(outtreefp); + + /* clocklike branch lengths */ + if (compclock) { + clockmode = 1; + fputrooted(outtreefp, locroot); + } +} /* evaluatetree */ + +/***************************************************************/ + +void memcleanup() { + if (puzzlemode == QUARTPUZ && typ_optn == TREERECON_OPTN) { + free(splitfreqs); + free(splitpatterns); + free(splitsizes); + free_ivector(consconfid); + free_ivector(conssizes); + free_cmatrix(consbiparts); + free_ulivector(badtaxon); + } + free_cmatrix(Identif); + free_dvector(Freqtpm); + free_imatrix(Basecomp); + free_ivector(clusterA); + free_ivector(clusterB); + free_ivector(clusterC); + free_ivector(clusterD); + free_dvector(qweight); + free_dvector(sqdiff); + free_ivector(qworder); + free_ivector(sqorder); + freetreelist(&psteptreelist, &psteptreenum, &psteptreesum); +} /* memcleanup */ + +/***************************************************************/ + + +/******************************************************************************/ +/* main part */ +/******************************************************************************/ + +int main(int argc, char *argv[]) +{ + int i, oldlocroot=0; + + /* start main timer */ + time(&walltimestart); + cputimestart = clock(); + inittimearr(&tarr); + +# if PARALLEL + PP_Init(&argc, &argv); + if (PP_IamSlave) { + slave_main(argc, argv); + } else { +# endif /* PARALLEL */ + + inputandinit(&argc, &argv); + + /* CZ 05/19/01 */ + /* FPRINTF(STDOUTFILE "Writing parameters to file %s\n", OUTFILE); */ + /* openfiletowrite(&ofp, OUTFILE, "general output"); */ + /* writeoutputfile(ofp,WRITEPARAMS); */ + /* fclose(ofp); */ + + + /* write distance matrix */ + FPRINTF(STDOUTFILE "Writing pairwise distances to file %s\n", DISTANCES); + openfiletowrite(&dfp, DISTANCES, "pairwise distances"); + putdistance(dfp); + closefile(dfp); + +# if PARALLEL + PP_SendSizes(Maxspc, Maxsite, numcats, Numptrn, tpmradix, outgroup, fracconst, randseed); + PP_SendData(Seqpat, /* cmatrix */ + Alias, Weight, constpat, /* ivector */ + Rates, Eval, Freqtpm, /* dvector */ + Evec, Ievc, iexp, Distanmat, /* dmatrix */ + ltprobr); /* dcube */ +# endif /* PARALLEL */ + psteptreestrlen = (Maxspc * (int)(1 + log10(Maxspc))) + + (Maxspc * 3); + + switch (typ_optn) { + case TREERECON_OPTN: /* tree reconstruction */ + + if (puzzlemode == QUARTPUZ) { /* quartet puzzling */ + recon_tree(); + } /* quartet puzzling */ + break; + + case LIKMAPING_OPTN: /* likelihood mapping */ + + map_lklhd(); + break; + } /* switch typ_optn */ + + + free_cmatrix(Seqchar); + free_cmatrix(seqchars); + + /* reserve memory for tree statistics */ + if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE && numutrees > 1) { + ulkl = new_dvector(numutrees); + allsites = new_dmatrix(numutrees,Numptrn); + if (compclock) { + ulklc = new_dvector(numutrees); + allsitesc = new_dmatrix(numutrees,Numptrn); + } + } + + /* write puzzling step tree list */ + if ((listqptrees == PSTOUT_ORDER) || (listqptrees == PSTOUT_LISTORDER)) { + openfiletowrite(&qptorder, OUTPTORDER, "puzzling step trees (unique)"); + + fprintfsortedpstrees(qptorder, psteptreelist, psteptreenum, psteptreesum, 1, 0.0); + closefile(qptorder); + } + + /* compute ML branch lengths for QP tree and for 1st user tree */ + switch(typ_optn) { + case TREERECON_OPTN: + + /* open outtree file */ + openfiletowrite(&tfp, TREEFILE, "output tree(s)"); + + addtimes(GENERAL, &tarr); + + switch (puzzlemode) { + case QUARTPUZ: /* read QP tree */ + rewind(tmpfp); + openfiletowrite(&tfp, TREEFILE, "output tree(s)"); + evaluatetree(tmpfp, tfp, puzzlemode, 0, 1, &oldlocroot); + addtimes(TREEEVAL, &tarr); + closefile(tmpfp); + closefile(tfp); + + /* CZ 05/19/01 */ + /*openfiletoappend(&ofp, OUTFILE, "general output");*/ + /*writeoutputfile(ofp,WRITEREST);*/ + break; + case USERTREE: /* read user tree */ + openfiletoappend(&ofp, OUTFILE, "general output"); + + openfiletowrite(&tfp, TREEFILE, "output tree(s)"); + for (i = 0; i < numutrees; i++) { + evaluatetree(utfp, tfp, puzzlemode, i, numutrees, &oldlocroot); + if (i==0) writeoutputfile(ofp,WRITEREST); + writecutree(ofp, i+1); + addtimes(TREEEVAL, &tarr); + } + closefile(tfp); + closefile(utfp); + break; + default: + /* CZ 05/19/01 */ + /*openfiletoappend(&ofp, OUTFILE, "general output");*/ + /*writeoutputfile(ofp,WRITEREST);*/ + break; + } /* switch puzzlemode */ + break; + default: + /* CZ 05/19/01 */ + /*openfiletoappend(&ofp, OUTFILE, "general output");*/ + /*writeoutputfile(ofp,WRITEREST);*/ + break; + } /* switch typ_optn */ + + /* print tree statistics */ + if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE && numutrees > 1) + printtreestats(ofp); + + /* free memory for tree statistics */ + if (typ_optn == TREERECON_OPTN && puzzlemode == USERTREE && numutrees > 1) { + free_dvector(ulkl); + free_dmatrix(allsites); + if (compclock) { + free_dvector(ulklc); + free_dmatrix(allsitesc); + } + } + +# if PARALLEL + PP_SendDone(); +# endif /* PARALLEL */ + + /* write CPU/Wallclock times and parallel statistics */ + time(&walltimestop); + cputimestop = clock(); + addtimes(OVERALL, &tarr); +# ifdef TIMEDEBUG + printtimearr(&tarr); +# endif /* TIMEDEBUG */ + fullcpu = tarr.fullcpu; + fulltime = tarr.fulltime; + +# if PARALLEL + writetimesstat(ofp); +# endif /* PARALLEL */ + + /* stop timer */ + time(&Stoptime); + Stopcpu=clock(); + /* CZ 05/19/01 */ + /*timestamp(ofp);*/ + /*closefile(ofp);*/ + + + /* printbestratecombination(stderr); */ + mlfinish(); + + FPRINTF(STDOUTFILE "\nAll results written to disk:\n"); + FPRINTF(STDOUTFILE " Puzzle report file: %s\n", OUTFILE); + FPRINTF(STDOUTFILE " Likelihood distances: %s\n", DISTANCES); + + if (typ_optn == TREERECON_OPTN && puzzlemode != PAIRDIST) + FPRINTF(STDOUTFILE " Phylip tree file: %s\n", TREEFILE); + if (typ_optn == TREERECON_OPTN && puzzlemode == QUARTPUZ) { + if ((listqptrees == PSTOUT_ORDER) ||(listqptrees == PSTOUT_LISTORDER)) + FPRINTF(STDOUTFILE " Unique puzzling step trees: %s\n", OUTPTORDER); + if ((listqptrees == PSTOUT_LIST) ||(listqptrees == PSTOUT_LISTORDER)) + FPRINTF(STDOUTFILE " Puzzling step tree list: %s\n", OUTPTLIST); + } + if (show_optn && typ_optn == TREERECON_OPTN && puzzlemode == QUARTPUZ) + FPRINTF(STDOUTFILE " Unresolved quartets: %s\n", UNRESOLVED); + if (typ_optn == LIKMAPING_OPTN) + FPRINTF(STDOUTFILE " Likelihood mapping diagram: %s\n", TRIANGLE); + FPRINTF(STDOUTFILE "\n"); + + /* runtime message */ + FPRINTF(STDOUTFILE + "The computation took %.0f seconds (= %.1f minutes = %.1f hours)\n", + difftime(Stoptime, Starttime), difftime(Stoptime, Starttime)/60., + difftime(Stoptime, Starttime)/3600.); + FPRINTF(STDOUTFILE + " including input %.0f seconds (= %.1f minutes = %.1f hours)\n", + fulltime, fulltime/60., fulltime/3600.); +#ifdef TIMEDEBUG + FPRINTF(STDOUTFILE + "and %.0f seconds CPU time (= %.1f minutes = %.1f hours)\n\n", + fullcpu, fullcpu/60., fullcpu/3600.); +#endif /* TIMEDEBUG */ + + /* free memory */ + memcleanup(); + +# if PARALLEL + } /* !IamSlave */ + PP_Finalize(); +# endif /* PARALLEL */ + + return 0; +} + + +/* compare function for uli - sort largest numbers first */ +int ulicmp(const void *ap, const void *bp) +{ + uli a, b; + + a = *((uli *) ap); + b = *((uli *) bp); + + if (a > b) return -1; + else if (a < b) return 1; + else return 0; +} + +/* compare function for int - sort smallest numbers first */ +int intcmp(const void *ap, const void *bp) +{ + int a, b; + + a = *((int *) ap); + b = *((int *) bp); + + if (a < b) return -1; + else if (a > b) return 1; + else return 0; +} diff --git a/forester/archive/RIO/others/puzzle_mod/src/puzzle2.c b/forester/archive/RIO/others/puzzle_mod/src/puzzle2.c new file mode 100644 index 0000000..429fe46 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/puzzle2.c @@ -0,0 +1,2701 @@ +/* + * puzzle2.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +/* Modified by Christian Zmasek to: + - Allow 8000 seqs (for pairwise dist. calc.). + - Names of 26 chars. + + + !WARNING: Use ONLY together with FORESTER/RIO! + !For all other puposes download the excellent original! + + last modification: 05/19/01 + + + void getsizesites(FILE *ifp): + + 257 -> 8000 + + + + void readid(FILE *infp, int t): + + for (i = 0; i < 10; i++) { -> for (i = 0; i < 26; i++) { + + for (i = 9; i > -1; i--) { -> for (i = 25; i > -1; i--) { + + for (j = 0; (j < 10) && (flag == TRUE); j++) -> for (j = 0; (j < 26) && (flag == TRUE); j++) + + + + void initid(int t): + + Identif = new_cmatrix(t, 10); -> Identif = new_cmatrix(t, 26); + + for (j = 0; j < 10; j++) -> for (j = 0; j < 26; j++) + + + + fputid10(FILE *ofp, int t): + + for (i = 0; i < 10; i++) -> for (i = 0; i < 26; i++) + + + + int fputid(FILE *ofp, int t): + + while (Identif[t][i] != ' ' && i < 10) { -> while (Identif[t][i] != ' ' && i < 26) { + + + + +*/ + +#define EXTERN extern + +#include "puzzle.h" +#include + +#if PARALLEL +# include "sched.h" +#endif /* PARALLEL */ + + +/******************************************************************************/ +/* sequences */ +/******************************************************************************/ + +/* read ten characters of current line as identifier */ +void readid(FILE *infp, int t) +{ + int i, j, flag, ci; + + for (i = 0; i < 26; i++) { /* CZ 05/19/01 */ + ci = fgetc(infp); + if (ci == EOF || !isprint(ci)) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no name for sequence %d)\n\n\n", t+1); + exit(1); + } + Identif[t][i] = (char) ci; + } + /* convert leading blanks in taxon name to underscores */ + flag = FALSE; + for (i = 25; i > -1; i--) { /* CZ 05/19/01 */ + if (flag == FALSE) { + if (Identif[t][i] != ' ') flag = TRUE; + } else { + if (Identif[t][i] == ' ') Identif[t][i] = '_'; + } + } + /* check whether this name is already used */ + for (i = 0; i < t; i++) { /* compare with all other taxa */ + flag = TRUE; /* assume identity */ + for (j = 0; (j < 26) && (flag == TRUE); j++) /* CZ 05/19/01 */ + if (Identif[t][j] != Identif[i][j]) + flag = FALSE; + if (flag) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (multiple occurence of sequence name '"); + fputid(STDOUT, t); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } +} + +/* read next allowed character */ +char readnextcharacter(FILE *ifp, int notu, int nsite) +{ + char c; + + /* ignore blanks and control characters except newline */ + do { + if (fscanf(ifp, "%c", &c) != 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing character at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } while (c == ' ' || (iscntrl((int) c) && c != '\n')); + return c; +} + +/* skip rest of the line */ +void skiprestofline(FILE* ifp, int notu, int nsite) +{ + int ci; + + /* read chars until the first newline */ + do{ + ci = fgetc(ifp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "Unable to proceed (missing newline at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } while ((char) ci != '\n'); +} + +/* skip control characters and blanks */ +void skipcntrl(FILE *ifp, int notu, int nsite) +{ + int ci; + + /* read over all control characters and blanks */ + do { + ci = fgetc(ifp); + if (ci == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing character at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + } while (iscntrl(ci) || (char) ci == ' '); + /* go one character back */ + if (ungetc(ci, ifp) == EOF) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (positioning error at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } +} + +/* read sequences of one data set */ +void getseqs(FILE *ifp) +{ + int notu, nsite, endofline, linelength, i; + char c; + + seqchars = new_cmatrix(Maxspc, Maxseqc); + /* read all characters */ + nsite = 0; /* next site to be read */ + while (nsite < Maxseqc) { + /* read first taxon */ + notu = 0; + /* go to next true line */ + skiprestofline(ifp, notu, nsite); + skipcntrl(ifp, notu, nsite); + if (nsite == 0) readid(ifp, notu); + endofline = FALSE; + linelength = 0; + do { + c = readnextcharacter(ifp, notu, nsite + linelength); + if (c == '\n') endofline = TRUE; + else if (c == '.') { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (invalid character '.' at position "); + FPRINTF(STDOUTFILE "%d in first sequence)\n\n\n", nsite + linelength + 1); + exit(1); + } else if (nsite + linelength < Maxseqc) { + /* change to upper case */ + seqchars[notu][nsite + linelength] = (char) toupper((int) c); + linelength++; + } else { + endofline = TRUE; + skiprestofline(ifp, notu, nsite + linelength); + } + } while (!endofline); + if (linelength == 0) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (line with length 0 at position %d in sequence '", nsite + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } + /* read other taxa */ + for (notu = 1; notu < Maxspc; notu++) { + /* go to next true line */ + if (notu != 1) skiprestofline(ifp, notu, nsite); + skipcntrl(ifp, notu, nsite); + if (nsite == 0) readid(ifp, notu); + for (i = nsite; i < nsite + linelength; i++) { + c = readnextcharacter(ifp, notu, i); + if (c == '\n') { /* too short */ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (line to short at position %d in sequence '", i + 1); + fputid(STDOUT, notu); + FPRINTF(STDOUTFILE "')\n\n\n"); + exit(1); + } else if (c == '.') { + seqchars[notu][i] = seqchars[0][i]; + } else { + /* change to upper case */ + seqchars[notu][i] = (char) toupper((int) c); + } + } + } + nsite = nsite + linelength; + } +} + +/* initialize identifer array */ +void initid(int t) +{ + int i, j; + + Identif = new_cmatrix(t, 26); /* CZ 05/19/01 */ + for (i = 0; i < t; i++) + for (j = 0; j < 26; j++) /* CZ 05/19/01 */ + Identif[i][j] = ' '; +} + +/* print identifier of specified taxon in full 10 char length */ +void fputid10(FILE *ofp, int t) +{ + int i; + + for (i = 0; i < 26; i++) fputc(Identif[t][i], ofp); /* CZ 05/19/01 */ +} + +/* print identifier of specified taxon up to first space */ +int fputid(FILE *ofp, int t) +{ + int i; + + i = 0; + while (Identif[t][i] != ' ' && i < 26) { /* CZ 05/19/01 */ + fputc(Identif[t][i], ofp); + i++; + } + return i; +} + +/* read first line of sequence data set */ +void getsizesites(FILE *ifp) +{ + if (fscanf(ifp, "%d", &Maxspc) != 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing number of sequences)\n\n\n"); + exit(1); + } + if (fscanf(ifp, "%d", &Maxseqc) != 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (missing number of sites)\n\n\n"); + exit(1); + } + + if (Maxspc < 4) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (less than 4 sequences)\n\n\n"); + exit(1); + } + if (Maxspc > 8000) { /* CZ 05/19/01 */ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (more than 8000 sequences)\n\n\n"); + exit(1); + } + if (Maxseqc < 1) { + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (no sequence sites)\n\n\n"); + exit(1); + } + Maxbrnch = 2*Maxspc - 3; +} + +/* read one data set - PHYLIP interleaved */ +void getdataset(FILE *ifp) +{ + initid(Maxspc); + getseqs(ifp); +} + +/* guess data type */ +int guessdatatype() +{ + uli numnucs, numchars, numbins; + int notu, nsite; + char c; + + /* count A, C, G, T, U, N */ + numnucs = 0; + numchars = 0; + numbins = 0; + for (notu = 0; notu < Maxspc; notu++) + for (nsite = 0; nsite < Maxseqc; nsite++) { + c = seqchars[notu][nsite]; + if (c == 'A' || c == 'C' || c == 'G' || + c == 'T' || c == 'U' || c == 'N') numnucs++; + if (c != '-' && c != '?') numchars++; + if (c == '0' || c == '1') numbins++; + } + if (numchars == 0) numchars = 1; + /* more than 85 % frequency means nucleotide data */ + if ((double) numnucs / (double) numchars > 0.85) return 0; + else if ((double) numbins / (double) numchars > 0.2) return 2; + else return 1; +} + +/* translate characters into format used by ML engine */ +void translatedataset() +{ + int notu, sn, co; + char c; + cvector code; + + + /* determine Maxsite - number of ML sites per taxon */ + if (data_optn == 0 && SH_optn) { + if (SHcodon) + Maxsite = Maxseqc / 3; + else + Maxsite = Maxseqc / 2; /* assume doublets */ + + } else + Maxsite = Maxseqc; + if (data_optn == 0 && (Maxsite % 3) == 0 && !SH_optn) { + if (codon_optn == 1 || codon_optn == 2 || codon_optn == 3) + Maxsite = Maxsite / 3; /* only one of the three codon positions */ + if (codon_optn == 4) + Maxsite = 2*(Maxsite / 3); /* 1st + 2nd codon positions */ + } + + /* reserve memory */ + if (Seqchar != NULL) free_cmatrix(Seqchar); + Seqchar = new_cmatrix(Maxspc, Maxsite); + + /* code length */ + if (data_optn == 0 && SH_optn) + code = new_cvector(2); + else + code = new_cvector(1); + + /* decode characters */ + if (data_optn == 0 && SH_optn) { /* SH doublets */ + + for (notu = 0; notu < Maxspc; notu++) { + for (sn = 0; sn < Maxsite; sn++) { + for (co = 0; co < 2; co++) { + if (SHcodon) + c = seqchars[notu][sn*3 + co]; + else + c = seqchars[notu][sn*2 + co]; + code[co] = c; + } + Seqchar[notu][sn] = code2int(code); + } + } + + } else if (!(data_optn == 0 && (Maxseqc % 3) == 0)) { /* use all */ + + for (notu = 0; notu < Maxspc; notu++) { + for (sn = 0; sn < Maxsite; sn++) { + code[0] = seqchars[notu][sn]; + Seqchar[notu][sn] = code2int(code); + } + } + + } else { /* codons */ + + for (notu = 0; notu < Maxspc; notu++) { + for (sn = 0; sn < Maxsite; sn++) { + if (codon_optn == 1 || codon_optn == 2 || codon_optn == 3) + code[0] = seqchars[notu][sn*3+codon_optn-1]; + else if (codon_optn == 4) { + if ((sn % 2) == 0) + code[0] = seqchars[notu][(sn/2)*3]; + else + code[0] = seqchars[notu][((sn-1)/2)*3+1]; + } else + code[0] = seqchars[notu][sn]; + Seqchar[notu][sn] = code2int(code); + } + } + + } + free_cvector(code); +} + +/* estimate mean base frequencies from translated data set */ +void estimatebasefreqs() +{ + int tpmradix, i, j; + uli all, *gene; + + tpmradix = gettpmradix(); + + if (Freqtpm != NULL) free_dvector(Freqtpm); + Freqtpm = new_dvector(tpmradix); + + if (Basecomp != NULL) free_imatrix(Basecomp); + Basecomp = new_imatrix(Maxspc, tpmradix); + + gene = (uli *) malloc((unsigned) ((tpmradix + 1) * sizeof(uli))); + if (gene == NULL) maerror("gene in estimatebasefreqs"); + + for (i = 0; i < tpmradix + 1; i++) gene[i] = 0; + for (i = 0; i < Maxspc; i++) + for (j = 0; j < tpmradix; j++) Basecomp[i][j] = 0; + for (i = 0; i < Maxspc; i++) + for (j = 0; j < Maxsite; j++) { + gene[(int) Seqchar[i][j]]++; + if (Seqchar[i][j] != tpmradix) Basecomp[i][(int) Seqchar[i][j]]++; + } + + all = Maxspc * Maxsite - gene[tpmradix]; + if (all != 0) { /* normal case */ + for (i = 0; i < tpmradix; i++) + Freqtpm[i] = (double) gene[i] / (double) all; + } else { /* pathological case with no unique character in data set */ + for (i = 0; i < tpmradix; i++) + Freqtpm[i] = 1.0 / (double) tpmradix; + } + + free(gene); + + Frequ_optn = TRUE; +} + +/* guess model of substitution */ +void guessmodel() +{ + double c1, c2, c3, c4, c5, c6; + dvector f; + dmatrix a; + int i; + + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + blosum62_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + TSparam = 2.0; + YRparam = 1.0; + optim_optn = TRUE; + HKY_optn = TRUE; + TN_optn = FALSE; + + if (data_optn == 1) { /* amino acids */ + + /* chi2 fit to amino acid frequencies */ + + f = new_dvector(20); + a = new_dmatrix(20,20); + /* chi2 distance Dayhoff */ + dyhfdata(a, f); + c1 = 0; + for (i = 0; i < 20; i++) + c1 = c1 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance JTT */ + jttdata(a, f); + c2 = 0; + for (i = 0; i < 20; i++) + c2 = c2 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance mtREV */ + mtrevdata(a, f); + c3 = 0; + for (i = 0; i < 20; i++) + c3 = c3 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance VT */ + vtmvdata(a, f); + c4 = 0; + for (i = 0; i < 20; i++) + c4 = c4 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance WAG */ + wagdata(a, f); + c5 = 0; + for (i = 0; i < 20; i++) + c5 = c5 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + /* chi2 distance cpREV */ + cprev45data(a, f); + c6 = 0; + for (i = 0; i < 20; i++) + c6 = c6 + (Freqtpm[i]-f[i])*(Freqtpm[i]-f[i]); + + free_dvector(f); + free_dmatrix(a); + +#ifndef CPREV + if ((c1 < c2) && (c1 < c3) && (c1 < c4) && (c1 < c5)) { + /* c1 -> Dayhoff */ + Dayhf_optn = TRUE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c2 < c3) && (c2 < c4) && (c2 < c5)) { + /* c2 -> JTT */ + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c3 < c4) && (c3 < c5)) { + /* c3 -> mtREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = TRUE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on mtDNA)\n"); + } else { + if ((c4 < c5)) { + /* c4 -> VT */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = TRUE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + /* c5 -> WAG */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = TRUE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } /* if c4 else c5 */ + } /* if c3 else c4 */ + } /* if c2 */ + } /* if c1 */ + +#else /* CPREV */ + + if ((c1 < c2) && (c1 < c3) && (c1 < c4) && (c1 < c5) && (c1 < c6)) { + /* c1 -> Dayhoff */ + Dayhf_optn = TRUE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c2 < c3) && (c2 < c4) && (c2 < c5) && (c2 < c6)) { + /* c2 -> JTT */ + Dayhf_optn = FALSE; + Jtt_optn = TRUE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if ((c3 < c4) && (c3 < c5) && (c3 < c6)) { + /* c3 -> mtREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = TRUE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on mtDNA)\n"); + } else { + if ((c4 < c5) && (c4 < c6)) { + /* c4 -> VT */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = TRUE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + if (c5 < c6) { + /* c5 -> WAG */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = FALSE; + vtmv_optn = FALSE; + wag_optn = TRUE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on nuclear DNA)\n"); + } else { + /* if (c6) */ + /* c6 -> cpREV */ + Dayhf_optn = FALSE; + Jtt_optn = FALSE; + mtrev_optn = FALSE; + cprev_optn = TRUE; + vtmv_optn = FALSE; + wag_optn = FALSE; + FPRINTF(STDOUTFILE "(consists very likely of amino acids encoded on cpDNA)\n"); + } /* if c5 else c6 */ + } /* if c4 else c5 */ + } /* if c3 else c4 */ + } /* if c2 */ + } /* if c1 */ +#endif /* CPREV */ + + } else if (data_optn == 0) { + FPRINTF(STDOUTFILE "(consists very likely of nucleotides)\n"); + } else { + FPRINTF(STDOUTFILE "(consists very likely of binary state data)\n"); + } +} /* guessmodel */ + + +/******************************************************************************/ +/* functions for representing and building puzzling step trees */ +/******************************************************************************/ + +/* initialize tree with the following starting configuration + + 2 + 0 +------- C(=2) + A(=0) -----+ + +------- B(=1) + 1 + */ +void inittree() +{ + int i; + + /* allocate the memory for the whole tree */ + + /* allocate memory for vector with all the edges of the tree */ + edge = (ONEEDGE *) calloc(Maxbrnch, sizeof(ONEEDGE) ); + if (edge == NULL) maerror("edge in inittree"); + + /* allocate memory for vector with edge numbers of leaves */ + edgeofleaf = (int *) calloc(Maxspc, sizeof(int) ); + if (edgeofleaf == NULL) maerror("edgeofleaf in inittree"); + + /* allocate memory for all the edges the edge map */ + for (i = 0; i < Maxbrnch; i++) { + edge[i].edgemap = (int *) calloc(Maxbrnch, sizeof(int) ); + if (edge[i].edgemap == NULL) maerror("edgemap in inittree"); + } + + /* number all edges */ + for (i = 0; i < Maxbrnch; i++) edge[i].numedge = i; + + /* initialize tree */ + + nextedge = 3; + nextleaf = 3; + + /* edge maps */ + (edge[0].edgemap)[0] = 0; /* you are on the right edge */ + (edge[0].edgemap)[1] = 4; /* go down left for leaf 1 */ + (edge[0].edgemap)[2] = 5; /* go down right for leaf 2 */ + (edge[1].edgemap)[0] = 1; /* go up for leaf 0 */ + (edge[1].edgemap)[1] = 0; /* you are on the right edge */ + (edge[1].edgemap)[2] = 3; /* go up/down right for leaf 2 */ + (edge[2].edgemap)[0] = 1; /* go up for leaf 0 */ + (edge[2].edgemap)[1] = 2; /* go up/down left for leaf 1 */ + (edge[2].edgemap)[2] = 0; /* you are on the right edge */ + + /* interconnection */ + edge[0].up = NULL; + edge[0].downleft = &edge[1]; + edge[0].downright = &edge[2]; + edge[1].up = &edge[0]; + edge[1].downleft = NULL; + edge[1].downright = NULL; + edge[2].up = &edge[0]; + edge[2].downleft = NULL; + edge[2].downright = NULL; + + /* edges of leaves */ + edgeofleaf[0] = 0; + edgeofleaf[1] = 1; + edgeofleaf[2] = 2; +} /* inittree */ + +/* add next leaf on the specified edge */ +void addnextleaf(int dockedge) +{ + int i; + + if (dockedge >= nextedge) { + /* Trying to add leaf nextleaf to nonexisting edge dockedge */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR F TO DEVELOPERS\n\n\n"); + exit(1); + } + + if (nextleaf >= Maxspc) { + /* Trying to add leaf nextleaf to a tree with Maxspc leaves */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR G TO DEVELOPERS\n\n\n"); + exit(1); + } + + /* necessary change in edgeofleaf if dockedge == edgeofleaf[0] */ + if (edgeofleaf[0] == dockedge) edgeofleaf[0] = nextedge; + + /* adding nextedge to the tree */ + edge[nextedge].up = edge[dockedge].up; + edge[nextedge].downleft = &edge[dockedge]; + edge[nextedge].downright = &edge[nextedge+1]; + edge[dockedge].up = &edge[nextedge]; + + if (edge[nextedge].up != NULL) { + if ( ((edge[nextedge].up)->downleft) == &edge[dockedge] ) + (edge[nextedge].up)->downleft = &edge[nextedge]; + else + (edge[nextedge].up)->downright = &edge[nextedge]; + } + + /* adding nextedge + 1 to the tree */ + edge[nextedge+1].up = &edge[nextedge]; + edge[nextedge+1].downleft = NULL; + edge[nextedge+1].downright = NULL; + edgeofleaf[nextleaf] = nextedge+1; + + /* the two new edges get info about the old edges */ + /* nextedge */ + for (i = 0; i < nextedge; i++) { + switch ( (edge[dockedge].edgemap)[i] ) { + + /* down right changes to down left */ + case 5: (edge[nextedge].edgemap)[i] = 4; + break; + + /* null changes to down left */ + case 0: (edge[nextedge].edgemap)[i] = 4; + break; + + default: (edge[nextedge].edgemap)[i] = + (edge[dockedge].edgemap)[i]; + break; + } + } + + /* nextedge + 1 */ + for (i = 0; i < nextedge; i++) { + switch ( (edge[dockedge].edgemap)[i] ) { + + /* up/down left changes to up */ + case 2: (edge[nextedge+1].edgemap)[i] = 1; + break; + + /* up/down right changes to up */ + case 3: (edge[nextedge+1].edgemap)[i] = 1; + break; + + /* down left changes to up/down left */ + case 4: (edge[nextedge+1].edgemap)[i] = 2; + break; + + /* down right changes to up/down left */ + case 5: (edge[nextedge+1].edgemap)[i] = 2; + break; + + /* null changes to up/down left */ + case 0: (edge[nextedge+1].edgemap)[i] = 2; + break; + + /* up stays up */ + default: (edge[nextedge+1].edgemap)[i] = + (edge[dockedge].edgemap)[i]; + break; + } + } + + /* dockedge */ + for (i = 0; i < nextedge; i++) { + switch ( (edge[dockedge].edgemap)[i] ) { + + /* up/down right changes to up */ + case 3: (edge[dockedge].edgemap)[i] = 1; + break; + + /* up/down left changes to up */ + case 2: (edge[dockedge].edgemap)[i] = 1; + break; + + default: break; + } + } + + /* all edgemaps are updated for the two new edges */ + /* nextedge */ + (edge[nextedge].edgemap)[nextedge] = 0; + (edge[nextedge].edgemap)[nextedge+1] = 5; /* down right */ + + /* nextedge + 1 */ + (edge[nextedge+1].edgemap)[nextedge] = 1; /* up */ + (edge[nextedge+1].edgemap)[nextedge+1] = 0; + + /* all other edges */ + for (i = 0; i < nextedge; i++) { + (edge[i].edgemap)[nextedge] = (edge[i].edgemap)[dockedge]; + (edge[i].edgemap)[nextedge+1] = (edge[i].edgemap)[dockedge]; + } + + /* an extra for dockedge */ + (edge[dockedge].edgemap)[nextedge] = 1; /* up */ + (edge[dockedge].edgemap)[nextedge+1] = 3; /* up/down right */ + + nextleaf++; + nextedge = nextedge + 2; +} /* addnextleaf */ + + +/* free memory (to be called after inittree) */ +void freetree() +{ + int i; + + for (i = 0; i < 2 * Maxspc - 3; i++) free(edge[i].edgemap); + free(edge); + free(edgeofleaf); +} /* freetree */ + +/* writes OTU sitting on edge ed */ +void writeOTU(FILE *outfp, int ed) +{ + int i; + + /* test whether we are on a leaf */ + if (edge[ed].downright == NULL && edge[ed].downleft == NULL) { + for (i = 1; i < nextleaf; i++) { + if (edgeofleaf[i] == ed) { /* i is the leaf of ed */ + column += fputid(outfp, trueID[i]); + return; + } + } + } + + /* we are NOT on a leaf */ + fprintf(outfp, "("); + column++; + writeOTU(outfp, edge[ed].downleft->numedge); + fprintf(outfp, ","); + column++; + column++; + if (column > 55) { + column = 2; + fprintf(outfp, "\n "); + } + writeOTU(outfp, edge[ed].downright->numedge); + fprintf(outfp, ")"); + column++; +} /* writeOTU */ + +/* write tree */ +void writetree(FILE *outfp) +{ + column = 1; + fprintf(outfp, "("); + column += fputid(outfp, trueID[0]) + 3; + fprintf(outfp, ","); + writeOTU(outfp, edge[edgeofleaf[0]].downleft->numedge); + column++; + column++; + fprintf(outfp, ","); + writeOTU(outfp, edge[edgeofleaf[0]].downright->numedge); + fprintf(outfp, ");\n"); +} /* writetree */ + + +/* clear all edgeinfos */ +void resetedgeinfo() +{ + int i; + + for (i = 0; i < nextedge; i++) + edge[i].edgeinfo = 0; +} /* resetedgeinfo */ + +/* increment all edgeinfo between leaf A and B */ +void incrementedgeinfo(int A, int B) +{ + int curredge, finaledge, nextstep; + + if (A == B) return; + + finaledge = edgeofleaf[B]; + + curredge = edgeofleaf[A]; + edge[curredge].edgeinfo = edge[curredge].edgeinfo + 1; + + while (curredge != finaledge) { + nextstep = (edge[curredge].edgemap)[finaledge]; + switch (nextstep) { + + /* up */ + case 1: curredge = (edge[curredge].up)->numedge; + break; + + /* up/down left */ + case 2: curredge = ((edge[curredge].up)->downleft)->numedge; + break; + + /* up/down right */ + case 3: curredge = ((edge[curredge].up)->downright)->numedge; + break; + + /* down left */ + case 4: curredge = (edge[curredge].downleft)->numedge; + break; + + /* down right */ + case 5: curredge = (edge[curredge].downright)->numedge; + break; + + } + edge[curredge].edgeinfo = edge[curredge].edgeinfo + 1; + } +} /* incrementedgeinfo */ + +/* checks which edge has the lowest edgeinfo + if there are several edges with the same lowest edgeinfo, + one of them will be selected randomly */ +void minimumedgeinfo() +{ + int i, k, howmany, randomnum; + + howmany = 1; + minedge = 0; + mininfo = edge[0].edgeinfo; + for (i = 1; i < nextedge; i++) + if (edge[i].edgeinfo <= mininfo) { + if (edge[i].edgeinfo == mininfo) { + howmany++; + } else { + minedge = i; + mininfo = edge[i].edgeinfo; + howmany = 1; + } + } + + if (howmany > 1) { /* draw random edge */ + randomnum = randominteger(howmany) + 1; /* 1 to howmany */ + i = -1; + for (k = 0; k < randomnum; k++) { + do { + i++; + } while (edge[i].edgeinfo != mininfo); + minedge = i; + } + } +} /* minimumedgeinfo */ + + + + +/*******************************************/ +/* tree sorting */ +/*******************************************/ + +/* compute address of the 4 int (sort key) in the 4 int node */ +int ct_sortkeyaddr(int addr) +{ + int a, res; + a = addr % 4; + res = addr - a + 3; + return res; +} + + +/**********/ + +/* compute address of the next edge pointer in a 4 int node (0->1->2->0) */ +int ct_nextedgeaddr(int addr) +{ + int a, res; + a = addr % 4; + if ( a == 2 ) { res = addr - 2; } + else { res = addr + 1; } + return res; +} + + +/**********/ + +/* compute address of 1st edge of a 4 int node from node number */ +int ct_1stedge(int node) +{ + int res; + res = 4 * node; + return res; +} + + +/**********/ + +/* compute address of 2nd edge of a 4 int node from node number */ +int ct_2ndedge(int node) +{ + int res; + res = 4 * node +1; + return res; +} + + +/**********/ + +/* compute address of 3rd edge of a 4 int node from node number */ +int ct_3rdedge(int node) +{ + int res; + res = 4 * node +2; + return res; +} + + +/**********/ + +/* check whether node 'node' is a leaf (2nd/3rd edge pointer = -1) */ +int ct_isleaf(int node, int *ctree) +{ + return (ctree[ct_3rdedge(node)] < 0); +} + + +/**********/ + +/* compute node number of 4 int node from an edge addr. */ +int ct_addr2node(int addr) +{ + int a, res; + a = addr % 4; + res = (int) ((addr - a) / 4); + return res; +} + + +/**********/ + +/* print graph pointers for checking */ +void printctree(int *ctree) +{ + int n; + for (n=0; n < 2*Maxspc; n++) { + printf("n[%3d] = (%3d.%2d, %3d.%2d, %3d.%2d | %3d)\n", n, + (int) ctree[ct_1stedge(n)]/4, + (int) ctree[ct_1stedge(n)]%4, + (int) ctree[ct_2ndedge(n)]/4, + (int) ctree[ct_2ndedge(n)]%4, + (int) ctree[ct_3rdedge(n)]/4, + (int) ctree[ct_3rdedge(n)]%4, + ctree[ct_3rdedge(n)+1]); + } + printf("\n"); +} /* printctree */ + + +/**********/ + +/* allocate memory for ctree 3 ints pointer plus 1 check byte */ +int *initctree() +{ + int *snodes; + int n; + + snodes = (int *) malloc(4 * 2 * Maxspc * sizeof(int)); + if (snodes == NULL) maerror("snodes in copytree"); + + for (n=0; n<(4 * 2 * Maxspc); n++) { + snodes[n]=-1; + } + return snodes; +} + + +/**********/ + +/* free memory of a tree for sorting */ +void freectree(int **snodes) +{ + free(*snodes); + *snodes = NULL; +} + + +/**********/ + +/* copy subtree recursively */ +void copyOTU(int *ctree, /* tree array struct */ + int *ct_nextnode, /* next free node */ + int ct_curredge, /* currende edge to add subtree */ + int *ct_nextleaf, /* next free leaf (0-maxspc) */ + int ed) /* edge in puzzling step tree */ +{ + int i, nextcurredge; + + /* test whether we are on a leaf */ + if (edge[ed].downright == NULL && edge[ed].downleft == NULL) { + for (i = 1; i < nextleaf; i++) { + if (edgeofleaf[i] == ed) { /* i is the leaf of ed */ + nextcurredge = ct_1stedge(*ct_nextleaf); + ctree[ct_curredge] = nextcurredge; + ctree[nextcurredge] = ct_curredge; + ctree[ct_sortkeyaddr(nextcurredge)] = trueID[i]; + (*ct_nextleaf)++; + return; + } + } + } + + /* we are NOT on a leaf */ + nextcurredge = ct_1stedge(*ct_nextnode); + ctree[ct_curredge] = nextcurredge; + ctree[nextcurredge] = ct_curredge; + (*ct_nextnode)++; + nextcurredge = ct_nextedgeaddr(nextcurredge); + copyOTU(ctree, ct_nextnode, nextcurredge, + ct_nextleaf, edge[ed].downleft->numedge); + + nextcurredge = ct_nextedgeaddr(nextcurredge); + copyOTU(ctree, ct_nextnode, nextcurredge, + ct_nextleaf, edge[ed].downright->numedge); +} + + +/**********/ + +/* copy treestructure to sorting structure */ +void copytree(int *ctree) +{ + int ct_curredge; + int ct_nextleaf; + int ct_nextnode; + + ct_nextnode = Maxspc; + ct_curredge = ct_1stedge(ct_nextnode); + ct_nextleaf = 1; + + ctree[ct_1stedge(0)] = ct_curredge; + ctree[ct_curredge] = ct_1stedge(0); + ctree[ct_sortkeyaddr(0)] = trueID[0]; + + ct_nextnode++; + + ct_curredge = ct_nextedgeaddr(ct_curredge); + copyOTU(ctree, &ct_nextnode, ct_curredge, + &ct_nextleaf, edge[edgeofleaf[0]].downleft->numedge); + + ct_curredge = ct_nextedgeaddr(ct_curredge); + copyOTU(ctree, &ct_nextnode, ct_curredge, + &ct_nextleaf, edge[edgeofleaf[0]].downright->numedge); +} + + +/**********/ + +/* sort subtree from edge recursively by indices */ +int sortOTU(int edge, int *ctree) +{ + int key1, key2; + int edge1, edge2; + int tempedge; + + if (ctree[ct_2ndedge((int) (edge / 4))] < 0) + return ctree[ct_sortkeyaddr(edge)]; + + edge1 = ctree[ct_nextedgeaddr(edge)]; + edge2 = ctree[ct_nextedgeaddr(ct_nextedgeaddr(edge))]; + + /* printf ("visiting [%5d] -> [%5d], [%5d]\n", edge, edge1, edge2); */ + /* printf ("visiting [%2d.%2d] -> [%2d.%2d], [%2d.%2d]\n", + (int)(edge/4), edge%4, (int)(edge1/4), edge1%4, + (int)(edge2/4), edge2%4); */ + + key1 = sortOTU(edge1, ctree); + key2 = sortOTU(edge2, ctree); + + if (key2 < key1) { + tempedge = ctree[ctree[edge1]]; + ctree[ctree[edge1]] = ctree[ctree[edge2]]; + ctree[ctree[edge2]] = tempedge; + tempedge = ctree[edge1]; + ctree[edge1] = ctree[edge2]; + ctree[edge2] = tempedge; + ctree[ct_sortkeyaddr(edge)] = key2; + + } else { + ctree[ct_sortkeyaddr(edge)] = key1; + } + return ctree[ct_sortkeyaddr(edge)]; +} + + +/**********/ + +/* sort ctree recursively by indices */ +int sortctree(int *ctree) +{ + int n, startnode=-1; + for(n=0; n>>>\n"); + tmpptr = list; + *sortlist = list; + while (tmpptr != NULL) { + (*tmpptr).sortnext = (*tmpptr).succ; + (*tmpptr).sortlast = (*tmpptr).pred; + tmpptr = (*tmpptr).succ; + } + + while (xchange > 0) { + curr = *sortlist; + xchange = 0; + if (curr == NULL) fprintf(stderr, "Grrrrrrrrr>>>>\n"); + while((*curr).sortnext != NULL) { + next = (*curr).sortnext; + if ((*curr).count >= (*next).count) + curr = (*curr).sortnext; + else { + if ((*curr).sortlast != NULL) + (*((*curr).sortlast)).sortnext = next; + if (*sortlist == curr) + *sortlist = next; + (*next).sortlast = (*curr).sortlast; + + if ((*next).sortnext != NULL) + (*((*next).sortnext)).sortlast = curr; + (*curr).sortnext = (*next).sortnext; + + (*curr).sortlast = next; + (*next).sortnext = curr; + + xchange++; + } + } + } +} /* sortbynum */ + + +/**********/ + +/* print puzzling step tree stuctures for checking */ +void printfpstrees(treelistitemtype *list) +{ + char ch; + treelistitemtype *tmpptr = NULL; + tmpptr = list; + ch = '-'; + while (tmpptr != NULL) { + printf ("%c[%2d] %5d %s\n", ch, (*tmpptr).idx, (*tmpptr).count, (*tmpptr).tree); + tmpptr = (*tmpptr).succ; + ch = ' '; + } +} + +/**********/ + +/* print sorted puzzling step tree stucture with names */ +void fprintffullpstree(FILE *outf, char *treestr) +{ + int count = 0; + int idnum = 0; + int n; + for(n=0; treestr[n] != '\0'; n++){ + while(isdigit((int)treestr[n])){ + idnum = (10 * idnum) + ((int)treestr[n]-48); + n++; + count++; + } + if (count > 0){ +# ifdef USEQUOTES + fprintf(outf, "'"); +# endif + (void)fputid(outf, idnum); +# ifdef USEQUOTES + fprintf(outf, "'"); +# endif + count = 0; + idnum = 0; + } + fprintf(outf, "%c", treestr[n]); + } +} + + +/**********/ + +/* print sorted puzzling step tree stuctures with names */ +void fprintfsortedpstrees(FILE *output, + treelistitemtype *list, /* tree list */ + int itemnum, /* order number */ + int itemsum, /* number of trees */ + int comment, /* with statistics, or puzzle report ? */ + float cutoff) /* cutoff percentage */ +{ + treelistitemtype *tmpptr = NULL; + treelistitemtype *slist = NULL; + int num = 1; + float percent; + + if (list == NULL) fprintf(stderr, "Grrrrrrrrr>>>>\n"); + sortbynum(list, &slist); + + tmpptr = slist; + while (tmpptr != NULL) { + percent = (float)(100.0 * (*tmpptr).count / itemsum); + if ((cutoff == 0.0) || (cutoff <= percent)) { + if (comment) + fprintf (output, "[ %d. %d %.2f %d %d %d ]", num++, (*tmpptr).count, percent, (*tmpptr).id, itemnum, itemsum); + else { + if (num == 1){ + fprintf (output, "\n"); + fprintf (output, "The following tree(s) occured in more than %.2f%% of the %d puzzling steps.\n", cutoff, itemsum); + fprintf (output, "The trees are orderd descending by the number of occurences.\n"); + fprintf (output, "\n"); + fprintf (output, "\n occurences ID Phylip tree\n"); + } + fprintf (output, "%2d. %5d %6.2f%% %5d ", num++, (*tmpptr).count, percent, (*tmpptr).id); + } + fprintffullpstree(output, (*tmpptr).tree); + fprintf (output, "\n"); + } + tmpptr = (*tmpptr).sortnext; + } + + if (!comment) { + fprintf (output, "\n"); + switch(num) { + case 1: fprintf (output, "There were no tree topologies (out of %d) occuring with a percentage >= %.2f%% of the %d puzzling steps.\n", itemnum, cutoff, itemsum); break; + case 2: fprintf (output, "There was one tree topology (out of %d) occuring with a percentage >= %.2f%%.\n", itemnum, cutoff); break; + default: fprintf (output, "There were %d tree topologies (out of %d) occuring with a percentage >= %.2f%%.\n", num-1, itemnum, cutoff); break; + } + fprintf (output, "\n"); + fprintf (output, "\n"); + } + +} /* fprintfsortedpstrees */ + +/**********/ + +/* print sorted tree topologies for checking */ +void printfsortedpstrees(treelistitemtype *list) +{ + treelistitemtype *tmpptr = NULL; + treelistitemtype *slist = NULL; + + sortbynum(list, &slist); + + tmpptr = slist; + while (tmpptr != NULL) { + printf ("[%2d] %5d %s\n", (*tmpptr).idx, (*tmpptr).count, (*tmpptr).tree); + tmpptr = (*tmpptr).sortnext; + } +} /* printfsortedpstrees */ + + +/*******************************************/ +/* end of tree sorting */ +/*******************************************/ + + + +/******************************************************************************/ +/* functions for computing the consensus tree */ +/******************************************************************************/ + +/* prepare for consensus tree analysis */ +void initconsensus() +{ +# if ! PARALLEL + biparts = new_cmatrix(Maxspc-3, Maxspc); +# endif /* PARALLEL */ + + if (Maxspc % 32 == 0) + splitlength = Maxspc/32; + else splitlength = (Maxspc + 32 - (Maxspc % 32))/32; + numbiparts = 0; /* no pattern stored so far */ + maxbiparts = 0; /* no memory reserved so far */ + splitfreqs = NULL; + splitpatterns = NULL; + splitsizes = NULL; + splitcomp = (uli *) malloc(splitlength * sizeof(uli) ); + if (splitcomp == NULL) maerror("splitcomp in initconsensus"); +} + +/* prototype needed for recursive function */ +void makepart(int i, int curribrnch); + +/* recursive function to get bipartitions */ +void makepart(int i, int curribrnch) +{ + int j; + + if ( edge[i].downright == NULL || + edge[i].downleft == NULL) { /* if i is leaf */ + + /* check out what leaf j sits on this edge i */ + for (j = 1; j < Maxspc; j++) { + if (edgeofleaf[j] == i) { + biparts[curribrnch][trueID[j]] = '*'; + return; + } + } + } else { /* still on inner branch */ + makepart(edge[i].downleft->numedge, curribrnch); + makepart(edge[i].downright->numedge, curribrnch); + } +} + +/* compute bipartitions of tree of current puzzling step */ +void computebiparts() +{ + int i, j, curribrnch; + + curribrnch = -1; + + for (i = 0; i < Maxspc - 3; i++) + for (j = 0; j < Maxspc; j++) + biparts[i][j] = '.'; + + for (i = 0; i < Maxbrnch; i++) { + if (!( edgeofleaf[0] == i || + edge[i].downright == NULL || + edge[i].downleft == NULL) ) { /* check all inner branches */ + curribrnch++; + makepart(i, curribrnch); + + /* make sure that the root is always a '*' */ + if (biparts[curribrnch][outgroup] == '.') { + for (j = 0; j < Maxspc; j++) { + if (biparts[curribrnch][j] == '.') + biparts[curribrnch][j] = '*'; + else + biparts[curribrnch][j] = '.'; + } + } + } + } +} + +/* print out the bipartition n of all different splitpatterns */ +void printsplit(FILE *fp, uli n) +{ + int i, j, col; + uli z; + + col = 0; + for (i = 0; i < splitlength; i++) { + z = splitpatterns[n*splitlength + i]; + for (j = 0; j < 32 && col < Maxspc; j++) { + if (col % 10 == 0 && col != 0) fprintf(fp, " "); + if (z & 1) fprintf(fp, "."); + else fprintf(fp, "*"); + z = (z >> 1); + col++; + } + } +} + +/* make new entries for new different bipartitions and count frequencies */ +void makenewsplitentries() +{ + int i, j, bpc, identical, idflag, bpsize; + uli nextentry, obpc; + + /* where the next entry would be in splitpatterns */ + nextentry = numbiparts; + + for (bpc = 0; bpc < Maxspc - 3; bpc++) { /* for every new bipartition */ + /* convert bipartition into a more compact format */ + bpsize = 0; + for (i = 0; i < splitlength; i++) { + splitcomp[i] = 0; + for (j = 0; j < 32; j++) { + splitcomp[i] = splitcomp[i] >> 1; + if (i*32 + j < Maxspc) + if (biparts[bpc][i*32 + j] == '.') { + /* set highest bit */ + splitcomp[i] = (splitcomp[i] | 2147483648UL); + bpsize++; /* count the '.' */ + } + } + } + /* compare to the *old* patterns */ + identical = FALSE; + for (obpc = 0; (obpc < numbiparts) && (!identical); obpc++) { + /* compare first partition size */ + if (splitsizes[obpc] == bpsize) idflag = TRUE; + else idflag = FALSE; + /* if size is identical compare whole partition */ + for (i = 0; (i < splitlength) && idflag; i++) + if (splitcomp[i] != splitpatterns[obpc*splitlength + i]) + idflag = FALSE; + if (idflag) identical = TRUE; + } + if (identical) { /* if identical increase frequency */ + splitfreqs[2*(obpc-1)]++; + } else { /* create new entry */ + if (nextentry == maxbiparts) { /* reserve more memory */ + maxbiparts = maxbiparts + 2*Maxspc; + splitfreqs = (uli *) myrealloc(splitfreqs, + 2*maxbiparts * sizeof(uli) ); + /* 2x: splitfreqs contains also an index (sorting!) */ + if (splitfreqs == NULL) maerror("splitfreqs in makenewsplitentries"); + splitpatterns = (uli *) myrealloc(splitpatterns, + splitlength*maxbiparts * sizeof(uli) ); + if (splitpatterns == NULL) maerror("splitpatterns in makenewsplitentries"); + splitsizes = (int *) myrealloc(splitsizes, + maxbiparts * sizeof(int) ); + if (splitsizes == NULL) maerror("splitsizes in makenewsplitentries"); + } + splitfreqs[2*nextentry] = 1; /* frequency */ + splitfreqs[2*nextentry+1] = nextentry; /* index for sorting */ + for (i = 0; i < splitlength; i++) + splitpatterns[nextentry*splitlength + i] = splitcomp[i]; + splitsizes[nextentry] = bpsize; + nextentry++; + } + } + numbiparts = nextentry; +} + +/* general remarks: + + - every entry in consbiparts is one node of the consensus tree + - for each node one has to know which taxa and which other nodes + are *directly* descending from it + - for every taxon/node number there is a flag that shows + whether it descends from the node or not + - '0' means that neither a taxon nor another node with the + corresponding number decends from the node + '1' means that the corresponding taxon descends from the node + '2' means that the corresponding node descends from the node + '3' means that the corresponding taxon and node descends from the node +*/ + +/* copy bipartition n of all different splitpatterns to consbiparts[k] */ +void copysplit(uli n, int k) +{ + int i, j, col; + uli z; + + col = 0; + for (i = 0; i < splitlength; i++) { + z = splitpatterns[n*splitlength + i]; + for (j = 0; j < 32 && col < Maxspc; j++) { + if (z & 1) consbiparts[k][col] = '1'; + else consbiparts[k][col] = '0'; + z = (z >> 1); + col++; + } + } +} + +/* compute majority rule consensus tree */ +void makeconsensus() +{ + int i, j, k, size, subnode; + char chari, charj; + + /* sort bipartition frequencies */ + qsort(splitfreqs, numbiparts, 2*sizeof(uli), ulicmp); + /* how many bipartitions are included in the consensus tree */ + consincluded = 0; + for (i = 0; i < numbiparts && i == consincluded; i++) { + if (2*splitfreqs[2*i] > Numtrial) consincluded = i + 1; + } + + /* collect all info about majority rule consensus tree */ + /* the +1 is due to the edge with the root */ + consconfid = new_ivector(consincluded + 1); + conssizes = new_ivector(2*consincluded + 2); + consbiparts = new_cmatrix(consincluded + 1, Maxspc); + + for (i = 0; i < consincluded; i++) { + /* copy partition to consbiparts */ + copysplit(splitfreqs[2*i+1], i); + /* frequency in percent (rounded to integer) */ + consconfid[i] = (int) floor(100.0*splitfreqs[2*i]/Numtrial + 0.5); + /* size of partition */ + conssizes[2*i] = splitsizes[splitfreqs[2*i+1]]; + conssizes[2*i+1] = i; + } + for (i = 0; i < Maxspc; i++) consbiparts[consincluded][i] = '1'; + consbiparts[consincluded][outgroup] = '0'; + consconfid[consincluded] = 100; + conssizes[2*consincluded] = Maxspc - 1; + conssizes[2*consincluded + 1] = consincluded; + + /* sort bipartitions according to cluster size */ + qsort(conssizes, consincluded + 1, 2*sizeof(int), intcmp); + + /* reconstruct consensus tree */ + for (i = 0; i < consincluded; i++) { /* try every node */ + size = conssizes[2*i]; /* size of current node */ + for (j = i + 1; j < consincluded + 1; j++) { + + /* compare only with nodes with more descendants */ + if (size == conssizes[2*j]) continue; + + /* check whether node i is a subnode of j */ + subnode = FALSE; + for (k = 0; k < Maxspc && !subnode; k++) { + chari = consbiparts[ conssizes[2*i+1] ][k]; + if (chari != '0') { + charj = consbiparts[ conssizes[2*j+1] ][k]; + if (chari == charj || charj == '3') subnode = TRUE; + } + } + + /* if i is a subnode of j change j accordingly */ + if (subnode) { + /* remove subnode i from j */ + for (k = 0; k < Maxspc; k++) { + chari = consbiparts[ conssizes[2*i+1] ][k]; + if (chari != '0') { + charj = consbiparts[ conssizes[2*j+1] ][k]; + if (chari == charj) + consbiparts[ conssizes[2*j+1] ][k] = '0'; + else if (charj == '3') { + if (chari == '1') + consbiparts[ conssizes[2*j+1] ][k] = '2'; + else if (chari == '2') + consbiparts[ conssizes[2*j+1] ][k] = '1'; + else { + /* Consensus tree [1] */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR H TO DEVELOPERS\n\n\n"); + exit(1); + } + } else { + /* Consensus tree [2] */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR I TO DEVELOPERS\n\n\n"); + exit(1); + } + } + } + /* add link to subnode i in node j */ + charj = consbiparts[ conssizes[2*j+1] ][ conssizes[2*i+1] ]; + if (charj == '0') + consbiparts[ conssizes[2*j+1] ][ conssizes[2*i+1] ] = '2'; + else if (charj == '1') + consbiparts[ conssizes[2*j+1] ][ conssizes[2*i+1] ] = '3'; + else { + /* Consensus tree [3] */ + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR J TO DEVELOPERS\n\n\n"); + exit(1); + } + } + } + } +} + +/* prototype for recursion */ +void writenode(FILE *treefile, int node); + +/* write node (writeconsensustree) */ +void writenode(FILE *treefile, int node) +{ + int i, first; + + fprintf(treefile, "("); + column++; + /* write descending nodes */ + first = TRUE; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + /* write node i */ + writenode(treefile, i); + + /* reliability value as internal label */ + fprintf(treefile, "%d", consconfid[i]); + + column = column + 3; + } + } + /* write descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + column += fputid(treefile, i); + } + } + fprintf(treefile, ")"); + column++; +} + +/* write consensus tree */ +void writeconsensustree(FILE *treefile) +{ + int i, first; + + column = 1; + fprintf(treefile, "("); + column += fputid(treefile, outgroup) + 2; + fprintf(treefile, ","); + /* write descending nodes */ + first = TRUE; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + /* write node i */ + writenode(treefile, i); + + /* reliability value as internal label */ + fprintf(treefile, "%d", consconfid[i]); + + column = column + 3; + } + } + /* write descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + if (first) first = FALSE; + else { + fprintf(treefile, ","); + column++; + } + if (column > 60) { + column = 2; + fprintf(treefile, "\n"); + } + column += fputid(treefile, i); + } + } + fprintf(treefile, ");\n"); +} + +/* prototype for recursion */ +void nodecoordinates(int node); + +/* establish node coordinates (plotconsensustree) */ +void nodecoordinates(int node) +{ + int i, ymin, ymax, xcoordinate; + + /* first establish coordinates of descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') + nodecoordinates(i); + } + + /* then establish coordinates of descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + /* y-coordinate of taxon i */ + ycortax[i] = ytaxcounter; + ytaxcounter = ytaxcounter - 2; + } + } + + /* then establish coordinates of this node */ + ymin = 2*Maxspc - 2; + ymax = 0; + xcoordinate = 0; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') { + if (ycor[i] > ymax) ymax = ycor[i]; + if (ycor[i] < ymin) ymin = ycor[i]; + if (xcor[i] > xcoordinate) xcoordinate = xcor[i]; + } + } + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + if (ycortax[i] > ymax) ymax = ycortax[i]; + if (ycortax[i] < ymin) ymin = ycortax[i]; + } + } + ycormax[node] = ymax; + ycormin[node] = ymin; + ycor[node] = (int) floor(0.5*(ymax + ymin) + 0.5); + if (xcoordinate == 0) xcoordinate = 9; + xcor[node] = xcoordinate + 4; +} + +/* prototype for recursion */ +void drawnode(int node, int xold); + +/* drawnode (plotconsensustree) */ +void drawnode(int node, int xold) +{ + int i, j; + char buf[4]; + + /* first draw vertical line */ + for (i = ycormin[node] + 1; i < ycormax[node]; i++) + treepict[xcor[node]][i] = ':'; + + /* then draw descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '2' || + consbiparts[node][i] == '3') + drawnode(i, xcor[node]); + } + + /* then draw descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[node][i] == '1' || + consbiparts[node][i] == '3') { + treepict[xcor[node]][ycortax[i]] = ':'; + for (j = xcor[node] + 1; j < xsize-10; j++) + treepict[j][ycortax[i]] = '-'; + for (j = 0; j < 10; j++) + treepict[xsize-10+j][ycortax[i]] = Identif[i][j]; + } + } + + /* then draw internal edge with consensus value */ + treepict[xold][ycor[node]] = ':'; + treepict[xcor[node]][ycor[node]] = ':'; + for (i = xold + 1; i < xcor[node]-3; i++) + treepict[i][ycor[node]] = '-'; + sprintf(buf, "%d", consconfid[node]); + if (consconfid[node] == 100) { + treepict[xcor[node]-3][ycor[node]] = buf[0]; + treepict[xcor[node]-2][ycor[node]] = buf[1]; + treepict[xcor[node]-1][ycor[node]] = buf[2]; + } else { + treepict[xcor[node]-3][ycor[node]] = '-'; + treepict[xcor[node]-2][ycor[node]] = buf[0]; + treepict[xcor[node]-1][ycor[node]] = buf[1]; + } +} + +/* plot consensus tree */ +void plotconsensustree(FILE *plotfp) +{ + int i, j, yroot, startree; + + /* star tree or no star tree */ + if (consincluded == 0) { + startree = TRUE; + consincluded = 1; /* avoids problems with malloc */ + } else + startree = FALSE; + + /* memory for x-y-coordinates of each bipartition */ + xcor = new_ivector(consincluded); + ycor = new_ivector(consincluded); + ycormax = new_ivector(consincluded); + ycormin = new_ivector(consincluded); + if (startree) consincluded = 0; /* avoids problems with malloc */ + + /* y-coordinates of each taxon */ + ycortax = new_ivector(Maxspc); + ycortax[outgroup] = 0; + + /* establish coordinates */ + ytaxcounter = 2*Maxspc - 2; + + /* first establish coordinates of descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') + nodecoordinates(i); + } + + /* then establish coordinates of descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + /* y-coordinate of taxon i */ + ycortax[i] = ytaxcounter; + ytaxcounter = ytaxcounter - 2; + } + } + + /* then establish length of root edge and size of whole tree */ + yroot = 0; + xsize = 0; + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') { + if (ycor[i] > yroot) yroot = ycor[i]; + if (xcor[i] > xsize) xsize = xcor[i]; + } + } + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + if (ycortax[i] > yroot) yroot = ycortax[i]; + } + } + if (xsize == 0) xsize = 9; + /* size in x direction inclusive one blank on the left */ + xsize = xsize + 6; + + /* change all x-labels so that (0,0) is down-left */ + for (i = 0; i < consincluded; i++) + xcor[i] = xsize-1-xcor[i]; + + /* draw tree */ + treepict = new_cmatrix(xsize, 2*Maxspc-1); + for (i = 0; i < xsize; i++) + for (j = 0; j < 2*Maxspc-1; j++) + treepict[i][j] = ' '; + + /* draw root */ + for (i = 1; i < yroot; i++) + treepict[1][i] = ':'; + treepict[1][0] = ':'; + for (i = 2; i < xsize - 10; i++) + treepict[i][0] = '-'; + for (i = 0; i < 10; i++) + treepict[xsize-10+i][0] = Identif[outgroup][i]; + + /* then draw descending nodes */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '2' || + consbiparts[consincluded][i] == '3') + drawnode(i, 1); + } + + /* then draw descending taxa */ + for (i = 0; i < Maxspc; i++) { + if (consbiparts[consincluded][i] == '1' || + consbiparts[consincluded][i] == '3') { + treepict[1][ycortax[i]] = ':'; + for (j = 2; j < xsize-10; j++) + treepict[j][ycortax[i]] = '-'; + for (j = 0; j < 10; j++) + treepict[xsize-10+j][ycortax[i]] = Identif[i][j]; + } + } + + /* plot tree */ + for (i = 2*Maxspc-2; i > -1; i--) { + for (j = 0; j < xsize; j++) + fputc(treepict[j][i], plotfp); + fputc('\n', plotfp); + } + + free_ivector(xcor); + free_ivector(ycor); + free_ivector(ycormax); + free_ivector(ycormin); + free_ivector(ycortax); + free_cmatrix(treepict); +} + + + +/******************************************************************************/ +/* storing and evaluating quartet branching information */ +/******************************************************************************/ + +/* general remarks: + + for a quartet with the taxa a, b, c, d there are + three possible binary trees: + + 1) (a,b)-(c,d) + 2) (a,c)-(b,d) + 3) (a,d)-(b,c) + + For every quartet information about its branching structure is + stored. With the functions readquartet and writequartet + this information can be accessed. For every quartet (a,b,c,d) + with a < b < c < d (taxa) the branching information is encoded + using 4 bits: + + value 8 4 2 1 + +-------------+-------------+-------------+-------------+ + | not used | tree 3 | tree 2 | tree 1 | + +-------------+-------------+-------------+-------------+ + + If the branching structure of the taxa corresponds to one of the + three trees the corresponding bit is set. If the branching structure + is unclear because two of the three trees have the same maximum + likelihood value the corresponding two bits are set. If the branching + structure is completely unknown all the bits are set (the highest + bit is always cleared because it is not used). + +*/ + +/* allocate memory for quartets */ +unsigned char *mallocquartets(int taxa) +{ + uli nc, numch; + unsigned char *qinfo; + + /* compute number of quartets */ + Numquartets = (uli) taxa*(taxa-1)*(taxa-2)*(taxa-3)/24; + if (Numquartets % 2 == 0) { /* even number */ + numch = Numquartets/2; + } else { /* odd number */ + numch = (Numquartets + 1)/2; + } + /* allocate memory */ + qinfo = (unsigned char *) malloc(numch * sizeof(unsigned char) ); + if (qinfo == NULL) maerror("quartetinfo in mallocquartets"); + for (nc = 0; nc < numch; nc++) qinfo[nc] = 0; + return(qinfo); +} + +/* free quartet memory */ +void freequartets() +{ + free(quartetinfo); +} + +/* read quartet info - a < b < c < d */ +unsigned char readquartet(int a, int b, int c, int d) +{ + uli qnum; + + qnum = (uli) a + + (uli) b*(b-1)/2 + + (uli) c*(c-1)*(c-2)/6 + + (uli) d*(d-1)*(d-2)*(d-3)/24; + if (qnum % 2 == 0) { /* even number */ + /* bits 0 to 3 */ + return (quartetinfo[qnum/2] & (unsigned char) 15); + } else { /* odd number */ + /* bits 4 to 7 */ + return ((quartetinfo[(qnum-1)/2] & (unsigned char) 240)>>4); + } +} + +/* write quartet info - a < b < c < d, 0 <= info <= 15 */ +void writequartet(int a, int b, int c, int d, unsigned char info) +{ + uli qnum; + + qnum = (uli) a + + (uli) b*(b-1)/2 + + (uli) c*(c-1)*(c-2)/6 + + (uli) d*(d-1)*(d-2)*(d-3)/24; + if (qnum % 2 == 0) { /* even number */ + /* bits 0 to 3 */ + quartetinfo[qnum/2] = + ((quartetinfo[qnum/2] & (unsigned char) 240) | + (info & (unsigned char) 15)); + } else { /* odd number */ + /* bits 4 to 7 */ + quartetinfo[(qnum-1)/2] = + ((quartetinfo[(qnum-1)/2] & (unsigned char) 15) | + ((info & (unsigned char) 15)<<4)); + } +} + +/* prototypes */ +void openfiletowrite(FILE **, char[], char[]); +void closefile(FILE *); + +/* sorts three doubles in descending order */ +void sort3doubles(dvector num, ivector order) +{ + if (num[0] > num[1]) { + if(num[2] > num[0]) { + order[0] = 2; + order[1] = 0; + order[2] = 1; + } else if (num[2] < num[1]) { + order[0] = 0; + order[1] = 1; + order[2] = 2; + } else { + order[0] = 0; + order[1] = 2; + order[2] = 1; + } + } else { + if(num[2] > num[1]) { + order[0] = 2; + order[1] = 1; + order[2] = 0; + } else if (num[2] < num[0]) { + order[0] = 1; + order[1] = 0; + order[2] = 2; + } else { + order[0] = 1; + order[1] = 2; + order[2] = 0; + } + } +} + +/* checks out all possible quartets */ +void computeallquartets() +{ + double onethird; + uli nq; + unsigned char treebits[3]; + FILE *lhfp; +# if ! PARALLEL + int a, b, c, i; + double qc2, mintogo, minutes, hours, temp; + double temp1, temp2, temp3; + unsigned char discreteweight[3]; +# endif + + onethird = 1.0/3.0; + treebits[0] = (unsigned char) 1; + treebits[1] = (unsigned char) 2; + treebits[2] = (unsigned char) 4; + + if (show_optn) { /* list all unresolved quartets */ + openfiletowrite(&unresfp, UNRESOLVED, "unresolved quartet trees"); + fprintf(unresfp, "List of all completely unresolved quartets:\n\n"); + } + + nq = 0; + badqs = 0; + + /* start timer - percentage of completed quartets */ + time(&time0); + time1 = time0; + mflag = 0; + +# if PARALLEL + { + schedtype sched; + int flag; + MPI_Status stat; + int dest = 1; + uli qaddr =0; + uli qamount=0; + int qblocksent = 0; + int apr; + uli sq, noq; + initsched(&sched, numquarts(Maxspc), PP_NumProcs-1, 4); + qamount=sgss(&sched); + while (qamount > 0) { + if (PP_emptyslave()) { + PP_RecvQuartBlock(0, &sq, &noq, quartetinfo, &apr); + qblocksent -= noq; + } + dest = PP_getslave(); + PP_SendDoQuartBlock(dest, qaddr, qamount, (approxqp ? APPROX : EXACT)); + qblocksent += qamount; + qaddr += qamount; + qamount=sgss(&sched); + + MPI_Iprobe(MPI_ANY_SOURCE, PP_QUARTBLOCKSPECS, PP_Comm, &flag, &stat); + while (flag) { + PP_RecvQuartBlock(0, &sq, &noq, quartetinfo, &apr); + qblocksent -= noq; + MPI_Iprobe(MPI_ANY_SOURCE, PP_QUARTBLOCKSPECS, PP_Comm, &flag, &stat); + } + } + while (qblocksent > 0) { + PP_RecvQuartBlock(0, &sq, &noq, quartetinfo, &apr); + qblocksent -= noq; + } + } +# else /* PARALLEL */ + + addtimes(GENERAL, &tarr); + if (savequartlh_optn) { + openfiletowrite(&lhfp, ALLQUARTLH, "all quartet likelihoods"); + if (saveqlhbin_optn) writetpqfheader(Maxspc, lhfp, 3); + else writetpqfheader(Maxspc, lhfp, 4); + } + + for (i = 3; i < Maxspc; i++) + for (c = 2; c < i; c++) + for (b = 1; b < c; b++) + for (a = 0; a < b; a++) { + nq++; + + /* generate message every 15 minutes */ + /* check timer */ + time(&time2); + if ( (time2 - time1) > 900) { + /* every 900 seconds */ + /* percentage of completed quartets */ + if (mflag == 0) { + FPRINTF(STDOUTFILE "\n"); + mflag = 1; + } + qc2 = 100.*nq/Numquartets; + mintogo = (100.0-qc2) * + (double) (time2-time0)/60.0/qc2; + hours = floor(mintogo/60.0); + minutes = mintogo - 60.0*hours; + FPRINTF(STDOUTFILE "%.2f%%", qc2); + FPRINTF(STDOUTFILE " completed (remaining"); + FPRINTF(STDOUTFILE " time: %.0f", hours); + FPRINTF(STDOUTFILE " hours %.0f", minutes); + FPRINTF(STDOUTFILE " minutes)\n"); + fflush(STDOUT); + time1 = time2; + } + + /* maximum likelihood values */ + + /* exact or approximate maximum likelihood values */ + compute_quartlklhds(a,b,c,i,&qweight[0],&qweight[1],&qweight[2], (approxqp ? APPROX : EXACT)); + + if (savequartlh_optn) { + if (saveqlhbin_optn) + fwrite(qweight, sizeof(double), 3, lhfp); + else + fprintf(lhfp, "(%d,%d,%d,%d)\t%f\t%f\t%f\n", a, b, c, i, + qweight[0], qweight[1], qweight[2]); + } + + /* sort in descending order */ + sort3doubles(qweight, qworder); + + if (usebestq_optn) { + sqorder[2] = 2; + discreteweight[sqorder[2]] = treebits[qworder[0]]; + if (qweight[qworder[0]] == qweight[qworder[1]]) { + discreteweight[sqorder[2]] = discreteweight[sqorder[2]] || treebits[qworder[1]]; + if (qweight[qworder[1]] == qweight[qworder[2]]) { + discreteweight[sqorder[2]] = discreteweight[sqorder[2]] || treebits[qworder[2]]; + discreteweight[sqorder[2]] = 7; + } + } + } else { + + /* compute Bayesian weights */ + qweight[qworder[1]] = exp(qweight[qworder[1]]-qweight[qworder[0]]); + qweight[qworder[2]] = exp(qweight[qworder[2]]-qweight[qworder[0]]); + qweight[qworder[0]] = 1.0; + temp = qweight[0] + qweight[1] + qweight[2]; + qweight[0] = qweight[0]/temp; + qweight[1] = qweight[1]/temp; + qweight[2] = qweight[2]/temp; + + /* square deviations */ + temp1 = 1.0 - qweight[qworder[0]]; + sqdiff[0] = temp1 * temp1 + + qweight[qworder[1]] * qweight[qworder[1]] + + qweight[qworder[2]] * qweight[qworder[2]]; + discreteweight[0] = treebits[qworder[0]]; + + temp1 = 0.5 - qweight[qworder[0]]; + temp2 = 0.5 - qweight[qworder[1]]; + sqdiff[1] = temp1 * temp1 + temp2 * temp2 + + qweight[qworder[2]] * qweight[qworder[2]]; + discreteweight[1] = treebits[qworder[0]] + treebits[qworder[1]]; + + temp1 = onethird - qweight[qworder[0]]; + temp2 = onethird - qweight[qworder[1]]; + temp3 = onethird - qweight[qworder[2]]; + sqdiff[2] = temp1 * temp1 + temp2 * temp2 + temp3 * temp3; + discreteweight[2] = (unsigned char) 7; + + /* sort in descending order */ + sort3doubles(sqdiff, sqorder); + } + + /* determine best discrete weight */ + writequartet(a, b, c, i, discreteweight[sqorder[2]]); + + /* counting completely unresolved quartets */ + if (discreteweight[sqorder[2]] == 7) { + badqs++; + badtaxon[a]++; + badtaxon[b]++; + badtaxon[c]++; + badtaxon[i]++; + if (show_optn) { + fputid10(unresfp, a); + fprintf(unresfp, " "); + fputid10(unresfp, b); + fprintf(unresfp, " "); + fputid10(unresfp, c); + fprintf(unresfp, " "); + fputid(unresfp, i); + fprintf(unresfp, "\n"); + } + } + addtimes(QUARTETS, &tarr); + } + if (savequartlh_optn) { + closefile(lhfp); + } + if (show_optn) + closefile(unresfp); + if (mflag == 1) + FPRINTF(STDOUTFILE "\n"); +# endif /* PARALLEL */ + +} + +/* check the branching structure between the leaves (not the taxa!) + A, B, C, and I (A, B, C, I don't need to be ordered). As a result, + the two leaves that are closer related to each other than to leaf I + are found in chooseA and chooseB. If the branching structure is + not uniquely defined, ChooseA and ChooseB are chosen randomly + from the possible taxa */ +void checkquartet(int A, int B, int C, int I) +{ + int i, j, a, b, taxon[5], leaf[5], ipos; + unsigned char qresult; + int notunique = FALSE; + + /* The relationship between leaves and taxa is defined by trueID */ + taxon[1] = trueID[A]; /* taxon number */ + leaf[1] = A; /* leaf number */ + taxon[2] = trueID[B]; + leaf[2] = B; + taxon[3] = trueID[C]; + leaf[3] = C; + taxon[4] = trueID[I]; + leaf[4] = I; + + /* sort for taxa */ + /* Source: Numerical Recipes (PIKSR2.C) */ + for (j = 2; j <= 4; j++) { + a = taxon[j]; + b = leaf[j]; + i = j-1; + while (i > 0 && taxon[i] > a) { + taxon[i+1] = taxon[i]; + leaf[i+1] = leaf[i]; + i--; + } + taxon[i+1] = a; + leaf[i+1] = b; + } + + /* where is leaf I ? */ + ipos = 1; + while (leaf[ipos] != I) ipos++; + + /* look at sequence quartet */ + qresult = readquartet(taxon[1], taxon[2], taxon[3], taxon[4]); + + /* chooseA and chooseB */ + do { + switch (qresult) { + + /* one single branching structure */ + + /* 001 */ + case 1: if (ipos == 1 || ipos == 2) { + chooseA = leaf[3]; + chooseB = leaf[4]; + } else { + chooseA = leaf[1]; + chooseB = leaf[2]; + } + notunique = FALSE; + break; + + /* 010 */ + case 2: if (ipos == 1 || ipos == 3) { + chooseA = leaf[2]; + chooseB = leaf[4]; + } else { + chooseA = leaf[1]; + chooseB = leaf[3]; + } + notunique = FALSE; + break; + + /* 100 */ + case 4: if (ipos == 1 || ipos == 4) { + chooseA = leaf[2]; + chooseB = leaf[3]; + } else { + chooseA = leaf[1]; + chooseB = leaf[4]; + } + notunique = FALSE; + break; + + /* two possible branching structures */ + + /* 011 */ + case 3: if (randominteger(2)) qresult = 1; + else qresult = 2; + notunique = TRUE; + break; + + /* 101 */ + case 5: if (randominteger(2)) qresult = 1; + else qresult = 4; + notunique = TRUE; + break; + + /* 110 */ + case 6: if (randominteger(2)) qresult = 2; + else qresult = 4; + notunique = TRUE; + break; + + /* three possible branching structures */ + + /* 111 */ + case 7: qresult = (1 << randominteger(3)); /* 1, 2, or 4 */ + notunique = TRUE; + break; + + default: /* Program error [checkquartet] */ +#if PARALLEL + FPRINTF(STDOUTFILE "\n\n\n(%2d)HALT: PLEASE REPORT ERROR K-PARALLEL TO DEVELOPERS (%d,%d,%d,%d) = %ld\n\n\n", + PP_Myid, taxon[1], taxon[2], taxon[3], taxon[4], + quart2num(taxon[1], taxon[2], taxon[3], taxon[4])); +#else + FPRINTF(STDOUTFILE "\n\n\nHALT: PLEASE REPORT ERROR K TO DEVELOPERS\n\n\n"); +#endif + + } + } while (notunique); + + return; +} + diff --git a/forester/archive/RIO/others/puzzle_mod/src/sched.c b/forester/archive/RIO/others/puzzle_mod/src/sched.c new file mode 100644 index 0000000..3f1c0f6 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/sched.c @@ -0,0 +1,423 @@ +/* + * sched.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#include +#include +#include +#include "sched.h" +/* #include "ppuzzle.h" */ + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, +#endif + +int scinit; +int ssinit; +int fscinit; +int gssinit; +int tssinit; + +int n, chunksize; +int p; + +#ifdef SCHEDTEST + schedtype testsched; +#endif + +void printsched(schedtype sch) +{ + FPRINTF(STDOUTFILE "Current scheduling status:\n"); + FPRINTF(STDOUTFILE " truetasks=%5ld - alltasks=%5ld - numtasks=%5ld - numprocs=%5d\n", + sch.truetasks, sch.alltasks, sch.numtasks, sch.numprocs); + FPRINTF(STDOUTFILE " delta =%5d - overhead=%5d - rest =%5d - inited =%5d\n", + sch.delta, sch.overhead, sch.rest, sch.inited); + FPRINTF(STDOUTFILE " nconst =%5d - fconst =%5f - lconst =%5f - kconst =%5f\n", + sch.nconst, sch.fconst, sch.lconst, sch.kconst); +} + +void initsched(schedtype *sch, uli tasks, int procs, uli minchunk) +{ + if (minchunk < 1) minchunk = 1; + (*sch).minchunk = minchunk; + (*sch).truetasks = tasks; + (*sch).rest = (int)((*sch).truetasks % (*sch).minchunk); + (*sch).alltasks = (tasks - (*sch).rest); + (*sch).numtasks = (*sch).alltasks; + (*sch).numprocs = procs; + (*sch).delta = 0; + (*sch).overhead = 0; + (*sch).nconst = 0; + (*sch).fconst = 0; + (*sch).lconst = 0; + (*sch).kconst = 0; + (*sch).inited = 0; + +# ifdef PVERBOSE1 + printsched(*sch); +# endif /* PVERBOSE1 */ +} + +/************************************** +* Static Chunking +**************************************/ +uli sc(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).overhead = (*sch).alltasks % (*sch).numprocs; + (*sch).delta = ((*sch).alltasks - (*sch).overhead) / (*sch).numprocs; + (*sch).inited ++; + } + + if (!(*sch).overhead) { + if ((*sch).numtasks >= (*sch).delta) + tmp = (uli)(*sch).delta; + else + tmp = 0; + } else { + if ((*sch).numtasks >= ((*sch).delta + 1)) { + tmp = (uli)(*sch).delta + 1; + (*sch).overhead--; + } else + tmp = 0; + } + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; +} /* SC */ + + +/************************************** +* Self Scheduling +**************************************/ +uli ss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).inited ++; + } + + if ((*sch).numtasks >= 1) + tmp = 1; + else + tmp = (*sch).numtasks; + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + + return tmp; +} /* SS */ + + +/************************************** +* fixed-size chunking +**************************************/ +int fsc() +{ + static int R ; + static int delta ; + static int overhead; + + int tmp; + + if (fscinit == 0) { + R = n; + overhead = n % p; + delta = (n - overhead) / p; + fscinit ++; + } + + if (!overhead) { + if (R >= delta) + tmp = delta; + else + tmp = 0; + } else { + if (R >= (delta + 1)) { + tmp = delta + 1; + overhead--; + } else + tmp = 0; + } + + R -= tmp; + return tmp; +} /* FSC */ + + +/************************************** +* Guided Self Scheduling +**************************************/ +uli gss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).inited ++; + } + + if ((*sch).numtasks >= 1) { + tmp = (uli)ceil((*sch).numtasks / (*sch).numprocs); + if (tmp == 0) tmp = 1; + } else + tmp = 0; + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; +} /* GSS */ + +/************************************** +* Smooth Guided Self Scheduling +**************************************/ +uli sgss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).inited ++; + } + + if ((*sch).numtasks >= 1) { + tmp = (uli)ceil(((*sch).numtasks / (*sch).numprocs) / 2); + if (tmp == 0) tmp = 1; + } else + tmp = 0; + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; +} /* SGSS */ + + +/************************************** +* Trapezoid Self Scheduling +**************************************/ +uli tss(schedtype *sch) +{ + uli tmp; + + if ((*sch).inited == 0) { + (*sch).fconst = ceil((*sch).numtasks / (2*(*sch).numprocs)); + if ((*sch).fconst == 0) (*sch).fconst = 1; + (*sch).lconst = 1; + (*sch).nconst = ceil( (2*n) / ((*sch).fconst + (*sch).lconst) ); + (*sch).ddelta = (((*sch).fconst - (*sch).lconst) / ((*sch).nconst - 1)); + (*sch).kconst = (*sch).fconst; + FPRINTF(STDOUTFILE "f = n/2p = %.2f ; l = %.2f\n", (*sch).fconst, (*sch).lconst); + FPRINTF(STDOUTFILE "N = 2n/(f+l) = %d ; delta = (f-l)/(N-1) = %.2f\n", (*sch).nconst, (*sch).ddelta); + (*sch).inited ++; + } + + if ((*sch).kconst <= (double) (*sch).numtasks) { + tmp = (uli)ceil((*sch).kconst); + (*sch).kconst -= (*sch).ddelta; + } else { + tmp = (uli)(*sch).numtasks; + (*sch).kconst = 0.0; + } + + /* correction */ + if ((tmp % (*sch).minchunk) > 0) { + tmp += (*sch).minchunk - (tmp % (*sch).minchunk); + } + + (*sch).numtasks -= tmp; + + if ((*sch).numtasks == 0) { + tmp += (uli)(*sch).rest; + (*sch).rest = 0; + } + return tmp; + +} /* TSS */ + + +/******************/ + + +#ifdef SCHEDTEST + uli numquarts(int maxspc) + { + uli tmp; + int a, b, c, d; + + if (maxspc < 4) + return (uli)0; + else { + maxspc--; + a = maxspc-3; + b = maxspc-2; + c = maxspc-1; + d = maxspc; + + tmp = (uli) 1 + a + + (uli) b * (b-1) / 2 + + (uli) c * (c-1) * (c-2) / 6 + + (uli) d * (d-1) * (d-2) * (d-3) / 24; + return (tmp); + } + } /* numquarts */ +#endif + + + + +/************************************** +* main +**************************************/ +#ifdef SCHEDTEST +int main(int argc, char *argv[]) +{ + int tcount, + count, + lastsize, + size; + if ((argc > 4) || (argc < 3)) { + FPRINTF(STDOUTFILE "\n\n Usage: %s <# species> <# processors> []\n\n", argv[0]); + exit(1); + } + + chunksize = 1; + + switch(argc) { + case 4: + chunksize = atoi(argv[3]); + case 3: + n = numquarts(atoi(argv[1])); + p = atoi(argv[2]); + } + + FPRINTF(STDOUTFILE "proc=%6d\n", p); + FPRINTF(STDOUTFILE "task=%6d\n", n); + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n---------------------------\n"); + FPRINTF(STDOUTFILE "SC(sched) - Static Chunking\n"); + FPRINTF(STDOUTFILE "---------------------------\n\n"); + do { size = sc(&testsched); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); + + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n---------------------------\n"); + FPRINTF(STDOUTFILE "SS(sched) - Self Scheduling\n"); + FPRINTF(STDOUTFILE "---------------------------\n\n"); + do { size = ss(&testsched); + if (size > 0) {if (count==1) FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + count++; + tcount+=size; + lastsize = size;} + else {FPRINTF(STDOUTFILE " ...\n"); + FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, lastsize , (lastsize%chunksize) ? '!' : ' '); + FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1));} + } while (size > 0); + + +/**/ + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n---------------------------\n"); + FPRINTF(STDOUTFILE "FSC() - Fixed-Size Chunking\n"); + FPRINTF(STDOUTFILE "---------------------------\n\n"); + do { size = fsc(); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); +/**/ + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n-----------------------------------\n"); + FPRINTF(STDOUTFILE "GSS(sched) - Guided Self Scheduling\n"); + FPRINTF(STDOUTFILE "-----------------------------------\n\n"); + do { size = gss(&testsched); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); + + initsched(&testsched, n, p, chunksize); + printsched(testsched); + + count=1; tcount = 0; + FPRINTF(STDOUTFILE "\n\n--------------------------------------\n"); + FPRINTF(STDOUTFILE "TSS(sched) - Trapezoid Self Scheduling\n"); + FPRINTF(STDOUTFILE "--------------------------------------\n\n"); + do { size = tss(&testsched); + if (size > 0) {FPRINTF(STDOUTFILE "%6d. chunk = %6d %c\n", count++, size , (size%chunksize) ? '!' : ' '); + tcount+=size;} + else FPRINTF(STDOUTFILE "%d tasks in %d chunks\n", tcount, (count-1)); + } while (size > 0); + return (0); +} +#endif diff --git a/forester/archive/RIO/others/puzzle_mod/src/sched.h b/forester/archive/RIO/others/puzzle_mod/src/sched.h new file mode 100644 index 0000000..e75bdd2 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/sched.h @@ -0,0 +1,53 @@ +/* + * sched.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef SCHED_H +#define SCHED_H +#ifndef SCHEDTEST +# include "util.h" +#else + typedef unsigned long int uli; +#endif + + +typedef struct sched_t{ + uli truetasks; + uli alltasks; + uli numtasks; + uli minchunk; + int numprocs; + int delta; + double ddelta; + int overhead; + int rest; + int nconst; + double fconst; + double lconst; + double kconst; + int inited; +} schedtype; + +void num2quart(uli qnum, int *a, int *b, int *c, int *d); +uli numquarts(int maxspc); +uli quart2num (int a, int b, int c, int d); + +void printsched(schedtype sch); +void initsched(schedtype *sch, uli tasks, int procs, uli minchunk); +uli sc(schedtype *sch); +uli gss(schedtype *sch); +uli sgss(schedtype *sch); +uli tss(schedtype *sch); + +#endif /* SCHED_H */ diff --git a/forester/archive/RIO/others/puzzle_mod/src/test b/forester/archive/RIO/others/puzzle_mod/src/test new file mode 100644 index 0000000..a680df2 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/test @@ -0,0 +1,19 @@ +CC gcc +LIBS -lm +CFLAGS -g -O2 +DEFS -DPACKAGE=\"tree-puzzle\" -DVERSION=\"5.0\" -DHAVE_LIBM=1 -DSTDC_HEADERS=1 -DHAVE_LIMITS_H=1 +SET_MAKE + +HCC @HCC@ +MPICC +MPCC @MPCC@ + +MPICC +MPILIBS +MPIDEFS +MPICFLAGS + +PCC @PCC@ +PLIBS @PLIBS@ +PDEFS @PDEFS@ +PCFLAGS @PCFLAGS@ diff --git a/forester/archive/RIO/others/puzzle_mod/src/test.in b/forester/archive/RIO/others/puzzle_mod/src/test.in new file mode 100644 index 0000000..0dc7ddc --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/test.in @@ -0,0 +1,19 @@ +CC @CC@ +LIBS @LIBS@ +CFLAGS @CFLAGS@ +DEFS @DEFS@ +SET_MAKE @SET_MAKE@ + +HCC @HCC@ +MPICC @MPICC@ +MPCC @MPCC@ + +MPICC @MPICC@ +MPILIBS @MPILIBS@ +MPIDEFS @MPIDEFS@ +MPICFLAGS @MPICFLAGS@ + +PCC @PCC@ +PLIBS @PLIBS@ +PDEFS @PDEFS@ +PCFLAGS @PCFLAGS@ diff --git a/forester/archive/RIO/others/puzzle_mod/src/util.c b/forester/archive/RIO/others/puzzle_mod/src/util.c new file mode 100644 index 0000000..667758b --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/util.c @@ -0,0 +1,748 @@ +/* + * util.c + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#include "util.h" + +#define STDOUT stdout +#ifndef PARALLEL /* because printf() runs significantly faster */ + /* than fprintf(stdout) on an Apple McIntosh */ + /* (HS) */ +# define FPRINTF printf +# define STDOUTFILE +#else +# define FPRINTF fprintf +# define STDOUTFILE STDOUT, + extern int PP_NumProcs; + extern int PP_Myid; + long int PP_randn; + long int PP_rand; +#endif + + +/* + * memory allocation error handler + */ + +void maerror(char *message) +{ + FPRINTF(STDOUTFILE "\n\n\nUnable to proceed (lack of memory: %s)\n\n", message); + FPRINTF(STDOUTFILE "Hint for Macintosh users:\n"); + FPRINTF(STDOUTFILE "Use the command of the Finder to increase the memory partition!\n\n"); + exit(1); +} + + +/* + * memory allocate double vectors, matrices, and cubes + */ + +dvector new_dvector(int n) +{ + dvector v; + + v = (dvector) malloc((unsigned) (n * sizeof(double))); + if (v == NULL) maerror("step 1 in new_dvector"); + + return v; +} + +dmatrix new_dmatrix(int nrow, int ncol) +{ + int i; + dmatrix m; + + m = (dmatrix) malloc((unsigned) (nrow * sizeof(dvector))); + if (m == NULL) maerror("step 1 in in new_dmatrix"); + + *m = (dvector) malloc((unsigned) (nrow * ncol * sizeof(double))); + if (*m == NULL) maerror("step 2 in in new_dmatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + +dcube new_dcube(int ntri, int nrow, int ncol) +{ + int i, j; + dcube c; + + c = (dcube) malloc((unsigned) (ntri * sizeof(dmatrix))); + if (c == NULL) maerror("step 1 in in new_dcube"); + + *c = (dmatrix) malloc((unsigned) (ntri * nrow * sizeof(dvector))); + if (*c == NULL) maerror("step 2 in in new_dcube"); + + **c = (dvector) malloc((unsigned) (ntri * nrow * ncol * sizeof(double))); + if (**c == NULL) maerror("step 3 in in new_dcube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_dvector(dvector v) +{ + free((double *) v); +} + +void free_dmatrix(dmatrix m) +{ + free((double *) *m); + free((double *) m); +} + +void free_dcube(dcube c) +{ + free((double *) **c); + free((double *) *c); + free((double *) c); +} + + +/* + * memory allocate char vectors, matrices, and cubes + */ + +cvector new_cvector(int n) +{ + cvector v; + + v = (cvector) malloc((unsigned)n * sizeof(char)); + if (v == NULL) maerror("step1 in new_cvector"); + + return v; +} + +cmatrix new_cmatrix(int nrow, int ncol) +{ + int i; + cmatrix m; + + m = (cmatrix) malloc((unsigned) (nrow * sizeof(cvector))); + if (m == NULL) maerror("step 1 in new_cmatrix"); + + *m = (cvector) malloc((unsigned) (nrow * ncol * sizeof(char))); + if (*m == NULL) maerror("step 2 in new_cmatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + +ccube new_ccube(int ntri, int nrow, int ncol) +{ + int i, j; + ccube c; + + c = (ccube) malloc((unsigned) (ntri * sizeof(cmatrix))); + if (c == NULL) maerror("step 1 in new_ccube"); + + *c = (cmatrix) malloc((unsigned) (ntri * nrow * sizeof(cvector))); + if (*c == NULL) maerror("step 2 in new_ccube"); + + **c = (cvector) malloc((unsigned) (ntri * nrow * ncol * sizeof(char))); + if (**c == NULL) maerror("step 3 in new_ccube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_cvector(cvector v) +{ + free((char *) v); +} + +void free_cmatrix(cmatrix m) +{ + free((char *) *m); + free((char *) m); +} + +void free_ccube(ccube c) +{ + free((char *) **c); + free((char *) *c); + free((char *) c); +} + + +/* + * memory allocate int vectors, matrices, and cubes + */ + +ivector new_ivector(int n) +{ + ivector v; + + v = (ivector) malloc((unsigned) (n * sizeof(int))); + if (v == NULL) maerror("step 1 in new_ivector"); + + return v; +} + +imatrix new_imatrix(int nrow, int ncol) +{ + int i; + imatrix m; + + m = (imatrix) malloc((unsigned) (nrow * sizeof(ivector))); + if (m == NULL) maerror("step 1 in new_imatrix"); + + *m = (ivector) malloc((unsigned) (nrow * ncol * sizeof(int))); + if (*m == NULL) maerror("step 2 in new_imatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + +icube new_icube(int ntri, int nrow, int ncol) +{ + int i, j; + icube c; + + c = (icube) malloc((unsigned) (ntri * sizeof(imatrix))); + if (c == NULL) maerror("step 1 in new_icube"); + + *c = (imatrix) malloc((unsigned) (ntri * nrow * sizeof(ivector))); + if (*c == NULL) maerror("step 2 in new_icube"); + + **c = (ivector) malloc((unsigned) (ntri * nrow * ncol * sizeof(int))); + if (**c == NULL) maerror("step 3 in new_icube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_ivector(ivector v) +{ + free((int *) v); +} + +void free_imatrix(imatrix m) +{ + free((int *) *m); + free((int *) m); +} + +void free_icube(icube c) +{ + free((int *) **c); + free((int *) *c); + free((int *) c); +} + + +/* + * memory allocate uli vectors, matrices, and cubes + */ + +ulivector new_ulivector(int n) +{ + ulivector v; + + v = (ulivector) malloc((unsigned) (n * sizeof(uli))); + if (v == NULL) maerror("step 1 in new_ulivector"); + + return v; +} + +ulimatrix new_ulimatrix(int nrow, int ncol) +{ + int i; + ulimatrix m; + + m = (ulimatrix) malloc((unsigned) (nrow * sizeof(ulivector))); + if (m == NULL) maerror("step 1 in new_ulimatrix"); + + *m = (ulivector) malloc((unsigned) (nrow * ncol * sizeof(uli))); + if (*m == NULL) maerror("step 2 in new_ulimatrix"); + + for (i = 1; i < nrow; i++) m[i] = m[i-1] + ncol; + + return m; +} + +ulicube new_ulicube(int ntri, int nrow, int ncol) +{ + int i, j; + ulicube c; + + c = (ulicube) malloc((unsigned) (ntri * sizeof(ulimatrix))); + if (c == NULL) maerror("step 1 in new_ulicube"); + + *c = (ulimatrix) malloc((unsigned) (ntri * nrow * sizeof(ulivector))); + if (*c == NULL) maerror("step 2 in new_ulicube"); + + **c = (ulivector) malloc((unsigned) (ntri * nrow * ncol * sizeof(uli))); + if (**c == NULL) maerror("step 3 in new_ulicube"); + + for (j = 1; j < nrow; j++) c[0][j] = c[0][j-1] + ncol; + + for (i = 1; i < ntri; i++) { + c[i] = c[i-1] + nrow; + c[i][0] = c[i-1][0] + nrow * ncol; + for (j = 1; j < nrow; j++) c[i][j] = c[i][j-1] + ncol; + } + + return c; +} + +void free_ulivector(ulivector v) +{ + free((uli *) v); +} + +void free_ulimatrix(ulimatrix m) +{ + free((uli *) *m); + free((uli *) m); +} + +void free_ulicube(ulicube c) +{ + free((uli *) **c); + free((uli *) *c); + free((uli *) c); +} + + +/******************************************************************************/ +/* random numbers generator (Numerical recipes) */ +/******************************************************************************/ + +/* definitions */ +#define IM1 2147483563 +#define IM2 2147483399 +#define AM (1.0/IM1) +#define IMM1 (IM1-1) +#define IA1 40014 +#define IA2 40692 +#define IQ1 53668 +#define IQ2 52774 +#define IR1 12211 +#define IR2 3791 +#define NTAB 32 +#define NDIV (1+IMM1/NTAB) +#define EPS 1.2e-7 +#define RNMX (1.0-EPS) + +/* variable */ +long idum; + +double randomunitintervall() +/* Long period (> 2e18) random number generator. Returns a uniform random + deviate between 0.0 and 1.0 (exclusive of endpoint values). + + Source: + Press et al., "Numerical recipes in C", Cambridge University Press, 1992 + (chapter 7 "Random numbers", ran2 random number generator) */ +{ + int j; + long k; + static long idum2=123456789; + static long iy=0; + static long iv[NTAB]; + double temp; + + if (idum <= 0) { + if (-(idum) < 1) + idum=1; + else + idum=-(idum); + idum2=(idum); + for (j=NTAB+7;j>=0;j--) { + k=(idum)/IQ1; + idum=IA1*(idum-k*IQ1)-k*IR1; + if (idum < 0) + idum += IM1; + if (j < NTAB) + iv[j] = idum; + } + iy=iv[0]; + } + k=(idum)/IQ1; + idum=IA1*(idum-k*IQ1)-k*IR1; + if (idum < 0) + idum += IM1; + k=idum2/IQ2; + idum2=IA2*(idum2-k*IQ2)-k*IR2; + if (idum2 < 0) + idum2 += IM2; + j=iy/NDIV; + iy=iv[j]-idum2; + iv[j] = idum; + if (iy < 1) + iy += IMM1; + if ((temp=AM*iy) > RNMX) + return RNMX; + else + return temp; +} + +#undef IM1 +#undef IM2 +#undef AM +#undef IMM1 +#undef IA1 +#undef IA2 +#undef IQ1 +#undef IQ2 +#undef IR1 +#undef IR2 +#undef NTAB +#undef NDIV +#undef EPS +#undef RNMX + +int initrandom(int seed) +{ + srand((unsigned) time(NULL)); + if (seed < 0) + seed = rand(); + idum=-(long) seed; +# ifdef PARALLEL + { + int n; + for (n=0; n= 0.0 ? fabs(a) : -fabs(a)) + +/* Brents method in one dimension */ +double brent(double ax, double bx, double cx, double (*f)(double), double tol, + double *foptx, double *f2optx, double fax, double fbx, double fcx) +{ + int iter; + double a,b,d=0,etemp,fu,fv,fw,fx,p,q,r,tol1,tol2,u,v,w,x,xm; + double xw,wv,vx; + double e=0.0; + + a=(ax < cx ? ax : cx); + b=(ax > cx ? ax : cx); + x=bx; + fx=fbx; + if (fax < fcx) { + w=ax; + fw=fax; + v=cx; + fv=fcx; + } else { + w=cx; + fw=fcx; + v=ax; + fv=fax; + } + for (iter=1;iter<=ITMAX;iter++) { + xm=0.5*(a+b); + tol2=2.0*(tol1=tol*fabs(x)+ZEPS); + if (fabs(x-xm) <= (tol2-0.5*(b-a))) { + *foptx = fx; + xw = x-w; + wv = w-v; + vx = v-x; + *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ + (v*v*xw + x*x*wv + w*w*vx); + return x; + } + if (fabs(e) > tol1) { + r=(x-w)*(fx-fv); + q=(x-v)*(fx-fw); + p=(x-v)*q-(x-w)*r; + q=2.0*(q-r); + if (q > 0.0) p = -p; + q=fabs(q); + etemp=e; + e=d; + if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x)) + d=CGOLD*(e=(x >= xm ? a-x : b-x)); + else { + d=p/q; + u=x+d; + if (u-a < tol2 || b-u < tol2) + d=SIGN(tol1,xm-x); + } + } else { + d=CGOLD*(e=(x >= xm ? a-x : b-x)); + } + u=(fabs(d) >= tol1 ? x+d : x+SIGN(tol1,d)); + fu=(*f)(u); + if (fu <= fx) { + if (u >= x) a=x; else b=x; + SHFT(v,w,x,u) + SHFT(fv,fw,fx,fu) + } else { + if (u < x) a=u; else b=u; + if (fu <= fw || w == x) { + v=w; + w=u; + fv=fw; + fw=fu; + } else if (fu <= fv || v == x || v == w) { + v=u; + fv=fu; + } + } + } + *foptx = fx; + xw = x-w; + wv = w-v; + vx = v-x; + *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ + (v*v*xw + x*x*wv + w*w*vx); + return x; +} +#undef ITMAX +#undef CGOLD +#undef ZEPS +#undef SHFT +#undef SIGN +#undef GOLD +#undef GLIMIT +#undef TINY + +/* one-dimensional minimization - as input a lower and an upper limit and a trial + value for the minimum is needed: xmin < xguess < xmax + the function and a fractional tolerance has to be specified + onedimenmin returns the optimal x value and the value of the function + and its second derivative at this point + */ +double onedimenmin(double xmin, double xguess, double xmax, double (*f)(double), + double tol, double *fx, double *f2x) +{ + double eps, optx, ax, bx, cx, fa, fb, fc; + + /* first attempt to bracketize minimum */ + eps = xguess*tol*50.0; + ax = xguess - eps; + if (ax < xmin) ax = xmin; + bx = xguess; + cx = xguess + eps; + if (cx > xmax) cx = xmax; + + /* check if this works */ + fa = (*f)(ax); + fb = (*f)(bx); + fc = (*f)(cx); + + /* if it works use these borders else be conservative */ + if ((fa < fb) || (fc < fb)) { + if (ax != xmin) fa = (*f)(xmin); + if (cx != xmax) fc = (*f)(xmax); + optx = brent(xmin, xguess, xmax, f, tol, fx, f2x, fa, fb, fc); + } else + optx = brent(ax, bx, cx, f, tol, fx, f2x, fa, fb, fc); + + return optx; /* return optimal x */ +} + +/* two-dimensional minimization with borders and calculations of standard errors */ +/* we optimize along basis vectors - not very optimal but it seems to work well */ +void twodimenmin(double tol, + int active1, double min1, double *x1, double max1, double (*func1)(double), double *err1, + int active2, double min2, double *x2, double max2, double (*func2)(double), double *err2) +{ + int it, nump, change; + double x1old, x2old; + double fx, f2x; + + it = 0; + nump = 0; + + /* count number of parameters */ + if (active1) nump++; + if (active2) nump++; + + do { /* repeat until nothing changes any more */ + it++; + change = FALSE; + + /* optimize first variable */ + if (active1) { + + if ((*x1) <= min1) (*x1) = min1 + 0.2*(max1-min1); + if ((*x1) >= max1) (*x1) = max1 - 0.2*(max1-min1); + x1old = (*x1); + (*x1) = onedimenmin(min1, (*x1), max1, func1, tol, &fx, &f2x); + if ((*x1) < min1) (*x1) = min1; + if ((*x1) > max1) (*x1) = max1; + /* same tolerance as 1D minimization */ + if (fabs((*x1) - x1old) > 3.3*tol) change = TRUE; + + /* standard error */ + f2x = fabs(f2x); + if (1.0/(max1*max1) < f2x) (*err1) = sqrt(1.0/f2x); + else (*err1) = max1; + + } + + /* optimize second variable */ + if (active2) { + + if ((*x2) <= min2) (*x2) = min2 + 0.2*(max2-min2); + if ((*x2) >= max2) (*x2) = max2 - 0.2*(max2-min2); + x2old = (*x2); + (*x2) = onedimenmin(min2, (*x2), max2, func2, tol, &fx, &f2x); + if ((*x2) < min2) (*x2) = min2; + if ((*x2) > max2) (*x2) = max2; + /* same tolerance as 1D minimization */ + if (fabs((*x2) - x2old) > 3.3*tol) change = TRUE; + + /* standard error */ + f2x = fabs(f2x); + if (1.0/(max2*max2) < f2x) (*err2) = sqrt(1.0/f2x); + else (*err2) = max2; + + } + + if (nump == 1) return; + + } while (it != MAXITS && change); + + return; +} + diff --git a/forester/archive/RIO/others/puzzle_mod/src/util.h b/forester/archive/RIO/others/puzzle_mod/src/util.h new file mode 100644 index 0000000..20f37e5 --- /dev/null +++ b/forester/archive/RIO/others/puzzle_mod/src/util.h @@ -0,0 +1,96 @@ +/* + * util.h + * + * + * Part of TREE-PUZZLE 5.0 (June 2000) + * + * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, + * M. Vingron, and Arndt von Haeseler + * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler + * + * All parts of the source except where indicated are distributed under + * the GNU public licence. See http://www.opensource.org for details. + */ + + +#ifndef _UTIL_ +#define _UTIL_ + +#include +#include +#include +#include + + +/* + * general definitions + */ + +#define TRUE 1 +#define FALSE 0 + +#ifdef PARALLEL + extern long int PP_randn; + extern long int PP_rand; +#endif + +/* + * type definitions + */ + +typedef unsigned long int uli; + +typedef double *dvector, **dmatrix, ***dcube; +typedef char *cvector, **cmatrix, ***ccube; +typedef int *ivector, **imatrix, ***icube; +typedef uli *ulivector, **ulimatrix, ***ulicube; + + +/* + * prototypes of functions defined in util.c + */ + +void maerror(char *message); + +dvector new_dvector(int n); +dmatrix new_dmatrix(int nrow, int ncol); +dcube new_dcube(int ntri, int nrow, int ncol); +void free_dvector(dvector v); +void free_dmatrix(dmatrix m); +void free_dcube(dcube c); + +cvector new_cvector(int n); +cmatrix new_cmatrix(int nrow, int ncol); +ccube new_ccube(int ntri, int nrow, int ncol); +void free_cvector(cvector v); +void free_cmatrix(cmatrix m); +void free_ccube(ccube c); + +ivector new_ivector(int n); +imatrix new_imatrix(int nrow, int ncol); +icube new_icube(int ntri, int nrow, int ncol); +void free_ivector(ivector v); +void free_imatrix(imatrix m); +void free_icube(icube c); + +ulivector new_ulivector(int n); +ulimatrix new_ulimatrix(int nrow, int ncol); +ulicube new_ulicube(int ntri, int nrow, int ncol); +void free_ulivector(ulivector v); +void free_ulimatrix(ulimatrix m); +void free_ulicube(ulicube c); + +double randomunitintervall(void); +int initrandom(int seed); +int randominteger(int n); +void chooser(int t, int s, ivector slist); +void *myrealloc(void *, size_t); +cvector mygets(void); + +#define MAXITS 10 /* maximum number of iterations in twoedimenmin */ +double onedimenmin(double, double, double, double (*f )(double ), double, double *, double *); +void twodimenmin(double, int, double, double *, double, double (*func1 )(double ), double *, int, double, double *, double, double (*func2 )(double ), double *); + + + +#endif diff --git a/forester/archive/perl/00README b/forester/archive/perl/00README new file mode 100755 index 0000000..2103cf4 --- /dev/null +++ b/forester/archive/perl/00README @@ -0,0 +1,48 @@ +Overview of the Perl scripts in this directory +---------------------------------------------- + +This directory contains a collection of (mostly horrible) +Perl scripts. Some of them are still maintained, such as +phylo_pl.pl. + +Some of the scripts in this directory relay heavily +on forester.pm. + + + +RIO pipeline: +- rio.pl +- makeTree.pl +- p7extract.pl +- multifetch.pl + + +Running a parallelized RIO web server: +- nph-riowebserver +- rio_slave.pl +- rio_slave_driver.pl +- queue.pm + + +To prepare data to be used by RIO: +- bootstrap_cz.pl +- pfam2slx.pl +- extractSWISS-PROT.pl +- extractTrembl.pl +- pfam2pwd.pl + + +To run multiple RIO analyses in an automated fashion: +- Xrio.pl + + +To analyze RIO results (of Xrio.pl runs): +- bootstrapCounter.pl +- bootstrapSelector.pl +- diffFinder3.pl + + +Counting of species in SWISS-PROT and TrEMBL: +- countSpeciesSPTrEMBL.pl + + diff --git a/forester/archive/perl/Xrio.pl b/forester/archive/perl/Xrio.pl new file mode 100755 index 0000000..08c3ceb --- /dev/null +++ b/forester/archive/perl/Xrio.pl @@ -0,0 +1,585 @@ +#!/usr/bin/perl -w +# +# Xrio.pl +# ------- +# Copyright (C) 1999-2001 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Created: 03/01/01 +# +# Last modified 06/22/01 + + +# Objective. Runs "rio.pl" for each Pfam assignment in "infile". +# +# Usage. rio.pl +# +# species names file: list of species to use for analysis. +# +# This version uses the CE number as identifier for output files.\n"; +# +# Format for infile: +# +# >>3R5.2 CE19648 (CAMBRIDGE) TR:Q9XWB1 protein_id:CAA21778.1 +# // +# +# >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1 +# =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1 +# // +# +# >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1 +# =ras Ras family 208.8 8.1e-59 1 +# =FA_desaturase Fatty acid desaturase 4.5 1.5 1 +# =UPF0117 Domain of unknown function DUF36 3.1 3.5 1 +# =arf ADP-ribosylation factor family -46.0 1.5e-05 1 +# // +# +# +# + +# Xrio.pl /nfs/wol2/people/zmasek/wormpep43_hmmpfam6.2/wormpep43_Hmmpfam_6.2 /nfs/wol2/people/zmasek/species_trees/tree_of_life_bin_1-4_species_list /nfs/wol2/people/zmasek/XrioTEST3 /nfs/wol2/people/zmasek/XrioTEST3/OUTFILE1 /nfs/wol2/people/zmasek/XrioTEST3/LOG1 + + + + + +use strict; + +use FindBin; +use lib $FindBin::Bin; +use rio_module; + + $RIO_PL = "rio.pl"; +my $VERSION = "3.000"; + +my $FASTA_DB = "/nfs/wol2/people/zmasek/DB/wormpep/wormpep43"; +my $QUERY_SPECIES = "CAEEL"; +my $SPECIES_TREE = $SPECIES_TREE_FILE_DEFAULT; + +my $RIOPL_OPTIONS = "T=B P=6 L=0 R=0 U=80 V=0 X=2 Y=2 Z=2 C E I"; + +my $TEMP_DIR = "/tmp/Xriopl"; # Where all the temp files, etc will be created. + +my %Species_names_hash = (); + +my $infile = ""; +my $outfile = ""; # Huge file of all rio outputs. +my $logfile = ""; # Lists all sequences which have been analyzed successfully. +my $output_directory = ""; +my $species_names_file = ""; + + +my $return_line = ""; +my $ID = ""; +my $pfam_name = ""; +my $E_value = 0; +my $score = 0; +my $GA = 0; +my $temp_dir = ""; +my $outname = ""; +my %outnames = (); +my $seqs = 0; +my $ii = 0; +my $time = 0; +my $successful = 0; +my $query_not_aligned = 0; +my $pwd_not_present = 0; +my $already_done = 0; +my $start_date = ""; +my $old_fh = ""; +my %AC_OS = (); # AC -> species name for TrEMBL seqs +my %AC_DE = (); # AC -> description for TrEMBL seqs + +my $description_line = ""; +my $message1 = ""; +my $message2 = ""; + +$start_date = `date`; + +if ( @ARGV != 5 ) { + &errorInCommandLine(); + exit ( -1 ); +} + +$infile = $ARGV[ 0 ]; +$species_names_file = $ARGV[ 1 ]; +$output_directory = $ARGV[ 2 ]; +$outfile = $ARGV[ 3 ]; +$logfile = $ARGV[ 4 ]; + + +if ( -e $outfile ) { + die "\n\n$0: <<$outfile>> already exists.\n\n"; +} +unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; +} +unless ( ( -s $species_names_file ) && ( -f $species_names_file ) && ( -T $species_names_file ) ) { + die "\n\n$0: <<$species_names_file>> does not exist, is empty, or is not a plain textfile.\n\n"; +} +unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) { + die "\n\n$0: <<$TREMBL_ACDEOS_FILE>> does not exist, is empty, or is not a plain textfile.\n\n"; +} +unless ( ( -e $output_directory ) && ( -d $output_directory ) ) { + die "\n\n$0: <<$output_directory>> does not exist, or is not a directory.\n\n"; +} + + + +# Reads in the species file: +# -------------------------- +&readSpeciesNamesFile( $species_names_file ); + + + +# Reads in the file containing AC, DE and OS for TrEMBL seqs: +# ----------------------------------------------------------- +open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n"; +while ( $return_line = ) { + if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) { + $AC_OS{ $1 } = $3; + $AC_DE{ $1 } = $2; + } +} +close( HH ); + + + +# Reads in outnames in logfile, if present: +# ----------------------------------------- +if ( ( -s $logfile ) ) { + open( L, "$logfile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$logfile>>: $!\n\n"; + while ( $return_line = ) { + if ( $return_line =~ /\s*(\S+)/ ) { + $outnames{ $1 } = 0; + } + } + close( L ); +} + + + +# Creates the temp directory: +# --------------------------- + +$ii = 0; + +$time = time; + +$temp_dir = $TEMP_DIR.$time.$ii; + +while ( -e $temp_dir ) { + $ii++; + $temp_dir = $TEMP_DIR.$time.$ii; +} + +mkdir( $temp_dir, 0777 ) +|| die "\n\n$0:Unexpected error: Could not create <<$temp_dir>>: $!\n\n"; + +unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { + die "\n\n$0:Unexpected error: <<$temp_dir>> does not exist, or is not a directory: $!\n\n"; +} + + + +$message1 = "# $0\n". + "# Version : $VERSION\n". + "# Date started : $start_date". + "# Infile : $infile\n". + "# Species names file : $species_names_file\n". + "# Output directory : $output_directory\n". + "# Outfile : $outfile\n". + "# RIO PWD directory : $RIO_PWD_DIRECTORY\n". + "# RIO BSP directory : $RIO_BSP_DIRECTORY\n". + "# RIO NBD directory : $RIO_NBD_DIRECTORY\n". + "# RIO ALN directory : $RIO_ALN_DIRECTORY\n". + "# RIO HMM directory : $RIO_HMM_DIRECTORY\n". + "# Fasta db : $FASTA_DB\n". + "# Species of query : $QUERY_SPECIES\n". + "# Species tree : $SPECIES_TREE\n". + "# rio.pl options : $RIOPL_OPTIONS\n\n\n"; + +open( IN, "$infile" ) || die "\n\n$0: Cannot open file <<$infile>>: $!\n\n"; +open( LOG, ">> $logfile" ) || die "\n\n$0: Cannot open file <<$logfile>>: $!\n\n"; + + +# Turns off buffering for LOG. +$old_fh = select( LOG ); +$| = 1; +select( $old_fh ); + + +$ID = ""; + +W: while ( $return_line = ) { + + if ( $return_line =~ /^\s*>>.*(CE\d+)/ ) { + $ID = $1; + $return_line =~ /^\s*>>(.+)/; + $description_line = $1; + } + elsif ( $return_line =~ /^\s*\/\// ) { + $ID = ""; + } + elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+\s*$/ + && $ID ne "" ) { + + $pfam_name = $1; + $score = $2; + $E_value = $3; + + $outname = $ID.".".$pfam_name; + + # Checks if already done. + if ( %outnames && exists( $outnames{ $outname } ) ) { + $already_done++; + next W; + } + + &executeHmmfetch( $PFAM_HMM_DB, $pfam_name, $temp_dir."/HMMFILE" ); + + $GA = &getGA1cutoff( $temp_dir."/HMMFILE" ); + unlink( $temp_dir."/HMMFILE" ); + + if ( $GA == 2000 ) { + die "\n\n$0: Unexpected error: No GA cutoff found for \"$pfam_name\".\n\n"; + } + elsif ( $score < $GA ) { + next W; + } + + if ( -s $output_directory."/".$outname ) { + unlink( $output_directory."/".$outname ); + } + + + $message1 .= "\n\n". + "# ############################################################################\n". + "# Annotation: $description_line\n". + "# HMM : $pfam_name\n". + "# score : $score\n". + "# E-value : $E_value\n"; + + + + unless ( ( -s $RIO_PWD_DIRECTORY.$pfam_name.$SUFFIX_PWD ) ) { + $pwd_not_present++; + $message1 .= "# No PWD file for this family.\n". + "# ############################################################################\n"; + next W; + } + + + unless ( ( -s $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -f $PFAM_SEED_DIRECTORY."/".$pfam_name ) && ( -T $PFAM_SEED_DIRECTORY."/".$pfam_name ) ) { + die "\n\n$0: Error: Pfam seed alignment <<$PFAM_SEED_DIRECTORY"."/"."$pfam_name>> not present.\n\n"; + } + + + &getSequenceFromFastaFile( $FASTA_DB, + $temp_dir."/QUERY", + $ID ); + + &performRIO( $pfam_name, # A= + $temp_dir."/QUERY", # Q= + $output_directory."/".$outname, # O= + $ID."_".$QUERY_SPECIES, # N= + $SPECIES_TREE, # S= + $RIOPL_OPTIONS, # L=0 R=0 U=70 V=0 X=2 Y=2 Z=2 C E K I x + $temp_dir."/riopltempdir" ); # j= + + + + if ( -s $output_directory."/".$outname ) { + $successful++; + } + else { + $message1 .= "# Query has not been aligned (E value too low).\n". + "# ############################################################################\n"; + $query_not_aligned++; + } + + if ( unlink( $temp_dir."/QUERY" ) != 1 ) { + die "\n$0: Unexpected error: File(s) could not be deleted.\n"; + } + + + + if ( -s $output_directory."/".$outname ) { + open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n\n"; + print OUT_MESG_ONE ( $message1 ); + close( OUT_MESG_ONE ); + + $message1 = ""; + + open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n\n"; + print OUT_MESG_TWO ( "# Successful calculations : $successful\n" ); + print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" ); + print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n" ); + print OUT_MESG_TWO ( "# ############################################################################\n" ); + close( OUT_MESG_TWO ); + + if ( -s $outfile ) { + system( "cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___" ) + && die "\n\n$0: Could not execute \"cat $outfile $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile"."___\": $!\n\n"; + system( "mv", $outfile."___", $outfile ) + && die "\n\n$0: Could not execute \"mv $outfile"."___ $outfile\": $!\n\n"; + } + else { + system( "cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile" ) + && die "\n\n$0: Could not execute \"cat $temp_dir/_message1_ $output_directory/$outname $temp_dir/_message2_ > $outfile\": $!\n\n"; + + } + + print LOG "$outname\n"; + + unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" ); + + } + + + + } ## End of elsif ( $return_line =~ /^\s*=(\S+)\s+.+\s+(\S+)\s+(\S+)\s+\S+$/ && $ID ne "" ) + +} ## End of while ( $return_line = ) + +close( IN ); +close( LOG ); + + +open( OUT_MESG_TWO, ">$temp_dir/_message2_" ) || die "\n$0: Cannot create file \"$temp_dir/_message2_\": $!\n"; +print OUT_MESG_TWO ( "\n\n# Xrio.pl successfully terminated.\n" ); +print OUT_MESG_TWO ( "# Started : $start_date" ); +print OUT_MESG_TWO ( "# Terminated: ".`date`."\n" ); +print OUT_MESG_TWO ( "# Successful calculations : $successful\n" ); +print OUT_MESG_TWO ( "# No calculation due to absence of PWD file: $pwd_not_present\n" ); +print OUT_MESG_TWO ( "# Calculation already performed : $already_done\n\n" ); +close( OUT_MESG_TWO ); + +if ( -s $outfile ) { + if ( $message1 ne "" ) { + open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n"; + print OUT_MESG_ONE ( $message1 ); + close( OUT_MESG_ONE ); + system( "cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___" ) + && die "$0: Could not execute \"cat $outfile $temp_dir/_message1_ $temp_dir/_message2_ > $outfile"."___\": $!"; + } + else { + system( "cat $outfile $temp_dir/_message2_ > $outfile"."___" ) + && die "$0: Could not execute \"cat $outfile $temp_dir/_message2_ > $outfile"."___\": $!"; + } + system( "mv", $outfile."___", $outfile ) + && die "$0: Could not execute \"mv $outfile"."___ $outfile\": $!"; +} +else { + open( OUT_MESG_ONE, ">$temp_dir/_message1_" ) || die "\n$0: Cannot create file \"$temp_dir/_message1_\": $!\n"; + print OUT_MESG_ONE ( $message1 ); + close( OUT_MESG_ONE ); + system( "cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile" ) + && die "$0: Could not execute \"cat $temp_dir/_message1_ $temp_dir/_message2_ > $outfile\": $!"; +} + +unlink( "$temp_dir/_message1_", "$temp_dir/_message2_" ); + +rmdir( $temp_dir ) || die "\n$0: Unexpected failure (could not remove: $temp_dir): $!\n"; + +print( "\n\nXrio.pl successfully terminated.\n" ); +print( "Successful calculations : $successful\n" ); +print( "No calculation due to absence of PWD file: $pwd_not_present\n" ); +print( "Calculation already performed : $already_done\n" ); +print( "Started : $start_date" ); +print( "Terminated: ".`date`."\n" ); +print( "\n" ); + +exit( 0 ); + + + +# Methods +# ------- + + + +# Gets the gathering cutoff per sequence from a HMM file. +# +# One argument: the HMM file name +# Returns the gathering cutoff per sequence, 2000 upon failure +# Last modified: 07/11/01 +sub getGA1cutoff { + + my $infile = $_[ 0 ]; + my $return_line = ""; + my $GA = 2000; + + &testForTextFilePresence( $infile ); + + open( H, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; + while ( $return_line = ) { + + if ( $return_line =~ /^GA\s+(\S+)/ ) { + $GA = $1; + close( H ); + return $GA; + } + + } + close( H ); + return $GA; + +} ## getGA1cutoff + + + + + +# 1. A= Name of Pfam family +# 2. Q= Query file +# 3. O= Output +# 4. N= query Name +# 5. S= Species tree file +# 6. more options, such I K m +# 7. j= Name for temporary directory +sub performRIO { + my $pfam_name = $_[ 0 ]; + my $query_file = $_[ 1 ]; + my $output_file = $_[ 2 ]; + my $name_for_query = $_[ 3 ]; + my $species_tree_file = $_[ 4 ]; + my $more_options = $_[ 5 ]; + my $tmp_file_rio = $_[ 6 ]; + + my $options_for_rio = ""; + + $options_for_rio .= ( " A=".$pfam_name ); + $options_for_rio .= ( " Q=".$query_file ); + $options_for_rio .= ( " O=".$output_file ); + $options_for_rio .= ( " N=".$name_for_query ); + $options_for_rio .= ( " S=".$species_tree_file ); + $options_for_rio .= ( " j=".$tmp_file_rio ); + $options_for_rio .= ( " ".$more_options ); + + system( "$RIO_PL 1 $options_for_rio" ) + && die "$0: performRIO: Could not execute \"$RIO_PL 1 $options_for_rio\": $!\n"; + +} ## performRIO + + + +# Reads in (SWISS-PROT) species names from a file. +# Names must be separated by newlines. +# Lines beginning with "#" are ignored. +# A possible "=" and everything after is ignored. +# One argument: species-names-file name +# Last modified: 04/24/01 +sub readSpeciesNamesFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $species = ""; + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + &Error( "\"$infile\" does not exist,\n is empty, or is not a plain textfile." ); + } + + open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; + while ( $return_line = ) { + if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { + $species = $1; + $species =~ s/=.+//; + $Species_names_hash{ $species } = ""; + } + } + close( IN_RSNF ); + + return; +} ## readSpeciesNamesFile + + + +# Searches the > line of a multiple seq file for a +# query, saves the found entries. +# Three arguments: +# 1. multi Fasta file to search through +# 2. outputfile name +# 3. query +# Last modified: 03/05/01 +sub getSequenceFromFastaFile { + + my $inputfile = $_[ 0 ]; + my $outputfile = $_[ 1 ]; + my $query = $_[ 2 ]; + my $hits = 0; + + open( IN_GSFF, "$inputfile" ) + || die "\n$0: getSequenceFromFastaFile: Cannot open file <<$inputfile>>: $!\n"; + open( OUT_GSFF, ">$outputfile" ) + || die "\n$0: getSequenceFromFastaFile: Cannot create file <<$outputfile>>: $!\n"; + + + while ( $return_line = ) { + if ( $return_line =~ /^\s*>.*$query\s+/ ) { + $hits++; + print $return_line; + print OUT_GSFF $return_line; + $return_line = ; + while ( $return_line && $return_line =~ /^\s*[^>]/ ) { + print OUT_GSFF $return_line; + $return_line = ; + } + last; # In Wormpep there _are_ ambigous CE numbers. + } + + } + + close( IN_GSFF ); + close( OUT_GSFF ); + if ( $hits < 1 ) { + die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> not found.\n"; + } + if ( $hits > 1 ) { + die "\n$0: getSequenceFromFastaFile: Unexpected error: <<$query>> is ambigous.\n"; + } + +} ## getSequenceFromFastaFile + + + + +# Last modified: 03/08/01 +sub errorInCommandLine { + + print "\n"; + print " Xrio.pl $VERSION\n"; + print " -------\n"; + print "\n"; + print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; + print "\n"; + print " Purpose. Runs \"rio.pl\" for each Pfam assignment in \"infile\".\n"; + print "\n"; + print " Usage. rio.pl \n"; + print "\n"; + print " infile: has the following format (defined per example):\n"; + print " >>4R79.1 CE19649 Zinc-binding metalloprotease domain (CAMBRIDGE) protein_id:CAB63429.1\n"; + print " =Astacin Astacin (Peptidase family M12A) 296.3 3.8e-85 1\n"; + print " //\n"; + print "\n"; + print " >>4R79.2 CE19650 Ras family (CAMBRIDGE) TR:Q9XXA4 protein_id:CAA20282.1\n"; + print " =ras Ras family 208.8 8.1e-59 1\n"; + print " =FA_desaturase Fatty acid desaturase 4.5 1.5 1\n"; + print " =UPF0117 Domain of unknown function DUF36 3.1 3.5 1\n"; + print " =arf ADP-ribosylation factor family -46.0 1.5e-05 1\n"; + print " //\n"; + print "\n"; + print " species names file: list of species to use for analysis.\n"; + print "\n"; + print " This version uses the CE number as identifier for output files.\n"; + print "\n"; + + exit( -1 ); + +} ## errorInCommandLine + + diff --git a/forester/archive/perl/bootstrapCounter.pl b/forester/archive/perl/bootstrapCounter.pl new file mode 100755 index 0000000..2aab587 --- /dev/null +++ b/forester/archive/perl/bootstrapCounter.pl @@ -0,0 +1,184 @@ +#!/usr/bin/perl -w +# +# bootstrapCounter.pl +# ------------------- +# +# Copyright (C) 2001 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Created: 04/04/01 +# +# Last modified 08/16/01 +# +# +# Objective. Determines the distribution of top orthology bootstrap values +# of a Xrio.pl output file. +# +# Usage. "bootstrapCounter.pl " +# +# Important. The result of this is meaningful ONLY if the thresholds +# for output of the RIO analysis are set to zero (L=0 R=0). +# +# Format for infile: +# ... +# +# # ############################################################################ +# # Annotation: B0511.6 CE17345 helicase (ST.LOUIS) TR:O61815 protein_id:AAC17654.1 +# # HMM : ABC_tran +# # score : -59.6 +# # E-value : 1.1 +# # Query has not been aligned (score lower than gathering cutoff). +# # ############################################################################ +# +# +# # ############################################################################ +# # Annotation: B0511.7 CE17346 (ST.LOUIS) TR:O61817 protein_id:AAC17655.1 +# # HMM : FHA +# # score : 71.6 +# # E-value : 1.7e-17 +# RIO - Resampled Inference of Orthologs +# Version: 1.000 +# ------------------------------------------------------------------------------ +# Alignment file: /tmp/Xriopl9846081980/Full-FHA +# Alignment : FHA domain +# HMM : FHA +# Query file : /tmp/Xriopl9846081980/__queryfile__ +# ============================================================================== +# +# Query : CE17346.FHA_CAEEL/45-114 +# +# Number (in %) of observed orthologies (o) and super orthologies (s) to query +# in bootstrapped trees, evolutionary distance to query: +# +# Sequence Description # o[%] s[%] distance +# -------- ----------- ---- ---- -------- +# YC67_MYCTU/308-372 - 20 14 1.577840 +# FRAH_ANASP/204-277 FRAH PROTEIN. 17 16 1.532670 +# ABA2_NICPL/557-633 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 1.885700 +# ABA2_LYCES/563-639 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 2.140000 +# +# +# +# Distance values (based on ML branch length values on consensus tree) +# -------------------------------------------------------------------- +# Given the thresholds for distance calculations: +# No sequence is considered orthologous to query. +# +# ... + + + +use strict; + +my $VERSION = 0.200; + +my $infile = ""; +my $outfile = ""; +my $return_line = ""; +my $results = 0; +my $o_bootstraps = 0; +my $s_bootstraps = 0; +my @o_bootstraps_array = (); +my @s_bootstraps_array = (); +my $total = 0; +my $i = 0; + + +if ( @ARGV != 2 ) { + &errorInCommandLine(); + exit ( -1 ); +} + +$infile = $ARGV[ 0 ]; +$outfile = $ARGV[ 1 ]; + +if ( -e $outfile ) { + die "\n$0: <<$outfile>> already exists.\n"; +} +unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; +} + + +open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; + +$results = 0; +for ( $i = 0; $i <= 100; ++$i ) { + $s_bootstraps_array[ $i ] = $o_bootstraps_array[ $i ] = 0; +} + +while ( $return_line = ) { + + if ( $return_line =~ /^\s*--------\s+/ ) { + $results = 1; + } + elsif ( $return_line =~ /^\s*Distance\s+values\s+/i ) { + $results = 0; + } + elsif ( $results == 1 && $return_line =~ /^\s*!NO\s+ORTHOLOGS/ ) { + $o_bootstraps_array[ 0 ]++; + $s_bootstraps_array[ 0 ]++; + $total++; + $results = 0; + } + elsif ( $results == 1 && $return_line =~ /(\S+)\s+(\S+)\s+\S+\s*$/ ) { + $o_bootstraps = $1; + $s_bootstraps = $2; + $results = 0; + if ( $o_bootstraps > 100 || $s_bootstraps > 100 + || $o_bootstraps < 0 ) { + print "o bootstraps: $o_bootstraps\n"; + print "s bootstraps: $s_bootstraps\n"; + die "\n\n$0: Error: Boostrap value(s) out of range.\n\n"; + } + + $total++; + $o_bootstraps_array[ $o_bootstraps ]++; + $s_bootstraps_array[ $s_bootstraps ]++; + + } +} + +close( IN ); + + +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; + +print OUT "bootstrapCounter.pl version: $VERSION\n\n"; +print OUT "Distribution of top bootstrap values\n\n"; +print OUT "Input file : $infile\n"; +print OUT "Output file: $outfile\n"; +print OUT "Date : ".`date`."\n"; +print OUT "Total: $total\n\n"; +print OUT "top-orthology-bootstraps vs. count:\n"; +for ( $i = 0; $i < @o_bootstraps_array; ++$i ) { + print OUT "$i $o_bootstraps_array[ $i ]\n"; +} +print OUT "\n\ntop-super-orthology-bootstraps vs. count:\n"; +for ( $i = 0; $i < @s_bootstraps_array; ++$i ) { + print OUT "$i $s_bootstraps_array[ $i ]\n"; +} +close( OUT ); + +print( "\nDone.\n\n" ); + +exit( 0 ); + + + +sub errorInCommandLine { + print "\n"; + print " bootstrapCounter.pl version: $VERSION\n"; + print " Usage: \"bootstrapCounter.pl \"\n"; + print " Important: The result of this is meaningful ONLY if the thresholds\n"; + print " for output of the RIO analysis are set to zero (L=0 R=0).\n"; + print "\n"; + exit( -1 ); +} + + diff --git a/forester/archive/perl/bootstrapSelector.pl b/forester/archive/perl/bootstrapSelector.pl new file mode 100755 index 0000000..ec6ce69 --- /dev/null +++ b/forester/archive/perl/bootstrapSelector.pl @@ -0,0 +1,291 @@ +#!/usr/bin/perl -w +# +# bootstrapSelector.pl +# -------------------- +# +# Copyright (C) 2001 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Created: 04/06/01 +# +# Last modified 09/24/01 +# +# +# Objective. Selection of RIO analysis results with top ortholgy +# bootstrap values greater or less than a threshold. +# +# Usage: "bootstrapSelector.pl " +# Options: "l" for "less or equal" ("grater or equal" is default) +# "c" for "all hits must meet threshold in case of +# multiple copies of the same domain in the query" +# (default: "at least one") +# Example: "bootstrapSelector.pl 95lc OUTFILE_At_1 At_1_out" +# +# Important. The result of this is meaningful ONLY if the thresholds +# for output of the RIO analysis are set to zero (L=0 R=0). +# +# +# Format for infile: +# +# ... +# +# # ############################################################################ +# # Annotation: B0511.6 CE17345 helicase (ST.LOUIS) TR:O61815 protein_id:AAC17654.1 +# # HMM : ABC_tran +# # score : -59.6 +# # E-value : 1.1 +# # Query has not been aligned (score lower than gathering cutoff). +# # ############################################################################ +# +# +# # ############################################################################ +# # Annotation: B0511.7 CE17346 (ST.LOUIS) TR:O61817 protein_id:AAC17655.1 +# # HMM : FHA +# # score : 71.6 +# # E-value : 1.7e-17 +# RIO - Resampled Inference of Orthologs +# Version: 1.000 +# ------------------------------------------------------------------------------ +# Alignment file: /tmp/Xriopl9846081980/Full-FHA +# Alignment : FHA domain +# HMM : FHA +# Query file : /tmp/Xriopl9846081980/__queryfile__ +# ============================================================================== +# +# Query : CE17346.FHA_CAEEL/45-114 +# +# Number (in %) of observed orthologies (o) and super orthologies (s) to query +# in bootstrapped trees, evolutionary distance to query: +# +# Sequence Description # o[%] s[%] distance +# -------- ----------- ---- ---- -------- +# YC67_MYCTU/308-372 - 20 14 1.577840 +# FRAH_ANASP/204-277 FRAH PROTEIN. 17 16 1.532670 +# ABA2_NICPL/557-633 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 1.885700 +# ABA2_LYCES/563-639 ZEAXANTHIN EPOXIDASE PRECURSOR (EC 1.14.-.-). 14 11 2.140000 +# +# +# +# Distance values (based on ML branch length values on consensus tree) +# -------------------------------------------------------------------- +# Given the thresholds for distance calculations: +# No sequence is considered orthologous to query. +# +# ... + + + +use strict; + +my $VERSION = 1.000; +my $threshold = 0; +my $infile = ""; +my $outfile = ""; +my $summary_outfile = ""; +my $return_line = ""; +my $identifier = ""; +my $top1 = ""; +my $analysis_performed = 0; +my $reading = 0; +my $i = 0; +my @lines = (); +my $larger = 1; +my $complete = 0; +my $total = 0; + +if ( @ARGV != 3 ) { + &errorInCommandLine(); + exit ( -1 ); +} + +$threshold = $ARGV[ 0 ]; +$infile = $ARGV[ 1 ]; +$outfile = $ARGV[ 2 ]; +$summary_outfile = $outfile.".short"; + +if ( -e $outfile ) { + die "\n$0: <<$outfile>> already exists.\n"; +} +unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; +} + + +if ( $threshold =~ /l/ ) { + $larger = 0; + $threshold =~ s/l//; +} +if ( $threshold =~ /c/ ) { + $complete = 1; + $threshold =~ s/c//; +} + +open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; + +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; +open( OUT_SUMMARY, ">$summary_outfile" ) || die "\n$0: Cannot create file \"$summary_outfile\": $!\n"; + +print OUT "bootstrapSelector.pl version: $VERSION\n\n"; +print OUT "Selection of RIO analysis results with top ortholgy\n"; +print OUT "bootstrap values greater or less than a threshold.\n\n"; +if ( $larger == 1 ) { + print OUT "Threshold : Grater than or equal to $threshold\n"; +} +else { + print OUT "Threshold : Less than or equal to $threshold\n"; +} +print OUT "In case of multiple copies of the same domain in the query:\n"; +if ( $complete == 1 ) { + print OUT "All hits must meet threshold.\n"; +} +else { + print OUT "At least one hit must meet threshold.\n"; +} +print OUT "Input file : $infile\n"; +print OUT "Output file : $outfile\n"; +print OUT "Output file short: $summary_outfile\n"; +print OUT "Date : ".`date`."\n\n\n"; + +while ( $return_line = ) { + + if ( $return_line =~ /^\s*# Annotation:\s*(.+)/ ) { + $identifier = $1; + $identifier = substr( $identifier, 0, 60); + $analysis_performed = 0; + $reading = 1; + $i = 0; + @lines = (); + } + + if ( $reading == 1 && $return_line =~ /^\s*RIO/ ) { + $analysis_performed = 1; + } + + if ( $reading == 1 + && $return_line =~ /^\s*# ####################################/ ) { + if ( $analysis_performed == 1 ) { + &analyze(); + } + $reading = 0; + } + + if ( $reading == 1 ) { + $lines[ $i++ ] = $return_line; + } +} + +close( IN ); + +print OUT "\n\nTotal: $total\n"; + +close( OUT ); +close( OUT_SUMMARY ); + +print "\nTotal: $total\n"; +print "Done.\n\n"; + +exit( 0 ); + + +sub analyze { + my $j = 0; + my $results = 0; + my $o_bootstraps = 0; + $top1 = ""; + + for ( $j = 0; $j < $i; $j++ ) { + + if ( $lines[ $j ] =~ /^\s*--------\s+/ ) { + $results = 1; + } + elsif ( $lines[ $j ] =~ /^\s*Distance\s+values\s+/i ) { + $results = 0; + } + elsif ( $results == 1 + && ( $lines[ $j ] =~ /\S+\s+\S+\s+\S+\s*$/ + || $lines[ $j ] =~ /^\s*!NO\s+ORTHOLOGS/ ) ) { + + if ( $lines[ $j ] =~ /^\s*!NO\s+ORTHOLOGS/ ) { + $o_bootstraps = 0; + } + else { + $lines[ $j ] =~ /(\S+)\s+\S+\s+\S+\s*$/; + $o_bootstraps = $1; + if ( $top1 eq "" ) { + $top1 = $lines[ $j ]; + $top1 =~ s/\n//; + $top1 =~ s/\s{2,}/ /g; + } + } + + $results = 0; + + if ( $o_bootstraps > 100 || $o_bootstraps < 0 ) { + print "o bootstraps: $o_bootstraps\n"; + die "\n\n$0: Error: Boostrap value(s) out of range.\n\n"; + } + + if ( $larger == 1 ) { + if ( $complete != 1 && $o_bootstraps >= $threshold ) { + &writeout(); + $total++; + return; + } + elsif ( $complete == 1 && $o_bootstraps < $threshold ) { + return; + } + } + else { + if ( $complete != 1 && $o_bootstraps <= $threshold ) { + &writeout(); + $total++; + return; + } + elsif ( $complete == 1 && $o_bootstraps > $threshold ) { + return; + } + } + } + } + if ( $complete == 1 ) { + &writeout(); + $total++; + } + return; +} + + + +sub writeout { + my $j = 0; + print OUT "# ############################################################################\n"; + for ( $j = 0; $j < $i; ++$j ) { + print OUT "$lines[ $j ]"; + } + print OUT "# ############################################################################\n\n\n"; + print OUT_SUMMARY "$identifier [top 1: $top1]\n\n"; +} + + + +sub errorInCommandLine { + print "\n"; + print " bootstrapCounter.pl version: $VERSION\n"; + print " Usage: \"bootstrapSelector.pl \"\n"; + print " Options: \"l\" for \"less or equal\" (\"grater or equal\" is default)\n"; + print " \"c\" for \"all hits must meet threshold in case of\n"; + print " multiple copies of the same domain in the query\"\n"; + print " (default: \"at least one\")\n"; + print " Example:\n"; + print " \"bootstrapSelector.pl 95lc OUTFILE_At_1 At_1_out\"\n\n"; + print " Important: The result of this is meaningful ONLY if the thresholds\n"; + print " for output of the RIO analysis are set to zero (L=0 R=0).\n\n"; + exit( -1 ); +} + + diff --git a/forester/archive/perl/bootstrap_cz.pl b/forester/archive/perl/bootstrap_cz.pl new file mode 100755 index 0000000..0c020a5 --- /dev/null +++ b/forester/archive/perl/bootstrap_cz.pl @@ -0,0 +1,325 @@ +#!/usr/bin/perl -w + +# bootstrap_cz.pl +# --------------- +# Copyright (C) 1999-2003 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Created: 05/17/01 +# +# Last modified 08/26/03 +# +# Purpose: +# Bootstrap resamples an alignment in PHYLIP sequential format +# times. +# Amino acid sequences must only be represented by uppercase letters (A-Z) +# and '-'. +# In mode 0 it saves the positions which it used to create the +# bootstrapped alignment into . +# Mode 1 allows to recreate exactly the same boostrapped alignment +# by reading in a . +# Sequence names are normalized to $LENGTH_OF_NAME characters. +# The output alignment is in PHYLIP's sequential or interleaved format. +# (These two are the same in this case, since all the seqs will be one +# line in length (no returns in seq).) +# +# Usage: +# bootstrap_cz.pl +# +# [random number seed (mode 0 only)] +# + +use strict; +use FindBin; +use lib $FindBin::Bin; + +use rio_module; + +my $VERSION = "2.001"; + +my $modus = -1; # 0 to create pos. file, 1 to use premade pos. file +my $bootstraps = -1; +my $infile = ""; +my $outalign_file = ""; +my $positions_file = ""; +my $seed = -1; + + +$modus = $ARGV[ 0 ]; +$bootstraps = $ARGV[ 1 ]; +$infile = $ARGV[ 2 ]; +$outalign_file = $ARGV[ 3 ]; +$positions_file = $ARGV[ 4 ]; +$seed = $ARGV[ 5 ]; + +if ( @ARGV != 5 && @ARGV != 6 ) { + &printUsage(); + exit( -1 ); +} + +if ( $modus != 0 && $modus != 1 ) { + &printUsage(); + exit( -1 ); +} + +if ( $modus == 0 && @ARGV != 6 ) { + &printUsage(); + exit( -1 ); +} + +if ( $modus == 1 && @ARGV != 5 ) { + &printUsage(); + exit( -1 ); +} + +if ( $bootstraps < 1 ) { + &printUsage(); + exit( -1 ); +} + +if ( $seed && $seed < 0 ) { + &printUsage(); + exit( -1 ); +} + + +unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\nbootstrap_cz.pl: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; +} +if ( -e $outalign_file ) { + die "\n\nbootstrap_cz.pl: \"$outalign_file\" already exists.\n\n"; +} + +if ( $modus == 0 ) { + if ( -e $positions_file ) { + die "\n\nbootstrap_cz.pl: \"$positions_file\" already exists.\n\n"; + } +} +else { + unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) { + die "\n\nbootstrap_cz.pl: \"$positions_file\" does not exist, is empty, or is not a plain textfile.\n\n"; + } +} + +if ( $modus == 0 ) { + &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file, $seed ); +} +else { + &bootstrap( $modus, $bootstraps, $infile, $outalign_file, $positions_file ); +} + + +exit( 0 ); + + + +# Methods +# ------- + + +# Five/six arguemnts: +# 1. Mode: 0 to create pos. file, 1 to use premade pos. file +# 2. bootstraps +# 3. Alignment infile name +# 4. Outfile name +# 5. file name for positions file (created if mode is 0, read if mode is 1) +# [6. If modus is 0: seed for random number generator] +# +# This method is very similar to method "pfam2phylip" "in makeTree.pl". +# +# Last modified: 05/17/01 +# +sub bootstrap { + + my $modus = $_[ 0 ]; + my $bootstraps = $_[ 1 ]; + my $infile = $_[ 2 ]; + my $outalign_file = $_[ 3 ]; + my $positions_file = $_[ 4 ]; + + my @seq_name = (); + my @seq_array = (); + my @random_numbers = (); + my $return_line = ""; + my $seq = ""; + my $x = 0; + my $y = 0; + my $seq_no = 0; + my $original_length = 0; + my $max_x = 0; + my $n = 0; + my $i = 0; + my $random = -1; + my $length = 0; + my $number_of_seqs = 0; + my $number_of_colm = 0; + + + # Checks the arguments + # -------------------- + + if ( $modus == 0 ) { + if ( !$_[ 5 ] ) { + die "\n\n$0: bootstrap: Failed to give a seed for random number generator.\n\n"; + } + srand( $_[ 5 ] ); + } + elsif( $modus == 1 ) { + if ( $_[ 5 ] ) { + die "\n\n$0: bootstrap: Must not give a seed for random number generator.\n\n"; + } + unless ( ( -s $positions_file ) && ( -f $positions_file ) && ( -T $positions_file ) ) { + die "\n\n$0: bootstrap: <<$positions_file>> does not exist, is empty, or is not a plain textfile.\n\n"; + } + } + else { + die "\n\n$0: bootstrap: modus must be either 0 or 1.\n\n"; + } + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\n$0: bootstrap: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; + } + + + + # Reads in the alignment + # ---------------------- + + open( IN, "$infile" ) || die "\n$0: bootstrap: Cannot open file <<$infile>>: $!"; + while ( $return_line = ) { + + if ( $return_line =~ /^\s*(\d+)\s+(\d+)/ ) { + $number_of_seqs = $1; + $number_of_colm = $2; + } + elsif ( $return_line =~ /^(\S+)\s+(\S+)/ ) { + $seq_name[ $seq_no ] = substr( $1, 0, $LENGTH_OF_NAME ); + $seq = $2; + if ( $original_length == 0 ) { + $original_length = length( $seq ); + } + elsif ( $original_length != length( $seq ) ) { + die "\n\n$0: Sequences do not have the same length.\n\n"; + } + for ( $x = 0; $x < $original_length; $x++ ) { + $seq_array[ $x ][ $seq_no ] = substr( $seq, $x, 1 ); + } + $seq_no++; + } + } + close( IN ); + + if ( ( $number_of_seqs != $seq_no ) + || ( $number_of_colm != $original_length ) ) { + die "\n\n$0: Number of sequences or number of columns are inconsisten with the values given in the alignment.\n\n"; + } + + # Adusts the length of the names to $LENGTH_OF_NAME + # ------------------------------------------------- + + for ( $y = 0; $y < $seq_no; $y++ ) { + $length = length( $seq_name[ $y ] ); + for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) { + $seq_name[ $y ] .= " "; + } + } + + + + # Bootstraps $bootstraps times and writes the outputfiles + # ------------------------------------------------------- + + open( OUT, ">$outalign_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$outalign_file>>: $!"; + if ( $modus == 0 ) { + open( OUT_P, ">$positions_file" ) || die "\n\n$0: bootstrap: Cannot create file <<$positions_file>>: $!"; + } + else { + open( IN_P, "$positions_file" ) || die "\n\n$0: bootstrap: Cannot open file <<$positions_file>>: $!"; + } + + for ( $n = 0; $n < $bootstraps; $n++ ) { + + if ( $modus == 0 ) { + for ( $x = 0; $x < $original_length; $x++ ) { + $random = int( rand( $original_length ) ); + print OUT_P "$random "; + $random_numbers[ $x ] = $random; + } + print OUT_P "\n"; + } + else { + $return_line = ; + if ( !$return_line || $return_line !~ /\d/ ) { + die "\n\n$0: bootstrap: <<$positions_file>> seems too short or otherwise unsuitable.\n\n"; + } + $return_line =~ s/^\s+//; + $return_line =~ s/\s+$//; + @random_numbers = split( /\s+/, $return_line ); + if ( scalar( @random_numbers ) != $original_length ) { + die "\n\n$0: bootstrap: <<$positions_file>> seems not to correspond to <<$infile>>.\n\n"; + } + } + + print OUT " $seq_no $original_length\n"; + + for ( $y = 0; $y < $seq_no; $y++ ) { + print OUT "$seq_name[ $y ]"; + + for ( $x = 0; $x < $original_length; $x++ ) { + $random = $random_numbers[ $x ]; + if ( !$seq_array[ $random ][ $y ] || $seq_array[ $random ][ $y ] !~ /[A-Z]|-/ ) { + die "\n\n$0: Sequence must be represented by uppercase letters A-Z and \"-\" only.\n\n"; + } + print OUT $seq_array[ $random ][ $y ]; + } + print OUT "\n"; + } + } + + close( OUT ); + + if ( $modus == 0 ) { + print OUT_P "\n"; + close( OUT_P ); + } + else { + close( IN_P ); + } + + return; + +} ## bootstrap + + + +sub printUsage { + print "\n"; + print " bootstrap_cz.pl $VERSION\n"; + print " ---------------\n"; + print "\n"; + print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; + print "\n"; + print " Purpose:\n"; + print " Bootstrap resamples an alignment in PHYLIP sequential format\n"; + print " times.\n"; + print " In mode 0 it saves the positions which it used to create the\n"; + print " bootstrapped alignment into .\n"; + print " Mode 1 allows to recreate exactly the same boostrapped alignment\n"; + print " by reading in a .\n"; + print " Sequence names are normalized to $LENGTH_OF_NAME characters.\n"; + print " The output alignment is in PHYLIP's sequential or interleaved format.\n"; + print " (These two are the same in this case, since all the seqs will be one\n"; + print " line in length (no returns in seq).)\n"; + print "\n"; + print " Usage:\n"; + print " bootstrap_cz.pl \n"; + print " \n"; + print " [random number seed (mode 0 only)]\n"; + print "\n"; +} ## printUsage + diff --git a/forester/archive/perl/countSpeciesSPTrEMBL.pl b/forester/archive/perl/countSpeciesSPTrEMBL.pl new file mode 100755 index 0000000..1e9a626 --- /dev/null +++ b/forester/archive/perl/countSpeciesSPTrEMBL.pl @@ -0,0 +1,150 @@ +#!/usr/bin/perl -W + +# countSpeciesSPTrEMBL.pl +# ----------------------- +# +# Copyright (C) 2003 Christian M. Zmasek +# All rights reserved +# +# Created: 02/27/03 +# Last modified: 02/27/03 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified 05/23/02 + +# Purpose. Counts species in SWISS-PROT and TrEMBL. +# +# Usage. countSpeciesSPTrEMBL.pl +# + + +use strict; + + +my $VERSION = "1.000"; +my $infile_sp = ""; +my $infile_tr = ""; +my $outfile = ""; + +my $return_line = ""; +my $read = 0; +my $os = ""; +my %species_count = (); # full name -> count. + + +if ( @ARGV != 3 ) { + &errorInCommandLine(); +} + +$infile_tr = $ARGV[ 0 ]; +$infile_sp = $ARGV[ 1 ]; +$outfile = $ARGV[ 2 ]; + + + +if ( -e $outfile ) { + die "\n$0: <<$outfile>> already exists.\n\n"; +} +unless ( ( -s $infile_tr ) && ( -f $infile_tr ) && ( -T $infile_tr ) ) { + die "\n$0: <$infile_tr>> does not exist, is empty, or is not a plain textfile.\n\n"; +} +unless ( ( -s $infile_sp ) && ( -f $infile_sp ) && ( -T $infile_sp ) ) { + die "\n$0: <<$infile_sp>> does not exist, is empty, or is not a plain textfile.\n\n"; +} + +open( IN_TR, "$infile_tr" ) || die "\n$0: Cannot open file <<$infile_tr>>: $!\n"; +open( IN_SP, "$infile_sp" ) || die "\n$0: Cannot open file <<$infile_sp>>: $!\n"; +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; + + +$read = 0; + +while ( $return_line = ) { + if ( $return_line =~ /^AC\s+(\S+);/ ) { + $read = 1; + } + elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) { + $os = $1; + $os =~ s/\(.+\)//g; + $os =~ s/^\s+//; + $os =~ s/\s+$//; + $os =~ s/\.$//; + if ( exists( $species_count{ $os } ) ) { + $species_count{ $os } = $species_count{ $os } + 1; + } + else { + $species_count{ $os } = 1; + } + print "$os\n"; + } + elsif ( $return_line =~ /^\/\// && $read == 1 ) { + $read = 0; + $os = ""; + } +} + +close( IN_TR ); + +$read = 0; +$os = ""; +$return_line = ""; + +while ( $return_line = ) { + if ( $return_line =~ /^ID\s+(\S+)/ ) { + $read = 1; + } + elsif ( $return_line =~ /^OS\s+(.+)\s*$/ && $read == 1 ) { + $os = $1; + $os =~ s/\(.+//g; + $os =~ s/^\s+//; + $os =~ s/\s+$//; + $os =~ s/\.$//; + $read = 0; + if ( exists( $species_count{ $os } ) ) { + $species_count{ $os } = $species_count{ $os } + 1; + } + else { + $species_count{ $os } = 1; + } + print "$os\n"; + } + elsif ( $return_line =~ /^\/\// && $read == 1 ) { + $read = 0; + $os = ""; + } +} + +close( IN_SP ); + + +foreach my $species ( sort { $species_count{ $b } <=> $species_count{ $a } } keys %species_count ) { + print OUT "$species: $species_count{$species}\n"; +} + + +print "\n\nDone!\n\n"; + +close( OUT ); + +exit( 0 ); + + + + + + +sub errorInCommandLine { + print "\n"; + print " countSpeciesSPTrEMBL.pl $VERSION\n"; + print " -----------------------\n"; + print "\n"; + print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; + print "\n"; + print " Purpose. Counts species in SWISS-PROT and TrEMBL.\n"; + print "\n"; + print " Usage. countSpeciesSPTrEMBL.pl \n"; + print "\n"; + exit( -1 ); +} diff --git a/forester/archive/perl/extractSWISS-PROT.pl b/forester/archive/perl/extractSWISS-PROT.pl new file mode 100755 index 0000000..7bcfec0 --- /dev/null +++ b/forester/archive/perl/extractSWISS-PROT.pl @@ -0,0 +1,176 @@ +#!/usr/bin/perl -W + +# extractSWISS-PROT.pl +# -------------------- +# +# Copyright (C) 2001 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Created: 09/25/01 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified 05/23/02 + +# Purpose. Extracts ID, DE, and species from a "sprot.dat" file. +# The output is used by "rio.pl". +# If a species list (format: SWISS-PROT-code=full name) is supplied: +# only sequences from species found in this list are written to +# outfile (recommended). +# +# Usage. extractSWISS-PROT.pl [species list] + +# Remark. Need to re-run this if species in species tree or species list +# are added/changed or if a new version of Pfam is used!! + + +use strict; + + +my $VERSION = "1.001"; +my $infile = ""; +my $outfile = ""; +my $speciesfile = ""; +my $return_line = ""; +my $read = 0; +my $ac = ""; +my $de = ""; +my $os = ""; +my %Species_names = (); # SWISS-PROT name -> "". +my $i = 0; + +if ( @ARGV != 2 && @ARGV != 3 ) { + &errorInCommandLine(); +} + +$infile = $ARGV[ 0 ]; +$outfile = $ARGV[ 1 ]; + +if ( @ARGV == 3 ) { + $speciesfile = $ARGV[ 2 ]; + unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) { + die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n"; + } + &readSpeciesNamesFile( $speciesfile ); +} + +if ( -e $outfile ) { + die "\n$0: <<$outfile>> already exists.\n\n"; +} +unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; +} + +open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; + +print OUT "# extractTrembl.pl version: $VERSION\n"; +print OUT "# trembl.dat file: $infile\n"; +print OUT "# output file : $outfile\n"; +print OUT "# species file : $speciesfile\n"; +print OUT "# date : ".`date`."\n\n"; + +$read = 0; + +while ( $return_line = ) { + if ( $return_line =~ /^ID\s+(\S+)/ ) { + $ac = $1; + $read = 1; + if ( $ac =~ /[A-Z0-9]+_([A-Z0-9]+)/ ) { + $os = $1; + } + else { + die "\n$0: Unexpected format: $ac.\n\n"; + } + if ( $speciesfile ne "" ) { + unless ( exists( $Species_names{ $os } ) ) { + $read = 0; + $ac = ""; + $de = ""; + $os = ""; + next; + } + } + } + elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) { + if ( $de ne "" ) { + $de .= " ".$1; + } + else { + $de = $1; + } + } + elsif ( $return_line =~ /^\/\// && $read == 1 ) { + $read = 0; + print OUT "$ac;$de;$os\n"; + $ac = ""; + $de = ""; + $os = ""; + $i++; + } +} + +close( IN ); + +print OUT "\n # $i entries.\n"; + +close( OUT ); + +exit( 0 ); + + + +# Reads in species file. +# Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis") +# Lines beginning with "#" are ignored. +# One argument: species file-name +# Last modified: 04/24/01 +sub readSpeciesNamesFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $sp = ""; + my $full = ""; + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; + } + + open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; + while ( $return_line = ) { + if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) { + $sp = $1; + $full = $2; + $full =~ s/^\s+//; + $full =~ s/\s+$//; + $Species_names{ $sp } = ""; + } + } + close( IN_RSNF ); + + return; +} + + + +sub errorInCommandLine { + print "\n"; + print " extractSWISS-PROT.pl $VERSION\n"; + print " --------------------\n"; + print "\n"; + print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; + print "\n"; + print " Purpose. Extracts ID, DE, and species from a \"sprot.dat\" file.\n"; + print " The resulting output is used by \"rio.pl\".\n"; + print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n"; + print " only sequences from species found in this list are written to\n"; + print " outfile (recommended).\n"; + print "\n"; + print " Usage. extractSWISS-PROT.pl [species list]\n"; + print "\n"; + print " Remark. Need to re-run this if species in species tree or species list\n"; + print " are added/changed or if a new version of Pfam is used!!\n"; + print "\n\n"; + exit( -1 ); +} diff --git a/forester/archive/perl/extractSpecies.pl b/forester/archive/perl/extractSpecies.pl new file mode 100755 index 0000000..32a40de --- /dev/null +++ b/forester/archive/perl/extractSpecies.pl @@ -0,0 +1,113 @@ +#!/usr/bin/perl -W + +# extractSpecies.pl +# ---------------- +# +# Copyright (C) 2003 Christian M. Zmasek +# All rights reserved +# +# Created: 09/03/03 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified 03/12/04 (Added gg) + +# Purpose. Adds species information to a file describing a phylogenetic +# tree in the following format (by way of example): +# "((ceINX_CE33055:0.02883,cbINX_CB09748:0.02934):0.36899[&&NHX:B=100],..." +# ce stands for "CAEEL". The hash %SPECIES needs to be set accordingly. +# + + +use strict; + + +my %SPECIES = ( + "dm" => "DROME", + "ag" => "ANOGA", + "ce" => "CAEEL", + "cb" => "CAEBR", + "ci" => "CIOIN", + "fr" => "FUGRU", + "gg" => "CHICK", + "rn" => "RAT", + "mm" => "MOUSE", + "hs" => "HUMAN" + ); + + +my $infile = ""; +my $outfile = ""; +my $intree = ""; +my $return_line = ""; + +if ( @ARGV != 1 && @ARGV != 2 ) { + &errorInCommandLine(); +} + +$infile = $ARGV[ 0 ]; + +if ( @ARGV == 1 ) { + $outfile = $infile; + $outfile =~ s/\.nhx$//; + $outfile .= "_species.nhx"; +} + +if ( @ARGV == 2 ) { + $outfile = $ARGV[ 1 ]; +} + + + + +if ( -e $outfile ) { + die "\n$0: <<$outfile>> already exists.\n\n"; +} +unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; +} + +open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; + +while ( $return_line = ) { + $return_line =~ s/\s+//g; + $return_line =~ s/\+/_/g; + + $intree .= $return_line; + +} + +close( IN ); + +while ( ( my $short, my $long ) = each ( %SPECIES ) ) { + + while ( $intree =~ /[(),]($short[^\[]+?)[(),]/ ) { + + my $name_and_length = $1; + + print "$name_and_length -> $name_and_length\[\&\&NHX:S=$long\]\n"; + + $intree =~ s/$name_and_length/$name_and_length\[&&NHX:S=$long\]/; + + } + +} + +print OUT $intree; + +close( OUT ); + +print "\n\nDone!\n\n"; + +exit( 0 ); + + + +sub errorInCommandLine { + print "\n"; + print "extractSpecies.pl infile [outfile]"; + print "\n\n"; + exit( -1 ); +} diff --git a/forester/archive/perl/extractTrembl.pl b/forester/archive/perl/extractTrembl.pl new file mode 100755 index 0000000..fadf216 --- /dev/null +++ b/forester/archive/perl/extractTrembl.pl @@ -0,0 +1,199 @@ +#!/usr/bin/perl -W + +# extractTrembl.pl +# ---------------- +# +# Copyright (C) 2001 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Created: 04/24/01 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified 05/23/02 + +# Purpose. Extracts AC, DE, and OS from a "trembl.dat" file. +# The output is used by "rio.pl". +# If a species list (format: SWISS-PROT-code=full name) is supplied: +# only sequences from species found in this list are written to +# outfile and their full species names replaced with their SWISS-PROT +# code (recommended). +# +# Usage. extractTrembl.pl [species list] + +# Remark. Need to re-run this if species in species tree or species list +# are added/changed or if a new version of Pfam is used!! + +# Some "heuristic" is required for Synechococcus, Synechocystis, Anabaena: +# see below. + +use strict; + + +my $VERSION = "1.001"; +my $infile = ""; +my $outfile = ""; +my $speciesfile = ""; +my $return_line = ""; +my $read = 0; +my $ac = ""; +my $de = ""; +my $os = ""; +my %Species_names = (); # full name -> SWISS-PROT name. +my $i = 0; + +if ( @ARGV != 2 && @ARGV != 3 ) { + &errorInCommandLine(); +} + +$infile = $ARGV[ 0 ]; +$outfile = $ARGV[ 1 ]; + +if ( @ARGV == 3 ) { + $speciesfile = $ARGV[ 2 ]; + unless ( ( -s $speciesfile ) && ( -f $speciesfile ) && ( -T $speciesfile ) ) { + die "\n$0: <<$speciesfile>> does not exist, is empty, or is not a plain textfile.\n\n"; + } + &readSpeciesNamesFile( $speciesfile ); +} + +if ( -e $outfile ) { + die "\n$0: <<$outfile>> already exists.\n\n"; +} +unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: <<$infile>> does not exist, is empty, or is not a plain textfile.\n\n"; +} + +open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file <<$outfile>>: $!\n"; + +print OUT "# extractTrembl.pl version: $VERSION\n"; +print OUT "# trembl.dat file: $infile\n"; +print OUT "# output file : $outfile\n"; +print OUT "# species file : $speciesfile\n"; +print OUT "# date : ".`date`."\n\n"; + +$read = 0; + +while ( $return_line = ) { + if ( $return_line =~ /^AC\s+(\S+);/ ) { + $ac = $1; + $read = 1; + } + elsif ( $return_line =~ /^DE\s+(.+)/ && $read == 1 ) { + if ( $de ne "" ) { + $de .= " ".$1; + } + else { + $de = $1; + } + } + + elsif ( $return_line =~ /^OS\s+(.+)\.\s*$/ && $read == 1 ) { + $os = $1; + if ( $speciesfile ne "" + && ( $os =~ /Synechococcus/ + || $os =~ /Synechocystis/ + || $os =~ /Anabaena/ ) ) { + if ( $os =~ /PCC\s*(\d+)/ ) { + $os = "PCC ".$1; + } + else { + $read = 0; + $ac = ""; + $de = ""; + $os = ""; + next; + } + } + else { + $os =~ s/\(.+\)//g; + } + $os =~ s/^\s+//; + $os =~ s/\s+$//; + if ( $speciesfile ne "" ) { + unless ( exists( $Species_names{ $os } ) ) { + $read = 0; + $ac = ""; + $de = ""; + $os = ""; + next; + } + $os = $Species_names{ $os }; + } + } + elsif ( $return_line =~ /^\/\// && $read == 1 ) { + $read = 0; + print OUT "$ac;$de;$os\n"; + $ac = ""; + $de = ""; + $os = ""; + $i++; + } +} + +close( IN ); + +print OUT "\n # $i entries.\n"; + +close( OUT ); + +exit( 0 ); + + + +# Reads in species file. +# Format: SWISS-PROT=full name (e.g. "BACSU=Bacillus subtilis") +# Lines beginning with "#" are ignored. +# One argument: species file-name +# Last modified: 04/24/01 +sub readSpeciesNamesFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $sp = ""; + my $full = ""; + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: readSpeciesNamesFile: <<$infile>> does not exist, is empty, or is not a plain textfile.\n"; + } + + open( IN_RSNF, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; + while ( $return_line = ) { + if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)=(.+)/ ) { + $sp = $1; + $full = $2; + $full =~ s/^\s+//; + $full =~ s/\s+$//; + $Species_names{ $full } = $sp; + } + } + close( IN_RSNF ); + + return; +} + + + +sub errorInCommandLine { + print "\n"; + print " extractTrembl.pl $VERSION\n"; + print " ----------------\n"; + print "\n"; + print " Christian Zmasek (zmasek\@genetics.wustl.edu)\n"; + print "\n"; + print " Purpose. Extracts AC, DE, and OS from a \"trembl.dat\" file.\n"; + print " The resulting output is used by \"rio.pl\".\n"; + print " If a species list (format: SWISS-PROT-code=full name) is supplied:\n"; + print " only sequences from species found in this list are written to\n"; + print " outfile and their full species names replaced with their SWISS-PROT\n"; + print " code (recommended).\n"; + print "\n"; + print " Usage. extractTrembl.pl [species list]\n"; + print "\n"; + print " Remark. Need to re-run this if species in species tree or species list\n"; + print " are added/changed or if a new version of Pfam is used!!\n"; + print "\n\n"; + exit( -1 ); +} diff --git a/forester/archive/perl/file_proc.pl b/forester/archive/perl/file_proc.pl new file mode 100755 index 0000000..0122778 --- /dev/null +++ b/forester/archive/perl/file_proc.pl @@ -0,0 +1,52 @@ +#!/usr/bin/perl -w + +my $in = $ARGV[ 0 ]; +my $out = $ARGV[ 1 ]; + +if ( -e $out ) { + print "File $out already exists.\n"; + exit( -1 ); +} + +if ( !-e $in ) { + print "File $in does not exist.\n"; + exit( -1 ); +} + +open( IN, $in ) ; +open ( OUT, ">>$out" ) || die ( "Could not open file $out for writing!\n" ); + +while ( my $line = ) { + my $newline = &proc_line( $line ); + if ( length( $newline ) > 0 ) { + print OUT $newline; + } +} + + +close( OUT ) or die( "can't close $out: $!" ); +close( IN ) or die( "can't close $in: $!" ); + +sub proc_line { + my $line = shift; + + + if ( $line =~ /^#/ ) { + return ""; + } + if ( $line =~ /^Predicted coding sequence\(s\):/ ) { + return ""; + } + elsif ( $line =~ /^>.*_aa\s*$/ ) { + return ""; + } + elsif ( $line =~ /^>/ ) { + return $line; + } + elsif ( $line !~ /[a-z]/ ) { + return ""; + } + else { + return $line;; + } +} diff --git a/forester/archive/perl/forester.pm b/forester/archive/perl/forester.pm new file mode 100755 index 0000000..3494fd5 --- /dev/null +++ b/forester/archive/perl/forester.pm @@ -0,0 +1,1428 @@ +# $Id: forester.pm,v 1.26 2010/12/13 19:00:22 cmzmasek Exp $ +# +# FORESTER -- software libraries and applications +# for evolutionary biology research and applications. +# +# Copyright (C) 2007-2009 Christian M. Zmasek +# Copyright (C) 2007-2009 Burnham Institute for Medical Research +# All rights reserved +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +# +# Contact: phylosoft @ gmail . com +# WWW: www.phylosoft.org/forester +# +# +# + + +package forester; +use strict; +require Exporter; + +our $VERSION = 1.000; + +our @ISA = qw( Exporter ); + +our @EXPORT = qw( executeConsense + executePhyloPl + executePuzzleDQO + executePuzzleDQObootstrapped + pfam2phylipMatchOnly + startsWithSWISS_PROTname + isPfamSequenceLine + isPfamCommentLine + containsPfamNamedSequence + isRFline + executeProtpars + setModelForPuzzle + setRateHeterogeneityOptionForPuzzle + setParameterEstimatesOptionForPuzzle + executePuzzleBootstrapped + executePuzzle + executeFastme + executeNeighbor + executeFitch + executeBionj + executeWeighbor + executePhyml + executeHmmfetch + addDistsToQueryToPWDfile + testForTextFilePresence + exitWithWarning + dieWithUnexpectedError + addSlashAtEndIfNotPresent + $LENGTH_OF_NAME + $MIN_NUMBER_OF_AA + $TREMBL_ACDEOS_FILE + $SWISSPROT_ACDEOS_FILE + $SPECIES_NAMES_FILE + $SPECIES_TREE_FILE_DEFAULT + $MULTIPLE_TREES_FILE_SUFFIX + $LOG_FILE_SUFFIX + $ALIGN_FILE_SUFFIX + $TREE_FILE_SUFFIX + $ADDITION_FOR_RIO_ANNOT_TREE + $SUFFIX_PWD + $SUFFIX_BOOT_STRP_POS + $MULTIPLE_PWD_FILE_SUFFIX + $SUFFIX_PWD_NOT_BOOTS + $SUFFIX_HMM + $MATRIX_FOR_PWD + $RIO_PWD_DIRECTORY + $RIO_BSP_DIRECTORY + $RIO_NBD_DIRECTORY + $RIO_ALN_DIRECTORY + $RIO_HMM_DIRECTORY + $PFAM_FULL_DIRECTORY + $PFAM_SEED_DIRECTORY + $PRIOR_FILE_DIR + $PFAM_HMM_DB + $FORESTER_JAR + $SEQBOOT + $NEIGHBOR + $PROTPARS + $CONSENSE + $PROML + $PHYLIP_VERSION + $PUZZLE + $PUZZLE_VERSION + $FASTME + $FASTME_VERSION + $BIONJ + $BIONJ_VERSION + $WEIGHBOR + $WEIGHBOR_VERSION + $RAXML + $RAXML_VERSION + $PHYML + $PHYML_VERSION + $HMMALIGN + $HMMSEARCH + $HMMBUILD + $HMMFETCH + $SFE + $HMMCALIBRATE + $P7EXTRACT + $MULTIFETCH + $BOOTSTRAP_CZ + $BOOTSTRAP_CZ_PL + $SUPPORT_TRANSFER + $SUPPORT_STATISTICS + $NEWICK_TO_PHYLOXML + $PHYLO_PL + $RIO_PL + $DORIO + $PUZZLE_DQO + $BOOTSTRAPS + $PATH_TO_FORESTER + $JAVA + $NODE_LIST + $RIO_SLAVE_DRIVER + $RIO_SLAVE + $TEMP_DIR_DEFAULT + $EXPASY_SPROT_SEARCH_DE + $EXPASY_SPROT_SEARCH_AC + ); + + + + +# ============================================================================= +# ============================================================================= +# +# THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY +# BY THE USER +# ------------------------------------------------------------------------- +# + +# For using just "phylo_pl.pl", only the following variables need to be set +# $JAVA +# $FORESTER_JAR +# $TEMP_DIR_DEFAULT +# $SEQBOOT +# $CONSENSE +# $PUZZLE +# $FASTME +# $NEIGHBOR +# $FITCH +# $BIONJ +# $WEIGHBOR +# $PHYML +# $PROTPARS + +# Software directory: +# --------------------- + +our $SOFTWARE_DIR = "/home/czmasek/SOFTWARE/"; + + +# Java virtual machine: +# --------------------- +our $JAVA = $SOFTWARE_DIR."JAVA/jdk1.6.0_03/bin/java"; + + +# Where all the temporary files can be created: +# --------------------------------------------- +our $TEMP_DIR_DEFAULT = "/tmp/"; + + +# Programs from Joe Felsenstein's PHYLIP package: +# ----------------------------------------------- +our $SEQBOOT = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/seqboot"; +our $NEIGHBOR = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/neighbor"; +our $PROTPARS = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/protpars"; +our $PROML = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/proml"; +our $FITCH = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/fitch"; +our $CONSENSE = $SOFTWARE_DIR."PHYLIP/phylip-3.68/src/consense"; +our $PHYLIP_VERSION = "3.68"; + +# TREE-PUZZLE: +# ------------ +our $PUZZLE = $SOFTWARE_DIR."TREE_PUZZLE/tree-puzzle-5.2/src/puzzle"; +our $PUZZLE_VERSION = "5.2"; + +# FASTME: +# ----------------------------------------------------- +our $FASTME = $SOFTWARE_DIR."FASTME/fastme2.0/fastme"; +our $FASTME_VERSION = "2.0"; + +# BIONJ: +# ----------------------------------------------------- +our $BIONJ = $SOFTWARE_DIR."BIONJ/bionj"; +our $BIONJ_VERSION = "[1997]"; + +# WEIGHBOR: +# ----------------------------------------------------- +our $WEIGHBOR = $SOFTWARE_DIR."WEIGHBOR/Weighbor/weighbor"; +our $WEIGHBOR_VERSION = "1.2.1"; + +# PHYML: +# ----------------------------------------------------- +our $PHYML = $SOFTWARE_DIR."PHYML/phyml_v2.4.4/exe/phyml_linux"; +our $PHYML_VERSION = "2.4.4"; + +# RAXML: +# ----------------------------------------------------- +our $RAXML = $SOFTWARE_DIR."RAXML/RAxML-7.0.4/raxmlHPC"; +our $RAXML_VERSION = "7.0.4"; + + +# forester.jar. This jar file is currently available at: http://www.phylosoft.org +# ------------------------------------------------------------------------------- + +our $FORESTER_JAR = $SOFTWARE_DIR."FORESTER/DEV/forester-atv/java/forester.jar"; + + + +# End of variables which need to be set by the user for using "phylo_pl.pl". + + + + + + + + + + + + + + +# Tool from forester.jar to transfer support values: +# ------------------------------------------------- +our $SUPPORT_TRANSFER = $JAVA." -cp $FORESTER_JAR org.forester.application.support_transfer"; + + + +# Tool from forester.jar for simple statistics for support values: +# ---------------------------------------------------------------- +our $SUPPORT_STATISTICS = $JAVA." -cp $FORESTER_JAR org.forester.application.support_statistics"; + + +# Tool from forester.jar to transfer nh to phyloXML: +# ------------------------------------------------- +our $NEWICK_TO_PHYLOXML = $JAVA." -cp $FORESTER_JAR org.forester.application.phyloxml_converter"; + + + +# FORESTER itself (currently not needed for "phylo_pl.pl"): +# --------------------------------------------------------- +our $PATH_TO_FORESTER = ""; + + +# Pfam data (not needed for phylo_pl.pl): +# -------------------------------------- +our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; +our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/"; +our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this + # to produce .ssi file. + # Then, for example + # "setenv HMMERDB /home/rio/pfam-6.6/" + + +$PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER ); + + +# Description lines and species from SWISS-PROT and TrEMBL (not needed for phylo_pl.pl): +# ------------------------------------------------------------------------------------- +our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6"; + +our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6"; + + + +# Names of species which can be analyzed and analyzed +# against (must also be in tree $SPECIES_TREE_FILE_DEFAULT). +# By using a list with less species, RIO analyses become faster +# but lose phylogenetic resolution. +# For many purposes, list "tree_of_life_bin_1-6_species_list" +# in "data/species/" might be sufficient: +# (not needed for phylo_pl.pl) +# -------------------------------------------------------------- +our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list"; + + + +# A default species tree in NHX format. +# For many purposes, tree "tree_of_life_bin_1-6.nhx" +# in "data/species/" might be fine: +# (not needed for phylo_pl.pl) +# -------------------------------------------------- +our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx"; + + + +# Data for using precalculated distances: +# (not needed for phylo_pl.pl) +# --------------------------------------- +our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY. + # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG. + +our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/" +our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/"; +our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; +our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/"; +our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/"; + + + +# +# End of variables which need to be set by the user. +# +# ============================================================================= +# ============================================================================= + + + + + +$TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT ); +$PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY ); +$PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY ); + + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# These variables should normally not be changed: +# + +our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/"; + # Directory containing dirichlet prior + # files needed for certain aligments + # by hmmbuild (e.g. Collagen). + + + + + +# TREE-PUZZLE: +our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle"; + +# HMMER: +our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign"; +our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch"; +our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild"; +our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch"; +our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch"; +our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate"; + +our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl"; +our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl"; + + +# RIO/FORESTER: +our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz"; +our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl"; +#our $SUPPORT_TRANSFER = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts"; +#our $SUPPORT_TRANSFER = $JAVA." -cp /home/czmasek/SOFTWARE/FORESTER/forester3/forester.jar org.forester.tools.SupportTransfer"; + +our $PHYLO_PL = $PATH_TO_FORESTER."perl/phylo_pl.pl"; +our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl"; +our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO"; +# parallel RIO: +our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl"; +our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl"; +our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat"; + +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +our $BOOTSTRAPS = 100; +our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed. +our $LENGTH_OF_NAME = 10; + + + + +our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt"; +our $LOG_FILE_SUFFIX = ".log"; +our $ALIGN_FILE_SUFFIX = ".aln"; +our $TREE_FILE_SUFFIX = ".nhx"; +our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio"; +our $SUFFIX_PWD = ".pwd"; +our $MULTIPLE_PWD_FILE_SUFFIX = ".mpwd"; +our $SUFFIX_BOOT_STRP_POS = ".bsp"; +our $SUFFIX_PWD_NOT_BOOTS = ".nbd"; +our $SUFFIX_HMM = ".hmm"; + +our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?"; +our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?"; + + + +# One argument: input multiple trees file +# Last modified: 07/05/01 +sub executeConsense { + my $in = $_[ 0 ]; + + &testForTextFilePresence( $in ); + + system( "$CONSENSE >/dev/null 2>&1 << ! +$in +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$CONSENSE $in\"" ); + + return; +} + + + +# Four arguments: +# 1. options ("-" is not necessary) +# 2. alignment or pwd file +# 3. outfile +# 4. temp dir +# Last modified: 07/05/01 +sub executePhyloPl { + + my $opts = $_[ 0 ]; + my $B = $_[ 1 ]; + my $C = $_[ 2 ]; + my $D = $_[ 3 ]; + + &testForTextFilePresence( $B ); + + $opts = "-".$opts; + + system( "$PHYLO_PL $opts $B $C $D" ) + && &dieWithUnexpectedError( "Could not execute \"$PHYLO_PL $opts $B $C $D\"" ); + +} ## executePhyloPl + + + + +# Two arguments: +# 1. Name of inputfile +# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +sub executePuzzleDQO { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + my $mat = ""; + + &testForTextFilePresence( $in ); + + $mat = setModelForPuzzle( $matrix_option ); + + system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat +y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" ); + + return; + +} ## executePuzzleDQO + + + + +# Two arguments: +# 1. Name of inputfile +# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# Last modified: 01/28/02 +sub executePuzzleDQObootstrapped { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + + + my $l = 0; + my $slen = 0; + my $counter = 0; + my $mat = ""; + my $a = ""; + my @a = (); + + &testForTextFilePresence( $in ); + + open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" ); + while( ) { + if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { + $counter++; + } + } + close( GRP ); + + $l = `cat $in | wc -l`; + $slen = $l / $counter; + + system( "split -$slen $in $in.splt." ) + && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" ); + + @a = <$in.splt.*>; + + $mat = setModelForPuzzle( $matrix_option ); + + foreach $a ( @a ) { + + system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat +y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" ); + + system( "cat $a.dist >> $in.dist" ) + && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" ); + + unlink( $a, $a.".dist" ); + } + + return; + +} ## executePuzzleDQObootstrapped + + + +# Transfers a Pfam (SELEX) alignment to a +# PHYLIP sequential style alignment. +# It only writes "match columns" as indicated by the +# "# RF" line ('x' means match). +# +# Three arguments: +# 1. infile name +# 2. outfile name +# 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-' +# +# Returns the number of match states (=length of output alignment), +# the length of the input alignment, +# the number of seqs in the input alignment +# +# Last modified: 07/07/01 +# +sub pfam2phylipMatchOnly { + + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $ne = $_[ 2 ]; + my @seq_name = (); + my @seq_array = (); + my $return_line = ""; + my $seq = ""; + my $x = 0; + my $y = 0; + my $i = 0; + my $x_offset = 0; + my $max_x = 0; + my $rf_y = 0; + my $number_colum = 0; + my $not_ensure = 0; + my $saw_rf_line = 0; + + if ( $ne && $ne == 1 ) { + $not_ensure = 1; + } + + &testForTextFilePresence( $infile ); + + open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + + # This reads in the first block. It reads in the seq names. + while ( 1 ) { + if ( &isPfamSequenceLine( $return_line ) ) { + $return_line =~ /^(\S+)\s+(\S+)/; + $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME ); + $seq = $2; + for ( $x = 0; $x < length( $seq ); $x++ ) { + $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 ); + } + $y++; + } + elsif ( &isRFline( $return_line ) ) { + $saw_rf_line = 1; + $return_line =~ /\s+(\S+)\s*$/; + $seq = $1; + $x_offset = length( $seq ); + $rf_y = $y; + for ( $x = 0; $x < $x_offset; $x++ ) { + $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 ); + } + last; + } + + $return_line = ; + + if ( !$return_line ) { + &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); + } + } + + if ( $saw_rf_line != 1 ) { + &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); + } + + $y = 0; + $max_x = 0; + + # This reads all blocks after the 1st one. + while ( $return_line = ) { + if ( &isPfamSequenceLine( $return_line ) ) { + $return_line =~ /^\S+\s+(\S+)/; + $seq = $1; + for ( $x = 0; $x < length( $seq ); $x++ ) { + $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 ); + } + $y++; + } + elsif ( &isRFline( $return_line ) ) { + if ( $y != $rf_y ) { + &dieWithUnexpectedError( "Alignment not in expected format" ); + } + + $return_line =~ /\s+(\S+)\s*$/; + $seq = $1; + $max_x = length( $seq ); + + for ( $x = 0; $x < length( $seq ); $x++ ) { + $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 ); + } + + $y = 0; + $x_offset = $x_offset + $max_x; + $max_x = 0; + } + } + + close( INPP ); + + # Counts the match states, and hence the number of aa in the alignment: + for ( $x = 0; $x < $x_offset; $x++ ) { + if ( !$seq_array[ $x ][ $rf_y ] ) { + &dieWithUnexpectedError( "Alignment not in expected format" ); + } + if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { + $number_colum++; + } + } + + # Writes the file: + + open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + print OUTPP "$rf_y $number_colum\n"; + for ( $y = 0; $y < $rf_y; $y++ ) { + print OUTPP "$seq_name[ $y ]"; + for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) { + print OUTPP " "; + } + for ( $x = 0; $x < $x_offset; $x++ ) { + if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { + if ( !$seq_array[ $x ][ $y ] ) { + &dieWithUnexpectedError( "Alignment not in expected format" ); + } + if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) { + &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" ); + } + print OUTPP "$seq_array[ $x ][ $y ]"; + } + } + print OUTPP "\n"; + } + close( OUTPP ); + + return $number_colum, $x_offset, $rf_y; + +} ## pfam2phylipMatchOnly + + + +# Returns whether the argument (a String) +# starts with a SWISS-PROT name (SEQN_SPECI). +# Last modified: 06/21/01 +sub startsWithSWISS_PROTname { + return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ ); +} + + + +# Returns whether the argument starts with XXX.. XXXXX.. and the first +# character is not a "#". +# Last modified: 06/21/01 +sub isPfamSequenceLine { + return( !&isPfamCommentLine( $_[ 0 ] ) + && &containsPfamNamedSequence( $_[ 0 ] ) ); +} + + + +# Returns whether the argument does start with a "#". +# Last modified: 06/21/01 +sub isPfamCommentLine { + return ( $_[ 0 ] =~ /^#/ ); +} + + + +# Returns whether the argument starts with XXX XXXXX. +# Last modified: 06/21/01 +sub containsPfamNamedSequence { + return ( $_[ 0 ] =~ /^\S+\s+\S+/ ); +} + + +# Returns whether the argument starts with XXX XXXXX. +# Last modified: 06/21/01 +sub isRFline { + return ( $_[ 0 ] =~ /^#.*RF/ ); +} + + + +# Three arguments: +# 1. pairwise distance file +# 2. number of bootstraps +# 3. initial tree: BME, GME or NJ +# Last modified: 2008/12/31 +sub executeFastme { + my $inpwd = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $init_opt = $_[ 2 ]; + + &testForTextFilePresence( $inpwd ); + my $command = ""; + if ( $bs > 0 ) { + $command = "$FASTME -b $init_opt -i $inpwd -n $bs -s b"; + } + else { + $command = "$FASTME -b $init_opt -i $inpwd -s b"; + } + print $command; + + system( $command ); + + +} ## executeFastme + + +# Four arguments: +# 1. pairwise distance file +# 2. number of bootstraps +# 3. seed for random number generator +# 4. lower-triangular data matrix? 1: yes; no, otherwise +sub executeNeighbor { + my $inpwd = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $s = $_[ 2 ]; + my $l = $_[ 3 ]; + my $multi = ""; + my $lower = ""; + + &testForTextFilePresence( $inpwd ); + + if ( $bs >= 2 ) { + $multi = " +M +$bs +$s"; + } + if ( $l == 1 ) { + $lower = " +L"; + } + + system( "$NEIGHBOR >/dev/null 2>&1 << ! +$inpwd$multi$lower +2 +3 +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$multi$lower\"" ); + +} ## executeNeighbor + + +# Seven arguments: +# 1. pairwise distance file +# 2. number of bootstraps +# 3. seed for random number generator +# 4. number of jumbles for input order +# 5. lower-triangular data matrix? 1: yes; no, otherwise +# 6. FM for Fitch-Margoliash, ME for ME# 6. +# 7. 1 to use globale rearragements +sub executeFitch { + my $inpwd = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $s = $_[ 2 ]; + my $j = $_[ 3 ]; + my $l = $_[ 4 ]; + my $m = $_[ 5 ]; + my $use_global_rearr = $_[ 6 ]; + my $jumble = ""; + my $multi = ""; + my $lower = ""; + my $method = ""; + + my $global = ""; + if ( $use_global_rearr == 1 ) { + $global = " +G"; + } + + &testForTextFilePresence( $inpwd ); + + if ( $m eq "FM" ) { + $method = ""; + } + elsif ( $m eq "ME" ) { + $method = " +D"; + } + else { + &dieWithUnexpectedError( "method for FITCH must be either FM or ME" ); + } + + if ( $j >= 1 ) { + $jumble = " +J +$s +$j"; + } + + if ( $bs >= 2 ) { + $multi = " +M +$bs +$s"; + } + if ( $l == 1 ) { + $lower = " +L"; + } + + # jumble must be set BEFORE multi! + system( "$FITCH 2>&1 << ! +$inpwd$method$global$jumble$multi$lower +3 +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$FITCH $inpwd$method$global$jumble$multi$lower\"" ); + # 3: Do NOT print out tree + +} ## executeFitch + + + +# Two arguments: +# 1. pairwise distance file +# 2. outfile +sub executeBionj { + my $inpwd = $_[ 0 ]; + my $out = $_[ 1 ]; + + &testForTextFilePresence( $inpwd ); + my $command = "$BIONJ $inpwd $out"; + + system( $command ) + && &dieWithUnexpectedError( $command ); + +} + +# Four arguments: +# 1. (effective) sequence length +# 2. (effective) number of bases +# 3. pairwise distance file +# 4. outfile +sub executeWeighbor { + my $L = $_[ 0 ]; + my $b = $_[ 1 ]; + my $i = $_[ 2 ]; + my $o = $_[ 3 ]; + + &testForTextFilePresence( $i ); + my $command = "$WEIGHBOR -L $L -b $b -i $i -o $o"; + + system( $command ) + && &dieWithUnexpectedError( $command ); + +} + +# Six arguments: +# 1. DNA or Amino-Acids sequence filename (PHYLIP format) +# 2. number of data sets to analyse (ex:3) +# 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids) +# 4. number of relative substitution rate categories (ex:4), positive integer +# 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree +# 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used +# PHYML produces several results files : +# _phyml_lk.txt : likelihood value(s) +# _phyml_tree.txt : inferred tree(s) +# _phyml_stat.txt : detailed execution stats +sub executePhyml { + my $sequences = $_[ 0 ]; + my $data_sets = $_[ 1 ]; + my $model = $_[ 2 ]; + my $nb_categ = $_[ 3 ]; + my $tree = $_[ 4 ]; + my $estimate_invar_sites = $_[ 5 ]; + + if ( $data_sets < 1 ) { + $data_sets = 1 + } + + my $invar = "0.0"; # proportion of invariable sites, + # a fixed value (ex:0.0) | e to get the maximum likelihood estimate + if ( $estimate_invar_sites == 1 ) { + $invar = "e"; + } + + my $data_type = "1"; # 0 = DNA | 1 = Amino-Acids + my $format = "i"; # i = interleaved sequence format | s = sequential + my $bootstrap_sets = "0"; # number of bootstrap data sets to generate (ex:2) + # only works with one data set to analyse + + my $alpha = "e"; # gamma distribution parameter, + # a fixed value (ex:1.0) | e to get the maximum likelihood estimate + + my $opt_topology = "y"; # optimise tree topology ? y | n + my $opt_lengths = "y"; # optimise branch lengths and rate parameters ? y | n + + if ( $data_sets > 1 ) { + # No need to calc branch lengths for bootstrapped analysis + $opt_lengths = "n"; + } + + &testForTextFilePresence( $sequences ); + my $command = "$PHYML $sequences $data_type $format $data_sets $bootstrap_sets $model $invar $nb_categ $alpha $tree $opt_topology $opt_lengths"; + + print( "\n$command\n"); + + system( $command ) + && &dieWithUnexpectedError( $command ); + +} + + + + +# Four arguments: +# 1. name of alignment file (in correct format!) +# 2. number of bootstraps +# 3. jumbles: 0: do not jumble; >=1 number of jumbles +# 4. seed for random number generator +sub executeProtpars { + my $align = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $rand = $_[ 2 ]; + my $s = $_[ 3 ]; + my $jumble = ""; + my $multi = ""; + + + &testForTextFilePresence( $align ); + + if ( $bs > 1 && $rand < 1 ) { + $rand = 1; + } + + if ( $rand >= 1 ) { + $jumble = " +J +$s +$rand"; + } + + if ( $bs > 1 ) { + $multi = " +M +D +$bs"; + } + + system( "$PROTPARS 2>&1 << ! +$align$jumble$multi +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $align$jumble$multi\"" ); + # 3: Do NOT print out tree + + + return; + +} ## executeProtpars + + + +# "Model of substitution" order for DQO TREE-PUZZLE 5.0: +# Auto +# m -> Dayhoff (Dayhoff et al. 1978) +# m -> JTT (Jones et al. 1992) +# m -> mtREV24 (Adachi-Hasegawa 1996) +# m -> BLOSUM62 (Henikoff-Henikoff 92) +# m -> VT (Mueller-Vingron 2000) +# m -> WAG (Whelan-Goldman 2000) +# m -> Auto +# One argument: +# matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# Last modified: 07/07/01 +sub setModelForPuzzle { + my $matrix_option = $_[ 0 ]; + my $matr = ""; + + if ( $matrix_option == 0 ) { # JTT + $matr = " +m +m"; + } + elsif ( $matrix_option == 2 ) { # BLOSUM 62 + $matr = " +m +m +m +m"; + } + elsif ( $matrix_option == 3 ) { # mtREV24 + $matr = " +m +m +m"; + } + elsif ( $matrix_option == 5 ) { # VT + $matr = " +m +m +m +m +m"; + } + elsif ( $matrix_option == 6 ) { # WAG + $matr = " +m +m +m +m +m +m"; + } + elsif ( $matrix_option == 7 ) { # auto + $matr = ""; + } + else { # PAM + $matr = " +m" + } + + return $matr; + +} ## setModelForPuzzle + +# One argument: +# Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +# Last modified: 09/08/03 +sub setRateHeterogeneityOptionForPuzzle { + my $rate_heterogeneity_option = $_[ 0 ]; + my $opt = ""; + + if ( $rate_heterogeneity_option == 1 ) { + $opt = " +w"; + } + elsif ( $rate_heterogeneity_option == 2 ) { + $opt = " +w +w"; + } + elsif ( $rate_heterogeneity_option == 3 ) { + $opt = " +w +w +w"; + } + else { + $opt = ""; + } + + return $opt; +} ## setRateHeterogeneityOptionForPuzzle + + +# One argument: +# Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# Last modified: 09/08/03 +sub setParameterEstimatesOptionForPuzzle { + my $parameter_estimates_option = $_[ 0 ]; + my $opt = ""; + + if ( $parameter_estimates_option == 1 ) { + $opt = " +e"; + } + else { + $opt = ""; + } + + return $opt; +} ## setParameterEstimatesOptionForPuzzle + + + +# three/four/five arguments: +# 1. Name of inputfile +# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# 3. Number of sequences in alignment +# 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# 5. Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +sub executePuzzleBootstrapped { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + my $number_of_seqs = $_[ 2 ]; + my $parameter_estimates_option = $_[ 3 ]; + my $rate_heterogeneity_option = $_[ 4 ]; + + my $l = 0; + my $slen = 0; + my $counter = 0; + my $mat = ""; + my $est = ""; + my $rate = ""; + my $a = ""; + my @a = (); + + &testForTextFilePresence( $in ); + + open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!"; + while( ) { + if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { + $counter++; + } + } + close( GRP ); + + $l = `cat $in | wc -l`; + $slen = $l / $counter; + + system( "split --suffix-length=4 -$slen $in $in.splt." ) + && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split --suffix-length=4 -$slen $in $in.splt.\": $!"; + + @a = <$in.splt.*>; + + $mat = setModelForPuzzle( $matrix_option ); + if ( $parameter_estimates_option ) { + $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); + } + if ( $rate_heterogeneity_option ) { + $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); + } + + my $k=""; + if ( $number_of_seqs <= 257 ) { + $k = "k"; + } + + foreach $a ( @a ) { + print "-".$a."\n"; + system( "$PUZZLE $a << ! +$k +k +k$mat$est$rate +y +!" ) + && die "$0: Could not execute \"$PUZZLE $a\""; + + system( "cat $a.dist >> $in.dist" ) + && die "$0: Could not execute \"cat outdist >> $in.dist\""; + + unlink( $a, $a.".dist", $a.".puzzle" ); + } + + return; + +} ## executePuzzleBootstrapped + + + + + +# three/four/five arguments: +# 1. Name of inputfile +# 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# 3. Number of sequences in alignment +# 4. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# 5. Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +sub executePuzzle { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + my $number_of_seqs = $_[ 2 ]; + my $parameter_estimates_option = $_[ 3 ]; + my $rate_heterogeneity_option = $_[ 4 ]; + my $mat = ""; + my $est = ""; + my $rate = ""; + + &testForTextFilePresence( $in ); + + $mat = &setModelForPuzzle( $matrix_option ); + if ( $parameter_estimates_option ) { + $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); + } + if ( $rate_heterogeneity_option ) { + $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); + } + + my $k=""; + if ( $number_of_seqs <= 257 ) { + $k = "k"; + } + + + system( "$PUZZLE $in << ! +$k +k +k$mat$est$rate +y +!" ) + && die "$0: Could not execute \"$PUZZLE\""; + + return; + +} ## executePuzzle + + + + +# Preparation of the pwd file +sub addDistsToQueryToPWDfile { + my $pwd_file = $_[ 0 ]; + my $disttoquery_file = $_[ 1 ]; + my $outfile = $_[ 2 ]; + my $name_of_query = $_[ 3 ]; + my $name_of_query_ = ""; + my $return_line_pwd = ""; + my $return_line_dq = ""; + my $num_of_sqs = 0; + my $block = 0; + my $name_from_pwd = "X"; + my $name_from_dq = "Y"; + my @dists_to_query = (); + my $i = 0; + + &testForTextFilePresence( $pwd_file ); + &testForTextFilePresence( $disttoquery_file ); + + $name_of_query_ = $name_of_query; + for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) { + $name_of_query_ .= " "; + } + + open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" ); + open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" ); + + W: while ( $return_line_pwd = ) { + + + if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) { + $num_of_sqs = $1; + $num_of_sqs++; + if ( $block > 0 ) { + print OUT_AD "$name_of_query_ "; + for ( my $j = 0; $j < $i; ++$j ) { + print OUT_AD "$dists_to_query[ $j ] "; + } + print OUT_AD "0.0\n"; + } + print OUT_AD " $num_of_sqs\n"; + $block++; + @dists_to_query = (); + $i = 0; + } + + if ( $block == 1 + && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { + $name_from_pwd = $1; + + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + + if ( $return_line_dq !~ /\S/ ) { + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + } + $return_line_dq =~ /^\s*(\S+)\s+(\S+)/; + $name_from_dq = $1; + $dists_to_query[ $i++ ] = $2; + + + if ( $name_from_pwd ne $name_from_dq ) { + &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" ); + } + print OUT_AD $return_line_pwd; + + } + elsif ( $block > 1 + && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { + $name_from_pwd = $1; + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + if ( $return_line_dq !~ /\S/ ) { + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + } + $return_line_dq =~ /^\s*\S+\s+(\S+)/; + $dists_to_query[ $i++ ] = $1; + print OUT_AD $return_line_pwd; + } + } + print OUT_AD "$name_of_query_ "; + for ( my $j = 0; $j < $i; ++$j ) { + print OUT_AD "$dists_to_query[ $j ] "; + } + print OUT_AD "0.0\n"; + + close( OUT_AD ); + close( IN_PWD ); + close( IN_DQ ); + return $block; + +} ## addDistsToQueryToPWDfile + + + + +# Three arguments: +# 1. HMMER model db +# 2. name of HMM +# 3. outputfile name +# Last modified: 02/27/01 +sub executeHmmfetch { + + my $db = $_[ 0 ]; + my $name = $_[ 1 ]; + my $outfile = $_[ 2 ]; + + system( "$HMMFETCH $db $name > $outfile" ) + && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" ); + return; + +} ## executeHmmfetch + + + +# Checks wether a file is present, not empty and a plain textfile. +# One argument: name of file. +# Last modified: 07/07/01 +sub testForTextFilePresence { + my $file = $_[ 0 ]; + unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { + dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" ); + } +} ## testForTextFilePresence + + +# Last modified: 02/21/03 +sub addSlashAtEndIfNotPresent { + my $filename = $_[ 0 ]; + $filename =~ s/\s+//g; + unless ( $filename =~ /\/$/ ) { + $filename = $filename."/"; + } + return $filename; +} ## addSlashAtEndIfNotPresent + + + +# Last modified: 02/15/02 +sub exitWithWarning { + + my $text = $_[ 0 ]; + if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) { + print( "

user error

\n" ); + print( "

\n" ); + print( "$text\n" ); + print( "

\n" ); + print( "

 

\n" ); + } + else { + print( "\n\n$text\n\n" ); + } + + exit( 0 ); + +} ## exit_with_warning + + + +# Last modified: 02/15/02 +sub dieWithUnexpectedError { + + my $text = $_[ 0 ]; + + die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" ); + +} ## dieWithUnexpectedError + + + +1; diff --git a/forester/archive/perl/gs_aa_extract.pl b/forester/archive/perl/gs_aa_extract.pl new file mode 100755 index 0000000..d485920 --- /dev/null +++ b/forester/archive/perl/gs_aa_extract.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -W + +# $Id: gs_aa_extract.pl,v 1.2 2008/03/09 00:11:50 cmzmasek Exp $ + +# This extracts the AA sequences from GENSCAN output files +# Copyright (C) 2008-2009 Christian M. Zmasek +# All rights reserved +# Created 2007-07-28 in Winterthur, Switzerland by CMZ + +# Usage: gs_aa_extract.pl + +use strict; + +if ( scalar( @ARGV ) != 2 ) { + print "\ngs_aa_extract.pl \n\n"; + exit( -1 ); +} + +my $infile = $ARGV[ 0 ]; +my $outfile = $ARGV[ 1 ]; + +if ( -e $outfile) { + die "\n$0: \"$outfile\" already exists.\n\n"; +} +unless( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: cannot read from \"$infile\".\n\n"; +} + +open( IN, "$infile" ) || die "\n$0: Cannot open file \"$infile\": $!\n"; +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; + +my $line = ""; +my $desc = ""; + +while ( $line = ) { + if ( $line =~ /^>/ ) { + $desc = $line; + } + elsif ( $line =~ /^[A-Z]+$/ ) { + if ( length( $desc ) > 0 ) { + print OUT $desc; + $desc = ""; + } + print OUT $line; + } +} + +close( OUT ); + +print( "\nOK\n" ); + +exit( 0 ); + diff --git a/forester/archive/perl/makeTree.pl b/forester/archive/perl/makeTree.pl new file mode 100755 index 0000000..45c4658 --- /dev/null +++ b/forester/archive/perl/makeTree.pl @@ -0,0 +1,1211 @@ +#!/usr/bin/perl -W + +# makeTree.pl +# ----------- +# Copyright (C) 1999-2003 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified 04/06/04 +# +# +# Requirements makeTree is part of the RIO/FORESTER suite of programs. +# ------------ Many of its global variables are set via rio_module.pm. +# +# +# Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments) +# to run makeTree.pl on whole directories of alignments files. +# +# +# +# Usage +# ----- +# +# Tree calculation based on a Pfam/Clustal W alignment +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# makeTree.pl [-options] +# [path/name for temporary directory to be created] +# +# Example: +# "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree" +# +# +# Tree calculation based on precalculated pairwise distances +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Consensus tree will have no branch length values. +# Precalculated pairwise distances are the output of "pfam2pwd.pl", +# number of bootstraps needs to match the one used for the pwds. +# +# makeTree.pl <-options, includes "F"> [path/name for temporary directory +# to be created] +# +# Example: +# "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree" +# +# +# Tree calculation based on precalculated pairwise distances +# and matching alignment +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Consensus tree will have branch length values. +# Precalculated pairwise distances and the matching (processed) +# alignment are the output of "pfam2pwd.pl", number of bootstraps +# needs to match the one used for the pwds, matrix needs to match +# the one used for the pwds. +# +# makeTree.pl <-options, includes "UF"> +# [path/name for temporary directory to be created] +# +# Example: +# "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree" +# +# +# Options +# ------- +# +# N : Suggestion to remove columns in the alignment which contain gaps. +# Gaps are not removed, if, after removal of gaps, the resulting alignment would +# be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps. +# Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps. +# The number of bootstrapps should be divisible by 10. +# U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of +# bootstrapped analysis. +# J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM. +# L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM. +# M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM. +# W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM. +# T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM. +# P : Let TREE-PUZZLE choose which matrix to use, default: PAM. +# E : Exact parameter estimates in TREE-PUZZLE, default: Approximate. +# Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate) +# g : 8 Gamma distributed rates +# t : Two rates (1 invariable + 1 variable) +# m : Mixed (1 invariable + 8 Gamma rates) +# R : Randomize input order in PHYLIP NEIGHBOR. +# A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation). +# jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx). +# Sx : Seed for random number generator(s). Must be 4n+1. Default is 9. +# X : To keep multiple tree file (=trees from bootstrap resampled alignments). +# D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file. +# This is created form the not resampled (original) alignment. +# C : Calculate pairwise distances only (no tree). Bootstrap is always 1. +# No other files are generated. +# F : Pairwise distance (pwd) file as input (instead of alignment). +# No -D, -C, and -N options available in this case. +# V : Verbose. +# # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl). +# +# +# +# History: +# ------- +# +# 09/06/03: Added "#" option (to be used only for rio.pl). +# 03/24/04: Do not replace "?" with "-" in method pfam2phylip. + + +use strict; + +use FindBin; +use lib $FindBin::Bin; +use rio_module2; + +my $VERSION = "4.210"; + +my $TEMP_DIR_DEFAULT = "/tmp/maketree"; # Where all the infiles, outfiles, etc will be created. + +my $remove_gaps = 0; # 0: do not remove gaps; 1: remove gaps +my $bootstraps = 100; # 0,1: do not bootstrap. Default: 100 +my $puzzle_consensus_tree = 0; # 0: no; 1: yes. No is default. +my $matrix = 1; # 0 = JTT + # 1 = PAM - default + # 2 = BLOSUM 62 + # 3 = mtREV24 + # 5 = VT + # 6 = WAG + # 7 = auto +my $rate_heterogeneity = 0; # 0 = Uniform rate (default) + # 1 = 8 Gamma distributed rates + # 2 = Two rates (1 invariable + 1 variable) + # 3 = Mixed (1 invariable + 8 Gamma rates) +my $randomize_input_order = 0; # 0: do not randomize input order; 1 jumble +my $seed = 9; # Seed for random number generators. Default: 9 +my $keep_multiple_trees = 0; # 0: delete multiple tree file + # 1: do not delete multiple tree file +my $keep_distance_matrix = 0; # 1: (create and) keep; 0: do not (create and) keep +my $verbose = 0; # 0: no; 1: yes +my $pairwise_dist_only = 0; # 0: no; 1: yes +my $start_with_pwd = 0; # 0: no; 1: yes +my $start_with_pwd_and_aln = 0; # 0: no; 1: yes +my $no_consenus_tree = 0; # 0: no; 1: yes +my $exact_parameter_est = 0; # 0: no; 1: yes +my $use_protpars = 0; # 0: no; 1: yes +my $protpars_jumbles = 0; + +my %seqnames = (); # number => seqname +my %numbers = (); # seqname => number +my $options = ""; +my $infile = ""; +my $pwdfile = ""; +my $outfile = ""; +my $outfilenhx = ""; +my $logfile = ""; +my $alignfile = ""; +my $multitreefile = ""; +my $distancefile = ""; +my $log = ""; +my $number_of_aa = 0; +my $orig_length = 0; +my $ii = 0; +my $temp_dir = ""; +my $current_dir = ""; +my @out = (); +my $number_of_seqs = 0; + + + +unless ( @ARGV == 2 || @ARGV == 3 || @ARGV == 4 || @ARGV == 5 ) { + &printUsage(); + exit ( -1 ); +} + + + +# Analyzes the options: +# --------------------- + +if ( $ARGV[ 0 ] =~ /^-.+/ ) { + + unless ( @ARGV > 2 ) { + &printUsage(); + exit ( -1 ); + } + $options = $ARGV[ 0 ]; + + if ( $options =~ /F/ && $options !~ /U/ ) { + if ( @ARGV != 3 && @ARGV != 4 ) { + &printUsage(); + exit ( -1 ); + + } + $start_with_pwd = 1; + $infile = ""; + $pwdfile = $ARGV[ 1 ]; + + $outfile = $ARGV[ 2 ]; + if ( @ARGV == 4 ) { + $temp_dir = $ARGV[ 3 ]; + } + + } + elsif ( $options =~ /F/ && $options =~ /U/ ) { + if ( @ARGV != 4 && @ARGV != 5 ) { + &printUsage(); + exit ( -1 ); + } + $start_with_pwd = 1; + $start_with_pwd_and_aln = 1; + $pwdfile = $ARGV[ 1 ]; + $infile = $ARGV[ 2 ]; + $outfile = $ARGV[ 3 ]; + if ( @ARGV == 5 ) { + $temp_dir = $ARGV[ 4 ]; + } + + } + else { + if ( @ARGV != 3 && @ARGV != 4 ) { + &printUsage(); + exit ( -1 ); + } + $infile = $ARGV[ 1 ]; + $outfile = $ARGV[ 2 ]; + if ( @ARGV == 4 ) { + $temp_dir = $ARGV[ 3 ]; + } + } + + if ( $options =~ /N/ && $start_with_pwd != 1 ) { + $remove_gaps = 1; # do remove gaps + } + if ( $options =~ /B(\d+)/ ) { + $bootstraps = $1; + if ( $bootstraps <= 1 ) { + $bootstraps = 0; + } + elsif ( $bootstraps <= 9 ) { + $bootstraps = 0; + print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nno bootstrapping.\n\n"; + } + elsif ( $bootstraps % 10 != 0 ) { + $bootstraps = $bootstraps - $bootstraps % 10; # to ensure $bootstraps % 10 == 0 + print "\n\nMAKETREE: WARNING: Bootstrap number must be devisable by 10,\nhas been set to $bootstraps.\n\n"; + } + } + if ( $options =~ /A/ ) { + $use_protpars = 1 # PROTPARS + } + if ( $options =~ /j(\d+)/ ) { + $protpars_jumbles = $1; + if ( $protpars_jumbles < 0 ) { + $protpars_jumbles = 0; + } + } + if ( $options =~ /J/ ) { + $matrix = 0; # JTT + } + if ( $options =~ /L/ ) { + $matrix = 2; # Blossum + } + if ( $options =~ /M/ ) { + $matrix = 3; # mtREV24 + } + if ( $options =~ /T/ ) { + $matrix = 5; # VT + } + if ( $options =~ /W/ ) { + $matrix = 6; # WAG + } + if ( $options =~ /P/ ) { + $matrix = 7; # auto + } + if ( $options =~ /R/ ) { + $randomize_input_order = 1; + } + if ( $options =~ /S(\d+)/ ) { + $seed = $1; + } + if ( $options =~ /U/ ) { + $puzzle_consensus_tree = 1; + } + if ( $options =~ /X/ ) { + $keep_multiple_trees = 1; + } + if ( $options =~ /D/ && $start_with_pwd != 1 ) { + $keep_distance_matrix = 1; + } + if ( $options =~ /V/ ) { + $verbose = 1; + } + if ( $options =~ /C/ && $start_with_pwd != 1 ) { + $pairwise_dist_only = 1; + } + if ( $options =~ /E/ ) { + $exact_parameter_est = 1; + } + if ( $options =~ /g/ ) { + $rate_heterogeneity = 1; + } + if ( $options =~ /t/ ) { + $rate_heterogeneity = 2; + } + if ( $options =~ /m/ ) { + $rate_heterogeneity = 3; + } + if ( $options =~ /#/ ) { + $no_consenus_tree = 1; + } + if ( $protpars_jumbles > 0 && $use_protpars != 1 ) { + &printUsage(); + exit ( -1 ); + } + if ( $use_protpars == 1 ) { + if ( $randomize_input_order >= 1 + || $start_with_pwd == 1 + || $keep_distance_matrix == 1 + || $pairwise_dist_only == 1 ) { + &printUsage(); + exit ( -1 ); + } + if ( $bootstraps > 1 && $protpars_jumbles < 1 ) { + $protpars_jumbles = 1; + } + } + +} + +else { + unless ( @ARGV == 2 || @ARGV == 3 ) { + &printUsage(); + exit ( -1 ); + } + $infile = $ARGV[ 0 ]; + $outfile = $ARGV[ 1 ]; + if ( @ARGV == 3 ) { + $temp_dir = $ARGV[ 2 ]; + } +} + + + + +$current_dir = `pwd`; +$current_dir =~ s/\s//; + +if ( $outfile !~ /^\// ) { + # outfile is not absolute path. + $outfile = $current_dir."/".$outfile; +} + + + +if ( $pairwise_dist_only == 1 ) { + $bootstraps = 0; + $keep_multiple_trees = 0; + $puzzle_consensus_tree = 0; + $randomize_input_order = 0; + $start_with_pwd = 0; + $keep_distance_matrix = 1; +} + +if ( $bootstraps < 2 ) { + $no_consenus_tree = 0; +} + +# TREE-PUZZLE sets the option in this way: +# If two rates or mixed, exact parameter estimates are used. +if ( $rate_heterogeneity == 2 +|| $rate_heterogeneity == 3 ) { + $exact_parameter_est = 1 +} + +$logfile = $outfile.$LOG_FILE_SUFFIX; +$alignfile = $outfile.$ALIGN_FILE_SUFFIX; +$multitreefile = $outfile.$MULTIPLE_TREES_FILE_SUFFIX; +$distancefile = $outfile.$SUFFIX_PWD_NOT_BOOTS; + +if ( $outfile =~ /\.nhx$/i ) { + $outfilenhx = $outfile; + $logfile =~ s/\.nhx//i; + $alignfile =~ s/\.nhx//i; + $outfile =~ s/\.nhx//i; + $multitreefile =~ s/\.nhx//i; + $distancefile =~ s/\.nhx//i; +} +else { + $outfilenhx = $outfile.".nhx"; +} + +if ( -e $outfilenhx ) { + die "\n\nmakeTree: \"$outfilenhx\" already exists.\n\n"; +} +if ( $infile ne "" ) { + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\nmakeTree: Input alignment file \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; + } +} +if ( $start_with_pwd == 1 ) { + unless ( ( -s $pwdfile ) && ( -f $pwdfile ) && ( -T $pwdfile ) ) { + die "\n\nmakeTree: Pairwise distance file \"$pwdfile\" does not exist, is empty, or is not a plain textfile.\n\n"; + } +} + + + +# Prints out the options: +# ----------------------- + + +$log = "\n$0 logfile:\n"; +$log = $log."Version: $VERSION\n\n"; + + +if ( $start_with_pwd == 1 ) { + $log = $log."Input pairwise distance file (bootstrapped): $pwdfile\n"; +} +if ( $infile ne "" ) { + $log = $log."Input alignment : $infile\n"; +} + +if ( $no_consenus_tree != 1 ) { + $log = $log."Output tree file : $outfilenhx\n"; +} + +if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) { + $log = $log."Output multiple trees file : $multitreefile\n"; +} +if ( $keep_distance_matrix ) { + $log = $log."Output pairwise distance file : $distancefile\n"; +} + +$log = $log."Bootstraps : $bootstraps\n"; + +if ( $start_with_pwd != 1 && $use_protpars != 1 ) { + $log = $log."Prgrm to calculate pairwise dist. : TREE-PUZZLE\n"; +} + +if ( $use_protpars == 1 ) { + $log = $log."Program to calculate tree : PHYLIP PROTPARS\n"; + $log = $log."Number of jumbles in PROTPARS : $protpars_jumbles\n"; +} +else { + $log = $log."Program to calculate tree : PHYLIP NEIGHBOR (NJ)\n"; +} + +if ( $puzzle_consensus_tree == 1 ) { + $log = $log."Prgrm to calculate ML branch lenghts: TREE-PUZZLE\n"; +} +if ( $puzzle_consensus_tree == 1 || $start_with_pwd != 1 ) { + $log = $log."Model : "; + if ( $matrix == 0 ) { + $log = $log."JTT (Jones et al. 1992)\n"; + } + elsif ( $matrix == 2 ) { + $log = $log."BLOSUM 62 (Henikoff-Henikoff 92)\n"; + } + elsif ( $matrix == 3 ) { + $log = $log."mtREV24 (Adachi-Hasegawa 1996)\n"; + } + elsif ( $matrix == 5 ) { + $log = $log."VT (Mueller-Vingron 2000)\n"; + } + elsif ( $matrix == 6 ) { + $log = $log."WAG (Whelan-Goldman 2000)\n"; + } + elsif ( $matrix == 7 ) { + $log = $log."auto\n"; + } + else { + $log = $log."PAM (Dayhoff et al. 1978)\n"; + } +} +$log = $log."Model of rate heterogeneity : "; +if ( $rate_heterogeneity == 1 ) { + $log = $log."8 Gamma distributed rates\n"; +} +elsif ( $rate_heterogeneity == 2 ) { + $log = $log."Two rates (1 invariable + 1 variable)\n"; +} +elsif ( $rate_heterogeneity == 3 ) { + $log = $log."Mixed (1 invariable + 8 Gamma rates)\n"; +} +else { + $log = $log."Uniform rate\n"; +} +if ( $randomize_input_order >= 1 ) { + $log = $log."Randomize input order in NEIGHBOR : yes\n"; +} +$log = $log."Seed for random number generators : $seed\n"; +if ( $exact_parameter_est == 1 ) { + $log = $log."Exact parameter estimates in TREE-PUZZLE\n"; +} + +$log = $log."Start time/date : ".`date`; + + + + +# That's where the mischief starts.... +# ------------------------------------ + +$ii = 0; + +my $time_st = time; + +if ( $temp_dir eq "" ) { + $temp_dir = $TEMP_DIR_DEFAULT; +} + +$temp_dir = $temp_dir.$time_st.$ii; + +while ( -e $temp_dir ) { + $ii++; + $temp_dir = $temp_dir.$time_st.$ii; +} + +mkdir( $temp_dir, 0700 ) +|| die "\n\n$0: Unexpected error: Could not create <<$temp_dir>>: $!\n\n"; + +unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { + die "\n\n$0: Unexpected error: <<$temp_dir>> does not exist, or is not a directory.\n\n"; +} + + +if ( $start_with_pwd != 1 ) { + system( "cp", $infile, $temp_dir."/INFILE" ); + unless ( chmod ( 0600, $temp_dir."/INFILE" ) ) { + warn "\n\n$0: Could not chmod. $!\n\n"; + } + $infile = "INFILE"; +} + + +chdir ( $temp_dir ) +|| die "\n\n$0: Unexpected error: Could not chdir to <<$temp_dir>>: $!\n\n"; + + +if ( $start_with_pwd != 1 ) { + + @out = &DoPfam2phylip( $infile, $alignfile, $remove_gaps ); + $number_of_aa = $out[ 0 ]; + $orig_length = $out[ 1 ]; + $number_of_seqs = $out[ 2 ]; + + system( "cp", $alignfile, "infile" ); + + if ( $use_protpars != 1 ) { + # Calculating the pairwise distances (saved in file "infile"): "puzzle" + + system( "cp", $alignfile, "align" ); + + if ( $bootstraps > 1 ) { + + &executeSeqboot( $seed, $bootstraps ); + + if ( $keep_distance_matrix ) { + system( "mv", "outfile", "outfile___" ); + system( "cp", "align", "infile" ); + &executePuzzle( "infile", + $matrix, + $exact_parameter_est, + $rate_heterogeneity ); + system( "mv", "infile.dist", $distancefile ); + system( "mv", "outfile___", "outfile" ); + } + unlink( "infile" ); # Necessary, since "infile" is puzzle's default input. + system( "mv", "outfile", "IN" ); + + &executePuzzleBootstrapped( "IN", + $matrix, + $exact_parameter_est, + $rate_heterogeneity ); + + $pwdfile = "IN.dist"; + + } + else { + + &executePuzzle( "infile", + $matrix, + $exact_parameter_est, + $rate_heterogeneity ); + + if ( $keep_distance_matrix ) { + system( "cp outdist $distancefile" ); + } + $pwdfile = "infile.dist"; + } + + unlink( "infile.tree" ); + + if ( $pairwise_dist_only == 1 ) { + unlink( "infile", "align", "INFILE", "outdist", $alignfile ); + chdir( $current_dir ) + || die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n"; + + rmdir( $temp_dir ) + || die "\n\n$0: Unexpected error: Could not remove <<$temp_dir>>: $!\n\n"; + + print "\n\n$0 finished.\n\n"; + print "Output pairwise distance file written as: $distancefile\n\n"; + print "\n\nmakeTree successfully terminated.\n\n"; + exit( 0 ); + } + + } ## if ( $use_protpars != 1 ) + +} ## if ( $start_with_pwd != 1 ) + + +# Calculating the tree (saved in file "infile"): + +if ( $use_protpars != 1 ) { + unlink( "infile" ); + &executeNeighbor( $pwdfile, $bootstraps, $randomize_input_order, $seed, 1 ); +} +else { + if ( $bootstraps > 1 ) { + &executeSeqboot( $seed, $bootstraps ); + unlink( "infile" ); + system( "mv", "outfile", "infile" ); + } + &executeProtpars( "infile", $bootstraps, $protpars_jumbles, $seed ); +} + +unlink( "outfile" ); + +if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + + system( "cp", "outtree", $multitreefile ); +} + + +system( "mv", "outtree", "intree" ); + +if ( $bootstraps > 1 ) { + if ( $no_consenus_tree != 1 ) { + + # Consense: + &executeConsense( "intree" ); + + if ( $puzzle_consensus_tree == 1 ) { + + system( "cp", "outtree", "treefile_consense" ); + system( "mv", "outtree", "intree" ); + + # Puzzle for ML branch lenghts: + # The alignment is read from infile by default. + # The tree is read from intree by default. + + if ( $start_with_pwd_and_aln == 1 ) { + &pfam2phylipMatchOnly( $infile, + "infile", + 1 ); + } + elsif ( $use_protpars != 1 ) { + system( "mv", "align", "infile" ); # align = original alignment in phylip interleaved. + } + + &executePuzzleToCalculateBranchLenghts( $matrix, + $exact_parameter_est, + $rate_heterogeneity ); + + unlink( "outfile", "outdist" ); + system( "mv", "outtree", "outree_puzzle" ); + + # Transfer + &executeTransfersBranchLenghts( "outree_puzzle", "treefile_consense", $outfilenhx ); + + } + else { + unlink( "outfile", "align" ); + system( "mv", "outtree", $outfilenhx ); + } + } + else { + unlink( "outfile", "align" ); + + } +} +else { + unlink( "align", "infile.dist" ); + if ( $start_with_pwd != 1 ) { + system( "mv intree $outfilenhx" ); + } + +} + + +unlink( "treefile_consense", "outtree", "outree_puzzle", + "infile", "intree", "align", "INFILE", "IN", "IN.dist", "outdist" ); + + +$log = $log."Finish time/date : ".`date`; + +if ( $start_with_pwd != 1 ) { + $log = $log."Removed gaps : "; + if ( $remove_gaps == 1 ) { + $log = $log."yes\n"; + } + else { + $log = $log."no\n"; + } + $log = $log."Columns in alignment used : $number_of_aa\n"; + $log = $log."Columns in original alignment : $orig_length\n"; + $log = $log."Number of sequences in alignment : $number_of_seqs\n"; +} + + +open( OUT, ">$logfile" ) || die "\n$0: Cannot create file <<$logfile>>: $!\n"; +print OUT $log; +close( OUT ); + + +chdir( $current_dir ) +|| die "\n\n$0:Unexpected error: Could not chdir to <<$current_dir>>: $!\n\n"; + + +rmdir( $temp_dir ) +|| die "\n\n$0:Unexpected error: Could not remove <<$temp_dir>>: $!\n\n"; + +if ( $verbose == 1 ) { + print "\n\n$0 finished.\n"; + if ( $no_consenus_tree != 1 ) { + print "Output tree written as : $outfilenhx\n"; + } + print "Log written as : $logfile\n"; + if ( $start_with_pwd != 1 ) { + print "Alignment written as : $alignfile\n"; + } + if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) { + print "Multiple trees written as : $multitreefile\n"; + } + if ( $keep_distance_matrix ) { + print "Distance matrix written as: $distancefile\n"; + } +} + + +exit( 0 ); + + + + + +# Methods: +# -------- + + + + +# Executes pfam2phylip. +# If resulting alignment is too short due to the removal +# of gaps, is does not remove gaps. +# Three arguments: +# 1. infile +# 2. outfile +# 3. remove gaps: 1 to remove gaps; 0: do not remove gaps +# Last modified: 06/04/01 +sub DoPfam2phylip { + my $in = $_[ 0 ]; + my $out = $_[ 1 ]; + my $option = $_[ 2 ]; + my $aa = 0; + my @output = (); + + if ( $option == 1 ) { + @output = &pfam2phylip( $in, $out, 1 ); + $aa = $output[ 0 ]; + if ( $aa < 0 ) { + die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n"; + } + if ( $aa < $MIN_NUMBER_OF_AA ) { + unlink( $out ); + $option = 0; + $remove_gaps = 0; + } + } + if ( $option == 0 ) { # Must be another "if" (no elsif of else)! + @output = &pfam2phylip( $in, $out, 2 ); + # 2 is to substitute non-letters with "-" in the sequence. + $aa = $output[ 0 ]; + if ( $aa <= 0 ) { + die "\n\n$0: DoPfam2phylip: Unexpected error.\n\n"; + } + } + return @output; +} + + + +# Two arguments: +# 1. seed for random number generator +# 2. number of bootstraps +# Reads in "infile" by default. +sub executeSeqboot { + + my $s = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $verb = ""; + + &testForTextFilePresence( $infile ); + + if ( $verbose != 1 ) { + $verb = " +2"; + } + + + system( "$SEQBOOT << ! +r +$bs$verb +Y +$s +!" ) + && die "$0: Could not execute \"$SEQBOOT\""; + + return; + +} + + + + +# One/two/three argument(s): +# Reads in tree from "intree" by default. (Presence of "intree" automatically +# switches into "User defined trees" mode.) +# 1. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# 2. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# 3. Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +# Last modified: 09/08/03 (added 2nd and 3rd parameter) +sub executePuzzleToCalculateBranchLenghts { + my $matrix_option = $_[ 0 ]; + my $parameter_estimates_option = $_[ 1 ]; + my $rate_heterogeneity_option = $_[ 2 ]; + my $i = 0; + my $mat = ""; + my $est = ""; + my $rate = ""; + + unless ( ( -s "infile" ) && ( -f "infile" ) && ( -T "infile" ) ) { + die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; + } + unless ( ( -s "intree" ) && ( -f "intree" ) && ( -T "intree" ) ) { + die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; + } + + $mat = setModelForPuzzle( $matrix_option ); + if ( $parameter_estimates_option ) { + $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); + } + if ( $rate_heterogeneity_option ) { + $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); + } + + system( "$PUZZLE << ! +$mat$est$rate +x +y +!" ) + && die "$0: Could not execute \"$PUZZLE\""; + + return; + +} + + + + + + + +# Three/four arguments: +# 1. Name of file containing tree with correct branch lengths +# 2. Name of file containing tree with correct bootstraps +# 3. Outputfilename +# 4. R to reroot both trees in the same manner (use for FITCH, +# since this changes to rooting. +sub executeTransfersBranchLenghts { + my $tree_with_bl = $_[ 0 ]; + my $tree_with_bs = $_[ 1 ]; + my $out = $_[ 2 ]; + my $reroot = $_[ 3 ]; + my $R = ""; + + if ( $reroot && $reroot eq "R" ) { + $R = "R"; + } + + &testForTextFilePresence( $tree_with_bl ); + &testForTextFilePresence( $tree_with_bs ); + + system( "$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R" ) + && die "$0: Could not execute \"$TRANSFERSBRANCHLENGHTS $tree_with_bl $tree_with_bs $out $R\""; + + + return; +} + + + +# Called by method DoPfam2phylip. +# This reads a multiple sequence alignment file in Pfam format, +# Phylip's sequential format, or ClustalW (".aln")output and saves them +# in Phylip's sequential or interleaved format. +# (Those two are the same in this case, since all the seqs will be +# one line in length (no returns)). +# It returns (1st) the number of aa (columns) in the resulting +# alignment and the (2nd) number of aa (columns) in the original +# alignment. +# +# Reads a file containing a sequence alignment in the following format +# (as used in Pfam): +# #comments <- empty lines and lines begining with # (not mandatory) +# name1 kal +# name2 kal +# <- at least one empty line between blocks +# name1 kale +# name2 k.le +# +# Saves it in the "sequential" format of phylip: +# number of OTUs length of aa seqs +# name1 kalkale +# name2 kalk-le +# +# Three arguments: +# 1. infile name +# 2. outfile name +# 3. 1 : Removes colums with a gap (non-letter character) +# 2 : Substitutes non-letter characters (except "?") in the sequence with "-". +# +# Last modified: 03/24/04 +# Changes: +# 03/24/04: Do not replace "?" with "-" +# +sub pfam2phylip { + + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $options = $_[ 2 ]; # 1: remove gaps; 2: non-letters (except "?") -> "-" + my $return_line = ""; + my $x = 0; + my $y = 0; + my $x_offset = 0; + my $original_length = 0; + my @seq_name = (); + my @seq_array = (); + my $seq = ""; + my $max_x = 0; + my $max_y = 0; + my $m = 0; + my $n = 0; + my $i = 0; + my $move = 0; + my $saw_a_sequence_line = 0; + + if ( -e $outfile ) { + die "\n$0: pfam2phylip: <<$outfile>> already exists.\n"; + } + + &testForTextFilePresence( $infile ); + + open( INPP, "$infile" ) || die "\n$0: pfam2phylip: Cannot open file <<$infile>>: $!\n"; + + until ( $return_line !~ /^\s*\S+\s+\S+/ && $saw_a_sequence_line == 1 ) { + if ( $return_line =~ /^\s*\S+\s+\S+/ + && $return_line !~ /^\s*#/ + && $return_line !~ /^\s*\d+\s+\d+/ + && $return_line !~ /^\s*CLUSTAL/ ) { + $saw_a_sequence_line = 1; + $return_line =~ /^\s*(\S+)\s+(\S+)/; + $seq_name[ $y ] = $1; + $seq = $2; + $seq_name[ $y ] = substr( $seq_name[ $y ], 0, $LENGTH_OF_NAME ); + + for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) { + $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 ); + } + if ( $x_offset < length( $seq ) ) { + $x_offset = length( $seq ); + } + $y++; + } + $return_line = ; + if ( !$return_line ) { + last; + } + } + + $max_y = $y; + $y = 0; + $max_x = 0; + + while ( $return_line = ) { + if ( $return_line =~ /^\s*(\S+)\s+(\S+)/ + && $return_line !~ /^\s*#/ + && $return_line !~ /^\s*\d+\s+\d+/ ) { + $return_line =~ /^\s*\S+\s+(\S+)/; + $seq = $1; + for ( $x = 0; $x <= length( $seq ) - 1; $x++ ) { + $seq_array[ $x + $x_offset ][ $y % $max_y ] = substr( $seq, $x, 1 ); + } + if ( $max_x < length( $seq ) ) { + $max_x = length( $seq ); + } + $y++; + if ( ( $y % $max_y ) == 0 ) { + $y = 0; + $x_offset = $x_offset + $max_x; + $max_x = 0; + } + } + } + $original_length = $x_offset; + close( INPP ); + + + # Removes "gap-columns" (gaps = everything except a-z characters): + if ( $options == 1 ) { + $move = 0; + + COLUMN: for ( $x = 0; $x <= $x_offset - 1; $x++ ) { # goes through all aa positions (columns) + + for ( $y = 0; $y <= $max_y - 1; $y++ ) { # goes through all aas in a particular position + + unless ( $seq_array[ $x ][ $y ] && $seq_array[ $x ][ $y ] =~ /[a-z]/i ) { + $move++; + next COLUMN; + } + } + + # If this point is reached, column must be OK = no gaps. + if ( $move >= 1 ) { + for ( $m = 0; $m <= $max_y; $m++ ) { + for ( $n = $x; $n <= $x_offset; $n++ ) { + $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ]; + } + } + $x_offset = $x_offset - $move; + $x = $x - $move; + $move = 0; + } + } + if ( $move >= 1 ) { + for ( $m = 0; $m <= $max_y; $m++ ) { + for ( $n = $x; $n <= $x_offset; $n++ ) { + $seq_array[ $n - $move ][ $m ] = $seq_array[ $n ][ $m ]; + } + } + $x_offset = $x_offset - $move; + $x = $x - $move; + $move = 0; + } + } + + + # Writes the file: + + open( OUTPP, ">$outfile" ) || die "\n$0: pfam2phylip: Cannot create file <<$outfile>>: $!\n"; + print OUTPP "$max_y $x_offset\n"; + for ( $y = 0; $y < $max_y; $y++ ) { + print OUTPP "$seq_name[ $y ]"; + for ( $i = 0; $i <= ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) - 1 ); $i++ ) { + print OUTPP " "; + } + for ( $x = 0; $x <= $x_offset - 1; $x++ ) { + if ( $options == 2 ) { + if ( $seq_array[ $x ][ $y ] ) { + $seq_array[ $x ][ $y ] =~s /[^a-zA-Z\?]/-/; + } + else { + $seq_array[ $x ][ $y ] = "-"; + } + } + print OUTPP "$seq_array[ $x ][ $y ]"; + } + print OUTPP "\n"; + } + close( OUTPP ); + + return $x_offset, $original_length, $max_y; + +} ## pfam2phylip + + + + +sub printUsage { + + print "\n"; + print " makeTree.pl version $VERSION\n"; + print " -----------\n"; + + print < + [path/name for temporary directory to be created] + + Example: + "% makeTree.pl -UTB1000S41NDXV /DB/PFAM/Full/IL5 IL5_tree" + + + Tree calculation based on precalculated pairwise distances + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Consensus tree will have no branch length values. + Precalculated pairwise distances are the output of "pfam2pwd.pl", + number of bootstraps needs to match the one used for the pwds. + + makeTree.pl <-options, includes "F"> [path/name for temporary directory + to be created] + + Example: + "% makeTree.pl -FB100S21XV /pfam2pwd_out/IL5.pwd IL5_tree" + + + Tree calculation based on precalculated pairwise distances + and matching alignment + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Consensus tree will have branch length values. + Precalculated pairwise distances and the matching (processed) + alignment are the output of "pfam2pwd.pl", number of bootstraps + needs to match the one used for the pwds, matrix needs to match + the one used for the pwds. + + makeTree.pl <-options, includes "UF"> + [path/name for temporary directory to be created] + + Example: + "% makeTree.pl -UFLB100S21XV /pfam2pwd_out/IL5.pwd /pfam2pwd_out/IL5.aln IL5_tree" + + + Options + ------- + + N : Suggestion to remove columns in the alignment which contain gaps. + Gaps are not removed, if, after removal of gaps, the resulting alignment would + be shorter than $MIN_NUMBER_OF_AA. Default is not to remove gaps. + Bx : Number of bootstrapps. B0: do not bootstrap. Default is 100 bootstrapps. + The number of bootstrapps should be divisible by 10. + U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of + bootstrapped analysis. + J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM. + L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM. + M : Use mtREV24 matrix (Adachi-Hasegawa 1996) inTREE-PUZZLE, default: PAM. + W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM. + T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM. + P : Let TREE-PUZZLE choose which matrix to use, default: PAM. + E : Exact parameter estimates in TREE-PUZZLE, default: Approximate. + Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate) + g : 8 Gamma distributed rates + t : Two rates (1 invariable + 1 variable) + m : Mixed (1 invariable + 8 Gamma rates) + R : Randomize input order in PHYLIP NEIGHBOR. + A : Use PHYLIP PROTPARS instead of NEIGHBOR (and no pairwise distance calculation). + jx : Number of jumbles when using PHYLIP PROTPARS (random seed set with Sx). + Sx : Seed for random number generator(s). Must be 4n+1. Default is 9. + X : To keep multiple tree file (=trees from bootstrap resampled alignments). + D : To keep (and create in case of bootstrap analysis) pairwise distance matrix file. + This is created form the not resampled (original) alignment. + C : Calculate pairwise distances only (no tree). Bootstrap is always 1. + No other files are generated. + F : Pairwise distance (pwd) file as input (instead of alignment). + No -D, -C, and -N options available in this case. + V : Verbose. + # : Only for rio.pl: Do not calculate consensus tree ("I" option in rio.pl). + + +END + +} ## printUsage diff --git a/forester/archive/perl/mt.pl b/forester/archive/perl/mt.pl new file mode 100755 index 0000000..dd04ca1 --- /dev/null +++ b/forester/archive/perl/mt.pl @@ -0,0 +1,261 @@ +#!/usr/bin/perl -W + +# mt.pl +# ----- +# +# Copyright (C) 2003 Christian M. Zmasek +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Version: 1.000 +# Created on: 09/05/03 +# Last modified: 09/05/03 +# +# +# +# Calculates trees based on all alignments/files in a given directory using +# makeTree.pl. +# +# + +use strict; +use FindBin; +use lib $FindBin::Bin; +use rio_module2; + + +my $PREPROCESSING_COMMAND = ""; +my $PERFORM_PREPROCESSING = 0; + +my $POSTPROCESSING_COMMAND = "/nfs/dm3/homedir1/czmasek/RIO1.24/perl/extractSpecies.pl"; +my $PERFORM_POSTPROCESSING = 1; + + +my $MY_TEMP_DIR = $TEMP_DIR_DEFAULT; # $TEMP_DIR_DEFAULT is inherited + # from rio_module.pm + + + + +my $options = ""; # Options for makeTree.pl, see makeTree.pl. + + +my $suffix = ""; +my $use_suffixes = 0; +my $input_dir = ""; +my $output_dir = ""; + +my $i = 0; +my $filename = ""; +my @filenames = (); + + + + + + +# Analyzes the options: +# --------------------- + +unless ( @ARGV == 3 || @ARGV == 4 ) { + &printUsage(); +} + +$options = $ARGV[ 0 ]; +$input_dir = $ARGV[ 1 ]; +$output_dir = $ARGV[ 2 ]; + +if ( @ARGV == 3 ) { + $use_suffixes = 0; +} +elsif ( @ARGV == 4 ) { + $use_suffixes = 1; + $suffix = $ARGV[ 3 ]; +} + + +$input_dir = &addSlashAtEndIfNotPresent( $input_dir ); +$output_dir = &addSlashAtEndIfNotPresent( $output_dir ); +$MY_TEMP_DIR = &addSlashAtEndIfNotPresent( $MY_TEMP_DIR ); + + + + +# This adds a "-" before the options for makeTree: +# ------------------------------------------------ +unless ( $options =~ /^-/ ) { + $options = "-".$options; +} + + + + + +# This creates the temp file: +# -------------------------- + +my $time = time; +my $ii = 0; + +my $temp_file = $MY_TEMP_DIR."mt".$time.$ii; + +while ( -e $temp_file ) { + $ii++; + $temp_file = $MY_TEMP_DIR."mt".$time.$ii; +} + + + +opendir( DIR, $input_dir ) || error( "Cannot open directory \"$input_dir\": $!" ); + +$i = 0; + +while( defined( $filename = readdir( DIR ) ) ) { + if ( $filename =~ /^\.\.?$/ ) { + next; + } + if ( $use_suffixes == 1 && $filename !~ /$suffix$/ ) { + next; + } + + $filenames[ $i ] = $filename; + $i++; +} + +close( DIR ); + +$i = 0; + +FOREACH: foreach $filename ( @filenames ) { + + # If the corresponding tree seems to already exists, do next one. + if ( -e "$output_dir$filename.nhx" ) { + next FOREACH; + } + + print "\n\n\n\n"; + print "MT.PL\n"; + print "working on: $filename\n"; + + print "[tree calculation $i]\n"; + print "=====================================================================\n\n\n"; + + + unlink( "$output_dir$filename.aln", + "$output_dir$filename.log", + "$output_dir$filename.nbd" ); + + print( "MT.PL: executing:\n" ); + + my $inputfile = $input_dir.$filename; + + my $outputfilename = ""; + + if ( $use_suffixes == 1 ) { + $outputfilename = $output_dir . $filename; + $outputfilename =~ s/$suffix$//; + $outputfilename =~ s/\.$//; + $outputfilename .= ".nhx"; + } + else { + $outputfilename = $output_dir . $filename . ".nhx"; + } + + + + if ( $PERFORM_PREPROCESSING == 1 ) { + my $pre_command = "$PREPROCESSING_COMMAND"; + + print( "$pre_command\n" ); + system( $pre_command ) && &error( "Could not execute \"$pre_command\"" ); + } + + $MAKETREE = "/nfs/dm3/homedir1/czmasek/RIO1.24/perl/makeTree2.pl"; # <<<<<<<<<<<<<<<<<<<<<<<-------------------~~~~~~~~~~~~~~~~~~~~~~~ + + my $command = "$MAKETREE $options $inputfile $outputfilename"; + + print( "$command\n" ); + system( $command ) && &error( "Could not execute \"$command\"" ); + + + + if ( $PERFORM_POSTPROCESSING == 1 ) { + my $post_command = "$POSTPROCESSING_COMMAND $outputfilename"; + + print( "$post_command\n" ); + system( $post_command ) && &error( "Could not execute \"$post_command\"" ); + } + + + + $i++; + +} + + + +print( "\n\n\nMT.PL: Done!\n" ); + +exit( 0 ); + + + + + + +sub error{ + + my $text = $_[ 0 ]; + + print( "\nxt.pl: ERROR:\n" ); + print( "$text\n\n" ); + + exit( -1 ); + +} + + + + +sub printUsage { + print "\n"; + print " mt.pl\n"; + print " _____\n"; + print " \n"; + print " Copyright (C) 2003 Christian M. Zmasek\n"; + print " All rights reserved\n"; + print "\n"; + print " Author: Christian M. Zmasek\n"; + print " zmasek\@genetics.wustl.edu\n"; + print " http://www.genetics.wustl.edu/eddy/forester/\n"; + print "\n"; + print "\n"; + print " Purpose\n"; + print " -------\n"; + print "\n"; + print " Tree construction using makeTree.pl on all alignments/files\n"; + print " in a given directory.\n"; + print "\n"; + print "\n"; + print " Usage\n"; + print " -----\n"; + print "\n"; + print " mt.pl [suffix for alignments to be used in input directory]\n"; + print "\n"; + print " If a suffix is given, it will be removed for the output files.\n"; + print "\n"; + print "\n"; + print " Example\n"; + print " -------\n"; + print "\n"; + print " \"mt.pl NS21UTRB100DX alignments/ trees/ .aln\"\n"; + print "\n"; + print "\n"; + print "\n"; + exit( -1 ); + +} diff --git a/forester/archive/perl/multifetch.pl b/forester/archive/perl/multifetch.pl new file mode 100755 index 0000000..82205df --- /dev/null +++ b/forester/archive/perl/multifetch.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl + +# multifetch.pl [options] +# +# Fetch all the seqs on the list. The list is a file with one line +# per sequence; the first field is the key. +# +# Options: +# -d : domain fetch - list is in GDF format +# -n : include this many extra residues upstream (-d only) +# -c : include this many extra residues downstream (-d only) +# -f : fetch in FASTA instead of native format +# -g : use getseq from , not fetch from main databases. +# This always gives FASTA output. +# -D : specify a source database, same usage as getseq: +# -Dsw SwissProt +# -Dpir PIR +# -Dem EMBL +# -Dgb GenBank +# -Dwp WormPep +# -Dowl OWL + + +use FindBin; +use lib $FindBin::Bin; +use rio_module; +require "getopts.pl"; + + +&Getopts('c:n:dfg:D:'); +if ($opt_c) { $extra_c = $opt_c; } +if ($opt_n) { $extra_n = $opt_n; } +if ($opt_d) { $domains = 1; } +if ($opt_f) { $fmtarg = "-Ffasta";} else {$fmtarg = ""; } +if ($opt_g) { $filearg = "-d$opt_g ";} else {$filearg = ""; } +if ($opt_D) { $dbarg = "-D$opt_D "; } else {$dbarg = ""; } + + +while (<>) { + if ($domains) { + if (($name, $from, $to, $source) = /^\s*(\S+)\s+(\d+)\s+(\d+)\s+(\S+)/){ + if ($from < $to) { + $from -= $opt_n; + $to += $opt_c; + } + else { + $from += $opt_n; + $to -= $opt_c; + } + + system("$SFE $filearg $dbarg $fmtarg -r \"$name\" -f $from -t $to \"$source\"") + && die "\n\n$0: Unexpected error: Could not execute \"$SFE $filearg $dbarg $fmtarg -r \"$name\" -f $from -t $to \"$source\"\": $!"; + } + } else { + if (/^\s*(\S+)/) { + $key = $1; + + system("$SFE $filearg $dbarg $fmtarg \"$key\"") + && die "\n\n$0: Unexpected error: Could not execute \"$SFE $filearg $dbarg $fmtarg \"$key\"\": $!"; + } + } +} + +# 01/30/02 +# CZ +# Added usage of rio_module.pm, $SFE for sfetch. + +# Thu Apr 10 18:27:40 1997 +# - added -D option +# - simplified from six different getseq calls to two + diff --git a/forester/archive/perl/nph-riowebserver b/forester/archive/perl/nph-riowebserver new file mode 100755 index 0000000..dee1123 --- /dev/null +++ b/forester/archive/perl/nph-riowebserver @@ -0,0 +1,939 @@ +#! /usr/bin/perl -W + +# nph-riowebserver.pl +# ------------------- +# +# Copyright (C) 2002 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Created: 02/18/02 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified: 02/20/02 + + +use strict; +use CGI; +use queue; + + +my $RIOPL = "/home/rio/forester/perl/rio4P.pl"; +my $JAVA = "/home/rio/j2sdk1.4.0/bin/java"; +my $TEST_NHX = $JAVA." -cp /home/rio/forester/java forester.tools.testNHX"; +my $TEMPDIR = "/home/rio/rws_temp"; +my $SPECIESTREE = "/home/rio/forester/data/species/tree_of_life_bin_1-4.nhx"; +my $SPECIESLIST = "/home/rio/forester/data/species/tree_of_life_bin_1-4_species_list"; +my $hmm_search_url_A = "http://pfam.wustl.edu/cgi-bin/nph-hmmsearch?protseq="; +my $hmm_search_url_B = "&search_mode=merge&cutoff_strategy=ga"; + +my $RIO_ALN_DIRECTORY = "/data/rio/ALNs/"; +my $RIO_NBD_DIRECTORY = "/data/rio/NBDs/"; +my $ALIGN_FILE_SUFFIX = ".aln"; +my $ALIGN_NBD_FILE = ".nbd"; +my $DIR_FOR_TREES = "/var/www/html/trees/"; # Directory for NHX files to be read by ATV applet +my $URL_FOR_TREES = "http://forester.wustl.edu/trees/"; # URL base for NHX files to be read by ATV applet +my $CODE_BASE_FOR_ATV_APPLET = "http://forester.wustl.edu/applets/"; # URL for ATV applet (jar file) +my $TARGET_FILES_IN_DIR_FOR_TREES = 100; # If the number of nhx files in $DIR_FOR_TREES is lager then $MAX_FILES_IN_DIR_FOR_TREES +my $MAX_FILES_IN_DIR_FOR_TREES = 120; # the oldest files will be deleted until the number is down to $TARGET_FILES_IN_DIR_FOR_TREES. +my $O_THRESHOLD_DEFAULT = 0; +my $SN_THRESHOLD_DEFAULT = 0; +my $U_THRESHOLD_DEFAULT = 50; +my $SEED_FOR_RANDOM_DEFAULT = 41; +my $SORT_DEFAULT = 12; +my $MIN_SIZE = 5; # Minimal size (in chars) for input files +my $MAX_SIZE = 10000; # Maximal size (in chars) for input files +my $MAX_LINES = 1000; # Maximal lines for input files +my $RIO_OPTIONS = "U=60 Y=2 X=2 Z=2 I C E x +"; +my $CONTACT = "zmasek\@genetics.wustl.edu"; +my $VERSION = "0.3"; + + +my $o_threshold = 0; +my $sn_threshold = 0; +my $u_threshold = 0; +my $seed_for_random = 0; +my $sort = 0; +my $size_d = 0; +my $size_c = 0; +my $entry_time = 0; +my $njobs = 0; +my $njobs_thisuser = 0; +my $user_defined_tree = 0; + + + +my $query = ""; +my $query_seq = ""; +my $query_seq_file = ""; +my $tree_file = ""; +my $pfam_domain = ""; +my $species = ""; +my $output_tree = ""; +my $output_up = ""; +my $remote_addr = ""; +my $oneline = ""; +my $aln = ""; +my $speciestree = ""; +my $output = ""; +my $query_sequence = ""; # To be submitted to hmmsearch website, if necessary. +my $link_to_hmmsearch = ""; + +my @lines = (); +my %Species_names_hash = (); + + +$| = 1; + +$query = new CGI; + + +$query_seq = $query->param( 'query_seq' ); +$query_seq_file = $query->upload( 'query_seq_file' ); +$pfam_domain = $query->param( 'pfam_domain' ); +$species = $query->param( 'species' ); +$o_threshold = $query->param( 'o_threshold' ); +$sn_threshold = $query->param( 'sn_threshold' ); +$u_threshold = $query->param( 'u_threshold' ); +$seed_for_random = $query->param( 'seed_for_random' ); +$output_up = $query->param( 'output_up' ); +$sort = $query->param( 'sort_priority' ); +$tree_file = $query->upload( 'tree_file' ); + +$remote_addr = $ENV{ REMOTE_ADDR }; + + +# NPH header +# ---------- +print $query->header( -status=>"200 OK", + -server=>"$ENV{ SERVER_SOFTWARE }", + -nph=>1 ); + + + + +# Prints the first HTML +# --------------------- +print "\n"; +print "\n"; +print "\n"; +print "[ RIO SERVER | phylogenomic analysis of a protein sequence ]\n"; +print "\n"; +print "\n"; + +&print_ATV_JavaScript(); + +print "\n"; +print "\n"; + +&print_navbar(); + + + +# Reads in, cleans up and checks +# ------------------------------ + +if ( ( !defined( $query_seq_file ) && !defined( $query_seq ) ) +|| ( $query_seq_file !~ /\w+/ && $query_seq !~ /\w+/ ) ) { + &nph_user_error( "need to specify a sequence file or submit a sequence directly" ); +} + +if ( $query_seq_file =~ /\w+/ && $query_seq =~ /\w+/ ) { + &nph_user_error( "cannot specify a sequence file and submit a sequence directly" ); +} + + +if ( $query_seq_file =~ /\w+/ ) { + # Reading in from file + &readInFile( $query_seq_file ); +} +else { + # "cut and paste" + @lines = split( /^/, $query_seq ); +} + + +if ( $lines[ 0 ] =~ /^\s*>/ ) { # FASTA + shift( @lines ); +} + + +foreach $oneline ( @lines ) { + $size_d += length( $oneline ); + if ( $size_d > $MAX_SIZE ) { + &nph_user_error( "query sequence is too long (>$MAX_SIZE)" ); + } + $oneline =~ s/[^A-Za-z]//g; + $size_c += length( $oneline ); +} +if ( $size_c < $MIN_SIZE ) { + &nph_user_error( "query sequence is too short (<$MIN_SIZE)" ); +} + + +# Writes a temp file for the query sequence +open( PROT, ">$TEMPDIR/$$.query" ) || &nph_fatal_error( "failed to open temp query file" ); +foreach $oneline ( @lines ) { + print PROT $oneline; + $query_sequence .= $oneline; +} +close( PROT ); + +if ( !defined( $species ) || $species !~ /\w+/ || length( $species ) < 2 ) { + &nph_user_error( "need to specify the species of the query sequence" ); +} + +$link_to_hmmsearch = " >> click here to perform hmmsearch on query sequence << "; + +if ( !defined( $pfam_domain ) || $pfam_domain !~ /\w+/ || length( $pfam_domain ) < 1 ) { + &nph_user_error( "need to specify a name for a pfam domain of the query sequence
$link_to_hmmsearch" ); +} + +if ( length( $species ) > 5 ) { + &nph_user_error( "invalid species name" ); +} +$species =~ s/[^A-Za-z0-9]//g; +if ( length( $species ) < 2 ) { + &nph_user_error( "invalid species name" ); +} + +if ( length( $pfam_domain ) > 40 ) { + &nph_user_error( "invalid pfam domain name
$link_to_hmmsearch" ); +} +$pfam_domain =~ s/[\s,;\.><\|\\\/\(\)!@\#\$%&\*\^=]//g; +if ( length( $pfam_domain ) < 1 ) { + &nph_user_error( "invalid pfam domain name
$link_to_hmmsearch" ); +} + +if ( defined( $tree_file ) && $tree_file =~ /\w+/ ) { + $user_defined_tree = 1; +} + +$species =~ tr/a-z/A-Z/; + +if ( $user_defined_tree != 1 ) { + &checkForPresenceOfSpecies( $species ); +} + +$aln = $RIO_ALN_DIRECTORY.$pfam_domain.$ALIGN_FILE_SUFFIX; + +if ( &checkForTextFilePresence( $aln ) != 1 ) { + &nph_user_error( "no pairwise distances precalculated for pfam domain \"$pfam_domain\"
$link_to_hmmsearch" ); +} + + +if ( checkForNumberBetween0and100( $o_threshold ) != 1 ) { + $o_threshold = $O_THRESHOLD_DEFAULT; +} +if ( checkForNumberBetween0and100( $sn_threshold ) != 1 ) { + $sn_threshold = $SN_THRESHOLD_DEFAULT; +} +if ( checkForNumberBetween0and100( $u_threshold ) != 1 ) { + $u_threshold = $U_THRESHOLD_DEFAULT; +} +if ( !defined( $seed_for_random ) || $seed_for_random !~ /\d/ +|| $seed_for_random =~ /\D/ || $seed_for_random > 10000 || $seed_for_random < 0 ) { + $seed_for_random = $SEED_FOR_RANDOM_DEFAULT; +} +if ( !defined( $sort ) +|| $sort > 16 || $sort < 12 ) { + $sort = $SORT_DEFAULT; +} + +if ( defined( $output_up ) && $output_up eq "yes" ) { + $RIO_OPTIONS .= " p"; +} +else { + $u_threshold = -1; +} + + + + + + + + +# User defined species tree is dealt with here +# -------------------------------------------- + +if ( $user_defined_tree == 1 ) { + &readInFile( $tree_file ); + $size_d = 0; + $size_c = 0; + foreach $oneline ( @lines ) { + $size_d += length( $oneline ); + if ( $size_d > $MAX_SIZE ) { + &nph_user_error( "user defined species tree file is too long (>$MAX_SIZE)" ); + } + $oneline =~ s/;\|,<>\s//g; + $oneline =~ tr/a-z/A-Z/; + + $size_c += length( $oneline ); + } + if ( $size_c < $MIN_SIZE ) { + &nph_user_error( "user defined species tree file is too short (<$MIN_SIZE)" ); + } + + open( TREE, ">$TEMPDIR/$$.tree" ) || &nph_fatal_error( "failed to open temp species tree file" ); + foreach $oneline ( @lines ) { + print TREE $oneline; + } + close( TREE ); + + $speciestree = "$TEMPDIR/$$.tree"; + system( "$TEST_NHX $speciestree" ) + && nph_user_error( "user defined species tree is not in proper NHX format (or is unrooted, or contains multifurcations) $!" ); + +} +else { + $speciestree = $SPECIESTREE; +} + + + +# Join the queue, using queue.pm API +# ---------------------------------- + +$entry_time = time; + +( $njobs, $njobs_thisuser ) = &queue::CheckQueue( "rioqueue", $remote_addr, $TEMPDIR ); +if ( $njobs > 5 ) { + &nph_user_error("The server is currently swamped, with $njobs searches in the queue.
Please come back later! Sorry."); +} +if ( $njobs_thisuser > 0 ) { + &nph_user_error( "We already have $njobs_thisuser searches in the queue from + your IP address ($remote_addr). Please wait until some or all of them + finish.
If you think you got this message in error, wait a minute or so and + resubmit your job. You probably hit your browser's stop button after you + started a search - but that doesn't stop our compute cluster, it just breaks + your connection to us. You won't be able to start a new search until the + cluster's done." ); +} +if ( $njobs > 0 ) { + print_waiting_message( $njobs ); +} +&queue::WaitInQueue( "rioqueue", $remote_addr, $TEMPDIR, $$, 10 ); # wait with ten-second granularity. + + + + +# Prints "waiting" header +# ----------------------- + +my $number_of_seqs = &getNumberOfSeqsFromNBDfile( $RIO_NBD_DIRECTORY.$pfam_domain.$ALIGN_NBD_FILE ); +my $estimated_time = &estimateTime( $number_of_seqs ); + +print( "

RIO: Starting search. Estimated time: $estimated_time seconds per domain (assuming all rio nodes are running). Please wait...

\n" ); + + + + +# Runs RIO +# -------- + +&run_rio( $pfam_domain, # domain + "$TEMPDIR/$$.query", # query file name + "$TEMPDIR/$$.outfile", # output file name + "QUERY_$species", # name for query + $speciestree, # species tree + $RIO_OPTIONS, # more options + "$TEMPDIR/$$", # temp file + $o_threshold, + $sn_threshold, + $u_threshold, + $seed_for_random, + $sort ); + + + +# Done +# ---- + +&showATVlinks(); + + + +# Cleanup +unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); + +$output .= "

 

\n"; + +# Leaves the queue in an orderly fashion. +&queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); + +print( $output ); + +&print_footer(); + +&removeFiles( $DIR_FOR_TREES, $TARGET_FILES_IN_DIR_FOR_TREES, $MAX_FILES_IN_DIR_FOR_TREES ); + +exit( 0 ); + + + + + + + + +# Methods +# ------- + + + +# Last modified: 02/19/02 +sub run_rio { + + my $pfam_name = $_[ 0 ]; + my $query_file = $_[ 1 ]; + my $output_file = $_[ 2 ]; + my $name_for_query = $_[ 3 ]; + my $species_tree_file = $_[ 4 ]; + my $more_options = $_[ 5 ]; + my $tmp_file_rio = $_[ 6 ]; + my $t_o = $_[ 7 ]; + my $t_sn = $_[ 8 ]; + my $t_u = $_[ 9 ]; + my $seed = $_[ 10 ]; + my $sort = $_[ 11 ]; + + my $start_time = time; + + my $options_for_rio = ""; + + $options_for_rio .= ( " A=".$pfam_name ); + $options_for_rio .= ( " Q=".$query_file ); + $options_for_rio .= ( " O=".$output_file ); + $options_for_rio .= ( " N=".$name_for_query ); + $options_for_rio .= ( " S=".$species_tree_file ); + $options_for_rio .= ( " j=".$tmp_file_rio ); + $options_for_rio .= ( " L=".$t_o ); + $options_for_rio .= ( " B=".$t_sn ); + if ( $t_u != -1 ) { + $options_for_rio .= ( " v=".$t_u ); + } + $options_for_rio .= ( " y=".$seed ); + $options_for_rio .= ( " P=".$sort ); + $options_for_rio .= ( " ".$more_options ); + + $output = `$RIOPL 1 $options_for_rio`; + + if ( $? != 0 ) { + nph_rio_error(); + } + my $finish_time = time; + my $wait_time = $finish_time - $entry_time; + my $cpu_time = $finish_time - $start_time; + + + + # Logs the results. + my $date = `date`; + chop( $date ); + open ( LOGFILE, ">>$TEMPDIR/log") || &nph_fatal_error( "could not open log file" ); + flock( LOGFILE, 2 ); + print LOGFILE "$date queue: $njobs wait: $wait_time true_cpu: $cpu_time pid: $$ addr: $ENV{'REMOTE_ADDR'} host: $ENV{'REMOTE_HOST'} pfam: $pfam_name\n"; + flock( LOGFILE, 8 ); + close ( LOGFILE ); + + return; + +} ## run_rio + + + + +# Reads a file into "@lines" +# Last modified: 02/19/02 +sub readInFile { + my $file = $_[ 0 ]; + my $l = 0; + my $s = 0; + @lines = (); + + $file =~ s/;\|,<>&\s//g; + + while( <$file> ) { + $s += length( $_ ); + if ( $s > $MAX_SIZE ) { + &nph_user_error( "query sequence is too long (>$MAX_SIZE)" ); + } + $l++; + if ( $l > $MAX_LINES ) { + &nph_user_error( "file has too many lines (>$MAX_LINES)" ); + } + + push( @lines, $_ ); + + } + +} ## readInFile + + + + +# Reads in (SWISS-PROT) species names from a file. +# Names must be separated by newlines. +# Lines beginning with "#" are ignored. +# A possible "=" and everything after is ignored. +# One argument: species-names-file name +# Last modified: 02/19/02 +sub readSpeciesNamesFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $species = ""; + + open( IN_RSNF, "$infile" ) || &nph_fatal_error( "could not open species list" ); + while ( $return_line = ) { + if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { + $species = $1; + $species =~ s/=.+//; + $Species_names_hash{ $species } = ""; + } + } + close( IN_RSNF ); + + return; +} ## readSpeciesNamesFile + + + +# Last modified: 02/19/02 +sub checkForNumberBetween0and100 { + + my $x = $_[ 0 ]; + + if ( !defined( $x ) || $x !~ /\d/ || $x =~ /\D/ || $x > 100 || $x < 0 ) { + return 0; + } + else { + return 1; + } + +} ## checkForNumberBetween0and100 + + + +# Last modified: 02/19/02 +sub getNumberOfSeqsFromNBDfile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $number_of_seqs = 0; + + open( C, "$infile" ) || &nph_fatal_error( "could not open NBD file" ); + while ( $return_line = ) { + if ( $return_line =~ /^\s*(\d+)\s*$/ ) { + $number_of_seqs = $1; + last; + } + } + close( C ); + return $number_of_seqs; + +} ## getNumberOfSeqsFromNBDfile + + + +# Last modified: 02/19/02 +sub print_waiting_message { + + my $njobs = $_[ 0 ]; + + print( "

\n" ); + print( "RIO: There are $njobs searches queued ahead of you on the RIO server. Please wait...\n" ); + print( "

\n" ); + + return; + +} ## print_waiting_message + + + +# Last modified: 02/19/02 +sub checkForPresenceOfSpecies { + + my $species = $_[ 0 ]; + + &readSpeciesNamesFile( $SPECIESLIST ); + unless( exists( $Species_names_hash{ $species } ) ) { + &nph_user_error( "species \"$species\" not present in currently used species tree" ); + } + + return; +} ## checkForPresenceOfSepecies + + + +# Last modified: 02/19/02 +sub checkForTextFilePresence { + + my $file = $_[ 0 ]; + + if ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { + return 1; + } + else { + return 0; + } + +} ## checkForTextFilePresence + + + + + +# Last modified: 02/19/02 +sub print_footer { + + &print_navbar(); + &print_contact(); + print( "\n" ); + print( "\n" ); + + return; + +} ## print_footer + + + +# Last modified: 02/19/02 +sub print_navbar { + + print( "
\n" ); + print( "

\n" ); + print( "RIO $VERSION \n" ); + print( "phylogenomic analysis of a protein sequence | " ); + print( "help | " ); + print( "forester/rio home | " ); + print( "pfam\n" ); + print( "

\n" ); + print( "
\n" ); + + return; + +} ## print_navbar + + + +# Last modified: 02/19/02 +sub print_contact { + + print( "

comments, questions, flames? email $CONTACT

\n" ); + + return; + +} ## print_contact + + + +# Last modified: 02/19/02 +sub showATVlinks { + + my $domain_no = 0; + + if ( -s "$TEMPDIR/$$.outfile.rio.nhx" ) { + $domain_no = 1; + system( "mv", "$TEMPDIR/$$.outfile.rio.nhx", $DIR_FOR_TREES ) + && &nph_fatal_error( "could not mv $TEMPDIR/$$.outfile.rio.nhx" ); + } + elsif ( -s "$TEMPDIR/$$.outfile.rio-1.nhx" ) { + $domain_no = 1; + while ( -s "$TEMPDIR/$$.outfile.rio-$domain_no.nhx" ) { + system( "mv", "$TEMPDIR/$$.outfile.rio-$domain_no.nhx", $DIR_FOR_TREES ) + && &nph_fatal_error( "could not mv $TEMPDIR/$$.outfile.rio-$domain_no.nhx.nhx" ); + $domain_no++; + } + + } + + + if ( $domain_no == 1 ) { + $output .= "

 

\n"; + $output .= "\n"; + $output .= "\n"; + $output .= "
\n"; + $output .= "download NHX file describing this tree
\n"; + } + elsif ( $domain_no > 1 ) { + $output .= "

 

\n"; + $output .= "\n"; + $output .= "\n"; + } + $output .= "
\n"; + $output .= "download NHX file for domain #$x
\n"; + } + + return; + +} ## showATVlinks + + +# Removes output tree (NHX) files if more than $_[ 2 ] in $_[ 0 ] +# Removes until $_[ 1 ] are left +# Last modified: 02/19/02 +sub removeFiles { + + my $dir = $_[ 0 ]; + my $target = $_[ 1 ]; + my $max = $_[ 2 ]; + + my $files = &countFilesInDir( $dir ); + + if ( $files > $max ) { + + my $diff = $files - $target; + + for ( my $i = 0; $i < $diff; $i++ ) { + &removeOldestFile( $dir ); + } + } + + return; +} ## removeFiles + + + +# Last modified: 02/19/02 +sub countFilesInDir { + + my $dir = $_[ 0 ]; + my $file = ""; + my $c = 0; + + opendir( DIR, $dir ) || &nph_fatal_error( "could not open dir $dir" ); + while( defined ( $file = readdir( DIR ) ) ) { + unless ( $file =~ /\d/ ) { + next; + } + $c++; + } + closedir( DIR ); + + return( $c ); + +} ## countFilesInDir + + + +# Last modified: 02/19/02 +sub removeOldestFile { + my $dir = $_[ 0 ]; + my $file = ""; + my $oldest = ""; + my $smallest_time = 0; + my $time = 0; + my $first = 1; + + opendir( DIR, $dir ) || &nph_fatal_error( "could not open dir $dir" ); + while( defined ( $file = readdir( DIR ) ) ) { + unless ( $file =~ /\d/ ) { + next; + } + $file =~ /(\d+)/; + $time = $1; + if ( $first == 1 ) { + $first = 0; + $smallest_time = $time; + $oldest = $file + } + elsif ( $time < $smallest_time ) { + $smallest_time = $time; + $oldest = $file; + } + } + closedir( DIR ); + + unlink( $dir.$oldest ) || &nph_fatal_error( "could not delete $oldest" ); + + return; + +} ## removeOldestFile + + + +# Last modified: 02/19/02 +sub print_ATV_JavaScript { + +print < + + + +END + + return; + +} ## print_ATV_JavaScript + + + +# Last modified: 02/19/02 +sub estimateTime { + my $number_of_seqs = $_[ 0 ]; + my $estimated_time = 0; + if ( $number_of_seqs <= 50 ) { + $estimated_time = 15; + } + elsif ( $number_of_seqs <= 100 ) { + $estimated_time = 20; + } + elsif ( $number_of_seqs <= 150 ) { + $estimated_time = 30; + } + elsif ( $number_of_seqs <= 200 ) { + $estimated_time = 35; + } + elsif ( $number_of_seqs <= 250 ) { + $estimated_time = 40; + } + elsif ( $number_of_seqs <= 300 ) { + $estimated_time = 60; + } + elsif ( $number_of_seqs <= 400 ) { + $estimated_time = 100; + } + elsif ( $number_of_seqs <= 500 ) { + $estimated_time = 160; + } + elsif ( $number_of_seqs <= 600 ) { + $estimated_time = 390; + } + elsif ( $number_of_seqs <= 700 ) { + $estimated_time = 530; + } + elsif ( $number_of_seqs <= 800 ) { + $estimated_time = 750; + } + elsif ( $number_of_seqs <= 900 ) { + $estimated_time = 850; + } + else { + $estimated_time = $number_of_seqs; + } + return $estimated_time; +} ## estimateTime + + + +# Last modified: 02/19/02 +sub nph_rio_error { + + my $mesg = $_[ 0 ]; + + &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); + + unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); + + + + if ( $user_defined_tree == 1 ) { + print( "

RIO error

\n" ); + print( "

[the RIO analysis appearently died]

\n" ); + print( "

the most likely source of this error is an invalid user defined species tree

\n" ); + } + else { + print( "

RIO server fatal error

\n" ); + print( "

[the RIO analysis appearently died for unknown reasons]

\n" ); + print( "

This type of error should not happen

\n" ); + print( "

\n" ); + print( "We may have logged it automatically, but we would appreciate it if you would also notify us at\n" ); + print( "$CONTACT\n" ); + print( "

\n" ); + } + print( "

 

\n" ); + + &print_footer(); + system( "rm -r $TEMPDIR/$$"."0" ); + die; + +} ## nph_fatal_error + + + +# Last modified: 02/19/02 +sub nph_fatal_error { + + my $mesg = $_[ 0 ]; + + &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); + + unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); + + print( "

RIO server fatal error

\n" ); + print( "

[$mesg : $!]

\n" ); + print( "

This type of error should not happen

\n" ); + print( "

\n" ); + print( "We may have logged it automatically, but we would appreciate it if you would also notify us at\n" ); + print( "$CONTACT\n" ); + print( "

\n" ); + print( "

 

\n" ); + + + &print_footer(); + die; + +} ## nph_fatal_error + + + +# Last modified: 02/19/02 +sub nph_user_error { + + my $mesg = $_[ 0 ]; + + &queue::RemoveFromQueue( "rioqueue", $remote_addr, $TEMPDIR, $$ ); + + unlink( "$TEMPDIR/$$.query", "$TEMPDIR/$$.tree" ); + + print( "

user error

\n" ); + print( "

\n" ); + print( "$mesg\n" ); + print( "

\n" ); + print( "

 

\n" ); + + + &print_footer(); + + die "nph-riowebserver handled: $mesg"; + +} ## nph_user_error + + + + diff --git a/forester/archive/perl/p7extract.pl b/forester/archive/perl/p7extract.pl new file mode 100755 index 0000000..fe6a69e --- /dev/null +++ b/forester/archive/perl/p7extract.pl @@ -0,0 +1,116 @@ +#! /usr/bin/perl + +# Usage: p7extract.pl +# +# Converts hmmsearch output to GLF or GDF. GLF is the default. +# Order is sorted by bit score +# +# Options: +# -C : extract Pfam coverage statistics (NAR paper) +# -d : extract domains in GDF format +# -t : report only hits better than evalue of +# -s : include scores in output +# -e : include evalues in output +# -l : include negative log evalues in output for easy sorting +# +# Note: p7extract.pl -sel gives the extended GLF format expected by +# the PROFMARK benchmark scripts + +require "getopts.pl"; + +$ethresh = 0; + +&Getopts('Cdt:sel'); +if ($opt_C) { $coverage_mode = 1; $gdfmode = 1;} +if ($opt_d) { $gdfmode = 1; } +if ($opt_t) { $ethresh = $opt_t; } +if ($opt_s) { $do_scores = 1; } +if ($opt_e) { $do_eval = 1; } +if ($opt_l) { $do_log = 1; } + +$status = 1; # -C only: assume we will fail, 'til proven otherwise + +while (<>) +{ + if (/^Query HMM:\s+(\S+)/) {$hmmname = $1;} + if (/^Scores for complete sequences/) {$indom = 0; $inseq = 1;} + if (/^Parsed for domains/) {$indom = 1; $inseq = 0;} + if (/^Histogram of all scores/) {$indom = 0; $inseq = 0;} + if (/^Total sequences searched/) {$status = 0;} # looks like we've seen output + + if ( $inseq && + (($id, $sc, $ev, $nd) = /(\S+).+\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) + { + if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0) + { + if (! $gdfmode) { + $show_key{$id} = 1; # remember this name + $show_sc{$id} = $sc; + $show_ev{$id} = $ev; + } + $numseqs++; + } + } + + if ($gdfmode && $indom && + (($id, $sqfrom, $sqto, $sc, $ev) = + /(\S+)\s+\S+\s+(\d+)\s+(\d+).+\s(\S+)\s+(\S+)\s*$/)) + { + if (($ethresh == 0 || $ev < $ethresh) && $show_key{$id} == 0) + { + $key = "$id/$sqfrom-$sqto"; + $show_key{$key} = 1; + $show_id{$key} = $id; + $show_sqfrom{$key} = $sqfrom; + $show_sqto{$key} = $sqto; + $show_sc{$key} = $sc; + $show_ev{$key} = $ev; + + $numdomains++; + + $domsize = $sqto - $sqfrom + 1; + if ($domsize < 0) { $domsize *= -1; } + $numresidues += $domsize; + } + } + +} + +if ($coverage_mode) +{ + if ($status == 0) { + printf "%-20s %6d %6d %6d\n", $hmmname, $numseqs, $numdomains, $numresidues; + exit 0; + } else { + printf "%-20s [FAILED]\n", $hmmname; + exit 1; + } + +} + + + +foreach $key (sort byscore keys(%show_key)) +{ + if ($gdfmode) + { + printf("%-24s\t%6d\t%6d\t%15s", + $key, $show_sqfrom{$key}, $show_sqto{$key}, $show_id{$key}) + } else { + printf("%-24s", $key); + } + # Optional extensions to GDF/GLF + if ($do_scores) { printf("\t%8s", $show_sc{$key}); } + if ($do_eval) { printf("\t%12s", $show_ev{$key}); } + if ($do_log) { printf("\t%12.1f", -log($show_ev{$key})); } + print "\n"; +} + +sub byscore { + $show_sc{$b} <=> $show_sc{$a}; +} + +sub byevalue { + $show_ev{$a} <=> $show_ev{$b}; +} + diff --git a/forester/archive/perl/pf_cutoff_extract.pl b/forester/archive/perl/pf_cutoff_extract.pl new file mode 100755 index 0000000..1a56d9c --- /dev/null +++ b/forester/archive/perl/pf_cutoff_extract.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -W + +# $Id: pf_cutoff_extract.pl,v 1.4 2009/11/11 02:28:19 cmzmasek Exp $ + +# This extracts GA, TC, or NC score cutoff values from +# Pfam HMM files (GA1, TC1, NC1) +# Copyright (C) 2008-2009 Christian M. Zmasek +# All rights reserved +# Created 2007-08-01 in Winterthur, Switzerland by CMZ + +# Usage: pf_cutoff_extract.pl + +use strict; + +if ( scalar( @ARGV ) != 3 ) { + print "\npf_cutoff_extract.pl \n\n"; + exit( -1 ); +} + +my $infile = $ARGV[ 0 ]; +my $cutoff_type = uc( $ARGV[ 1 ] ); +my $outfile = $ARGV[ 2 ]; + +my $GA = "GA"; +my $TC = "TC"; +my $NC = "NC"; + +if ( -e $outfile ) { + die "\n$0: \"$outfile\" already exists.\n\n"; +} +unless( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n$0: cannot read from \"$infile\".\n\n"; +} +unless( $cutoff_type eq $GA || $cutoff_type eq $TC || $cutoff_type eq $NC ) { + die "\n$0: illegal value \"$cutoff_type\" for cutoff type.\n\n"; +} + +open( IN, "$infile" ) || die "\n$0: Cannot open file \"$infile\": $!\n"; +open( OUT, ">$outfile" ) || die "\n$0: Cannot create file \"$outfile\": $!\n"; + +my $line = ""; +my $name = ""; +my $line_number = 0; +my $n = 0; + +while ( $line = ) { + $line_number++; + if ( $line =~ /^NAME\s+(.+)/ ) { + if ( length( $name ) > 0 ) { + die "\n$0: Unexpected line $line at line $line_number: $!\n"; + } + $name = $1; + } + elsif ( $line =~ /^$cutoff_type\s+(\S+)\s+[^;]+/ ) { + if ( length( $name ) < 1 ) { + die "\n$0: Unexpected line $line at line $line_number: $!\n"; + } + $n++; + print OUT "$name $1\n"; + $name = ""; + } + elsif ( $line =~ /\/\// ) { + $name = ""; + } +} + +close( OUT ) || die "\n$0: Cannot close file \"$outfile\": $!\n";; + +print( "\nExtracted $n $cutoff_type" . "1 values to \"$outfile\"\n" ); +print( "\nOK\n" ); + +exit( 0 ); + diff --git a/forester/archive/perl/pfam2pwd.pl b/forester/archive/perl/pfam2pwd.pl new file mode 100755 index 0000000..73ac495 --- /dev/null +++ b/forester/archive/perl/pfam2pwd.pl @@ -0,0 +1,743 @@ +#!/usr/bin/perl -W + +# pfam2pwd.pl +# ----------- +# Copyright (C) 1999-2002 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Created: 05/17/01 +# +# Last modified 02/20/03 +# +# +# See RIO_INSTALL on how to use this program. +# ------------------------------------------ +# + +use strict; + +use FindBin; +use lib $FindBin::Bin; +use rio_module; + +my $VERSION = "3.002"; + + + +# ============================================================================= +# ============================================================================= +# +# THESE VARIABLES NEED TO BE SET BY THE USER +# ------------------------------------------ +# + + +# Pfam alignments to calculate pairwise distances from: +# ----------------------------------------------------- +my $MY_PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; # must end with "/" + + + +# This file lists all the alignments for which to calculate pairwise distances +# from. If left empty, ALL the alignments in $MY_PFAM_FULL_DIRECTORY +# will be used: +# ---------------------------------------------------------------------------- +my $ALGNS_TO_USE_LIST_FILE = ""; + + + +# This is _VERY IMPORTANT_. It determines the species whose sequences +# are being used (sequences from species not listed in $MY_SPECIES_NAMES_FILE +# are ignored). Normally, one would use the same list as RIO uses +# ($SPECIES_NAMES_FILE in "rio_module.pm") -- currently "tree_of_life_bin_1-6.nhx". +# +# For certain large families (such as protein kinases), one might use a +# species file which contains less species in order to be able to finish +# the calculations in reasonable time. +# For example, to exclude most mammals, use: +# my $MY_SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list_NO_RAT_MONKEYS_APES_SHEEP_GOAT_HAMSTER" +# (to only use sequences from SWISS-PROT add this line: +# $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/NO_TREMBL";) +# ---------------------------------------------------------------------------- +my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE; + + + +# This is were the output goes (must end with "/") +# ------------------------------------------------ +my $MY_RIO_PWD_DIRECTORY = "/path/to/pfam2pwd_out/pwd/"; +my $MY_RIO_BSP_DIRECTORY = "/path/to/pfam2pwd_out/bsp/"; +my $MY_RIO_NBD_DIRECTORY = "/path/to/pfam2pwd_out/nbd/"; +my $MY_RIO_ALN_DIRECTORY = "/path/to/pfam2pwd_out/aln/"; +my $MY_RIO_HMM_DIRECTORY = "/path/to/pfam2pwd_out/hmm/"; + + + +# A directory to create temporary files in: +# ----------------------------------------- +my $MY_TEMP_DIR = "/tmp/"; # must end with "/" + + + +# Alignments in which the number of sequences after pruning (determined +# by "$MY_SPECIES_NAMES_FILE") is lower than this, are ignored +# (no calculation of pwds): +# ------------------------------------------------------------------ +my $MIN_SEQS = 5; + + + +# Alignments in which the number of sequences after pruning (determined +# by "$MY_SPECIES_NAMES_FILE") is greater than this, are ignored +# (no calculation of pwds): +# ------------------------------------------------------------------ +my $MAX_SEQS = 700; + + + +# Seed for the random number generator for bootstrapping (must be 4n+1): +# --------------------------------------------------------------------- +my $MY_SEED = 85; + + + +# This is used to choose the model to be used for the (ML) +# distance calculation: +# IMPORTANT: "$MY_MATRIX_FOR_PWD" in "rio_module.pm" needs to +# have the same value, when the pwds calculated are going to +# be used for RIO! +# 0 = JTT +# 2 = BLOSUM 62 +# 3 = mtREV24 +# 5 = VT +# 6 = WAG +# PAM otherwise +# -------------------------------------------------------- +my $MY_MATRIX = 2; + + + +# +# End of variables which need to be set by the user. +# +# ============================================================================= +# ============================================================================= + + + + + + + + +my $too_small = 0; +my $too_large = 0; +my $i = 0; +my $seqs = 0; +my $filename = ""; +my $tmp_dir = ""; +my $current_dir = ""; +my $return_line = ""; +my @filenames = (); +my @too_small_names = (); +my @too_large_names = (); +my %Species_names_hash = (); +my %AC_OS = (); # AC -> species name +my %AC_DE = (); # AC -> description +my %ALGNS_TO_USE = (); # name of alignment -> "" +my $use_algns_to_use_list = 0; +my $LOGFILE = "00_pfam2pwd_LOGFILE"; + $HMMBUILD = $HMMBUILD." --amino"; + + +&createTempdir(); + + +&startLogfile(); + + +opendir( DIR, $MY_PFAM_FULL_DIRECTORY ) || die "\n\n$0: Cannot open directory $MY_PFAM_FULL_DIRECTORY: $!\n\n"; +$i = 0; +while( defined( $filename = readdir( DIR ) ) ) { + if ( $filename =~ /^\.\.?$/ ) { + next; + } + $filenames[ $i ] = $filename; + $i++; +} +close( DIR ); + + +&readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE ); + +&readTrEMBL_ACDEOS_FILE(); + +if ( defined( $ALGNS_TO_USE_LIST_FILE ) && $ALGNS_TO_USE_LIST_FILE =~ /\w/ ) { + $use_algns_to_use_list = 1; + &readListFile(); +} + + +$current_dir = `pwd`; +$current_dir =~ s/\s//; +chdir ( $tmp_dir ) +|| die "\n\n$0: Unexpected error: Could not chdir to <<$tmp_dir>>: $!"; + +$i = 0; + +FOREACH_ALIGN: foreach $filename ( @filenames ) { + + # If the corresponding pwd, positions, and aln files seem to already exists, do next one. + if ( ( -e $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD ) + && ( -e $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS ) + && ( -e $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS ) + && ( -e $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX ) + && ( -e $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ) ) { + next FOREACH_ALIGN; + } + + if ( $use_algns_to_use_list == 1 && !exists( $ALGNS_TO_USE{ $filename } ) ) { + next FOREACH_ALIGN; + } + + + $seqs = &removeSeqsFromPfamAlign( $MY_PFAM_FULL_DIRECTORY.$filename, + "REM_SEQ_OUTFILE", + 1 ); + if ( $seqs < $MIN_SEQS ) { + unlink( "REM_SEQ_OUTFILE" ); + $too_small_names[ $too_small++ ] = $filename; + next FOREACH_ALIGN; + } + elsif ( $seqs > $MAX_SEQS ) { + unlink( "REM_SEQ_OUTFILE" ); + $too_large_names [ $too_large++ ] = $filename; + next FOREACH_ALIGN; + } + + + print "\n\n\n"; + print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + print " $i: $filename ($seqs seqs)\n"; + print " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"; + print "\n"; + + # If one of the two file exists from a previous (interrupted) run. + unlink( $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD ); + unlink( $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS ); + unlink( $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS ); + unlink( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX ); + unlink( $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ); + + + &executeHmmbuild( "REM_SEQ_OUTFILE", + $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, + "hmm" ); + + if ( unlink( "hmm" ) != 1 ) { + die "\n\n$0: Unexpected error: Could not delete <>: $!"; + } + + if ( unlink( "REM_SEQ_OUTFILE" ) != 1 ) { + die "\n\n$0: Unexpected error: Could not delete <>: $!"; + } + + executeHmmbuildHand( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, + $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ); + + system( $HMMCALIBRATE, $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM ) + && die "\n\n$0: Could not execute \"$HMMCALIBRATE $MY_RIO_HMM_DIRECTORY.$filename.$SUFFIX_HMM\": $!"; + + &pfam2phylipMatchOnly( $MY_RIO_ALN_DIRECTORY.$filename.$ALIGN_FILE_SUFFIX, "infile" ); + + &executePuzzle( "infile", $MY_MATRIX ); + + system( "mv", "infile.dist", $MY_RIO_NBD_DIRECTORY.$filename.$SUFFIX_PWD_NOT_BOOTS ) + && die "\n\n$0: Unexpected error: $!"; + + &executeBootstrap( "infile", + $BOOTSTRAPS, + "BOOTSTRAPPED_ALGN", + $MY_RIO_BSP_DIRECTORY.$filename.$SUFFIX_BOOT_STRP_POS, + $MY_SEED ); + + if ( unlink( "infile" ) != 1 ) { + die "\n\n$0: Unexpected error: Could not delete <>: $!"; + } + + + &executePuzzleBootstrapped( "BOOTSTRAPPED_ALGN", $MY_MATRIX ); + + ##if ( unlink( "outfile" ) != 1 ) { + ## die "\n\n$0: Unexpected error: Could not delete <>: $!"; + ##} + + + system( "mv", "BOOTSTRAPPED_ALGN".".dist", $MY_RIO_PWD_DIRECTORY.$filename.$SUFFIX_PWD ) + && die "\n\n$0: Unexpected error: $!\n\n"; + + if ( unlink( "BOOTSTRAPPED_ALGN" ) != 1 ) { + die "\n\n$0: Unexpected error: Could not delete <>: $!"; + } + + $i++; + +} ## End of FOREACH_ALIGN loop. + + +chdir( $current_dir ) +|| die "\n\n$0: Unexpected error: Could not chdir to <<$current_dir>>: $!"; + +rmdir( $tmp_dir ); + +&finishLogfile(); + +print "\n\n\n"; +print( "pfam2pwd.pl: Done.\n" ); +print( "Successfully calculated $i pairwise distance files.\n" ); +print( "Too large alignments (>$MAX_SEQS): $too_large\n" ); +print( "Too small alignments (<$MIN_SEQS): $too_small\n" ); +print( "See the logfile \"$MY_RIO_PWD_DIRECTORY".$LOGFILE."\"\n" ); +print "\n\n\n"; + +exit( 0 ); + + + + + + +# Methods +# ------- + + + +# Three arguments: +# 1. Stockholm alignment +# 2. Outalignment +# 3. Outhmm +# Returns the options used. +# Last modified: 06/26/01 +sub executeHmmbuild { + + my $full = $_[ 0 ]; + my $outalignment = $_[ 1 ]; + my $outhmm = $_[ 2 ]; + my $options = ""; + + unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) { + die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n"; + } + + $options = getHmmbuildOptionsFromPfam( $full ); + + $options =~ s/-f//; + $options =~ s/-g//; + $options =~ s/-s//; + $options =~ s/-F//; + $options =~ s/-A//; + $options =~ s/-o\s+\S+//; + $options =~ s/(\s|^)[^-]\S+/ /g; + + if ( $options =~ /--prior/ ) { + my $basename = basename( $full ); + $basename .= ".PRIOR"; + $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/; + } + + # Remove for versions of HMMER lower than 2.2. + if ( $options =~ /--informat\s+\S+/ ) { + $options =~ s/--informat\s+\S+/-/; + } + + system( "$HMMBUILD $options -o $outalignment $outhmm $full" ) + && die "\n\n$0: Could not execute \"$HMMBUILD $options -o $outalignment $outhmm $full\".\n\n"; + + return $options; + +} ## executeHmmbuild. + + +# Two arguments: +# 1. Stockholm alignment +# 2. Outhmm +# Returns the options used. +# Last modified: 06/26/01 +sub executeHmmbuildHand { + + my $full = $_[ 0 ]; + my $outhmm = $_[ 1 ]; + my $options = ""; + + unless ( ( -s $full ) && ( -f $full ) && ( -T $full ) ) { + die "\n\n$0: \"$full\" does not exist, is empty, or is not a plain textfile.\n\n"; + } + + $options = getHmmbuildOptionsFromPfam( $full ); + + $options =~ s/-f//; + $options =~ s/-g//; + $options =~ s/-s//; + $options =~ s/-F//; + $options =~ s/-A//; + $options =~ s/-o\s+\S+//; + $options =~ s/(\s|^)[^-]\S+/ /g; + + if ( $options =~ /--prior/ ) { + my $basename = basename( $full ); + $basename .= ".PRIOR"; + $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/; + } + + # Remove for versions of HMMER lower than 2.2. + if ( $options =~ /--informat\s+\S+/ ) { + $options =~ s/--informat\s+\S+/-/; + } + + system( "$HMMBUILD --hand $options $outhmm $full" ) + && die "\n\n$0: Could not execute \"$HMMBUILD -- hand $options $outhmm $full\".\n\n"; + + return $options; + +} ## executeHmmbuildHand. + + + +# One argument: +# Pfam align name. +# Last modified: 02/26/01 +sub getHmmbuildOptionsFromPfam { + + my $infile = $_[ 0 ]; + my $return_line = ""; + my $result = ""; + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\n$0: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; + } + + open( GHO, $infile ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; + while ( $return_line = ) { + if ( $return_line =~ /^\s*#.*hmmbuild\s+(.+)\s*$/ ) { + $result = $1; + close( GHO ); + return $result; + } + } + close( GHO ); + return $result; + +} ## getHmmbuildOptionsFromPfam + + + +# Similar to the method with the same name in "rio.pl". +# Removes sequences from a Pfam flat file. +# Adds species to TrEMBL seqs. +# It can remove all sequences not from species listed in a species names file. +# It can remove all sequences which do not have a SWISS-PROT name (XXXX_XXXXX) +# Three arguments: +# 1. Pfam flat file name +# 2. outfile name +# 3. 1 to remove TrEMBL seqs with "(FRAGMENT)" in their DE line. +# Returns the number of sequences in the resulting alignment. +# If a query name is given, it returns -1 if query is not found in alignment, +# -10 if the name is not unique. +# Last modified: 05/24/02 +sub removeSeqsFromPfamAlign { + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $remove_frags = $_[ 2 ]; + my $return_line = ""; + my $saw_sequence_line = 0; + my $number_of_seqs = 0; + my $DE = ""; + my $OS = ""; + my $AC = ""; + my $i = 0; + my $length = 0; + my $seq_name = ""; + my $seq = ""; + + + open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!"; + open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; + while ( $return_line = ) { + + if ( $saw_sequence_line == 1 + && !&containsPfamNamedSequence( $return_line ) + && !&isPfamCommentLine( $return_line ) ) { + # This is just for counting purposes. + $saw_sequence_line = 2; + } + if ( &isPfamSequenceLine( $return_line ) ) { + if ( $saw_sequence_line == 0 ) { + $saw_sequence_line = 1; + } + $return_line =~ /^\s*(\S+)\s+(\S+)/; + $seq_name = $1; + $seq = $2; + if ( !&startsWithSWISS_PROTname( $return_line ) ) { + $seq_name =~ /^(\S+)\//; + $AC = $1; + unless( exists( $AC_OS{ $AC } ) ) { + #ACs not present in "ACDEOS" file. + next; + } + $OS = $AC_OS{ $AC }; + if ( !$OS || $OS eq "" ) { + die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n"; + } + unless( exists( $Species_names_hash{ $OS } ) ) { + next; + } + if ( $remove_frags == 1 ) { + $DE = $AC_DE{ $AC }; + if ( $DE && $DE =~ /\(FRAGMENT\)/ ) { + next; + } + } + $seq_name =~ s/\//_$OS\//; + } + else { + if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) { + unless( exists( $Species_names_hash{ $1 } ) ) { + next; + } + } + # remove everything whose species cannot be determined. + else { + next; + } + } + $length = length( $seq_name ); + for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) { + $seq_name .= " "; + } + $return_line = $seq_name.$seq."\n"; + } + + if ( !&isPfamCommentLine( $return_line ) ) { + print OUT_RNSP $return_line; + } + + if ( $saw_sequence_line == 1 ) { + $number_of_seqs++; + } + } ## while ( $return_line = ) + close( IN_RNSP ); + close( OUT_RNSP ); + + return $number_of_seqs; + +} ## removeSeqsFromPfamAlign + + + + + + + +# Reads in (SWISS-PROT) species names from a file. +# Names must be separated by newlines. +# Lines beginning with "#" are ignored. +# A possible "=" and everything after is ignored. +# One argument: species-names-file name +# Last modified: 04/24/01 +sub readSpeciesNamesFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $species = ""; + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; + } + + open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n"; + while ( $return_line = ) { + if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { + $species = $1; + $species =~ s/=.+//; + $Species_names_hash{ $species } = ""; + } + } + close( IN_RSNF ); + + return; +} ## readSpeciesNamesFile + + + +# Last modified: 02/21/03 +sub readTrEMBL_ACDEOS_FILE { + + my $return_line = ""; + + unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) { + die "\n\n$0: Error: \"$TREMBL_ACDEOS_FILE\" does not exist, is empty, or is not a plain textfile.\n\n"; + } + # Fill up (huge) hashs. + open( HH, "$TREMBL_ACDEOS_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$TREMBL_ACDEOS_FILE>>: $!\n\n"; + while ( $return_line = ) { + + if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) { + $AC_OS{ $1 } = $3; + $AC_DE{ $1 } = $2; + } + } + close( HH ); +} ## readTrEMBL_ACDEOS_FILE + + +# Last modified: 02/21/03 +sub readListFile { + + my $return_line = ""; + + unless ( ( -s $ALGNS_TO_USE_LIST_FILE ) && ( -f $ALGNS_TO_USE_LIST_FILE ) && ( -T $ALGNS_TO_USE_LIST_FILE ) ) { + die "\n\n$0: Error: \"$ALGNS_TO_USE_LIST_FILE\" does not exist, is empty, or is not a plain textfile.\n\n"; + } + # Fill up hash. + open( LF, "$ALGNS_TO_USE_LIST_FILE" ) || die "\n\n$0: Unexpected error: Cannot open file <<$ALGNS_TO_USE_LIST_FILE>>: $!\n\n"; + while ( $return_line = ) { + if ( $return_line =~ /^\s*(\S+)\s*$/ ) { # just a list + $ALGNS_TO_USE{ $1 } = ""; + } + elsif ( $return_line =~ /^\s*\S+\s+\S+\s+(\S+)/ ) { # "changes" list from Pfam + $ALGNS_TO_USE{ $1 } = ""; + } + + } + close( LF ); + +} ## readListFile + + + +# Five arguments: +# 1. Name of inputfile +# 2. Bootstraps +# 2. Name of output alignment file +# 3. Name of output positions file +# 4. Seed for random number generator +# +# Last modified: 06/23/01 +sub executeBootstrap { + my $infile = $_[ 0 ]; + my $bootstraps = $_[ 1 ]; + my $outalign = $_[ 2 ]; + my $positions = $_[ 3 ]; + my $seed = $_[ 4 ]; + + system( "$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed" ) + && die "\n\n$0: executeBootstrap:\nCould not execute \"$BOOTSTRAP_CZ_PL 0 $bootstraps $infile $outalign $positions $seed\".\n\n"; + +} ## executeBootstrap + + + + +# Last modified: 05/22/02 +sub createTempdir { + + my $ii = 0; + my $time = time; + + $tmp_dir = $MY_TEMP_DIR.$time.$ii; + + while ( -e $tmp_dir ) { + $ii++; + $tmp_dir = $MY_TEMP_DIR.$time.$ii; + } + + mkdir( $tmp_dir, 0777 ) + || die "\n\n$0: Unexpected error: Could not create <<$tmp_dir>>: $!\n\n"; + + unless ( ( -e $tmp_dir ) && ( -d $tmp_dir ) ) { + die "\n\n$0: Unexpected error: failed to create <<$tmp_dir>>.\n\n"; + } + +} ## createTempdir + + + +# Last modified: 05/17/01 +sub startLogfile { + if ( -e $MY_RIO_PWD_DIRECTORY.$LOGFILE ) { + print "\npfam2pwd.pl:\n"; + print "logfile $MY_RIO_PWD_DIRECTORY"."$LOGFILE already exists\n"; + print "rename it or place it in another directory\n"; + exit( -1 ); + } + + open( L, ">$MY_RIO_PWD_DIRECTORY".$LOGFILE ) + || die "\n\n$0: startLogfile: Cannot create logfile: $!\n\n"; + print L "Min seqs : $MIN_SEQS\n"; + print L "Max seqs : $MAX_SEQS\n"; + print L "Seed : $MY_SEED\n"; + print L "TrEMBL ACDEOS file : $TREMBL_ACDEOS_FILE\n"; + print L "Species names file : $MY_SPECIES_NAMES_FILE\n"; + print L "Pfam directory : $MY_PFAM_FULL_DIRECTORY\n"; + print L "PWD outputdirectory: $MY_RIO_PWD_DIRECTORY\n"; + print L "BSP outputdirectory: $MY_RIO_BSP_DIRECTORY\n"; + print L "NBD outputdirectory: $MY_RIO_NBD_DIRECTORY\n"; + print L "ALN outputdirectory: $MY_RIO_ALN_DIRECTORY\n"; + print L "HMM outputdirectory: $MY_RIO_HMM_DIRECTORY\n"; + print L "Start date : ".`date`; + if ( $MY_MATRIX == 0 ) { + print L "Matrix : JTT\n"; + } + elsif ( $MY_MATRIX == 2 ) { + print L "Matrix : BLOSUM 62\n"; + } + elsif ( $MY_MATRIX == 3 ) { + print L "Matrix : mtREV24\n"; + } + elsif ( $MY_MATRIX == 5 ) { + print L "Matrix : VT\n"; + } + elsif ( $MY_MATRIX == 6 ) { + print L "Matrix : WAG\n"; + } + elsif ( $MY_MATRIX == 7 ) { + print L "Matrix : auto\n"; + } + else { + print L "Matrix : PAM\n"; + } +} ## startLogfile + + + +# Last modified: 05/17/01 +sub finishLogfile { + my $j = 0; + print L "\n\n"; + print L "Successfully calculated $i pairwise distance files.\n"; + print L "Too large alignments (>$MAX_SEQS): $too_large\n"; + print L "Too small alignments (<$MIN_SEQS): $too_small\n"; + print L "Finish date : ".`date`."\n\n"; + + print L "List of the $too_large alignments which were ignored because they\n"; + print L "contained too many sequences (>$MAX_SEQS) after pruning:\n"; + for ( $j = 0; $j < $too_large; ++$j ) { + print L "$too_large_names[ $j ]\n"; + } + print L "\n\n"; + print L "List of the $too_small alignments which were ignored because they\n"; + print L "contained not enough sequences (<$MIN_SEQS) after pruning:\n"; + for ( $j = 0; $j < $too_small; ++$j ) { + print L "$too_small_names[ $j ]\n"; + } + print L "\n"; + close( L ); +} ## finishLogfile + + + + diff --git a/forester/archive/perl/pfam2slx.pl b/forester/archive/perl/pfam2slx.pl new file mode 100755 index 0000000..1d591f5 --- /dev/null +++ b/forester/archive/perl/pfam2slx.pl @@ -0,0 +1,94 @@ +#! /usr/bin/perl + +# Unpack a pfam flatfile, containing many alignments, +# into separate SELEX-format alignment files. +# +# Assumes that ID is the first line in a record, +# that SQ is the last line before the alignment starts, +# and that there is one aligned sequence per line. +# + + +################################################################ +# PFAMSERVER - The Washington University/St. Louis Pfam web server +# Copyright (C) 1995-1999 Washington University School of Medicine +# Copyright (C) 1995-1999 Sanger Centre/Genome Research Ltd. +# Copyright (C) 1998-1999 Karolinska Institutet Center for Genomics Research +# All Rights Reserved +# +# This source code is distributed under the terms of the +# GNU General Public License. See the files COPYRIGHT and LICENSE +# for details. +# +################################################################ + +$cpl = 50; # 50 sequence characters per line +$/ = "\n//"; # paragraph mode on // separators + +while (<>) { + $in_alignment = 0; + $nseq = 0; + @lines = split(/^/); + while ($line = shift(@lines)) { + if ($in_alignment) { + if ($line =~ /^\#/) { next; } + elsif ($line =~ /^(\S+)\s+(\S+)/) { + $name[$nseq] = $1; + $aseq[$nseq] = $2; + $nseq++; + } + } + elsif ($line =~ /^\#=GF ID (\S+)\s*$/) { + $root = $1; + print "working on $root\n"; + if (-e "$root") { + system ("mv $root $root.orig"); + print "$root exists -- moved to $root.orig\n"; + } + open(SELEX,">$root") || die; + print SELEX "#=ID $root\n"; + } + elsif ($line =~ /^\#=GF AC (.+)$/) { print SELEX "#=AC $1\n"; } + elsif ($line =~ /^\#=GF DE (.+)$/) { print SELEX "#=DE $1\n"; } + + elsif ($line =~ /^\#=GF GA (\S+)\s+(\S+)/) + { print SELEX "#=GA $1 $2\n"; } + + elsif ($line =~ /^\#=GF TC (\S+) (\S+)/) + { print SELEX "#=TC $1 $2\n"; } + + elsif ($line =~ /^\#=GF NC (\S+) (\S+)/) + { print SELEX "#=NC $1 $2\n"; } + + elsif ($line =~ /^\#=GF SQ \d+/) { + print SELEX "# $line"; + $in_alignment = 1; + } + elsif ($line =~ /^\/\//) { + last; + } + else { + print SELEX "# $line"; + } + } + + # figure out maximum name length + $maxnamelen = 0; + for ($idx = 0; $idx < $nseq; $idx++) { + if (length($name[$idx]) > $maxnamelen) { + $maxnamelen = length($name[$idx]); + } + } + # break the alignment across + # multiple lines + $alen = length($aseq[0]); + for ($pos = 0; $pos < $alen; $pos += $cpl) { + for ($idx = 0; $idx < $nseq; $idx++) { + printf(SELEX "%-${maxnamelen}s %s\n", + $name[$idx], substr($aseq[$idx], $pos, $cpl)); + } + print SELEX "\n"; + } + close SELEX; +} + diff --git a/forester/archive/perl/phylo_pl.pl b/forester/archive/perl/phylo_pl.pl new file mode 100755 index 0000000..b6b40bd --- /dev/null +++ b/forester/archive/perl/phylo_pl.pl @@ -0,0 +1,1735 @@ +#!/usr/bin/perl -W +# +# $Id: phylo_pl.pl,v 1.32 2010/12/13 19:00:22 cmzmasek Exp $ +# +# FORESTER -- software libraries and applications +# for evolutionary biology research and applications. +# +# Copyright (C) 2008-2009 Christian M. Zmasek +# Copyright (C) 2008-2009 Burnham Institute for Medical Research +# All rights reserved +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +# +# Contact: phylosoft @ gmail . com +# WWW: www.phylosoft.org/forester +# +# +# +# Requirements phylo_pl is part of the FORESTER libraries. +# ------------ Many of its global variables are set via forester.pm. +# +# +# Note. Use xt.pl (for Pfam alignments) or mt.pl (for other alignments) +# to run phylo_pl.pl on whole directories of alignments files. +# +# +# +# +# ========================= +# +# METHOD ORDER (IMPORTANT!) +# 1. FastME +# 2. phylip NJ +# 3. phylip fitch FM +# 4. phylip fitch ME +# 5. BIONJ +# 6. Weighbor +# 7. Raxml +# 8. phyml +# 9. phylip proml +# 10. phylip protpars +# 11. all +#========================== + +use strict; + +use FindBin; +use lib $FindBin::Bin; +use forester; + +my $VERSION = "1.0.1"; +my $LAST_MODIFIED = "2009.10.02"; + +my $RAXML_MODEL_BASE = "PROTGAMMA"; +my $RAXML_ALGORITHM = "a"; + +my $TEMP_DIR_DEFAULT = "/tmp/phylo_pl_"; # Where all the infiles, outfiles, etc will be created. + +my $bootstraps = 100; # 0,1: do not bootstrap. Default: 100 +my $matrix = 5; # 0 = JTT + # 1 = PAM + # 2 = BLOSUM 62 + # 3 = mtREV24 + # 5 = VT - default + # 6 = WAG + # 7 = auto in puzzle + # 8 = DCMut in PHYML, VT in TREE-PUZZLE +my $rate_heterogeneity = 0; # 0 = Uniform rate (default) + # 1 = 8 Gamma distributed rates + # 2 = Two rates (1 invariable + 1 variable) + # 3 = Mixed (1 invariable + 8 Gamma rates) +my $seed = 9; # Seed for random number generators. Default: 9 +my $keep_multiple_trees = 0; # 0: delete multiple tree file + # 1: do not delete multiple tree file +my $exact_parameter_est = 0; # 0: no; 1: yes + +my $phyml_rel_substitution_rate_cat = 4; + +my $jumbles = 2; +my $use_fastme = 0; # 0: no; 1: yes +my $use_phylip_nj = 0; # 0: no; 1: yes +my $use_phylip_fitch_fm = 0; # 0: no; 1: yes +my $use_phylip_fitch_me = 0; # 0: no; 1: yes +my $use_bionj = 0; # 0: no; 1: yes +my $use_weighbor = 0; # 0: no; 1: yes +my $use_raxml = 0; # 0: no; 1: yes +my $use_phyml = 0; # 0: no; 1: yes +my $use_proml = 0; # 0: no; 1: yes +my $use_protpars = 0; # 0: no; 1: yes +my $use_global_rearr = 0; # 0: no; 1: yes +my $estimate_invar_sites = 0; # 0: no; 1: yes + +my $fastme_init_tree_opt = "NJ"; + +my %seqnames = (); # number => seqname +my %numbers = (); # seqname => number +my $options = ""; +my $infile = ""; +my $pwdfile = ""; +my $outfile = ""; +my $logfile = ""; +my $multipwdfile = ""; +my $distancefile = ""; +my $log = ""; +my $ii = 0; +my $temp_dir = ""; +my $current_dir = ""; +my @out = (); +my $number_of_seqs = 0; +my $number_of_aa = 0; + +my $use_pwd_based_methods = 0; + +print( "\n"); +print( "phylo_pl $VERSION ($LAST_MODIFIED)\n" ); +print( "__________________________________\n"); +print( "\n\n"); + + + +unless ( @ARGV == 2 || @ARGV == 3 || @ARGV == 4 || @ARGV == 5 ) { + &printUsage(); + exit ( -1 ); +} + + + +# Analyzes the options: +# --------------------- + +if ( $ARGV[ 0 ] =~ /^-.+/ ) { + + unless ( @ARGV > 2 ) { + &printUsage(); + exit ( -1 ); + } + $options = $ARGV[ 0 ]; + + if ( @ARGV != 3 && @ARGV != 4 ) { + &printUsage(); + exit ( -1 ); + } + $infile = $ARGV[ 1 ]; + $outfile = $ARGV[ 2 ]; + if ( @ARGV == 4 ) { + $temp_dir = $ARGV[ 3 ]; + } + if ( $options =~ /B(\d+)/ ) { + $bootstraps = $1; + if ( $bootstraps <= 1 ) { + $bootstraps = 0; + } + elsif ( $bootstraps <= 9 ) { + $bootstraps = 0; + print "\n\nphylo_pl: WARNING: Bootstrap number must be devisable by 10,\nno bootstrapping.\n\n"; + } + elsif ( $bootstraps % 10 != 0 ) { + $bootstraps = $bootstraps - $bootstraps % 10; # to ensure $bootstraps % 10 == 0 + print "\n\nphylo_pl: WARNING: Bootstrap number must be devisable by 10,\nset to $bootstraps.\n\n"; + } + } + if ( $options =~ /n/ ) { + $use_phylip_nj = 1; + } + if ( $options =~ /q@(\d)/ ) { + $use_fastme = 1; + my $opt = $1; + if ( $opt == 1 ) { + $fastme_init_tree_opt = "GME"; + } + elsif ( $opt == 2 ) { + $fastme_init_tree_opt = "BME"; + } + elsif ( $opt == 3 ) { + $fastme_init_tree_opt = "NJ"; + } + else { + &printUsage(); + exit ( -1 ); + } + } + if ( $options =~ /f/ ) { + $use_phylip_fitch_fm = 1; + } + if ( $options =~ /e/ ) { + $use_phylip_fitch_me = 1; + } + if ( $options =~ /b/ ) { + $use_bionj = 1; + } + if ( $options =~ /w/ ) { + $use_weighbor = 1; + } + if ( $options =~ /x/ ) { + $use_raxml = 1; + } + if ( $options =~ /y/ ) { + $use_phyml = 1; + } + if ( $options =~ /o/ ) { + $use_proml = 1; + } + if ( $options =~ /p/ ) { + $use_protpars = 1; + } + if ( $options =~ /G/ ) { + $use_global_rearr = 1; + } + if ( $options =~ /I/ ) { + $estimate_invar_sites = 1; + } + if ( $options =~ /j(\d+)/ ) { + $jumbles = $1; + if ( $jumbles < 1 ) { + $jumbles = 0; + } + } + if ( $options =~ /r(\d+)/ ) { + $phyml_rel_substitution_rate_cat = $1; + if ( $phyml_rel_substitution_rate_cat < 1 ) { + $phyml_rel_substitution_rate_cat = 1; + } + } + if ( $options =~ /J/ ) { + $matrix = 0; # JTT + } + if ( $options =~ /P/ ) { + $matrix = 1; # PAM + } + if ( $options =~ /L/ ) { + $matrix = 2; # Blosum 62 + } + if ( $options =~ /M/ ) { + $matrix = 3; # mtREV24 + } + if ( $options =~ /W/ ) { + $matrix = 6; # WAG + } + if ( $options =~ /A/ ) { + $matrix = 7; # auto + } + if ( $options =~ /D/ ) { + $matrix = 8; # DCMut in PHYML and RAXML, VT in PUZZLE + } + if ( $options =~ /S(\d+)/ ) { + $seed = $1; + } + if ( $options =~ /X/ ) { + $keep_multiple_trees = 1; + } + if ( $options =~ /E/ ) { + $exact_parameter_est = 1; + } + if ( $options =~ /g/ ) { + $rate_heterogeneity = 1; + } + if ( $options =~ /t/ ) { + $rate_heterogeneity = 2; + } + if ( $options =~ /m/ ) { + $rate_heterogeneity = 3; + } +} +else { + unless ( @ARGV == 2 || @ARGV == 3 ) { + &printUsage(); + exit ( -1 ); + } + $infile = $ARGV[ 0 ]; + $outfile = $ARGV[ 1 ]; + if ( @ARGV == 3 ) { + $temp_dir = $ARGV[ 2 ]; + } +} + +if ( $use_fastme != 1 && + $use_phylip_nj != 1 && + $use_phylip_fitch_fm != 1 && + $use_phylip_fitch_me != 1 && + $use_bionj != 1 && + $use_weighbor != 1 && + $use_raxml != 1 && + $use_phyml != 1 && + $use_proml != 1 && + $use_protpars != 1 ) { + + $use_fastme = 1; + $use_phylip_nj = 1; + $use_phylip_fitch_fm = 1; + $use_phylip_fitch_me = 1; + $use_bionj = 1; + $use_raxml = 1; + $use_weighbor = 1; + $use_phyml = 1; + $use_proml = 1; + $use_protpars = 1; +} + + +if ( $use_fastme == 1 || + $use_phylip_nj == 1 || + $use_phylip_fitch_fm == 1 || + $use_phylip_fitch_me == 1 || + $use_bionj == 1 || + $use_weighbor == 1 ) { + $use_pwd_based_methods = 1; +} +else { + $use_pwd_based_methods = 0; +} + +$current_dir = `pwd`; +$current_dir =~ s/\s//; + +if ( $outfile !~ /^\// ) { + # outfile is not absolute path. + $outfile = $current_dir."/".$outfile; +} + + + +# TREE-PUZZLE sets the option in this way: +# If two rates or mixed, exact parameter estimates are used. +if ( $rate_heterogeneity == 2 +|| $rate_heterogeneity == 3 ) { + $exact_parameter_est = 1 +} + + +if ( $outfile =~ /\.xml$/i ) { + $outfile =~ s/\.xml//i; +} +elsif ( $outfile =~ /\.aln$/i ) { + $outfile =~ s/\.aln//i; +} +elsif ( $outfile =~ /\.fasta$/i ) { + $outfile =~ s/\.fasta//i; +} +elsif ( $outfile =~ /\.fas$/i ) { + $outfile =~ s/\.fas//i; +} +elsif ( $outfile =~ /\.seqs$/i ) { + $outfile =~ s/\.seqs//i; +} + + +$logfile = $outfile.$LOG_FILE_SUFFIX; +$multipwdfile = $outfile.$MULTIPLE_PWD_FILE_SUFFIX; +$distancefile = $outfile.$SUFFIX_PWD_NOT_BOOTS; + +&dieIfFileExists( $logfile ); +&dieIfFileExists( $multipwdfile ); +&dieIfFileExists( $distancefile ); + + +my $fastme_outtree = $outfile."_fme.xml"; +my $phylip_nj_outtree = $outfile."_pnj.xml"; +my $phylip_fm_outtree = $outfile."_pfm.xml"; +my $phylip_me_outtree = $outfile."_pme.xml"; +my $bionj_outtree = $outfile."_bionj.xml"; +my $weighbor_outtree = $outfile."_weigh.xml"; +my $raxml_outtree = $outfile."_raxml.xml"; +my $phyml_outtree = $outfile."_phyml.xml"; +my $proml_outtree = $outfile."_proml.xml"; +my $protpars_outtree = $outfile."_ppp.xml"; +my $all_outtree = $outfile."_comb.xml"; + +my $multitreefile_fastme = $outfile."_fme".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_phylip_nj = $outfile."_pnj".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_phylip_fm = $outfile."_pfm".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_phylip_me = $outfile."_pme".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_bionj = $outfile."_bionj".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_weighbor = $outfile."_weigh".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_raxml = $outfile."_raxml".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_phyml = $outfile."_phyml".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_proml = $outfile."_proml".$MULTIPLE_TREES_FILE_SUFFIX; +my $multitreefile_protpars = $outfile."_ppp".$MULTIPLE_TREES_FILE_SUFFIX; + +if ( $use_fastme == 1 ) { + &dieIfFileExists( $fastme_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_fastme ); + } +} +if( $use_phylip_nj == 1 ) { + &dieIfFileExists( $phylip_nj_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_phylip_nj ); + } +} +if( $use_phylip_fitch_fm == 1 ) { + &dieIfFileExists( $phylip_fm_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_phylip_fm ); + } +} +if( $use_phylip_fitch_me == 1 ) { + &dieIfFileExists( $phylip_me_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_phylip_me ); + } +} +if( $use_bionj == 1 ) { + &dieIfFileExists( $bionj_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_bionj ); + } +} +if( $use_weighbor == 1 ) { + &dieIfFileExists( $weighbor_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_weighbor ); + } +} +if( $use_raxml == 1 ) { + &dieIfFileExists( $raxml_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_raxml ); + } +} +if( $use_phyml == 1 ) { + &dieIfFileExists( $phyml_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_phyml ); + } +} +if( $use_proml == 1 ) { + &dieIfFileExists( $proml_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_proml ); + } +} +if( $use_protpars == 1 ) { + &dieIfFileExists( $protpars_outtree ); + if ( $keep_multiple_trees == 1 && $bootstraps > 1 ) { + &dieIfFileExists( $multitreefile_protpars ); + } +} +if ( $bootstraps > 1 ) { + &dieIfFileExists( $all_outtree ); +} +if ( $infile ne "" ) { + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\nphylo_pl: Input alignment file \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; + } +} + + + + +# Prints out the options: +# ----------------------- + + +$log = "\n$0 logfile:\n"; +$log = $log."Version: $VERSION\n\n"; + + + +if ( $infile ne "" ) { + $log = $log."Input : $infile\n"; +} + +if ( $keep_multiple_trees == 1 && $bootstraps >= 2 ) { + $log = $log."Multiple distance matrices : $multipwdfile\n"; +} + + +$log = $log."Bootstraps : $bootstraps\n"; + +if ( $use_pwd_based_methods == 1 ) { + $log = $log."Prgrm to calculate pairwise dist. : TREE-PUZZLE (version: $PUZZLE_VERSION)\n"; +} + + +if ( $use_fastme == 1 ) { + $log = $log."Program to calculate tree : FastME (version: $FASTME_VERSION)\n"; + $log = $log."Method for intial tree in FastME : $fastme_init_tree_opt\n"; + $log = $log."Tree swapping (NNI) in FastME : balanced (default)\n"; +} +if ( $use_phylip_nj == 1 ) { + $log = $log."Program to calculate tree : PHYLIP NEIGHBOR NJ (version: $PHYLIP_VERSION)\n"; +} +if ( $use_phylip_fitch_fm == 1 ) { + $log = $log."Program to calculate tree : PHYLIP FITCH Fitch-Margoliash (version: $PHYLIP_VERSION)\n"; +} +if ( $use_phylip_fitch_me == 1 ) { + $log = $log."Program to calculate tree : PHYLIP FITCH Minimal Evolution (version: $PHYLIP_VERSION)\n"; +} +if ( $use_bionj == 1 ) { + $log = $log."Program to calculate tree : BIONJ (version: $BIONJ_VERSION)\n"; +} +if ( $use_weighbor == 1 ) { + $log = $log."Program to calculate tree : Weighbor [no invariable sites, b=14] (version: $WEIGHBOR_VERSION)\n"; +} +if ( $use_raxml == 1 ) { + $log = $log."Program to calculate tree : RAxML [$RAXML_MODEL_BASE] (uses its own bootstraps, if bootstrapped: -f $RAXML_ALGORITHM) (version: $RAXML_VERSION)\n"; +} +if ( $use_phyml == 1 ) { + $log = $log."Program to calculate tree : PHYML (MLE for gamma distr param and proportion of inv sites) (version: $PHYML_VERSION)\n"; + $log = $log."# of rel subst rate categories : $phyml_rel_substitution_rate_cat\n"; +} +if ( $use_proml == 1 ) { + $log = $log."Program to calculate tree : PHYLIP PROML (uses PAM unless JTT selected) (version: $PHYLIP_VERSION)\n"; +} +if ( $use_protpars == 1 ) { + $log = $log."Program to calculate tree : PHYLIP PROTPARS (with global rearrangements) (version: $PHYLIP_VERSION)\n"; +} +if ( $use_phylip_fitch_fm == 1 || $use_phylip_fitch_me == 1 || $use_protpars == 1 || $use_proml ) { + $log = $log."Number of jumbles (input order rand): $jumbles\n"; + +} +if ( $use_phylip_fitch_fm == 1 || $use_phylip_fitch_me == 1 || $use_proml ) { + if ( $use_global_rearr == 1 ) { + $log = $log."Global rearrangements : true\n"; + } + else { + $log = $log."Global rearrangements : false\n"; + + } +} + +if ( $bootstraps > 0 ) { + $log = $log."Prgrm to calculate ML branch lenghts: TREE-PUZZLE (version: $PUZZLE_VERSION)\n"; +} + +$log = $log."Model : "; +if ( $matrix == 0 ) { + $log = $log."JTT (Jones et al. 1992)\n"; +} +elsif ( $matrix == 1 ) { + $log = $log."PAM (Dayhoff et al. 1978)\n"; +} +elsif ( $matrix == 2 ) { + $log = $log."BLOSUM 62 (Henikoff-Henikoff 92)\n"; +} +elsif ( $matrix == 3 ) { + $log = $log."mtREV24 (Adachi-Hasegawa 1996)\n"; +} +elsif ( $matrix == 5 ) { + $log = $log."VT (Mueller-Vingron 2000)\n"; +} +elsif ( $matrix == 6 ) { + $log = $log."WAG (Whelan-Goldman 2000)\n"; +} +elsif ( $matrix == 7 ) { + $log = $log."auto in TREE-PUZZLE\n"; +} +elsif ( $matrix == 8 ) { + $log = $log."DCMut (Kosial and Goldman, 2005) in PHYML and RAxML, VT in TREE-PUZZLE\n"; +} +else { + &dieWithUnexpectedError( "Unknown model: matrix=$matrix" ); +} +if ( $use_raxml == 1 || $use_phyml == 1 ) { + if ( $estimate_invar_sites == 1 ) { + $log = $log."Estimate proportion of invariable sites in RAXML and/or PHYML: true\n"; + } + else { + $log = $log."Estimate proportion of invariable sites in RAXML and/or PHYML: false (proportion \"0.0\" is used in PHYML)\n"; + } +} + +$log = $log."Model of rate heterogeneity (PUZZLE): "; +if ( $rate_heterogeneity == 1 ) { + $log = $log."8 Gamma distributed rates\n"; +} +elsif ( $rate_heterogeneity == 2 ) { + $log = $log."Two rates (1 invariable + 1 variable)\n"; +} +elsif ( $rate_heterogeneity == 3 ) { + $log = $log."Mixed (1 invariable + 8 Gamma rates)\n"; +} +else { + $log = $log."Uniform rate\n"; +} +$log = $log."Seed for random number generators : $seed\n"; +if ( $exact_parameter_est == 1 ) { + $log = $log."Exact parameter estimates in TREE-PUZZLE\n"; +} + +$log = $log."Start time/date : ".`date`; + + + + +# That's where the mischief starts.... +# ------------------------------------ + +$ii = 0; + +srand(); +my $range = 1000000; +my $random_number = int( rand( $range ) ); + +if ( $temp_dir eq "" ) { + $temp_dir = $TEMP_DIR_DEFAULT; +} + +$temp_dir = $temp_dir.$random_number.$ii; + +while ( -e $temp_dir ) { + $ii++; + $temp_dir = $temp_dir.$random_number.$ii; +} + +mkdir( $temp_dir, 0700 ) +|| die "\n\n$0: Could not create <<$temp_dir>>: $!\n\n"; + +unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { + die "\n\n$0: <<$temp_dir>> does not exist, or is not a directory.\n\n"; +} + + +&cp( $infile, $temp_dir."/INFILE" ); +unless ( chmod ( 0600, $temp_dir."/INFILE" ) ) { + warn "\n\n$0: Could not chmod. $!\n\n"; +} +$infile = "INFILE"; + +chdir ( $temp_dir ) +|| die "\n\n$0: Could not chdir to <<$temp_dir>>: $!\n\n"; + +&cp( $infile, "infile" ); +@out = &getNumberOfSeqsAndAas( $infile ); +$number_of_seqs = $out[ 0 ]; +$number_of_aa = $out[ 1 ]; + +my $SEQBOOT_OUTFILE = "seqboot_outfile"; + +if ( $bootstraps > 1 && ( $use_pwd_based_methods == 1 + || $use_phyml == 1 + || $use_proml == 1 + || $use_protpars == 1 ) ) { + &executeSeqboot( $seed, $bootstraps ); + &mv( "outfile", $SEQBOOT_OUTFILE ); + &rm( "infile" ); +} + +&cp( $infile, "align" ); + +if ( $use_pwd_based_methods == 1 ) { + # Calculating the pairwise distances (saved in file "infile"): "puzzle" + if ( $bootstraps > 1 ) { + &executePuzzleBootstrapped( $SEQBOOT_OUTFILE, + $matrix, + $number_of_seqs, + $exact_parameter_est, + $rate_heterogeneity ); + + $pwdfile = $SEQBOOT_OUTFILE.".dist"; + } + else { + &executePuzzle( "infile", + $matrix, + $number_of_seqs, + $exact_parameter_est, + $rate_heterogeneity ); + $pwdfile = "infile.dist"; + } +} + +&rm( "infile" ); + +# Methods based on alignment +# -------------------------- +my $OUTTREE_RAXML = "outtree_rax"; +my $OUTTREE_PHYML = "outtree_phyml"; +my $OUTTREE_PROML = "outtree_proml"; +my $OUTTREE_PROTPARS = "outtree_protpars"; + +my $CONSENSUS_RAXML = "consensus_raxml"; +my $CONSENSUS_PHYML = "consensus_phyml"; +my $CONSENSUS_PROML = "consensus_proml"; +my $CONSENSUS_PROTPARS = "consensus_protpars"; + +my $OUTTREES_ALL = "outtrees_all"; +my $all_count = 0; + +if ( $use_raxml == 1 ) { + + my $model = "---"; + if ( $matrix == 0 ) { + $model = "JTT"; + } + elsif ( $matrix == 1 ) { + $model = "DAYHOFF"; + } + elsif ( $matrix == 2 ) { + $model = "BLOSUM62"; + } + elsif ( $matrix == 3 ) { + $model = "MTREV"; + } + elsif ( $matrix == 5 ) { + $model = "VT"; + } + elsif ( $matrix == 6 ) { + $model = "WAG"; + } + elsif ( $matrix == 7 ) { + $model = "VT"; + } + elsif ( $matrix == 8 ) { + $model = "DCMUT"; + } + else { + &dieWithUnexpectedError( "Unknown model: matrix=$matrix" ); + } + + print( "\n========== RAxML begin =========\n\n" ); + # Six arguments: + # 1. DNA or Amino-Acids sequence filename (PHYLIP format) + # 2. Model, eg. PROTGAMMAIVT + # 3. Replicates (bootstrap) + # 4. Seed for bootstrap + # 5. Output suffix + # 6. Algorithm (only for bootstrap, default otherwise) + my $invar = ""; + if ( $estimate_invar_sites == 1 ) { + $invar = "I"; + } + + # NOTE. RaxML does its own bootstrapping. + &executeRaxml( "align", $RAXML_MODEL_BASE.$invar.$model."F", $bootstraps, $seed, "xxx", $RAXML_ALGORITHM ); + print( "\n========== RAxML end =========\n\n" ); + + &rm( "RAxML_log.xxx" ); + &rm( "RAxML_parsimonyTree.xxx" ); + &mv( "RAxML_info.xxx", $outfile."_raxml_info" ); + if ( $bootstraps > 1 ) { + &rm( "RAxML_bestTree.xxx" ); + &mv( "RAxML_bipartitions.xxx", $CONSENSUS_RAXML ); + &append( "RAxML_bootstrap.xxx", $OUTTREES_ALL ); + if ( $keep_multiple_trees == 1 ) { + &mv( "RAxML_bootstrap.xxx", $multitreefile_raxml ); + } + else { + &rm( "RAxML_bootstrap.xxx" ); + } + $all_count++; + } + else { + &mv( "RAxML_result.xxx", $OUTTREE_RAXML ); + } +} + + +if ( $use_phyml == 1 ) { + + my $model = "---"; + if ( $matrix == 0 ) { + $model = "JTT"; + } + elsif ( $matrix == 1 ) { + $model = "Dayhoff"; + } + elsif ( $matrix == 2 ) { + $model = "Blosum62"; + } + elsif ( $matrix == 3 ) { + $model = "MtREV"; + } + elsif ( $matrix == 5 ) { + $model = "VT"; + } + elsif ( $matrix == 6 ) { + $model = "WAG"; + } + elsif ( $matrix == 7 ) { + $model = "VT"; + } + elsif ( $matrix == 8 ) { + $model = "DCMut"; + } + else { + &dieWithUnexpectedError( "Unknown model: matrix=$matrix" ); + } + + my $input = ""; + if ( $bootstraps > 1 ) { + $input = $SEQBOOT_OUTFILE; + } + else { + $input = "align"; + } + print( "\n========== PHYML begin =========\n\n" ); + # Six arguments: + # 1. DNA or Amino-Acids sequence filename (PHYLIP format) + # 2. number of data sets to analyse (ex:3) + # 3. Model: JTT | MtREV | Dayhoff | WAG | VT | DCMut | Blosum62 (Amino-Acids) + # 4. number of relative substitution rate categories (ex:4), positive integer + # 5. starting tree filename (Newick format), your tree filename | BIONJ for a distance-based tree + # 6. 1 to estimate proportion of invariable sites, otherwise, fixed proportion "0.0" is used + # PHYML produces several results files : + # _phyml_lk.txt : likelihood value(s) + # _phyml_tree.txt : inferred tree(s) + # _phyml_stat.txt : detailed execution stats + &executePhyml( $input, $bootstraps, $model, $phyml_rel_substitution_rate_cat, "BIONJ", $estimate_invar_sites ); + print( "\n========== PHYML end =========\n\n" ); + + &rm( $input."_phyml_lk.txt" ); + &mv( $input."_phyml_tree.txt", $OUTTREE_PHYML ); + if ( -e $outfile."_phyml_stat" ) { + &rm( $outfile."_phyml_stat" ); + } + &mv( $input."_phyml_stat.txt", $outfile."_phyml_stat" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_PHYML, $OUTTREES_ALL ); + $all_count++; + } + +} + +if ( $use_proml == 1 ) { + my $input = ""; + if ( $bootstraps > 1 ) { + $input = $SEQBOOT_OUTFILE; + } + else { + $input = "align"; + } + print( "\n========== PHYLIP PROML begin =========\n\n" ); + # Five arguments: + # 1. name of alignment file (in correct format!) + # 2. number of bootstraps + # 3. jumbles: 0: do not jumble; >=1 number of jumbles + # 4. seed for random number generator + # 5. 1 for PAM instead of JTT + my $use_pam = 1; + if ( $matrix == 0 ) { + $use_pam = 0; + } + &executeProml( $input, $bootstraps, $jumbles, $seed, $use_pam, $use_global_rearr ); + print( "\n========== PHYLIP PROML end =========\n\n" ); + &mv( "outtree", $OUTTREE_PROML ); + &rm( "outfile" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_PROML, $OUTTREES_ALL ); + $all_count++; + } +} + + +if ( $use_protpars == 1 ) { + my $input = ""; + if ( $bootstraps > 1 ) { + $input = $SEQBOOT_OUTFILE; + } + else { + $input = "align"; + } + print( "\n========== PHYLIP PROTPARS begin =========\n\n" ); + &executeProtpars( $input, $bootstraps, $jumbles, $seed ); + print( "\n========== PHYLIP PROTPARS end =========\n\n" ); + &mv( "outtree", $OUTTREE_PROTPARS ); + &rm( $outfile."_protpars_outfile" ); + &mv( "outfile", $outfile."_protpars_outfile" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_PROTPARS, $OUTTREES_ALL ); + $all_count++; + } +} + + + +# Methods based on PWD +# -------------------- +my $OUTTREE_FASTME = "outtree_fastme"; +my $OUTTREE_PHYLIP_NJ = "outtree_phylip_nj"; +my $OUTTREE_PHYLIP_FM = "outtree_phylip_fm"; +my $OUTTREE_PHYLIP_ME = "outtree_phylip_me"; +my $OUTTREE_BIONJ = "outtree_bionj"; +my $OUTTREE_WEIGHBOR = "outtree_weighbor"; + +my $CONSENSUS_FASTME = "consensus_fastme"; +my $CONSENSUS_PHYLIP_NJ = "consensus_phylip_nj"; +my $CONSENSUS_PHYLIP_FM = "consensus_phylip_fm"; +my $CONSENSUS_PHYLIP_ME = "consensus_phylip_me"; +my $CONSENSUS_BIONJ = "consensus_bionj"; +my $CONSENSUS_WEIGHBOR = "consensus_weighbor"; +my $CONSENSUS_ALL = "consensus_all"; + + +if ( $use_fastme == 1 ) { + print( "\n========== FASTME begin =========\n\n" ); + &executeFastme( $pwdfile, $bootstraps, $fastme_init_tree_opt ); + print( "\n========== FASTME end ===========\n\n" ); + &rm( "output.d" ); + &mv( "output.tre", $OUTTREE_FASTME ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_FASTME, $OUTTREES_ALL ); + $all_count++; + } +} +if ( $use_phylip_nj ) { + print( "\n========== PHYLIP NEIGHBOR begin =========\n\n" ); + &executeNeighbor( $pwdfile, $bootstraps, $seed, 0 ); + print( "\n========== PHYLIP NEIGHBOR end =========\n\n" ); + &mv( "outtree", $OUTTREE_PHYLIP_NJ ); + &rm( "outfile" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_PHYLIP_NJ, $OUTTREES_ALL ); + $all_count++; + } +} +if ( $use_phylip_fitch_fm ) { + print( "\n========== PHYLIP FITCH FM begin =========\n\n" ); + &executeFitch( $pwdfile, $bootstraps, $seed, $jumbles, 0, "FM", $use_global_rearr ); + print( "\n========== PHYLIP FITCH FM end =========\n\n" ); + &mv( "outtree", $OUTTREE_PHYLIP_FM ); + &rm( "outfile" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_PHYLIP_FM, $OUTTREES_ALL ); + $all_count++; + } +} +if ( $use_phylip_fitch_me ) { + print( "\n========== PHYLIP FITCH ME begin =========\n\n" ); + &executeFitch( $pwdfile, $bootstraps, $seed, $jumbles, 0, "ME", $use_global_rearr ); + print( "\n========== PHYLIP FITCH ME end =========\n\n" ); + &mv( "outtree", $OUTTREE_PHYLIP_ME ); + &rm( "outfile" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_PHYLIP_ME, $OUTTREES_ALL ); + $all_count++; + } +} +if ( $use_bionj ) { + print( "\n========== BIONJ begin =========\n\n" ); + &executeBionj( $pwdfile, $OUTTREE_BIONJ ); + print( "\n========== BIONJ end =========\n\n" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_BIONJ, $OUTTREES_ALL ); + $all_count++; + } +} +if ( $use_weighbor ) { + print( "\n========== WEIGHBOR begin =========\n\n" ); + &executeWeighbor( $number_of_aa, 14, $pwdfile, $OUTTREE_WEIGHBOR ); + print( "\n========== WEIGHBOR end =========\n\n" ); + if ( $bootstraps > 1 ) { + &append( $OUTTREE_WEIGHBOR, $OUTTREES_ALL ); + $all_count++; + } +} + + + +if ( $bootstraps > 1 ) { + # Consense: + if ( $use_fastme == 1 ) { + &consense( $OUTTREE_FASTME, $CONSENSUS_FASTME ); + } + if ( $use_phylip_nj == 1 ) { + &consense( $OUTTREE_PHYLIP_NJ, $CONSENSUS_PHYLIP_NJ ); + } + if ( $use_phylip_fitch_fm == 1 ) { + &consense( $OUTTREE_PHYLIP_FM, $CONSENSUS_PHYLIP_FM ); + } + if ( $use_phylip_fitch_me == 1 ) { + &consense( $OUTTREE_PHYLIP_ME, $CONSENSUS_PHYLIP_ME ); + } + if ( $use_bionj == 1 ) { + &consense( $OUTTREE_BIONJ, $CONSENSUS_BIONJ ); + } + if ( $use_weighbor == 1 ) { + &consense( $OUTTREE_WEIGHBOR, $CONSENSUS_WEIGHBOR ); + } + if ( $use_phyml == 1 ) { + &consense( $OUTTREE_PHYML, $CONSENSUS_PHYML ); + } + if ( $use_proml == 1 ) { + &consense( $OUTTREE_PROML, $CONSENSUS_PROML ); + } + if ( $use_protpars == 1 ) { + &consense( $OUTTREE_PROTPARS, $CONSENSUS_PROTPARS ); + } + if ( $all_count > 1 ) { + &consense( $OUTTREES_ALL, $CONSENSUS_ALL ); + } + else { + &rm( $OUTTREES_ALL ); + } + + my $INTREE_FOR_PUZZLE = "intree"; #why so serious? + &rm( $INTREE_FOR_PUZZLE ); + system( "touch", $INTREE_FOR_PUZZLE ) + && die("\n\n$0: could not \"touch $INTREE_FOR_PUZZLE\": $!\n\n"); + + if ( $use_fastme == 1 ) { + &append( $CONSENSUS_FASTME, $INTREE_FOR_PUZZLE ); + } + if ( $use_phylip_nj == 1 ) { + &append( $CONSENSUS_PHYLIP_NJ, $INTREE_FOR_PUZZLE ); + } + if ( $use_phylip_fitch_fm == 1 ) { + &append( $CONSENSUS_PHYLIP_FM, $INTREE_FOR_PUZZLE ); + } + if ( $use_phylip_fitch_me == 1 ) { + &append( $CONSENSUS_PHYLIP_ME, $INTREE_FOR_PUZZLE ); + } + if ( $use_bionj == 1 ) { + &append( $CONSENSUS_BIONJ, $INTREE_FOR_PUZZLE ); + } + if ( $use_weighbor == 1 ) { + &append( $CONSENSUS_WEIGHBOR, $INTREE_FOR_PUZZLE ); + } + if ( $use_raxml == 1 ) { + # Needed, because TREE-PUZZLE adds internal labels for all subsequent trees + # when evaluating given trees (this seems a strange behaviour). + removeSupportValues( $CONSENSUS_RAXML, $CONSENSUS_RAXML."_support_removed" ); + &append( $CONSENSUS_RAXML."_support_removed", $INTREE_FOR_PUZZLE ); + &rm( $CONSENSUS_RAXML."_support_removed" ); + } + if ( $use_phyml == 1 ) { + &append( $CONSENSUS_PHYML, $INTREE_FOR_PUZZLE ); + } + if ( $use_proml == 1 ) { + &append( $CONSENSUS_PROML, $INTREE_FOR_PUZZLE ); + } + if ( $use_protpars == 1 ) { + &append( $CONSENSUS_PROTPARS, $INTREE_FOR_PUZZLE ); + } + if ( $all_count > 1 ) { + &append( $CONSENSUS_ALL, $INTREE_FOR_PUZZLE ); + } + + + # Puzzle for ML branch lenghts: + # The alignment is read from infile by default. + # The tree is read from intree by default. + &rm( "infile" ); + &mv( "align", "infile" ); # align = original alignment in phylip interleaved. + + &executePuzzleToCalculateBranchLenghts( $matrix, + $exact_parameter_est, + $rate_heterogeneity ); + + my $OUTTREE_PUZZLE = "outtree_puzzle"; + + &rm( $outfile."_puzzle_outfile" ); + + &mv( "outfile", $outfile."_puzzle_outfile" ); + &mv( "outtree", $OUTTREE_PUZZLE ); + &rm( "outdist" ); + &rm( "intree" ); + + + # Transfer + # -------- + my $counter = 0; + if ( $use_fastme == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_FASTME, $fastme_outtree, $counter++ ); + &rm( $CONSENSUS_FASTME ); + } + if ( $use_phylip_nj == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_NJ, $phylip_nj_outtree, $counter++ ); + &rm( $CONSENSUS_PHYLIP_NJ ); + } + if ( $use_phylip_fitch_fm == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_FM, $phylip_fm_outtree, $counter++ ); + &rm( $CONSENSUS_PHYLIP_FM ); + } + if ( $use_phylip_fitch_me == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYLIP_ME, $phylip_me_outtree, $counter++ ); + &rm( $CONSENSUS_PHYLIP_ME ); + } + if ( $use_bionj == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_BIONJ, $bionj_outtree, $counter++ ); + &rm( $CONSENSUS_BIONJ ); + } + if ( $use_weighbor == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_WEIGHBOR, $weighbor_outtree, $counter++ ); + &rm( $CONSENSUS_WEIGHBOR ); + } + if ( $use_raxml == 1 ) { + &to_phyloxml( $CONSENSUS_RAXML, $raxml_outtree, 1, 1 ); + $counter++; + } + if ( $use_phyml == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PHYML, $phyml_outtree, $counter++ ); + &rm( $CONSENSUS_PHYML ); + } + if ( $use_proml == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PROML, $proml_outtree, $counter++ ); + &rm( $CONSENSUS_PROML ); + } + if ( $use_protpars == 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_PROTPARS, $protpars_outtree, $counter++ ); + &rm( $CONSENSUS_PROTPARS ); + } + if ( $all_count > 1 ) { + &executeSupportTransfer( $OUTTREE_PUZZLE, $CONSENSUS_ALL, $all_outtree, $counter++ ); + &rm( $CONSENSUS_ALL ); + } + + # Clean up + # -------- + &rm( $OUTTREE_PUZZLE ); + &rm( $SEQBOOT_OUTFILE ); + if ( $keep_multiple_trees == 1 ) { + if ( $use_fastme == 1 ) { + &mv( $OUTTREE_FASTME, $multitreefile_fastme ); + } + if ( $use_phylip_nj == 1 ) { + &mv( $OUTTREE_PHYLIP_NJ, $multitreefile_phylip_nj ); + } + if ( $use_phylip_fitch_fm == 1 ) { + &mv( $OUTTREE_PHYLIP_FM, $multitreefile_phylip_fm ); + } + if ( $use_phylip_fitch_me == 1 ) { + &mv( $OUTTREE_PHYLIP_ME, $multitreefile_phylip_me ); + } + if ( $use_bionj == 1 ) { + &mv( $OUTTREE_BIONJ, $multitreefile_bionj ); + } + if ( $use_weighbor == 1 ) { + &mv( $OUTTREE_WEIGHBOR, $multitreefile_weighbor ); + } + if ( $use_phyml == 1 ) { + &mv( $OUTTREE_PHYML, $multitreefile_phyml ); + } + if ( $use_proml == 1 ) { + &mv( $OUTTREE_PROML, $multitreefile_proml ); + } + if ( $use_protpars == 1 ) { + &mv( $OUTTREE_PROTPARS, $multitreefile_protpars ); + } + &mv( $pwdfile, $multipwdfile ); + } + else { + if ( $use_fastme == 1 ) { + &rm( $OUTTREE_FASTME ); + } + if ( $use_phylip_nj == 1 ) { + &rm( $OUTTREE_PHYLIP_NJ ); + } + if ( $use_phylip_fitch_fm == 1 ) { + &rm( $OUTTREE_PHYLIP_FM ); + } + if ( $use_phylip_fitch_me == 1 ) { + &rm( $OUTTREE_PHYLIP_ME ); + } + if ( $use_bionj == 1 ) { + &rm( $OUTTREE_BIONJ ); + } + if ( $use_weighbor == 1 ) { + &rm( $OUTTREE_WEIGHBOR ); + } + if ( $use_phyml == 1 ) { + &rm( $OUTTREE_PHYML ); + } + if ( $use_proml == 1 ) { + &rm( $OUTTREE_PROML ); + } + if ( $use_protpars == 1 ) { + &rm( $OUTTREE_PROTPARS ); + } + &rm( $pwdfile ); + } + if ( $all_count > 1 ) { + &rm( $OUTTREES_ALL ); + } +} # if ( $bootstraps > 1 ) +else { + &rm( "infile.dist" ); + + &rm( "infile.puzzle" ); + if ( $use_fastme == 1 ) { + &to_phyloxml( $OUTTREE_FASTME, $fastme_outtree, 0, 1 ); + } + if ( $use_phylip_nj == 1 ) { + &to_phyloxml( $OUTTREE_PHYLIP_NJ, $phylip_nj_outtree, 0, 1); + } + if ( $use_phylip_fitch_fm == 1 ) { + &to_phyloxml( $OUTTREE_PHYLIP_FM, $phylip_fm_outtree, 0, 1 ); + } + if ( $use_phylip_fitch_me == 1 ) { + &to_phyloxml( $OUTTREE_PHYLIP_ME, $phylip_me_outtree, 0, 1 ); + } + if ( $use_bionj == 1 ) { + &to_phyloxml( $OUTTREE_BIONJ, $bionj_outtree, 0, 1 ); + } + if ( $use_weighbor == 1 ) { + &to_phyloxml( $OUTTREE_WEIGHBOR, $weighbor_outtree, 0, 1 ); + } + if ( $use_raxml == 1 ) { + &to_phyloxml( $OUTTREE_RAXML, $raxml_outtree, 0, 1 ); + } + if ( $use_phyml == 1 ) { + &to_phyloxml( $OUTTREE_PHYML, $phyml_outtree, 0, 1 ); + } + if ( $use_proml == 1 ) { + &to_phyloxml( $OUTTREE_PROML, $proml_outtree, 0, 1 ); + } + if ( $use_protpars == 1 ) { + &to_phyloxml( $OUTTREE_PROTPARS, $protpars_outtree, 0, 1 ); + } +} # if ( $bootstraps > 1 ) + +&rm( $infile ); +&rm( "infile" ); +&rm( "align" ); +&rm( "align.reduced" ); + + +$log = $log."Finish time/date : ".`date`; + +if ( $bootstraps > 1 ) { + $log = $log."Puzzle output file : ".$outfile."_puzzle_outfile\n"; +} +$log = $log."Columns in alignment : $number_of_aa\n"; +$log = $log."Number of sequences in alignment : $number_of_seqs\n"; +if ( $all_count > 1 ) { + $log = $log."Combined consensus : $all_outtree\n"; +} + + +if ( $bootstraps > 1 ) { + $log = $log."\n\n"; + $log = $log."Simple support value statistics (trees are numbered the same as for TREE PUZZLE output)\n"; + $log = $log."------------------------------- \n"; + $log = $log."\n"; +} + +open( OUT, ">$logfile" ) || die "\n$0: Cannot create file <<$logfile>>: $!\n"; +print OUT $log; +close( OUT ); + +if ( $bootstraps > 1 ) { + # Simple support statistics + # ------------------------- + my $SS_OUT = $temp_dir."/ss_out"; + my @phylos = (); + my $ounter = 0; + if ( $use_fastme == 1 ) { + $phylos[ $ounter++ ] = $fastme_outtree; + } + if ( $use_phylip_nj == 1 ) { + $phylos[ $ounter++ ] = $phylip_nj_outtree; + } + if ( $use_phylip_fitch_fm == 1 ) { + $phylos[ $ounter++ ] = $phylip_fm_outtree; + } + if ( $use_phylip_fitch_me == 1 ) { + $phylos[ $ounter++ ] = $phylip_me_outtree; + } + if ( $use_bionj == 1 ) { + $phylos[ $ounter++ ] = $bionj_outtree; + } + if ( $use_weighbor == 1 ) { + $phylos[ $ounter++ ] = $weighbor_outtree; + } + if ( $use_raxml == 1 ) { + $phylos[ $ounter++ ] = $raxml_outtree; + } + if ( $use_phyml == 1 ) { + $phylos[ $ounter++ ] = $phyml_outtree; + } + if ( $use_proml == 1 ) { + $phylos[ $ounter++ ] = $proml_outtree; + } + if ( $use_protpars == 1 ) { + $phylos[ $ounter++ ] = $protpars_outtree; + } + if ( $all_count > 1) { + $phylos[ $ounter++ ] = $all_outtree; + } + &executeSupportStatistics( $SS_OUT, @phylos ); + &append( $SS_OUT, $logfile ); + &rm( $SS_OUT ); + + # Append parts of puzzle output file + # ---------------------------------- + if ( $all_count > 1 ) { + &parsePuzzleOutfile( $outfile."_puzzle_outfile", $logfile ); + } +} + +chdir( $current_dir ) +|| die "\n\n$0: Could not chdir to <<$current_dir>>: $!\n\n"; + +rmdir( $temp_dir ) +|| print "\n\n$0: Warning: Could not remove <<$temp_dir>>: $!\n\n"; + +print "\n\n\n$0 successfully comleted.\n\n"; + +exit( 0 ); + + + +# Methods: +# -------- + + +# Six arguments: +# 1. DNA or Amino-Acids sequence filename (PHYLIP format) +# 2. Model, eg. PROTGAMMAIVT +# 3. Replicates (bootstrap) +# 4. Seed for bootstrap +# 5. Output suffix +# 6. Algorithm (only for bootstrap, default otherwise) +# NOTE. RaxML does its own bootstrapping. +sub executeRaxml { + my $msa = $_[ 0 ]; + my $model = $_[ 1 ]; + my $replicates = $_[ 2 ]; + my $seed = $_[ 3 ]; + my $outfile_suffix = $_[ 4 ]; + my $algo = $_[ 5 ]; + + &testForTextFilePresence( $msa ); + my $command = "$RAXML -m $model -s $msa -n $outfile_suffix"; + + if ( $replicates > 1 ) { + $command = $command . " -x $seed -N $replicates"; + if ( $algo ) { + $command = $command . " -f $algo"; + } + } + + print( "\n$command\n"); + + system( $command ) + && &dieWithUnexpectedError( $command ); + +} + + +sub to_phyloxml { + my $from = $_[ 0 ]; + my $to = $_[ 1 ]; + my $internal_names_are_boots = $_[ 2 ]; + my $extract_taxonomy = $_[ 3 ]; + &dieIfFileExists( $to ); + &dieIfFileNotExists( $from ); + my $command = "$NEWICK_TO_PHYLOXML -f=nn $from $to"; + if ( $internal_names_are_boots == 1 ) { + $command = $command . " -i"; + } + if ( $extract_taxonomy == 1 ) { + $command = $command . " -xt"; + } + system( $command ) + && die "$0: Could not execute \"$command \""; + &rm( $from ); +} + + +sub mv { + my $from = $_[ 0 ]; + my $to = $_[ 1 ]; + &dieIfFileExists( $to ); + &dieIfFileNotExists( $from ); + system( "mv", $from, $to ) + && die "\n\n$0: could not move \"$from\" to \"$to\": $!\n\n"; +} + +sub cp { + my $from = $_[ 0 ]; + my $to = $_[ 1 ]; + &dieIfFileExists( $to ); + &dieIfFileNotExists( $from ); + + system( "cp", $from, $to ) + && die "\n\n$0: could not copy \"$from\" to \"$to\": $!\n\n"; +} + +sub rm { + my $f = $_[ 0 ]; + unlink( $f ); +} + +sub consense { + my $multi_in = $_[ 0 ]; + my $consense_out = $_[ 1 ]; + &executeConsense( $multi_in ); + &mv( "outtree", $consense_out ); + &rm( "outfile" ); + +} + + + +# 1. file to be appended +# 2. file to append to +sub append { + my $to_be_appended = $_[ 0 ]; + my $append_to = $_[ 1 ]; + &dieIfFileNotExists( $to_be_appended ); + system( "cat $to_be_appended >> $append_to" ) + && die "\n\n$0: could not execute \"cat $to_be_appended >> $append_to\": $!\n\n"; + +} + +sub dieIfFileExists { + my $file = $_[ 0 ]; + if ( -e $file ) { + die "\n\n$0: \"$file\" already exists\n\n"; + } +} + +sub dieIfFileNotExists { + my $file = $_[ 0 ]; + unless ( ( -s $file ) && ( -f $file ) ) { + die( "\n\n$0: \"$file\" does not exist or is empty" ); + } +} + + + + +# Two arguments: +# 1. seed for random number generator +# 2. number of bootstraps +# Reads in "infile" by default. +sub executeSeqboot { + + my $s = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $verb = ""; + + &testForTextFilePresence( $infile ); + + + $verb = " +2"; + + system( "$SEQBOOT << ! +r +$bs$verb +Y +$s +!" ) + && die "$0: Could not execute \"$SEQBOOT\""; + +} + + + +# One/two/three argument(s): +# Reads in tree from "intree" by default. (Presence of "intree" automatically +# switches into "User defined trees" mode.) +# 1. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# 2. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# 3. Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +# Last modified: 09/08/03 (added 2nd and 3rd parameter) +sub executePuzzleToCalculateBranchLenghts { + my $matrix_option = $_[ 0 ]; + my $parameter_estimates_option = $_[ 1 ]; + my $rate_heterogeneity_option = $_[ 2 ]; + my $i = 0; + my $mat = ""; + my $est = ""; + my $rate = ""; + + unless ( ( -s "infile" ) && ( -f "infile" ) && ( -T "infile" ) ) { + die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; + } + unless ( ( -s "intree" ) && ( -f "intree" ) && ( -T "intree" ) ) { + die "\n$0: executePuzzleToCalculateBranchLenghts: <> does not exist, is empty, or is not a plain textfile.\n"; + } + + $mat = setModelForPuzzle( $matrix_option ); + if ( $parameter_estimates_option ) { + $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); + } + if ( $rate_heterogeneity_option ) { + $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); + } + + + system( "$PUZZLE << ! +$mat$est$rate +x +y +!" ) + && die "$0: Could not execute \"$PUZZLE\" (mat=$mat est=$est rate=$rate)"; + +} + +# Two arguments: +# 1. puzzle outfile +# 2. file to append to +sub parsePuzzleOutfile { + my $puzzle_outfile = $_[ 0 ]; + my $file_to_append_to = $_[ 1 ]; + &testForTextFilePresence( $puzzle_outfile ); + open( OUT, ">>$file_to_append_to" ) || &dieWithUnexpectedError( "Cannot open \"$file_to_append_to\"" ); + open( IN, "$puzzle_outfile" ) || &dieWithUnexpectedError( "Cannot open file \"$puzzle_outfile\"" ); + my $return_line; + my $read = 0; + print OUT "\nTREE PUZZLE output\n"; + print OUT "------------------\n"; + while ( $return_line = ) { + if ( $return_line =~/COMPARISON OF USER TREES/ ) { + $read = 1; + } + elsif( $return_line =~/TIME STAMP/ ) { + $read = 0; + } + elsif( $read ) { + print OUT $return_line; + } + } + close( IN ); + close( OUT ); +} + +# Three/four arguments: +# 1. Name of file containing tree with correct branch lengths +# 2. Name of file containing tree with correct bootstraps +# 3. Outputfilename +# 4. Index of tree with correct branch lengths, in case more than one in file +# Last modified: 2007.11.27 +sub executeSupportTransfer { + my $tree_with_bl = $_[ 0 ]; + my $tree_with_bs = $_[ 1 ]; + my $out = $_[ 2 ]; + my $index = $_[ 3 ]; + + &testForTextFilePresence( $tree_with_bl ); + &testForTextFilePresence( $tree_with_bs ); + my $command = "$SUPPORT_TRANSFER $tree_with_bl $tree_with_bs $out $index"; + system( $command ) + && die "$0: Could not execute \"$command\""; +} + + +# Two or more arguments: +# 1. outfile +# 2. phylogeny 1 with support values +# 3. phylogeny 2 with support values +# 4. ... +sub executeSupportStatistics { + my $outfile = $_[ 0 ]; + &dieIfFileExists( $outfile ); + my $phylos = ""; + for( my $i = 1; $i < scalar(@_); ++$i ) { + &testForTextFilePresence( $_[ $i ] ); + $phylos .= $_[ $i ]." "; + } + my $command = "$SUPPORT_STATISTICS -o=$outfile $phylos"; + system( "$command" ) + && die "$0: Could not execute \"$command\""; +} + + +sub getNumberOfSeqsAndAas { + my $infile = $_[ 0 ]; + my $seqs = 0; + my $aa = 0; + open( IN, "$infile" ) || die "\n$0: Cannot open file <<$infile>>: $!\n"; + while( ) { + if ( $_ =~ /^\s*(\d+)\s+(\d+)\s*$/ ) { + $seqs = $1; + $aa = $2; + } + } + close( IN ); + + if ( $seqs == 0 || $aa == 0 ) { + die( "\n$0: Could not get number of seqs and aa from: $infile" ); + } + return $seqs, $aa; +} + + + +sub removeSupportValues { + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + &testForTextFilePresence( $infile ); + open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + open( IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( my $line = ) { + $line =~ s/\)\d+\.?\d*:/\):/g; + print OUT "$line"; + } + close( OUT ); + close( IN ); +} + + + + +# Six arguments: +# 1. name of alignment file (in correct format!) +# 2. number of bootstraps +# 3. jumbles: 0: do not jumble; >=1 number of jumbles +# 4. seed for random number generator +# 5. 1 for PAM instead of JTT +# 6. 1 to use globale rearragements +sub executeProml { + my $align = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $rand = $_[ 2 ]; + my $s = $_[ 3 ]; + my $use_pam = $_[ 4 ]; + my $use_global_rearr = $_[ 5 ]; + my $jumble = ""; + my $multi = ""; + my $pam = ""; + + &testForTextFilePresence( $align ); + + if ( $bs > 1 && $rand < 1 ) { + $rand = 1; + } + + if ( $rand >= 1 ) { + $jumble = " +J +$s +$rand"; + } + + if ( $bs > 1 ) { + $multi = " +M +D +$bs"; + } + + + if ( $use_pam == 1 ) { + $pam = " +P +P"; + } + + my $global = ""; + if ( $use_global_rearr == 1 ) { + $global = " +G"; + } + + system( "$PROML 2>&1 << ! +$align$jumble$multi$pam$global +3 +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$PROML $align$jumble$multi$pam$global\"" ); + # 3: Do NOT print out tree + + return; + +} ## executeProml + + +sub printUsage { + + print < + [path/name for temporary directory to be created] + + Example: + "% phylo_pl.pl -B100q\@1nbS9X IL5.aln IL5_tree" + + Options + ------- + + Bx : Number of bootstraps. B0: do not bootstrap. Default is 100 bootstrapps. + The number of bootstrapps should be divisible by 10. + J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE and/or PHYML, RAXML, default: VT (Mueller-Vingron 2000). + L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE and/or PHYML, RAXML, default: VT. + M : Use mtREV24 matrix (Adachi-Hasegawa 1996) in TREE-PUZZLE and/or PHYML, default: VT. + W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE and/or PHYML, RAXML, default: VT. + P : Use PAM matrix (Dayhoff et al. 1978) in TREE-PUZZLE and/or PHYML, RAXML, default: VT. + D : Use DCMut matrix (Kosial and Goldman, 2005) in PHYML, RAXML, VT in TREE-PUZZLE. + A : Let TREE-PUZZLE choose which matrix to use, default: VT + E : Exact parameter estimates in TREE-PUZZLE, default: Approximate. + Model of rate heterogeneity in TREE-PUZZLE (default: Uniform rate): + g : 8 Gamma distributed rates + t : Two rates (1 invariable + 1 variable) + m : Mixed (1 invariable + 8 Gamma rates) + q\@x: Use FastME, x: 1: GME + 2: BME + 3: NJ + n : Use PHYLIP Neighbor (NJ). + f : Use PHYLIP Fitch. + e : Use PHYLIP Minimal Evolution. + b : Use BIONJ. + w : Use Weighbor. + x : Use RAxML. + y : Use PHYML. + o : Use PHYLIP proml. + p : Use PHYLIP protpars. + rx : Number of relative substitution rate categories in PHYML (default is 4). + jx : Number of jumbles (input order randomization) for PHYLIP FM, ME, PROTPARS, and PROML (default is 2) (random seed set with Sx). + I : Estimate proportion of invariable sites in RAXML and/or PHYML (otherwise, proportion "0.0" is used in PHYML) + G : to turn on global rearrangements in PHYLIP FM, ME, and PROML + Sx : Seed for random number generator(s). Must be 4n+1. Default is 9. + X : To keep multiple tree file (=trees from bootstrap resampled alignments) and + pairwise distance matrix file (in case of bootstrap analysis). + +END + +} ## printUsage diff --git a/forester/archive/perl/queue.pm b/forester/archive/perl/queue.pm new file mode 100755 index 0000000..a7374fe --- /dev/null +++ b/forester/archive/perl/queue.pm @@ -0,0 +1,150 @@ +package queue; + +# Process queueing +# SRE, Wed Sep 2 14:37:14 1998 +# CVS $Id: queue.pm,v 1.1.1.1 2005/03/22 08:35:51 cmzmasek Exp $ +# Master copy: see src/queue (CVS controlled, separate from pfamserver) +# +# Written for Pfam web server; suited for queuing any set of commands. +# +# API: +# +# $mypid = $$; +# $delay_in_seconds = 2; +# +# $nqueued = &queue::CheckQueue("pfamqueue", "username", "/tmp"); +# print ("There are $nqueued jobs ahead of you in line\n"); +# &queue::WaitInQueue("pfamqueue", "username", "/tmp", $mypid, $delay_in_seconds); +# print ("Our turn! Working...\n"); +# (do stuff) +# &queue::RemoveFromQueue("pfamqueue", "username", "/tmp", $mypid); +# +# queuedir is a directory where the script has write permissions; +# typically a tmp directory of some sort. +# + + +################################################################ +# PFAMSERVER - The Washington University/St. Louis Pfam web server +# Copyright (C) 1995-1999 Washington University School of Medicine +# Copyright (C) 1995-1999 Sanger Centre/Genome Research Ltd. +# Copyright (C) 1998-1999 Karolinska Institutet Center for Genomics Research +# All Rights Reserved +# +# This source code is distributed under the terms of the +# GNU General Public License. See the files COPYRIGHT and LICENSE +# for details. +# +################################################################ +# RCS $Id: queue.pm,v 1.1.1.1 2005/03/22 08:35:51 cmzmasek Exp $ + + +# WaitInQueue() - add a process id to a queue, wait for turn +# +# Arguments: queue - name of queue (prefix of queue stamp) +# username - name of user (middle part of queue stamp) +# queuedir - directory to keep queue stamps in +# mypid - our process id +# delay - number of seconds between checking queue status +# +# Note: When it checks the queue, if a stamp is present that +# doesn't seem to correspond to a running process (ps -a), +# it deletes the stamp. This protects against crashed processes +# freezing all subsequent jobs. +# +# example: &WaitInQueue("pfamqueue", "/tmp", $mypid, 2); +# +# Returns 1 on success, 0 on failure. +# +# NOTE: You may have to set the ps command in WaitInQueue. +# It must return all running processes. +# +sub WaitInQueue +{ + local($queue, $username, $queuedir, $mypid, $delay) = @_; + local(@newqueue, @queuelist, %mark); + local(*STAMP, *QUEUEDIR); + local(%is_running); + local(@output, $line, $pid, $waiting); + + # get list of other guys who are working + opendir(QUEUEDIR, $queuedir); + @queuelist = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR)); + closedir(QUEUEDIR); + # make stamp for our pid + if ($username eq "") { $username = "unknown"; } + open(STAMP, ">$queuedir/$queue.$username.$mypid") || return 0; + close(STAMP); + # wait for our turn + while (1) + { + if ($#queuelist == -1) { last; } # nobody ahead of us; our turn! + sleep($delay); + # get list of running processes + %is_running = 0; + @output = split(/^/, `ps -ax`); + foreach $line (@output) { + $line =~ /\s*(\d+)/; + $is_running{$1} = 1; + } + # verify that the guys we're waiting for + # are still running, and haven't crashed. + # if they have, reap their stamps, and their + # tmp files. + foreach $waiting (@queuelist) { + ($name, $pid) = ($waiting =~ /$queue\.(\S*)\.(\d+)/); + if (! $is_running{$pid}) { unlink "$queuedir/$queue.$name.$pid"; } + } + + # get new list of queued jobs ahead of us. + # ignore guys who came in after we grabbed our initial queue list; + # they're waiting for *us*. The crazed greps are the Perl-y + # way of computing an intersection between two arrays. + # + opendir(QUEUEDIR, $queuedir); + @newqueue = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR)); + closedir(QUEUEDIR); + %mark = 0; + grep($mark{$_}++,@queuelist); + @queuelist = grep($mark{$_},@newqueue); + } + + 1; # time to run! +} + + +# CheckQueue() - return total number of processes working, other than us +# and the total that this particular username is running. +# +# Arguments: queue, username, queuedir +# +sub CheckQueue +{ + local($queue, $username, $queuedir) = @_; + local(*QUEUEDIR, @allqueue, $nall, $nuser); + + opendir(QUEUEDIR, $queuedir); + @allqueue = grep(/$queue\.\S*\.\d+/, readdir(QUEUEDIR)); + closedir(QUEUEDIR); + + if ($username eq "") {$username = "unknown"; } + $nall = $nuser = 0; + foreach $waiting (@allqueue) { + ($name, $pid) = ($waiting =~ /$queue\.(\S*)\.(\d+)/); + $nall++; + if ($name eq $username) { $nuser++; } + } + return ($nall, $nuser); +} + + +# RemoveFromQueue() - remove a pid from a queue +# +sub RemoveFromQueue +{ + local($queue, $username, $queuedir, $pid) = @_; + if ($username eq "") {$username = "unknown"; } + unlink "$queuedir/$queue.$username.$pid"; +} + +1; diff --git a/forester/archive/perl/rio.pl b/forester/archive/perl/rio.pl new file mode 100755 index 0000000..7594587 --- /dev/null +++ b/forester/archive/perl/rio.pl @@ -0,0 +1,3391 @@ +#!/usr/bin/perl -W + +# rio.pl +# ------ +# +# Copyright (C) 2000-2002 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Created: 11/25/00 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified 09/06/03 +# + +# +# +# Available at: http://www.genetics.wustl.edu/eddy/forester/ +# RIO webserver: http://www.rio.wustl.edu/ +# +# Reference: +# Zmasek C.M. and Eddy S.R. (2002) +# RIO: Analyzing proteomes by automated phylogenomics using +# resampled inference of orthologs. +# BMC Bioinformatics 3:14 +# http://www.biomedcentral.com/1471-2105/3/14/ +# +# It is highly recommended that you read this paper before +# installing and/or using RIO. (Included in the RIO +# distribution as PDF: "RIO.pdf".) +# +# +# Before rio.pl can be used, some variables in rio_module.pm need to be set, +# as described in RIO_INSTALL. +# +# Usage: rio.pl +# ----- +# +# +# Examples: +# -------- +# % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out1 p I C E +# +# % RIO1.1/perl/rio.pl 2 A=aconitase N=LEU2_LACLA/5-449 O=out2 p I C E +# +# % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase H=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out3 p I C E +# +# % RIO1.1/perl/rio.pl 4 A=/path/to/my/pfam/Full/aconitase N=LEU2_LACLA/5-449 O=out4 p I C E +# +# % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase b=/path/to/my/pfam/Seed/aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out5 p I C E +# +# +# Modes: +# ------ +# +# 1: RIO analysis based on precalculated pairwise distances +# alignment does not contain query sequence +# +# 2: RIO analysis based on precalculated pairwise distances +# alignment does contain query sequence +# +# 3: RIO analysis based on Pfam alignments, +# alignment does not contain query sequence +# +# 4: RIO analysis based on Pfam alignments, +# alignment does contain query sequence +# +# +# +# Tagged arguments: +# ----------------- +# +# No "G=", "H=", "F=", "T=", "a=", "b=", "s", "f" in modes 1 and 2. +# +# +# A= Pfam alignment name (mandatory). This specifies the alignment +# against which the RIO analysis is to be performed. +# In modes 1 and 2: Pfam model (alignment) name +# (e.g. "A=aconitase"). +# In modes 3 and 4: Pfam alignment path/name +# (e.g. "A=/path/to/your/pfam/Full/aconitase"). +# +# Q= Path/name of file containing the query sequence +# (in FASTA format or raw sequence) (mandatory in modes 1 and 3). +# +# N= Query name (mandatory). This must include the SWISS-PROT code +# for the species of the query after a "_" (e.g. "N=QUERY_HAEIN"). +# If the query sequence is already in the alignment (modes 2 and 4) +# the complete name needs to be specified -- including "/xxx-xxx". +# +# O= Output file path/name (mandatory). +# +# T= Model for pairwaise distance calculation: +# J=JTT, B=BLOSUM 62, M=mtREV24, V=VT, W=WAG, P=PAM. +# BLOSUM 62 is default. +# (Not in modes 1 and 2; these modes use $MATRIX_FOR_PWD instead.) +# +# In modes 1 and 3, a HMM is needed to align the query sequence to +# the alignment and either one of the following options must be +# employed: +# H= HMM name: This uses hmmfetch to retrieve a HMM from +# $PFAM_HMM_DB. +# F= HMM file: This directly reads the HMM from a file. +# +# S= Species tree file path/name (in NHX format) (optional). +# If not specified, $SPECIES_TREE_FILE_DEFAULT is used. +# +# G= Species names file (optional). Only sequences associated with +# species found in this file are used. +# In the species names file, individual species names must be +# separated by newlines and lines starting with "#" are ignored. +# While only sequences associated with species found in the species +# tree ("S=") are used for the actual RIO analysis, this allows to +# remove sequences prior to tree calculation (which is the most +# time consuming step). +# +# P= Sort priority (default is 12): +# 0 : Ortholog +# 1 : Ortholog, Super ortholog +# 2 : Super ortholog, Ortholog +# 3 : Ortholog, Distance +# 4 : Distance, Ortholog +# 5 : Ortholog, Super ortholog, Distance +# 6 : Ortholog, Distance, Super ortholog +# 7 : Super ortholog, Ortholog, Distance +# 8 : Super ortholog, Distance, Ortholog +# 9 : Distance, Ortholog, Super ortholog +# 10 : Distance, Super ortholog, Ortholog +# 11 : Ortholog, Subtree neighbor, Distance +# 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default) +# 13 : Ortholog, Super ortholog, Subtree neighbor, Distance +# 14 : Subtree neighbor, Ortholog, Super ortholog, Distance +# 15 : Subtree neighbor, Distance, Ortholog, Super ortholog +# 16 : Ortholog, Distance, Subtree neighbor, Super ortholog +# 17 : Ortholog, Subtree neighbor, Distance, Super ortholog +# +# a= Bootstraps for tree construction (not in modes 1 and 2). +# Default is 100. +# +# L= Threshold for orthologies for output. Default is 0. +# v= Threshold for ultra-paralogies for output. Default is 50. +# +# U= Threshold for orthologies for distance calculation. Default is 60. +# +# X= In case of more than one putative orthologs: +# number of sd the distance query - LCA has to differ +# from the mean to generate a warning. Default is 2. +# +# Y= In case of no putative orthologs: +# number of sd the distance query - root has to differ +# from mean to generate a warning. Default is 2. +# +# Z= In case of one putative ortholog: +# threshold for factor between the two distances to their +# LCA (larger/smaller) to generate a warning. Default is 2. +# +# B= Threshold for subtree-neighborings. Default is 0. +# +# b= Build HMM from seed alignment with "hmmbuild -s" (optional). +# This is to prevent from finding multiple domains per sequence +# (i.e. prevents "cutting" the query sequence). Give path/name to +# Seed with this. +# +# j= Name for temporary directory (optional). +# +# y= Seed for random number generator. Default is 41. +# +# I Create and save a rooted, with duplication vs speciation, +# and orthology information annotated gene tree. +# If precalculated distances are used (modes 1 and 2): this gene +# tree is a NJ tree calculated based on the non-bootstrap resampled +# (original) pairwise distances. +# If precalculated distances are not used (modes 3 and 4): this gene +# is a consenus tree with ML branch length values and is also +# annotated with bootstrap values for each node. +# +# Options for output: +# p Output ultra-paralogs. +# D Description from SWISS-PROT and TrEMBL. +# C Complete description from SWISS-PROT and TrEMBL. +# E 118 character output instead of 78 character output. +# +# K Keep intermediate files (they will go into the same directory +# as the output file, their names are the same as of the output +# file, with various suffixes added). +# +# s Ignore non SWISS-PROT sequences (i.e. sequences from TrEMBL) +# in the Pfam alignment. +# +# f Try to ignore TrEMBL "fragments" (sequences with "fragment" in +# their description). +# +# + Parallel, use machines listed in file $NODE_LIST. +# +# x RIO used as web server -- HTML output. +# +# +# +# +# History +# ------- +# 09/06/03: Removal of minor bug. Only create consenus tree with ML branch length +# values if "I" option used (in modes 3 or 4) -- the problem/bug was that +# this tree was always created whether "I" was used or not. +# + + +use strict; + +use FindBin; +use lib $FindBin::Bin; +use Net::Ping; +use rio_module; + +use File::Basename; + + +my $VERSION = "5.010"; + +my $E_VALUE_THRESHOLD = 0.01; # For HMMSEARCH. +my $SORT_DEFAULT = 12; +my $THRESHOLD_ORTHOLOGS_DEFAULT = 0; +my $THRESHOLD_SN_DEFAULT = 0; +my $THRESHOLD_ORTHOLOGS_DEFAULT_DC = 60; +my $T_ULTRA_PARALOGS_DEFAULT = 50; +my $WARN_NO_ORTHOS_DEFAULT = 2; +my $WARN_MORE_THAN_ONE_ORTHO_DEFAULT = 2; +my $WARN_ONE_ORTHO_DEFAULT = 2; +my $MIN_NUMBER_OF_SEQS_IN_ALN = 4; +my $BOOSTRAPS_FOR_MAKETREE_DEFAULT = 100; +my $SEED_FOR_MAKETREE_DEFAULT = 41; +my $MATRIX_DEFAULT = 2; # 2=BLOSUM62 + +my $DO_RIO_TEMP_OUTFILE = "DoRIO_OUTFILE"; +my $TEMP_HMM_FILE = "HMMFILE"; + +my $DEFAULT_OPTIONS_FOR_MAKETREE = "XR"; + + +# I/O files, names: +my $alignment = ""; +my $hmm_file = ""; +my $hmm_name = ""; +my $seqX_file = ""; +my $species_tree_file = ""; +my $outfile = ""; +my $outfile_annot_nhx_tree = ""; +my $query_name = ""; +my $multiple_trees_file = ""; +my $distance_matrix_file = ""; +my $maketree_out_tree_file = ""; +my $seed_aln_for_hmmbuild = ""; +my $temp_dir = ""; +my $bsp_file = ""; +my $pwd_file = ""; +my $nbd_file = ""; +my $output_dir = ""; +my $species_names_file = " "; # Must be " ". +my $options_for_makeTree = ""; + + +# multiple choice options: +my $mode = 0; +my $sort = $SORT_DEFAULT; +my $matrix_n = $MATRIX_DEFAULT; # 0=JTT 1=PAM 2=BLOSUM62 3=mtREV24 5=VT 6=WAG + + + + +# yes/no options: +my $description = 0; +my $complete_description = 0; +my $long_output = 0; +my $keep = 0; +my $non_sp = 1; # 0 to remove non SP seqs. +my $safe_nhx = 0; +my $no_frags = 0; +my $output_ultraparalogs = 0; +my $parallel = 0; +my $output_HTML = 0; + + +# numerical options: +my $warn_no_orthos = $WARN_NO_ORTHOS_DEFAULT; +my $warn_more_than_one_ortho = $WARN_MORE_THAN_ONE_ORTHO_DEFAULT; +my $warn_one_ortho = $WARN_ONE_ORTHO_DEFAULT; +my $boostraps_for_makeTree = $BOOSTRAPS_FOR_MAKETREE_DEFAULT; +my $seed_for_makeTree = $SEED_FOR_MAKETREE_DEFAULT; +my $t_orthologs = $THRESHOLD_ORTHOLOGS_DEFAULT; +my $t_sn = $THRESHOLD_SN_DEFAULT; +my $t_orthologs_dc = $THRESHOLD_ORTHOLOGS_DEFAULT_DC; +my $t_ultra_paralogs = $T_ULTRA_PARALOGS_DEFAULT; + + +# internal variables: +my $print_header_for_orthologies = 0; +my $print_header_for_s_paralogs = 0; +my $length_of_alignment = 0; +my $length_of_orig_alignment = 0; +my $time = 0; +my $ii = 0; +my $j = 0; +my $jj = 0; +my $number_of_seqs_in_aln = 0; +my $f = 0; +my $saw_distance_values = 0; +my $saw_ultra_paralogs = 0; +my $bootstraps = 0; +my $ext_nodes_in_trees_analyzed = 0; +my $time_total = 0; +my $time_tree_calc = 0; +my $time_tree_calcT = 0; +my $time_rio = 0; +my $time_rioT = 0; +my $time_dqopuzzle = 0; +my $time_dqopuzzleT = 0; +my $time_addingdists = 0; +my $time_addingdistsT = 0; +my $processors = 0; +my $block_size = 0; +my $larger_blocks = 0; +my $printed_ultra_paralogs = 0; + +my $dorio_outfile = ""; +my $options_for_DoRIO = ""; +my $ortho_name = ""; +my $orthos = 0; +my $s_orthos = 0; +my $subtree_neighbors = 0; +my $dist = 0; +my $s_para_name = ""; +my $s_paras = 0; +my $sort_priority = ""; +my $return_line = ""; +my $matrix = ""; +my $command_line = ""; +my $command_line_for_hmmbuild = ""; +my $current_dir = ""; +my @complete_names = (); +my @temp_array = (); +my %Species_names_hash = (); +my %AC_DE = (); # AC => DE from "ACDEOS" TrEMBL file. +my %SP_AC_DE = (); # ID => DE from "ACIDOS" SWISS-PROT file. +my %names_in_pwd_file = (); +my @nodelist = (); + +my $start_date = `date`; + + + + +# This analyzes the options: +# -------------------------- + +$time_total = time; + +if ( @ARGV < 4 ) { + &printHelp(); +} + +$command_line = "$0 "; +for ( $j = 0; $j < @ARGV; ++$j ) { + $command_line .= "$ARGV[ $j ] "; +} + +&analyzeCommandLine( @ARGV ); + +if ( $species_tree_file eq "" ) { + $species_tree_file = $SPECIES_TREE_FILE_DEFAULT; +} + +&CheckArguments; + +$options_for_makeTree = $DEFAULT_OPTIONS_FOR_MAKETREE; +$options_for_makeTree .= "S".$seed_for_makeTree; + + +if ( $mode == 1 || $mode == 2 ) { + + if ( $mode == 1 ) { + $hmm_file = $RIO_HMM_DIRECTORY.$alignment.$SUFFIX_HMM; + $bsp_file = $RIO_BSP_DIRECTORY.$alignment.$SUFFIX_BOOT_STRP_POS; + &userErrorCheckForTextFileExistence( $hmm_file ); + &userErrorCheckForTextFileExistence( $bsp_file ); + } + + $pwd_file = $RIO_PWD_DIRECTORY.$alignment.$SUFFIX_PWD; + $nbd_file = $RIO_NBD_DIRECTORY.$alignment.$SUFFIX_PWD_NOT_BOOTS; + $alignment = $RIO_ALN_DIRECTORY.$alignment.$ALIGN_FILE_SUFFIX; + &userErrorCheckForTextFileExistence( $pwd_file ); + &userErrorCheckForTextFileExistence( $nbd_file ); + &userErrorCheckForTextFileExistence( $alignment ); + $no_frags = 0; + $non_sp = 1; + + $options_for_makeTree .= "F"; +} +elsif ( $mode == 3 || $mode == 4 ) { + if ( $safe_nhx == 1 ) { + $options_for_makeTree .= "U"; + } + else { + $options_for_makeTree .= "#"; + } + $options_for_makeTree .= "D"; # To calc. and keep pairwise distances. + $options_for_makeTree .= "B".$boostraps_for_makeTree; + +} + +if ( $output_HTML == 1 ) { + $| = 1; + $complete_description = 1; + $long_output = 1; + +} + +if ( $mode == 1 || $mode == 3 || $mode == 4 ) { + + if ( $mode == 1 ) { + $matrix_n = $MATRIX_FOR_PWD; + } + + if ( $matrix_n == 0 ) { + $options_for_makeTree .= "J"; + $matrix = "JTT (Jones et al. 1992)"; + } + elsif ( $matrix_n == 1 ) { # PAM is makeTree's default. + $matrix = "PAM (Dayhoff et al. 1978)"; + } + elsif ( $matrix_n == 2 ) { + $options_for_makeTree .= "L"; + $matrix = "BLOSUM 62 (Henikoff-Henikoff 92)"; + } + elsif ( $matrix_n == 3 ) { + $options_for_makeTree .= "M"; + $matrix = "mtREV24 (Adachi-Hasegawa 1996)"; + } + elsif ( $matrix_n == 5 ) { + $options_for_makeTree .= "T"; + $matrix = "VT (Mueller-Vingron 2000)"; + } + elsif ( $matrix_n == 6 ) { + $options_for_makeTree .= "W"; + $matrix = "WAG (Whelan-Goldman 2000)"; + } + else { + &dieWithUnexpectedError( "Failed sanity check" ); + } +} + + +# This creates the temp directory: +# -------------------------------- + +$ii = 0; + +$time = time; + +if ( $temp_dir eq "" ) { + $temp_dir = $TEMP_DIR_DEFAULT.$time.$ii; +} +else { + $temp_dir = $temp_dir.$ii; +} + +while ( -e $temp_dir ) { + $ii++; + $temp_dir = $TEMP_DIR_DEFAULT.$time.$ii; +} + +mkdir( $temp_dir, 0700 ) || &dieWithUnexpectedError( "Could not create \"$temp_dir\"" ); + +unless ( ( -e $temp_dir ) && ( -d $temp_dir ) ) { + &dieWithUnexpectedError( "\"$temp_dir\" does not exist, or is not a directory" ); +} + + + +# The analysis starts here: +# ------------------------- + +$dorio_outfile = $temp_dir."/".$DO_RIO_TEMP_OUTFILE; + +$output_dir = dirname( $outfile ); + +unless ( ( -e $output_dir ) && ( -d $output_dir ) ) { + &userError( "Outfile directory (\"$output_dir\") does not exist,\n or is not a directory." ); +} + +if ( $mode == 1 || $mode == 3 ) { + $query_name = substr( $query_name, 0, $LENGTH_OF_NAME - 10 ); +} + + + + + +if ( $mode == 1 || $mode == 3 ) { + + # Prepares the query file: + # ------------------------ + $query_name = &seqFile2CleanedUpFastaFile( $seqX_file, + "$temp_dir/QUERY_SEQ", + $query_name ); + if ( $query_name eq "" ) { + &userError( "Query file \"$seqX_file\") does not appear to contain a valid name\n and/or \"-N\" option has not been used." ); + } + + if ( $mode == 3 ) { + # Prepares the HMM: + # ----------------- + if ( $hmm_file eq "" ) { + $hmm_file = $temp_dir."/".$TEMP_HMM_FILE; + if ( $hmm_name ne "" ) { + &executeHmmfetch( $PFAM_HMM_DB, $hmm_name, $hmm_file ); + } + elsif ( $seed_aln_for_hmmbuild ne "" ) { + $command_line_for_hmmbuild = &executeHmmbuild( $seed_aln_for_hmmbuild, $hmm_file ); + } + } + + } +} + + + + +# This might remove non SWISS PROT seqs, TreMBL fragments, +# and seqs from species not in $species_names_file from the alignment: +# -------------------------------------------------------------------- +if ( $mode == 3 || $mode == 4 ) { + #if ( $do_not_removeSeqsFromPfamAlign != 1 ) { + + if ( $mode == 3 ) { + &removeSeqsFromPfamAlign( $alignment, + $temp_dir."/ALIGN2", + " ", + $species_names_file, + $non_sp, + $no_frags ); + } + else { + &removeSeqsFromPfamAlign( $alignment, + $temp_dir."/ALIGN2", + $query_name, + $species_names_file, + $non_sp, + $no_frags ); + } + +} + + + +# If necessary, this aligns the query to the pfam alignment +# using hmmsearch, p7extract.pl, multifetch.pl, and hmmalign +# from the HMMER package: +# ---------------------------------------------------------- +if ( $mode == 1 || $mode == 3 ) { + if ( $mode == 1 ) { + + $f = &alignWithHmmalign( $alignment, + $temp_dir."/QUERY_SEQ", + $hmm_file, + $temp_dir."/HMMALIGNOUT", + 1 ); # --mapali + + + } + else { + + $f = &alignWithHmmalign( $temp_dir."/ALIGN2", + $temp_dir."/QUERY_SEQ", + $hmm_file, + $temp_dir."/HMMALIGNOUT", + 0 ); # --withali + + } + if ( $f != 1 ) { + if ( $alignment =~ /.+\/(.+)/ ) { + $alignment = $1; + } + if ( $alignment =~ /(.+)\..+/ ) { + $alignment = $1; + } + &cleanUpTempDir(); + if ( $output_HTML == 1 ) { + &exitWithWarning( "query sequence does not contain sufficient similarity to the \"$alignment\" domain", 1 ); + } + else { + &exitWithWarning( "Query sequence does not contain sufficient similarity to the \"$alignment\" domain" ); + } + } + + + # In case query contains more than one of the same domain: + + @complete_names = &getCompleteName( $temp_dir."/HMMALIGNOUT", $query_name ); + + if ( @complete_names < 1 ) { + &dieWithUnexpectedError( "Could not find \"$query_name in $temp_dir"."/HMMALIGNOUT\"" ); + } +} +elsif ( $mode == 2 || $mode == 4 ) { + # Here, this is just for checking: + if ( $mode == 2 ) { + @complete_names = &getCompleteName( $alignment, $query_name ); + } + elsif ( $mode == 4 ) { + @complete_names = &getCompleteName( $temp_dir."/ALIGN2", $query_name ); + } + if ( @complete_names < 1 ) { + &dieWithUnexpectedError( "Could not find \"$query_name in $temp_dir"."/HMMALIGNOUT\"" ); + } + @complete_names = (); + $complete_names[ 0 ] = $query_name; +} + +if ( $parallel == 1 ) { + &readInNodesList(); + &pingNodes(); + $processors = scalar( @nodelist ); + if ( $processors < 2 ) { + $parallel = 0; + } + if ( $processors > $BOOTSTRAPS ) { + $processors = $BOOTSTRAPS; + } + else { + $block_size = int $BOOTSTRAPS / $processors; + $larger_blocks = $BOOTSTRAPS - ( $block_size * $processors ); # number of blocks which have a size of + # block_size + 1 + + } +} + + +# This opens the output file: +# --------------------------- +if ( $output_HTML != 1 ) { + open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); +} + +# This starts printing to the output file: +# ---------------------------------------- +&printHeader(); + + + +# This loop goes through the different domains of the query +# which aligned to the alignment (in modes 2 and 4 this can +# obviously be only one): +# ----------------------------------------------------------- +for ( $jj = 0; $jj < @complete_names; ++$jj ) { + + if ( $mode == 1 ) { + # Moves the query to the last line(s) of the alignment. + # Removes other querie domains $complete_names[i]-- for which i != $jj + # -------------------------------------------------------------------- + + &moveToLast( $complete_names[ $jj ], + $temp_dir."/HMMALIGNOUT", + $temp_dir."/MOVETOLASTOUT", + \@complete_names ); + + } + + if ( $mode == 1 || $mode == 3 ) { + if ( $mode == 1 ) { + @temp_array = &pfam2phylipMatchOnly( $temp_dir."/MOVETOLASTOUT", + $temp_dir."/ALIGN2_PHYLIP_MO", + 0 ); + } + else { + @temp_array = &pfam2phylipMatchOnly( $temp_dir."/HMMALIGNOUT", + $temp_dir."/ALIGN2", + 1 ); + } + $length_of_alignment = $temp_array[ 0 ]; + $length_of_orig_alignment = $temp_array[ 1 ]; + $number_of_seqs_in_aln = $temp_array[ 2 ]; + } + elsif ( $mode == 2 || $mode == 4 ) { + + $query_name = $complete_names[ 0 ]; + + if ( $mode == 4 ) { + if ( !&startsWithSWISS_PROTname( $query_name ) ) { + # Query is not a SWISS-PROT sequence. + $query_name = &getCompleteNameForTrEMBLquerySeq( $temp_dir."/ALIGN2", + $query_name ); + } + + $number_of_seqs_in_aln = &countSeqsInPfamAlign( $temp_dir."/ALIGN2" ); + } + else { + if ( !&startsWithSWISS_PROTname( $query_name ) ) { + # Query is not a SWISS-PROT sequence. + $query_name = &getCompleteNameForTrEMBLquerySeq( $alignment, + $query_name ); + } + $number_of_seqs_in_aln = &countSeqsInPfamAlign( $alignment ); + } + + + + } + + if ( $number_of_seqs_in_aln < $MIN_NUMBER_OF_SEQS_IN_ALN ) { + &cleanUpTempDir(); + if ( $output_HTML == 1 ) { + &exitWithWarning( "Removal of sequences resulted in an alignment with less than $MIN_NUMBER_OF_SEQS_IN_ALN sequences ($number_of_seqs_in_aln)", 1 ); + } + else { + &exitWithWarning( "Removal of sequences resulted in an alignment with less than $MIN_NUMBER_OF_SEQS_IN_ALN sequences ($number_of_seqs_in_aln)" ); + } + } + + + if ( $mode == 1 ) { + + unlink( $temp_dir."/ALIGN2_BOOTSTRAPPED" ); + + if ( $parallel == 1 ) { + &executeBootstrap_cz( $BOOTSTRAPS, + $bsp_file, + $temp_dir."/ALIGN2_PHYLIP_MO", + $temp_dir."/ALIGN2_BOOTSTRAPPED", + $processors ); + + } + else { + + &executeBootstrap_cz( $BOOTSTRAPS, + $bsp_file, + $temp_dir."/ALIGN2_PHYLIP_MO", + $temp_dir."/ALIGN2_BOOTSTRAPPED" ); + + } + + + $current_dir = `pwd`; + $current_dir =~ s/\s//; + + chdir ( $temp_dir ) || &dieWithUnexpectedError( "Could not chdir to \"$temp_dir\"" ); + + + if ( $parallel == 1 ) { + + my $number = 0; + my $all_finished = 0; + + system( $RIO_SLAVE_DRIVER, + $block_size, + $larger_blocks, + $temp_dir."/ALIGN2_BOOTSTRAPPED", + $matrix_n, + $complete_names[ $jj ], + $pwd_file, + $temp_dir, + $seed_for_makeTree, + @nodelist ) + && &dieWithUnexpectedError( "Could not execute \"$RIO_SLAVE_DRIVER\"" ); + + while ( $all_finished != 1 ) { + for ( $number = 0; $number < $processors; $number++ ) { + unless ( -e "FINISHED_$number" ) { + $number = -1; + } + } + $all_finished = 1; + } + + sleep( 1 ); + + system( "mv", + "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX."0", + "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX ) + && &dieWithUnexpectedError( "$!" ); + + for ( $number = 1; $number < $processors; $number++ ) { + system( "cat MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number >> MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX" ) + && &dieWithUnexpectedError( "$!" ); + if ( unlink( "MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number" ) != 1 ) { + &dieWithUnexpectedError( "Could not delete \"MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX$number" ); + } + } + + # Sanity check: Counts ";" in "MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX". + if ( `grep -c ';' MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX` != $BOOTSTRAPS ) { + &dieWithUnexpectedError( "\"MAKETREEOUT$MULTIPLE_TREES_FILE_SUFFIX\" does not contain $BOOTSTRAPS \";\"" ); + } + + for ( $number = 0; $number < $processors; $number++ ) { + if ( unlink( "FINISHED_$number" ) != 1 ) { + &dieWithUnexpectedError( "Could not delete \"FINISHED_$number\"" ); + } + } + + &executeConsense( "MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX ); + unlink( "outfile", "intree" ); + + system( "mv", "outtree", "MAKETREEOUT.nhx" ) + && &dieWithUnexpectedError( "$!" ); + + + } + else { + $time_dqopuzzle = time; #time + &executePuzzleDQObootstrapped( "ALIGN2_BOOTSTRAPPED", $matrix_n ); + $time_dqopuzzle = time - $time_dqopuzzle; #time + $time_dqopuzzleT += $time_dqopuzzle; #time + + system( "mv", "ALIGN2_BOOTSTRAPPED.dist", "DISTs_TO_QUERY" ) + && &dieWithUnexpectedError( "$!" ); + } + + + &executePuzzleDQO( "ALIGN2_PHYLIP_MO", $matrix_n ); + + unlink( "ALIGN2_PHYLIP_MO" ); + + system( "mv", "ALIGN2_PHYLIP_MO.dist", "DIST_TO_QUERY" ) + && &dieWithUnexpectedError( "$!" ); + + if ( $parallel != 1 ) { + $time_addingdists = time; + &addDistsToQueryToPWDfile( $pwd_file, + $temp_dir."/DISTs_TO_QUERY", + $temp_dir."/PWD_INC_QUERY", + $complete_names[ $jj ] ); + + + $time_addingdists = time - $time_addingdists; + $time_addingdistsT += $time_addingdists; + } + &addDistsToQueryToPWDfile( $nbd_file, + $temp_dir."/DIST_TO_QUERY", + $temp_dir."/NBD_INC_QUERY", + $complete_names[ $jj ] ); + + } + + if ( $mode == 2 ) { + $current_dir = `pwd`; + $current_dir =~ s/\s//; + chdir ( $temp_dir ) + || &dieWithUnexpectedError( "Could not chdir to \"$temp_dir\"" ); + + } + + + if ( $parallel != 1 ) { + unlink( $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX ); + } + + $time_tree_calc = time; + + # This calculates the trees + # ------------------------- + + if ( $mode == 1 || $mode == 2 ) { + + if ( $mode == 1 ) { + + &executeNeighbor( $temp_dir."/NBD_INC_QUERY", + 0, + 0, + 0, + 1 ); + + unlink( "outfile" ); + system( "mv", "outtree", "NBD_NJ_TREE" ) + && &dieWithUnexpectedError( "$!" ); + if ( $parallel != 1 ) { + &executeMakeTree( $options_for_makeTree, + $temp_dir."/PWD_INC_QUERY", + $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, + $temp_dir."/maketree_tempdir" ); + } + + } + else { + &executeNeighbor( $nbd_file, + 0, + 0, + 0, + 1 ); + + unlink( "outfile" ); + system( "mv", "outtree", "NBD_NJ_TREE" ) + && &dieWithUnexpectedError( "$!" ); + + &executeMakeTree( $options_for_makeTree, + $pwd_file, + $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, + $temp_dir."/maketree_tempdir" ); + + } + + chdir( $current_dir ) + || &dieWithUnexpectedError( "Could not chdir to \"$current_dir\"" ); + + + } + elsif ( $mode == 3 || $mode == 4 ) { + &executeMakeTree( $options_for_makeTree, + $temp_dir."/ALIGN2", + $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, + $temp_dir."/maketree_tempdir" ); + + unlink( $temp_dir."/MAKETREEOUT".$ALIGN_FILE_SUFFIX ); + } + + + $time_tree_calc = time - $time_tree_calc; + $time_tree_calcT += $time_tree_calc; + + if ( $keep == 1 ) { + + system( "cp", $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, $outfile.$TREE_FILE_SUFFIX ); + system( "cp", $temp_dir."/MAKETREEOUT".$LOG_FILE_SUFFIX, $outfile.$LOG_FILE_SUFFIX ); + system( "cp", $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX, $outfile.$MULTIPLE_TREES_FILE_SUFFIX ); + if ( $mode == 1 || $mode == 2 ) { + system( "cp", $temp_dir."/NBD_NJ_TREE", $outfile."-NJ".$TREE_FILE_SUFFIX ); + } + + } + + unlink( $temp_dir."/ALIGN2" ); + + $multiple_trees_file = $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX; + $maketree_out_tree_file = $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX; + $distance_matrix_file = $temp_dir."/MAKETREEOUT".$SUFFIX_PWD_NOT_BOOTS; + + + if ( $mode == 1 || $mode == 3 ) { + $query_name = $complete_names[ $jj ]; + } + + $options_for_DoRIO = ""; + + # This will result in saving of the annotated consenus tree: + # ---------------------------------------------------------- + if ( $safe_nhx == 1 ) { + my $number = $jj + 1; + if ( @complete_names > 1 ) { + $outfile_annot_nhx_tree = $outfile.$ADDITION_FOR_RIO_ANNOT_TREE."-".$number.$TREE_FILE_SUFFIX; + } + else { + $outfile_annot_nhx_tree = $outfile.$ADDITION_FOR_RIO_ANNOT_TREE.$TREE_FILE_SUFFIX; + } + } + + + + if ( $sort > 2 ) { + if ( $mode == 3 || $mode == 4 ) { + $options_for_DoRIO .= " D=".$distance_matrix_file; + } + elsif ( $mode == 1 ) { + $options_for_DoRIO .= " d=".$temp_dir."/DIST_TO_QUERY"; + } + elsif ( $mode == 2 ) { + $options_for_DoRIO .= " D=".$nbd_file; + } + } + $options_for_DoRIO .= " M=".$multiple_trees_file; + $options_for_DoRIO .= " 'N=".$query_name."'"; + $options_for_DoRIO .= " S=".$species_tree_file; + $options_for_DoRIO .= " O=".$dorio_outfile; + $options_for_DoRIO .= " P=".$sort; + $options_for_DoRIO .= " L=".$t_orthologs; + $options_for_DoRIO .= " B=".$t_sn; + $options_for_DoRIO .= " U=".$t_orthologs_dc; + $options_for_DoRIO .= " X=".$warn_more_than_one_ortho; + $options_for_DoRIO .= " Y=".$warn_no_orthos; + $options_for_DoRIO .= " Z=".$warn_one_ortho; + + if ( $mode == 1 || $mode == 2 ) { + $options_for_DoRIO .= " T=".$temp_dir."/NBD_NJ_TREE"; + $options_for_DoRIO .= " t=".$maketree_out_tree_file; + } + elsif ( $mode == 3 || $mode == 4 ) { + if ( $safe_nhx == 1 ) { # Added 09/04/03. + $options_for_DoRIO .= " T=".$maketree_out_tree_file; + } + } + + if ( $safe_nhx == 1 ) { + $options_for_DoRIO .= " I"; + } + if ( $output_ultraparalogs == 1 ) { + $options_for_DoRIO .= " p"; + $options_for_DoRIO .= " v=".$t_ultra_paralogs; + } + + $time_rio = time; + + &executeDoRIO( $options_for_DoRIO ); + + $time_rio = time - $time_rio; + $time_rioT += $time_rio; + + unless ( ( -s $dorio_outfile ) && ( -f $dorio_outfile ) && ( -T $dorio_outfile ) ) { + close( OUT ); + unlink( $outfile ); + &dieWithUnexpectedError( "failure during execution of RIO (no output generated)" ); + } + + if ( $safe_nhx == 1 ) { + system( "mv", + $temp_dir."/".$DO_RIO_TEMP_OUTFILE.$ADDITION_FOR_RIO_ANNOT_TREE.$TREE_FILE_SUFFIX, + $outfile_annot_nhx_tree ) + && &dieWithUnexpectedError( "$!" ); + } + + + open( IN, "$dorio_outfile" ) + || &dieWithUnexpectedError( "Cannot open file \"$dorio_outfile\"" ); + + $saw_distance_values = 0; + $saw_ultra_paralogs = 0; + $printed_ultra_paralogs = 0; + $print_header_for_orthologies = 1; + $print_header_for_s_paralogs = 1; + + + + + # This generates the report + # ------------------------- + + W: while ( $return_line = ) { + + if ( $return_line =~ /distance values:/i ) { + $saw_distance_values = 1; + &printTitleForDistanceValues(); + } + elsif ( $return_line =~ /ultra paralogs/i ) { + $saw_ultra_paralogs = 1; + } + elsif ( $return_line =~ /^mean bootstrap/i ) { + &printMeanBootstraps(); + } + elsif ( $return_line =~ /sort priority\s*:\s*(.+)/i ) { + $sort_priority = $1; + } + elsif ( $return_line =~ /ext nodes\s*:\s*(.+)/i ) { + $ext_nodes_in_trees_analyzed = $1 - 1; # One seq is query. + } + elsif ( $return_line =~ /bootstraps\s*:\s*(\S+)/i ) { + if ( $jj == @complete_names - 1 ) { + $bootstraps = $1; + if ( $output_HTML == 1 ) { + $| = 1; + } + &printOptions(); + last W; + } + } + elsif ( $saw_distance_values != 1 + && $saw_ultra_paralogs != 1 + && $return_line =~ /(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s*(\S*)/ ) { + $ortho_name = $1; + $orthos = $2; + $subtree_neighbors = $3; + $s_orthos = $4; + $dist = $5; + + if ( $print_header_for_orthologies == 1 ) { + &printHeaderForOrthologies(); + $print_header_for_orthologies = 0; + } + &printOrthologies(); + } + elsif ( $saw_distance_values != 1 + && $saw_ultra_paralogs != 1 + && $return_line =~ /^\s*-\s*$/ ) { + $ortho_name = "-"; + $orthos = 0; + $s_orthos = 0; + $dist = 0; + if ( $print_header_for_orthologies == 1 ) { + &printHeaderForOrthologies(); + $print_header_for_orthologies = 0; + } + &printOrthologies(); + } + elsif ( $output_ultraparalogs == 1 + && $saw_ultra_paralogs == 1 + && $return_line =~ /(\S+)\s+(\S+)\s+(\S+)/ ) { + $s_para_name = $1; + $s_paras = $2; + $dist = $3; + if ( $print_header_for_s_paralogs == 1 ) { + &printHeaderForSparalogs(); + $print_header_for_s_paralogs = 0; + } + &printUltraParlogs(); + $printed_ultra_paralogs = 1; + } + elsif ( $output_ultraparalogs == 1 + && $saw_ultra_paralogs == 1 + && $return_line =~ /^\s*-\s*$/ ) { + &printNoUltraParalogs(); + } + elsif ( $return_line =~ /Bootstraps/ ) { + $saw_distance_values = 0; + } + elsif ( $saw_distance_values == 1 && $saw_ultra_paralogs != 1 ) { + &printDistanceValues(); + } + + } + close( IN ); + +} # End of for loop going through possible + # multiple matches to the same alignment/model. + +if ( $output_HTML != 1 ) { + close( OUT ); +} + +&cleanUpTempDir(); + +if ( $output_HTML != 1 ) { + print( "\n\nrio.pl successfully terminated.\nOutput written to: $outfile\n\n" ); +} + +exit( 0 ); + + + + + + + + + +# =========================================================== +# Methods +# ----------------------------------------------------------- + + + + +# ----------------------------------------------------------- +# Parallization related +# ----------------------------------------------------------- + + + +# Last modified: 02/02/02 +sub readInNodesList { + + &testForTextFilePresence( $NODE_LIST ); + + open( NIN, "$NODE_LIST" ) || &dieWithUnexpectedError( "Cannot open file \"$NODE_LIST\"" ); + + while ( ) { + if ( $_ =~ /(\S+)/ ) { + push( @nodelist, $1 ); + } + } + close( NIN ); + return; +} + + + +# Last modified: 02/02/02 +sub pingNodes { + my @temp_node_list = (); + my $p = Net::Ping->new( "tcp", 2 ); # or "udp" + my $n = ""; + + foreach $n ( @nodelist ) { + if ( defined( $p->ping( $n ) ) ) { + push( @temp_node_list, $n ); + } + } + @nodelist = (); + @nodelist = @temp_node_list; + return; + +} + + + + +# ----------------------------------------------------------- +# Output related +# ----------------------------------------------------------- + + +# Last modified: 03/07/01 +sub printHeader { + + if ( $output_HTML != 1 ) { + print OUT "RIO - Resampled Inference of Orthologs\n"; + print OUT "Version: $VERSION\n"; + print OUT "------------------------------------------------------------------------------\n"; + + print OUT "Pfam alignment file : $alignment\n"; + if ( $mode == 3 ) { + print OUT "Pfam alignment description : ".&getDescriptionFromPfam( $alignment )."\n"; + } + if ( $mode == 1 || $mode == 2 ) { + print OUT "Bootstrapped pairwise distances file : $pwd_file\n"; + print OUT "Not bootstrapped pairwise distances file: $nbd_file\n"; + print OUT "Bootstrap positions file : $bsp_file\n"; + } + if ( $mode == 1 || $mode == 3 ) { + if ( $seed_aln_for_hmmbuild ne "" ) { + print OUT "HMM : built based on $seed_aln_for_hmmbuild\n"; + } + elsif ( $hmm_name ne "" ) { + print OUT "HMM : $hmm_name\n"; + } + else { + print OUT "HMM : $hmm_file\n"; + } + print OUT "Query file : $seqX_file\n"; + } + print OUT "==============================================================================\n\n"; + } + +} ## printHeader + + + + +# Last modified: 03/07/01 +sub printHeaderForOrthologies { + + if ( $output_HTML != 1 ) { + if ( $jj > 0 ) { + print OUT "\n\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n\n"; + } + + print OUT "Query : $query_name\n\n"; + + if ( @complete_names > 1 ) { + my $size = @complete_names; + my $number = $jj + 1; + print OUT "More than one region of the query were aligned to the profile HMM.\n"; + print OUT "This is for domain #$number out of $size.\n\n"; + } + + print OUT "Number (in %) of observed orthologies (o), \"subtree-neighborings\" (n),\n"; + print OUT "and super-orthologies (s) to query in bootstrapped trees, evolutionary\n"; + print OUT "distance to query (as average number of aa replacements per residue):\n\n"; + if ( $long_output != 1 ) { + print OUT "Sequence Description o[%] n[%] s[%] distance\n"; + print OUT "-------- ----------- ---- ---- ---- --------\n"; + } + else { + print OUT "Sequence Description o[%] n[%] s[%] distance\n"; + print OUT "-------- ----------- ---- ---- ---- --------\n"; + } + } + else { + if ( $jj > 0 ) { + print "\n"; + print "

 

\n"; + print "
\n"; + print "

 

\n"; + } + + if ( @complete_names > 1 ) { + my $size = @complete_names; + my $number = $jj + 1; + print "

More than one region of the query were aligned to the profile HMM. \n"; + print "This is for domain #$number out of $size.

\n"; + } + print "

Query : $query_name

\n"; + print "

Orthologies, subtree-neighborings, super-orthologies

\n"; + + print "

Number (in %) of observed orthologies (o), \"subtree-neighborings\" (n), \n"; + print "and super-orthologies (s) to query in bootstrapped trees, evolutionary \n"; + print "distance to query (as average number of aa replacements per residue):

\n"; + if ( $ortho_name ne "-" ) { + print "\n"; + + print "\n"; + } + } + +} ## printHeaderForOrthologies + + + +# Last modified: 10/15/01 +sub printHeaderForSparalogs { + + if ( $output_HTML != 1 ) { + print OUT "\nUltra-paralogs\n"; + print OUT "--------------\n"; + print OUT "Number (in %) of observed ultra-paralogies (up) to query\n"; + print OUT "in bootstrapped trees, evolutionary distance to query (as average number\n"; + print OUT "of aa replacements per residue):\n\n"; + if ( $long_output != 1 ) { + print OUT "Sequence Description up[%] distance\n"; + print OUT "-------- ----------- ----- --------\n"; + } + else { + print OUT "Sequence Description up[%] distance\n"; + print OUT "-------- ----------- ----- --------\n"; + } + } + else { + print "

Ultra-paralogs

\n"; + print "

Number (in %) of observed ultra-paralogies (up) to query \n"; + print "in bootstrapped trees, evolutionary distance to query (as average number \n"; + print "of aa replacements per residue):

\n"; + print "
Sequence Description o[%] n[%] s[%]   distance
\n"; + print "\n"; + + } + +} ## printHeaderForSparalogs + + + +# Last modified: 03/07/01 +sub printOrthologies { + my @cut = (); + my $i = 0; + my $descp = ""; + $orthos = &roundToInt( $orthos ); + $s_orthos = &roundToInt( $s_orthos ); + + if ( $sort > 10 ) { + $subtree_neighbors = &roundToInt( $subtree_neighbors ); + } + + if ( ( $description == 1 || $complete_description == 1 ) + && $ortho_name ne "-" ) { + + if ( $non_sp != 1 ) { + if ( &startsWithSWISS_PROTname( $ortho_name ) ) { + $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $ortho_name ); + } + else { + $descp = "-"; + } + } + else { + if ( &startsWithSWISS_PROTname( $ortho_name ) ) { + $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $ortho_name ); + } + else { + $descp = &getDescriptionFromTrEMBL_ACDEOSfile( $TREMBL_ACDEOS_FILE, $ortho_name ); + } + } + + if ( $output_HTML != 1 ) { + if ( $long_output == 1 ) { + @cut = &cutDescription( $descp, 73 ); + } + else { + @cut = &cutDescription( $descp, 33 ); + } + $descp = $cut[ 0 ]; + } + } + if ( $descp eq "" ) { + $descp = "-"; + } + + if ( $output_HTML != 1 ) { + + if ( $ortho_name eq "-" ) { + print OUT "\nNO ORTHOLOGS in alignment with the current thresholds for output\n"; + } + elsif ( $dist ne "-" ) { + if ( $long_output == 1 ) { + print OUT sprintf "%-24.24s%-74.74s%5s%5s%5s%10.6f", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; + } + else { + print OUT sprintf "%-24.24s%-34.34s%5s%5s%5s%10.6f", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; + } + } + else { + if ( $long_output == 1 ) { + print OUT sprintf "%-24.24s%-74.74s%5s%5s%5s%10.10s", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; + } + else { + print OUT sprintf "%-24.24s%-34.34s%5s%5s%5s%10.10s", $ortho_name,$descp,$orthos,$subtree_neighbors,$s_orthos,$dist; + } + } + if ( $complete_description == 1 ) { + for ( $i = 1; $i < @cut; ++$i ) { + print OUT "\n"; + if ( $long_output == 1 ) { + print OUT sprintf " %-74.74s", $cut[ $i ]; + } + else { + print OUT sprintf " %-34.34s", $cut[ $i ]; + } + } + } + print OUT "\n"; + } + else { + if ( $ortho_name eq "-" ) { + print "

NO ORTHOLOGS in alignment with the current thresholds for output

\n"; + } + else { + $ortho_name = &replaceNameWithLinkToExpasy( $ortho_name ); + print "\n"; + } + } + +} ## printOrthologies + + + +sub replaceNameWithLinkToExpasy { + my $name = $_[ 0 ]; + + if ( $name =~ /(.+)_(.+)\/(.+)/ ) { + my $desc = $1; + my $spec = $2; + my $numbers = $3; + if ( length( $desc ) <= 4 ) { + $name = "".$desc."_".$spec."\/".$numbers; + } + else { + $name = "".$desc."_".$spec."\/".$numbers; + } + } + + return $name; + +} ## replaceNameWithLinkToExpasy + + + + +# Last modified: 10/15/01 +sub printUltraParlogs { + my @cut = (); + my $i = 0; + my $descp = ""; + $s_paras = &roundToInt( $s_paras ); + + if ( ( $description == 1 || $complete_description == 1 ) + && $s_para_name ne "-" ) { + + if ( $non_sp != 1 ) { + if ( &startsWithSWISS_PROTname( $s_para_name ) ) { + $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $s_para_name ); + } + else { + $descp = "-"; + } + } + else { + if ( &startsWithSWISS_PROTname( $s_para_name ) ) { + $descp = &getDescriptionFromSWISSPROT_ACDEOSfile( $SWISSPROT_ACDEOS_FILE, $s_para_name ); + } + else { + $descp = &getDescriptionFromTrEMBL_ACDEOSfile( $TREMBL_ACDEOS_FILE, $s_para_name ); + } + } + + if ( $output_HTML != 1 ) { + if ( $long_output == 1 ) { + @cut = &cutDescription( $descp, 73 ); + } + else { + @cut = &cutDescription( $descp, 33 ); + } + $descp = $cut[ 0 ]; + } + } + if ( $descp eq "" ) { + $descp = "-"; + } + + if ( $output_HTML != 1 ) { + + if ( $dist ne "-" ) { + if ( $long_output == 1 ) { + print OUT sprintf "%-24.24s%-74.74s%5s%10.6f", $s_para_name,$descp,$s_paras,$dist; + } + else { + print OUT sprintf "%-24.24s%-34.34s%5s%10.6f", $s_para_name,$descp,$s_paras,$dist; + } + } + else { + if ( $long_output == 1 ) { + print OUT sprintf "%-24.24s%-74.74s%5s%10.10s", $s_para_name,$descp,$s_paras,$dist; + } + else { + print OUT sprintf "%-24.24s%-34.34s%5s%10.10s", $s_para_name,$descp,$s_paras,$dist; + } + } + if ( $complete_description == 1 ) { + for ( $i = 1; $i < @cut; ++$i ) { + print OUT "\n"; + if ( $long_output == 1 ) { + print OUT sprintf " %-74.74s", $cut[ $i ]; + } + else { + print OUT sprintf " %-34.34s", $cut[ $i ]; + } + } + } + print OUT "\n"; + + } + else { + $s_para_name = &replaceNameWithLinkToExpasy( $s_para_name ); + print "\n"; + } + +} ## printUltraParlogs + + + +sub printNoUltraParalogs { + if ( $output_HTML != 1 ) { + print OUT "\nUltra-paralogs\n"; + print OUT "--------------\n"; + print OUT "\nNO ULTRA-PARALOGS in alignment with the current threshold of $t_ultra_paralogs%\n"; + } + else { + print "

Ultra-paralogs

\n"; + print "

NO ULTRA-PARALOGS in alignment with the current threshold of $t_ultra_paralogs%

\n"; + } +} ## printNoUltraParalogs + + + +# Called by method "printOrthologies". +# Last modified: 02/27/01 +sub cutDescription { + my $line = $_[ 0 ]; + my $size = $_[ 1 ]; + my @cut = (); + my $i = 0; + + while ( ( length( $line ) ) > $size ) { + $cut[ $i++ ] = substr( $line, 0, $size ); + $line = substr( $line, $size ); + } + $cut[ $i++ ] = $line; + return @cut; +} ## cutDescription + + + + +# Last modified: 02/27/01 +sub printTitleForDistanceValues { + if ( $output_HTML != 1 ) { + if ( $mode == 1 || $mode == 2 ) { + print OUT "\n\nDistance values (based on NJ tree of original alignment)\n"; + print OUT "--------------------------------------------------------\n"; + } + elsif ( $mode == 3 || $mode == 4 ) { + print OUT "\n\nDistance values (based on ML branch length values on consensus tree)\n"; + print OUT "--------------------------------------------------------------------\n"; + } + } + else { + print "

Distance values (based on NJ tree of original alignment)

\n"; + } + +} ## printTitleForDistanceValues + + + + +# Last modified: 02/27/01 +sub printDistanceValues { + if ( $output_HTML != 1 ) { + print OUT "$return_line"; + } + else { + chomp( $return_line ); + if ( $return_line =~ /WARNING/ ) { + $return_line =~ s/\+\/-/ ± /; + $return_line =~ s/\*/ × /; + print "

$return_line

\n"; + } + elsif ( $return_line =~ /lca\s+is/i ) { + print "

$return_line

\n"; + } + elsif ( $return_line =~ /orthologous/i ) { + print "

$return_line

\n"; + } + elsif ( $return_line =~ /distance\s+of\s+query/i ) { + print "
Sequence Description up[%]   distance
$ortho_name $descp $orthos $subtree_neighbors $s_orthos   $dist
$s_para_name $descp $s_paras   $dist
\n"; + } + if ( $return_line =~ /(.+)=(.+)/ ) { + print "\n"; + } + if ( $return_line =~ /sum\s+/i || $return_line =~ /distance\s+of\s+ortholog\s+to\s+LCA/i ) { + print "
$1 = $2
\n"; + } + } +} ## printDistanceValues + + + + +# Last modified: 02/27/01 +sub printMeanBootstraps { + if ( $output_HTML != 1 ) { + print OUT "\n\n$return_line"; + } + else { + chomp( $return_line ); + $return_line =~ s/\+\/-/ ± /; + print "\n"; + print "

$return_line

\n"; + } +} ## printMeanBootstraps + + + + +# Last modified: 02/12/02 +sub printOptions { + + if ( $output_HTML != 1 ) { + print OUT "\n\n\n==============================================================================\n"; + if ( $number_of_seqs_in_aln >= $MIN_NUMBER_OF_SEQS_IN_ALN ) { + print OUT "RIO options\n"; + print OUT "-----------\n"; + print OUT "Mode : "; + if ( $mode == 1 ) { + print OUT "precalc. pwd files with alignment not containing query (1)\n"; + } + elsif ( $mode == 2 ) { + print OUT "precalc. pwd files with alignment containing query (2)\n"; + } + elsif ( $mode == 3 ) { + print OUT "alignment not containing query (3)\n"; + } + elsif ( $mode == 4 ) { + print OUT "alignment containing query (4)\n"; + } + print OUT "Bootstraps : $bootstraps\n"; + print OUT "Species tree : $species_tree_file\n"; + if ( $safe_nhx == 1 ) { + if ( $mode == 3 || $mode == 4 ) { + if ( @complete_names > 1 ) { + $outfile_annot_nhx_tree =~ s/-\d+\.nhx/-X.nhx/; + print OUT "Saved annotated consensus trees (ML branch lengths) : $outfile_annot_nhx_tree\n"; + } + else { + print OUT "Saved annotated consensus tree (ML branch lengths) : $outfile_annot_nhx_tree\n"; + } + } + elsif ( $mode == 1 || $mode == 2 ) { + if ( @complete_names > 1 ) { + $outfile_annot_nhx_tree =~ s/-\d+\.nhx/-X.nhx/; + print OUT "Saved annotated NJ trees (based on original alignment) : $outfile_annot_nhx_tree\n"; + } + else { + print OUT "Saved annotated NJ tree (based on original alignment) : $outfile_annot_nhx_tree\n"; + } + } + } + print OUT "Threshold for output for orthologies (L=) : $t_orthologs\n"; + print OUT "Threshold for output for subtree-neighborings (B=) : $t_sn\n"; + print OUT "Threshold for distance calc for orthologies (U=) : $t_orthologs_dc\n"; + + print OUT "When to generate warnings:\n"; + print OUT "More than one ortholog: diff. in standard deviations (X=): $warn_more_than_one_ortho\n"; + print OUT "No orthologs : diff. in standard deviations (Y=): $warn_no_orthos\n"; + print OUT "One ortholog : factor (Z=): $warn_one_ortho\n"; + if ( $output_ultraparalogs == 1 ) { + print OUT "Output ultra-paralogs (p)\n"; + print OUT "Threshold for ultra-paralogies (v=) : $t_ultra_paralogs\n"; + } + print OUT "Sort priority: $sort_priority\n"; + } + + print OUT "\nOptions for the calculation of the phylgenetic trees\n"; + print OUT "----------------------------------------------------\n"; + if ( $mode == 1 ) { + print OUT "Model for pairwise distance calculations : $matrix\n"; + } + elsif ( $mode == 3 || $mode == 4 ) { + print OUT "Model for pairwise dist and ML branch length calc. : $matrix\n"; + } + if ( $mode == 1 || $mode == 3 || $mode == 4 ) { + print OUT "Columns in alignment used for tree calc : $length_of_alignment\n"; + print OUT "Columns in original alignment : $length_of_orig_alignment\n"; + } + print OUT "Sequences in alignment used for trees (incl query) : $number_of_seqs_in_aln\n"; + + if ( $mode == 3 || $mode == 4 ) { + print OUT "Removed non-SWISS-PROT sequences : "; + if ( $non_sp == 1 ) { + print OUT "no\n"; + } + else { + print OUT "yes\n"; + } + if ( $non_sp == 1 ) { + print OUT "Removed \"TrEMBL fragments\" : "; + if ( $no_frags == 1 ) { + print OUT "yes\n"; + } + else { + print OUT "no\n"; + } + } + } + if ( $mode == 1 || $mode == 2 ) { + print OUT "Prgrm to calc. branch lengths for distance values : PHYLIP NEIGHBOR (NJ)\n"; + } + elsif ( $mode == 3 || $mode == 4 ) { + print OUT "Prgrm to calc branch lengths for distance values : TREE-PUZZLE\n"; + } + if ( $seed_aln_for_hmmbuild ne "" ) { + print OUT "HMM was built with hmmbuild using options : $command_line_for_hmmbuild\n"; + } + if ( ( $mode == 3 || $mode == 4 ) && $species_names_file =~ /\S/ ) { + print OUT "File listing species used for tree calculation (G=): $species_names_file\n"; + } + print OUT "Seed for random number generator : $seed_for_makeTree\n"; + print OUT "Options for makeTree : $options_for_makeTree\n"; + + $time_total = time - $time_total; + + print OUT "\nTime and date\n"; + print OUT "-------------\n"; + if ( $mode == 1 ) { + print OUT "Time requirement dqo puzzle : $time_dqopuzzleT s\n"; + } + + print OUT "Time requirement for tree calculation: $time_tree_calcT s\n"; + print OUT "Time requirement for SDI and RIO : $time_rioT s\n"; + print OUT "Total time requirement : $time_total s\n"; + print OUT "Date started : $start_date"; + print OUT ( "Date finished : ".`date` ); + + print OUT "\nCommand line\n"; + print OUT "------------\n"; + print OUT "$command_line\n"; + if ( $parallel == 1 ) { + print OUT "\nProcessors used: @nodelist\n"; + } + } + else { + if ( $printed_ultra_paralogs == 1 ) { + print "\n"; + } + if ( $species_tree_file =~ /.+\/(.+)/ ) { + $species_tree_file = $1; + } + print "

Options

\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + if ( $output_ultraparalogs == 1 ) { + print "\n"; + print "\n"; + } + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "\n"; + print "
Bootstraps: $bootstraps
Species tree: $species_tree_file
Threshold for output for orthologies: $t_orthologs
Threshold for output for subtree-neighborings: $t_sn
Threshold for distance calc for orthologies: $t_orthologs_dc
When to generate warnings
More than one ortholog [diff in standard deviations]: $warn_more_than_one_ortho
No orthologs [diff in standard deviations]: $warn_no_orthos
One ortholog [factor]: $warn_one_ortho
Output ultra-paralogs
Threshold for ultra-paralogies: $t_ultra_paralogs
Sort priority: $sort_priority
Model for pairwise distance calculations: $matrix
Columns in alignment used for tree calc: $length_of_alignment
Columns in original alignment: $length_of_orig_alignment
Sequences in alignment used for trees (incl query): $number_of_seqs_in_aln
Seed for random number generator: $seed_for_makeTree
\n"; + + $time_total = time - $time_total; + + print "

 

\n"; + print "\n"; + print "\n"; + print "\n"; + print ( "\n" ); + if ( $parallel == 1 ) { + print "\n"; + } + print "
Time requirement: $time_total s
Date started: $start_date
Date finished: ".`date`."
Number of processors used: ".scalar( @nodelist )."
\n"; + } + +} ## printOptions + + + + + + + + + + +# ----------------------------------------------------------- +# Execution of other programs +# ----------------------------------------------------------- + + + + + +# Two arguments: +# 1. seed +# 2. outfile +# Returns the options used. +# Last modified: 05/11/01 +sub executeHmmbuild { + + my $seed = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $options = ""; + + &testForTextFilePresence( $seed ); + + $options = getHmmbuildOptionsFromPfam( $seed ); + + $options =~ s/-f//; + $options =~ s/-g//; + $options =~ s/-s//; + $options =~ s/-F//; + $options =~ s/-A//; + $options =~ s/-o\s+\S+//; + $options =~ s/(\s|^)[^-]\S+/ /g; + + if ( $options =~ /--prior/ ) { + my $basename = basename( $seed ); + $basename .= ".PRIOR"; + $options =~ s/--prior/--prior $PRIOR_FILE_DIR$basename/; + } + + # Remove for versions of HMMER lower than 2.2. + if ( $options =~ /--informat\s+\S+/ ) { + $options =~ s/--informat\s+\S+/--informat SELEX/; + } + else { + $options = "--informat SELEX ".$options; + } + + system( "$HMMBUILD $options $outfile $seed" ) + && &dieWithUnexpectedError( "Could not execute \"$HMMBUILD $options $outfile $seed\"" ); + return $options; + +} ## executeHmmbuild. + + + + +# One argument: +# Pfam align name. +# Last modified: 02/26/01 +sub getHmmbuildOptionsFromPfam { + + my $infile = $_[ 0 ]; + my $return_line = ""; + my $result = ""; + + &testForTextFilePresence( $infile ); + + open( GHO, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + if ( $return_line =~ /^\s*#.*hmmbuild\s+(.+)\s*$/ ) { + $result = $1; + close( GHO ); + return $result; + } + } + close( GHO ); + return $result; + +} ## getHmmbuildOptionsFromPfam + + + + +# Purpose. Aligns a FASTA file to a Pfam alignment using an HMM profile. +# Five arguemnts: +# 1. Pfam flat file name +# 2. Name of FASTA file to append +# 3. HMM profile file name +# 4. outputfile name +# 5. 1 use --mapali, --withali otherwise (in hmmalign) +# Returns 1 if successful, -1 if no alignment was made because +# E value of HMMSEARCH output was larger than $E_VALUE_THRESHOLD. +# Last modified: 07/11/01 +sub alignWithHmmalign { + my $alignment = $_[ 0 ]; + my $query = $_[ 1 ]; + my $hmm = $_[ 2 ]; + my $outfile = $_[ 3 ]; + my $use_mapali = $_[ 4 ]; + my $E = 2000; + my $ali = "--withali"; + + if ( $use_mapali == 1 ) { + $ali = "--mapali"; + } + + &testForTextFilePresence( $alignment ); + &testForTextFilePresence( $query ); + &testForTextFilePresence( $hmm ); + + system( "$HMMSEARCH $hmm $query > $temp_dir/HMMSEARCHOUT" ) + && &dieWithUnexpectedError( "Could not execute \"$HMMSEARCH $hmm $query > $temp_dir/HMMSEARCHOUT\"" ); + + + + $E = &getEvalue( "$temp_dir/HMMSEARCHOUT" ); + if ( $E == 2000 ) { + &dieWithUnexpectedError( "No E-value found in \"$temp_dir/HMMSEARCHOUT\"" ); + } + elsif ( $E > $E_VALUE_THRESHOLD ) { + unlink( "$temp_dir/HMMSEARCHOUT" ); + return ( -1 ); + } + + system( "$P7EXTRACT -d $temp_dir/HMMSEARCHOUT > $temp_dir/GDF" ) + && &dieWithUnexpectedError( "Could not execute \"$P7EXTRACT -d $temp_dir/HMMSEARCHOUT > $temp_dir/GDF\"" ); + + + system( "$MULTIFETCH -d -g $query $temp_dir/GDF > $temp_dir/MULTIFETCHOUT" ) + && &dieWithUnexpectedError( "Could not execute \"$MULTIFETCH -d -g $query $temp_dir/GDF > $temp_dir/MULTIFETCHOUT\"" ); + + # Checks if score was too low to have made a reasonable alignment. + unless ( -s "$temp_dir/MULTIFETCHOUT" ) { + unlink( "$temp_dir/HMMSEARCHOUT", "$temp_dir/GDF", "$temp_dir/MULTIFETCHOUT" ); + return ( -1 ); + } + + system( "$HMMALIGN -o $outfile $ali $alignment $hmm $temp_dir/MULTIFETCHOUT >/dev/null 2>&1" ) + && &dieWithUnexpectedError( "Could not execute \"$HMMALIGN -o $outfile $ali $alignment $hmm $temp_dir/MULTIFETCHOUT\"" ); + + if ( unlink( "$temp_dir/HMMSEARCHOUT", "$temp_dir/GDF","$temp_dir/MULTIFETCHOUT" ) != 3 ) { + &dieWithUnexpectedError( "Could not delete (a) file(s)" ); + } + + return 1; +} ## alignWithHmmalign + + + + +# Gets the E value for complete sequences (score includes all domains) +# from a HMMSEARCH output file. +# One argument: the HMMSEARCH output file name +# Returns the E value, 2000 if no E value found +# Last modified: 07/11/01 +sub getEvalue { + + my $infile = $_[ 0 ]; + my $return_line = ""; + my $flag = 0; + my $E = 2000; + + &testForTextFilePresence( $infile ); + + open( E, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + + # "Sequence Description Score E-value N" + if ( $return_line =~ /Sequence.+Description.+Score.+E.+value.+N/ ) { + $flag = 1; + } + # "QUERY_HUMAN 657.4 1.3e-198 1" + elsif ( $flag == 1 && $return_line =~ /\s+(\S+)\s+\d+\s*$/ ) { + $E = $1; + close( E ); + return $E; + } + + } + close( E ); + return $E; + +} ## getEvalue + + + +# Four/Five arguments: +# 1. Number of bootstraps +# 2. bsp (bootstrap positions) file +# 3. Infile (alignment) +# 4. Outfile (bootstrapped according to bsp file) +# 5. Number of processors +# Last modified: 01/30/02 +sub executeBootstrap_cz { + my $boots = $_[ 0 ]; + my $bsp_file = $_[ 1 ]; + my $infile = $_[ 2 ]; + my $outfile = $_[ 3 ]; + my $processors = $_[ 4 ]; + + if ( defined( $processors ) && ( $processors > 1 ) ) { + system( "$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile $processors" ) + && &dieWithUnexpectedError( "Could not execute \"$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile $processors\"" ); + + } + else { + system( "$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile" ) + && &dieWithUnexpectedError( "Could not execute \"$BOOTSTRAP_CZ $boots $infile $bsp_file $outfile\"" ); + } + +} ## executeBootstrap_cz + + + + + +# One argument: +# options for DoRIO.main. +# Last modified: 02/26/01 +sub executeDoRIO { + + my $options = $_[ 0 ]; + + system( "$DORIO $options >/dev/null 2>&1" ) + && &dieWithUnexpectedError( "Could not execute \"$DORIO $options\"" ); + + return; + +} ## executeDoRIO + + + + + + + + + + + +# ----------------------------------------------------------- +# These deal with the alignment +# ----------------------------------------------------------- + + + + +# Counts sequences from a Pfam flat file or +# in a PHYLIP interleaved aligment. +# One arguments: Pfam flat file name. +# Returns the number of sequences. +# Last modified: 07/10/01 +sub countSeqsInPfamAlign { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $saw_sequence_line = 0; + my $number_of_seqs = 0; + + &testForTextFilePresence( $infile ); + + open( C, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + + if ( $saw_sequence_line == 1 + && !&containsPfamNamedSequence( $return_line ) + && !&isPfamCommentLine( $return_line ) ) { + last; + } + if ( &isPfamSequenceLine( $return_line ) + && $return_line !~ /^\s*\d+\s+\d+/ ) { + if ( $saw_sequence_line == 0 ) { + $saw_sequence_line = 1; + } + $number_of_seqs++; + } + } + close( C ); + return $number_of_seqs; + +} ## countSeqsInPfamAlign + + + + +# This gets the complete name(s) of a sequence from a Pfam alignment. +# I.e. it adds "/xxx-xxx". +# 2 arguments: +# 1. Infile (alignment) +# 2. Name of query +# Returns a String-array of all the complete names found. +# Last modified: 03/04/01 +sub getCompleteName { + + my $infile = $_[ 0 ]; + my $query_name = $_[ 1 ]; + my $return_line = ""; + my @complete_names = (); + my $complete_name = ""; + my $i = 0; + + &testForTextFilePresence( $infile ); + + $query_name =~ s/\/.*//; + + open( INGCN, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + if ( $return_line =~ /^\s*$query_name(\S+)\s+.+/ ) { + $complete_name = $query_name.$1; + if ( $i > 0 && $complete_names[ 0 ] eq $complete_name ) { + # Now, we saw of all of them. + last; + } + $complete_names[ $i++ ] = $complete_name; + } + } + + close( INGCN ); + return @complete_names; +} ## getCompleteName + + + + +# Removes sequences from a Pfam flat file. +# It can remove all sequences not from species listed in a species names file. +# It can remove all sequences which do not have a SWISS-PROT name (XXXX_XXXXX) +# It can remove all sequences which are "TrEMBL" fragments. +# Six arguments: +# 1. Pfam flat file name +# 2. outfile name +# 3. Name of the query - not to be removed +# (use " " to not use this functionality) +# 4. species names file (will be ignored if " ") +# 5. 1 to NOT remove non-SWISS_PROT seqs. +# 6. 1 to remove TrEMBL seqs with "(FRAGMENT)" in their DE line. +# (Only used if non SWISS_PROT seqswill not be removed) +# Returns the number of sequences in the resulting alignment. +# If a query name is given, it returns -1 if query is not found in alignment, +# -10 if the name is not unique. +# Last modified: 05/11/01 +sub removeSeqsFromPfamAlign { + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $query = $_[ 2 ]; + my $species_names_file = $_[ 3 ]; + my $keep_non_sp = $_[ 4 ]; + my $remove_frags = $_[ 5 ]; + my $return_line = ""; + my $name = ""; + my $seq = ""; + my $saw_sequence_line = 0; + my $number_of_seqs = 0; + my $saw_query = 0; + my $query_given = 0; + my $species_names_file_given = 0; + my $length_of_name = 0; + my %AC_OS = (); # AC -> species name (TrEMBL) + my %AC_DE = (); # AC -> description (TrEMBL) + my $AC = ""; + my $DE = ""; + my $OS = ""; + + &testForTextFilePresence( $infile ); + + if ( $query =~ /\S/ ) { + $query_given = 1; + } + if ( $species_names_file =~ /\S/ ) { + $species_names_file_given = 1; + &readSpeciesNamesFile( $species_names_file ); + } + + if ( $keep_non_sp == 1 + || ( $query_given == 1 && !&startsWithSWISS_PROTname( $query ) ) ) { + + &testForTextFilePresence( $TREMBL_ACDEOS_FILE ); + + # Fill up hash $AC_OS and $AC_DE. + open( HH, "$TREMBL_ACDEOS_FILE" ) || &dieWithUnexpectedError( "Cannot open file \"$TREMBL_ACDEOS_FILE\"" ); + while ( $return_line = ) { + if ( $return_line =~ /(\S+);([^;]*);(\S+)/ ) { + $AC_OS{ $1 } = $3; + if ( $remove_frags == 1 ) { + $AC_DE{ $1 } = $2; + } + } + } + close( HH ); + } + + open( OUT_RNSP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + open( IN_RNSP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + + if ( $saw_sequence_line == 1 + && !&containsPfamNamedSequence( $return_line ) + && !&isPfamCommentLine( $return_line ) ) { + # This is just for counting purposes. + $saw_sequence_line = 2; + } + if ( &isPfamSequenceLine( $return_line ) ) { + if ( $saw_sequence_line == 0 ) { + $saw_sequence_line = 1; + } + $return_line =~ /(\S+)\s+(\S+)/; + $name = $1; + $seq = $2; + if ( $query_given == 1 && $name eq $query ) { + $saw_query++; + } + if ( ( $query_given == 1 && $name ne $query ) + || $query_given != 1 ) { + if ( !&startsWithSWISS_PROTname( $name ) ) { + if ( $keep_non_sp != 1 ) { + next; + } + else { + $name =~ /(\S+)\//; + $AC = $1; + unless( exists( $AC_OS{ $AC } ) ) { + #ACs not present in "ACDEOS" file. + next; + } + $OS = $AC_OS{ $AC }; + if ( !$OS || $OS eq "" ) { + &dieWithUnexpectedError( "species for \"$AC\" not found" ); + } + if ( $species_names_file_given == 1 ) { + unless( exists( $Species_names_hash{ $OS } ) ) { + next; + } + } + if ( $remove_frags == 1 ) { + $DE = $AC_DE{ $AC }; + if ( $DE && $DE =~ /\(FRAGMENT\)/ ) { + next; + } + } + $name =~ s/\//_$OS\//; + } + } + else { + if ( $species_names_file_given == 1 ) { + if ( $name =~ /_([A-Z0-9]{1,5})/ ) { + unless( exists( $Species_names_hash{ $1 } ) ) { + next; + } + } + # remove everything whose species cannot be determined. + else { + next; + } + } + } + } + elsif ( $query_given == 1 && $name eq $query + && !&startsWithSWISS_PROTname( $query ) ) { + # Adding species to non SWISS-PROT query + $name =~ /(\S+)\//; + $AC = $1; + unless( exists( $AC_OS{ $AC } ) ) { + #ACs not present in "ACDEOS" file. + &userError( "Could not establish species of query.\n Check file \"$TREMBL_ACDEOS_FILE\"." ); + } + $OS = $AC_OS{ $AC }; + if ( !$OS || $OS eq "" ) { + &dieWithUnexpectedError( "species for \"$AC\" not found" ); + } + $name =~ s/\//_$OS\//; + } + + $length_of_name = length( $name ); + + if ( $length_of_name > ( $LENGTH_OF_NAME - 1 ) ) { + &userError( "Name \"$name\" is too long." ); + } + + for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - $length_of_name - 1 ); ++$j ) { + $name .= " "; + } + + $return_line = $name.$seq."\n"; + } + + print OUT_RNSP $return_line; + if ( $saw_sequence_line == 1 ) { + $number_of_seqs++; + } + } + close( IN_RNSP ); + close( OUT_RNSP ); + if ( $query_given == 1 ) { + if ( $saw_query < 1 ) { + return -1; + } + elsif ( $saw_query > 1 ) { + return -10; + } + } + return $number_of_seqs; + +} ## removeSeqsFromPfamAlign + + + + +# One argument: +# 1. PWD file +# "Returns" a Hash of Strings (=keys) containing all the names found in PWD file +# Last modified: 05/29/01 +sub getNamesFromPWDFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $i = 0; + my $saw_dist_line = 0; + + &testForTextFilePresence( $infile ); + + open( GN_IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + + while ( $return_line = ) { + if ( $saw_dist_line == 1 && $return_line =~ /^\s*(\d+)\s*$/ ) { + if ( $1 != $i ) { + &dieWithUnexpectedError( "Failed sanity check" ); + } + last; + } + elsif ( $return_line =~ /^\s*(\S+)\s+\S+/ ) { + $names_in_pwd_file{ $1 } = 0; + $i++; + $saw_dist_line = 1; + } + } + close( GN_IN ); + return; +} ## getNamesFromPWDFile + + + + +# Moves sequences which start with query name (argument 1) +# to the last positions in pfam alignment sepecified by argument 2. +# Removes seqs present in argument 4, unless for query name. +# Four arguments: +# 1. Query name +# 2. Infile (alignment) +# 3. Outfile (=infile with query seq moved to the bottom) +# 4. Array of seq names to remove, unless for query name +# Last modified: 06/25/01 +sub moveToLast { + my $query = $_[ 0 ]; + my $infile = $_[ 1 ]; + my $outfile = $_[ 2 ]; + my @to_remove = @{ $_[ 3 ] }; # @{} tells Perl that this is a list. + my $return_line = ""; + my $query_line = ""; + my $n = ""; + + &testForTextFilePresence( $infile ); + + open( MTL_IN, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + open( MTL_OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + + W: while ( $return_line = ) { + if ( &isPfamCommentLine( $return_line ) + && ( !isRFline( $return_line ) || $mode != 1 ) ) { + next W; + } + if ( @to_remove > 1 ) { + foreach $n ( @to_remove ) { + if ( $n ne $query && $return_line =~ /^\s*$n\s+/ ) { + next W; + } + } + } + if ( $return_line =~ /^\s*$query\s+/ ) { + $query_line = $return_line; + } + elsif ( $query_line ne "" + && ( $return_line !~ /\S+/ || isRFline( $return_line ) ) ) { + print MTL_OUT $query_line; + print MTL_OUT $return_line; + $query_line = ""; + } + else { + print MTL_OUT $return_line; + } + } + if ( $query_line ne "" ) { + print MTL_OUT $query_line; + } + + close( MTL_IN ); + close( MTL_OUT ); + + return; + +} ## moveToLast + + + + + + + + + + +# ----------------------------------------------------------- +# Others +# ----------------------------------------------------------- + + + + +# This gets the complete name of a TrEMBL sequence from a Pfam alignment. +# I.e. it adds the species between "_" and "/XXX-XXX". +# 2 arguments: +# 1. Infile (alignment) +# 2. Name of query +# Returns the complete name found. +# Last modified: 04/25/01 +sub getCompleteNameForTrEMBLquerySeq { + + my $infile = $_[ 0 ]; + my $query_name = $_[ 1 ]; + my $return_line = ""; + my $complete_name = ""; + my $before_slash = ""; + my $after_slash = ""; + + &testForTextFilePresence( $infile ); + + $query_name =~ /(.+)\/.+/; + $before_slash = $1; + + $query_name =~ /.+\/(.+)/; + $after_slash = $1; + + open( INGCN, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + if ( $return_line =~ /^\s*($before_slash.+\/$after_slash)/ ) { + $complete_name = $1; + last; + } + } + close( INGCN ); + if ( $complete_name eq "" ) { + &userError( "Could not find \"$query_name\" in \"$alignment\"." ); + } + return $complete_name; +} ## getCompleteNameForTrEMBLquerySeq + + + + +# One argument: +# Pfam align name. +# Last modified: 02/26/01 +sub getDescriptionFromPfam { + + my $infile = $_[ 0 ]; + my $return_line = ""; + my $result = ""; + + &testForTextFilePresence( $infile ); + + open( INGDPF, $infile ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + if ( $return_line =~ /^\s*#=DE\s+(.+)/ ) { + $result = $1; + close( INGDPF ); + return $result; + } + } + close( INGDPF ); + return $result; + +} ## getDescriptionFromPfam + + + +# Reads in (SWISS-PROT) species names from a file. +# Names must be separated by newlines. +# Lines beginning with "#" are ignored. +# A possible "=" and everything after is ignored. +# One argument: species-names-file name +# Last modified: 04/24/01 +sub readSpeciesNamesFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $species = ""; + + &testForTextFilePresence( $infile ); + + open( IN_RSNF, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + while ( $return_line = ) { + if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { + $species = $1; + $species =~ s/=.+//; + $Species_names_hash{ $species } = ""; + } + } + close( IN_RSNF ); + + return; +} ## readSpeciesNamesFile + + + + +# This reads a raw sequence file or a FASTA sequence file +# and saves it as a "cleaned up" FASTA sequence file. +# If no > line is in the file, it creates one with new sequence name. +# If a > line is in the file, it modifes it: +# white space -> _, ";" ":" "," or "|" -> "~", deletes everything after ( or [; +# length is limited to 40 characters. +# Error if $new_seq_name is "" and no > line in the file. +# Two/three arguments: +# 1. infile name +# 2. outfile name +# 3. new sequence name for > line(will be ignored if "") +# If new sequence name is "": +# returns the contents of the ">" line after modification. +# If new sequence name is specified: +# return new sequence name. +# Last modified: 03/04/01 +sub seqFile2CleanedUpFastaFile { + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $new_seq_name = $_[ 2 ]; + my $return_line = ""; + my $mod_desc = ""; + my $saw_desc_line = 0; + + &testForTextFilePresence( $infile ); + + open( IN_CUFF, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + open( OUT_CUFF, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + + while ( $return_line = ) { + if ( $return_line =~ /\w/ && $return_line !~ /^\s*#/ ) { + if ( $return_line =~ /^\s*>/ ) { + if ( $new_seq_name eq "" && $return_line !~ /_/ ) { + &userError( "Description line of query file appears not to\n contain any species information. Use \"N=\" option." ); + } + elsif ( $new_seq_name eq "" ) { + $return_line =~ s/^\s*>\s*(.*?)\s*/>$1/; # Removes spaces before and after >. + $return_line = substr( $return_line, 0, $LENGTH_OF_NAME - 1 ); + $return_line =~ s/[\(\[].*//; # Removes "(" or "[" and everything after. + $return_line =~ s/\s+$//; # Removes spaces at end. + $return_line =~ s/\s+/_/g; # Replaces all white spaces with "_". + $return_line =~ s/[;:,\|]/~/g; # Replaces all ";", ":", ",", or "|" with "~". + $return_line =~ />\s*(\S+)/; + $mod_desc = $1; + $return_line .= "\n"; + } + else { + $return_line = ">".$new_seq_name."\n"; + $mod_desc = $new_seq_name; + } + $saw_desc_line = 1; + } + else { + if ( $saw_desc_line != 1 ) { + if ( $new_seq_name ne "" ) { + print OUT_CUFF ( ">".$new_seq_name."\n" ); + $mod_desc = $new_seq_name; + } + else { + &userError( "Query file is not a FASTA file\n and option \"N=\" has not been used." ); + } + $saw_desc_line = 1; + } + $return_line =~ s/[^a-zA-Z\r\n\f]//g; # Removes non-letters from sequence. + } + + if ( $return_line =~ /\w/ ) { + print OUT_CUFF $return_line; + } + } + } + close( IN_CUFF ); + close( OUT_CUFF ); + + return $mod_desc; +} ## seqFile2CleanedUpFastaFile + + + + +# Purpose. Gets description for TrEMBL seqs, +# from a file which contains the AC, DE, and OS +# and which has to be generated from a TrEMBL flat file db +# using "extractTrembl.pl". +# The same file is used in method "addSpeciesToNonSPseqs". +# Two arguments: +# 1. "ACDEOS" file (AC, DE, OS from TrEMBL db) +# 2. AC ("_species/..." is removed) +# Format: AC;DE;OS\n +# Last modified: 02/14/02 +sub getDescriptionFromTrEMBL_ACDEOSfile { + my $ACDEOS = $_[ 0 ]; + my $AC = $_[ 1 ]; + my $DE = ""; + + # Fill up (huge) hash, if not already done. + unless ( %AC_DE ) { + &testForTextFilePresence( $ACDEOS ); + open( ACDEOS, "$ACDEOS" ) || &dieWithUnexpectedError( "Cannot open file \"$ACDEOS\"" ); + while ( $return_line = ) { + if ( $return_line =~ /(\S+);([^;]+);/ ) { + $AC_DE{ $1 } = $2; + } + } + close( ACDEOS ); + } + + $AC =~ s/_.+//; + + unless( exists( $AC_DE{ $AC } ) ) { + #AC not present in "ACDEOS" file. + return "-"; + } + + $DE = $AC_DE{ $AC }; + + if ( !$DE || $DE eq "" ) { + $DE = "-"; + } + + return $DE; + +} ## getDescriptionFromTrEMBL_ACDEOSfile + + + +# Purpose. Gets description for SP seqs, +# from a file which contains the AC, DE, and OS +# and which has to be generated from a sprot.dat flat file db +# using "extractSWISS-PROT.pl". +# Two arguments: +# 1. "ACDEOS" file (AC, DE, OS from SWISS-PROT db) +# 2. SWISS-PROT AC (XXXX_XXXX) +# Format: AC;DE;OS\n +# Last modified: 02/12/02 +sub getDescriptionFromSWISSPROT_ACDEOSfile { + my $SPACDEOS = $_[ 0 ]; + my $AC = $_[ 1 ]; + my $DE = ""; + + # Fill up (huge) hash, if not already done. + unless ( %SP_AC_DE ) { + &testForTextFilePresence( $SPACDEOS ); + open( ACDEOS, "$SPACDEOS" ) || &dieWithUnexpectedError( "Cannot open file \"$SPACDEOS\"" ); + while ( $return_line = ) { + if ( $return_line =~ /(\S+);([^;]+);/ ) { + $SP_AC_DE{ $1 } = $2; + } + } + close( ACDEOS ); + } + + $AC =~ s/\/.+//; + + unless( exists( $SP_AC_DE{ $AC } ) ) { + #AC not present in "ACDEOS" file. + return "-"; + } + + $DE = $SP_AC_DE{ $AC }; + + if ( !$DE || $DE eq "" ) { + $DE = "-"; + } + + return $DE; + +} ## getDescriptionFromSWISSPROT_ACDEOSfile + + + + + + + + + +# ----------------------------------------------------------- +# Helpers +# ----------------------------------------------------------- + + + +# One argument: +# Numeric value to be rounded to int. +# Last modified: 10/17/01 +sub roundToInt { + my $x = $_[ 0 ]; + unless ( $x eq "-" ) { + $x = int ( $x + 0.5 ); + } + return $x; +} ## roundToInt + + + +# Removes files. +# Last modified: 03/10/01 +sub cleanUpTempDir { + unlink( $temp_dir."/MAKETREEOUT".$TREE_FILE_SUFFIX, $temp_dir."/MAKETREEOUT".$LOG_FILE_SUFFIX, + $temp_dir."/MAKETREEOUT".$ALIGN_FILE_SUFFIX, $temp_dir."/MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX, + $temp_dir."/MAKETREEOUT".$SUFFIX_PWD_NOT_BOOTS, $temp_dir."/".$DO_RIO_TEMP_OUTFILE, + $temp_dir."/ALIGN1", $temp_dir."/ALIGN2", $temp_dir."/QUERY_SEQ", $temp_dir."/NBD_NJ_TREE", + $temp_dir."/ALIGN2_BOOTSTRAPPED", $temp_dir."/ALIGN2_PROCESSED", $temp_dir."/DIST_TO_QUERY", + $temp_dir."/DISTs_TO_QUERY", $temp_dir."/HMMALIGNOUT", $temp_dir."/NBD_INC_QUERY", $temp_dir."/PWD_INC_QUERY", + $temp_dir."/HMMFILE", $temp_dir."/MOVETOLASTOUT" ); + rmdir( $temp_dir ); +} ## cleanUpTempDir + + + + + + + + + + + + +# ----------------------------------------------------------- +# Command line and arguments, Errors +# ----------------------------------------------------------- + + + +# One argument: +# the command line. +# Last modified: 03/08/01 +sub analyzeCommandLine { + + my $args = ""; + my $arg = ""; + my $char = ""; + + + + $mode = shift( @_ ); + + if ( $mode != 1 && $mode != 2 && $mode != 3 && $mode != 4 ) { + &errorInCommandLine( "Mode can only be: 1, 2, 3, or 4." ); + } + + + foreach $args ( @_ ) { + + $args =~ s/\s//g; + + $char = substr( $args, 0, 1 ); + + + if ( length( $args ) > 1 ) { + $arg = substr( $args, 2 ); + } + + if ( $char =~ /A/ ) { + if ( $alignment ne "" ) { + &errorInCommandLine( "Entered same argument twice." ); + } + if ( $mode == 3 || $mode == 4 ) { + &userErrorCheckForTextFileExistence( $arg ); + } + $alignment = $arg; + } + elsif ( $char =~ /B/ ) { + if ( $t_sn != $THRESHOLD_SN_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $t_sn = $arg; + } + elsif ( $char =~ /C/ ) { + if ( $description == 1 || $complete_description == 1 ) { + &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"D\" and \"C\"." ); + } + $complete_description = 1; + } + elsif ( $char =~ /D/ ) { + if ( $description == 1 || $complete_description == 1 ) { + &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"D\" and \"C\"." ); + } + $description = 1; + } + elsif ( $char =~ /E/ ) { + if ( $long_output != 0 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $long_output = 1; + } + elsif ( $char =~ /F/ ) { + if ( $hmm_file ne "" || $hmm_name ne "" || $seed_aln_for_hmmbuild ne "") { + &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." ); + } + if ( $mode == 1 || $mode == 2 ) { + &errorInCommandLine( "Can not use \"F=\" in modes 1 or 2." ); + } + &userErrorCheckForTextFileExistence( $arg ); + $hmm_file = $arg; + } + elsif ( $char =~ /G/ ) { + if ( $species_names_file ne " " ) { + &errorInCommandLine( "Entered same argument twice." ); + } + &userErrorCheckForTextFileExistence( $arg ); + $species_names_file = $arg; + } + elsif ( $char =~ /H/ ) { + if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { + &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." ); + } + if ( $mode == 1 || $mode == 2 ) { + &errorInCommandLine( "Can not use \"H=\" in modes 1 or 2." ); + } + $hmm_name = $arg; + } + elsif ( $char =~ /I/ ) { + if ( $safe_nhx != 0 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $safe_nhx = 1; + } + elsif ( $char =~ /K/ ) { + if ( $keep != 0 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $keep = 1; + } + elsif ( $char =~ /L/ ) { + if ( $t_orthologs != $THRESHOLD_ORTHOLOGS_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $t_orthologs = $arg; + } + elsif ( $char =~ /N/ ) { + if ( $query_name ne "" ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $query_name = $arg; + } + elsif ( $char =~ /O/ ) { + if ( $outfile ne "" ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $outfile = $arg; + } + elsif ( $char =~ /P/ ) { + if ( $sort != $SORT_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $sort = $arg; + } + elsif ( $char =~ /Q/ ) { + if ( $seqX_file ne "" ) { + &errorInCommandLine( "Entered same argument twice." ); + } + &userErrorCheckForTextFileExistence( $arg ); + $seqX_file = $arg; + } + elsif ( $char =~ /S/ ) { + if ( $species_tree_file ne "" ) { + &errorInCommandLine( "Entered same argument twice." ); + } + &userErrorCheckForTextFileExistence( $arg ); + $species_tree_file = $arg; + } + elsif ( $char =~ /T/ ) { + if ( $mode == 1 || $mode == 2 ) { + &errorInCommandLine( "Matrix cannot be changed in modes 1 and 2 (is dictated by \"\$MATRIX_FOR_PWD\" for mode 1)." ); + } + if ( $arg eq "J" ) { + $matrix_n = 0; + } + elsif ( $arg eq "P" ) { + $matrix_n = 1; + } + elsif ( $arg eq "B" ) { + $matrix_n = 2; + } + elsif ( $arg eq "M" ) { + $matrix_n = 3; + } + elsif ( $arg eq "V" ) { + $matrix_n = 5; + } + elsif ( $arg eq "W" ) { + $matrix_n = 6; + } + else { + &errorInCommandLine( "Use T=J for JTT, P for PAM, B for BLOSUM62, M for mtREV24, V for VT, W for WAG." ); + } + } + elsif ( $char =~ /U/ ) { + if ( $t_orthologs_dc != $THRESHOLD_ORTHOLOGS_DEFAULT_DC ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $t_orthologs_dc = $arg; + } + elsif ( $char =~ /X/ ) { + if ( $warn_more_than_one_ortho + != $WARN_MORE_THAN_ONE_ORTHO_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $warn_more_than_one_ortho = $arg; + } + elsif ( $char =~ /Y/ ) { + if ( $warn_no_orthos != $WARN_NO_ORTHOS_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $warn_no_orthos = $arg; + } + elsif ( $char =~ /Z/ ) { + if ( $warn_one_ortho != $WARN_ONE_ORTHO_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $warn_one_ortho = $arg; + } + elsif ( $char =~ /a/ ) { + if ( $boostraps_for_makeTree != $BOOSTRAPS_FOR_MAKETREE_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + if ( $mode == 1 || $mode == 2 ) { + &errorInCommandLine( "Modes 1 and 2: Cannot change bootstrap value. Do not use \"a=\"." ); + } + $boostraps_for_makeTree = $arg; + if ( $boostraps_for_makeTree < 10 ) { + &errorInCommandLine( "Bootsraps cannot be smaller than 10." ); + } + } + elsif ( $char =~ /b/ ) { + if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { + &errorInCommandLine( "Entered same argument twice or conflicting arguments: \"F=\", \"H=\" and \"b=\"." ); + } + if ( $mode == 1 || $mode == 2 ) { + &errorInCommandLine( "Can not use \"b=\" in modes 1 or 2." ); + } + &userErrorCheckForTextFileExistence( $arg ); + $seed_aln_for_hmmbuild = $arg; + } + elsif ( $char =~ /f/ ) { + if ( $no_frags ne 0 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $no_frags = 1; + } + elsif ( $char =~ /j/ ) { + if ( $temp_dir ne "" ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $temp_dir = $arg; + } + elsif ( $char =~ /p/ ) { + if ( $output_ultraparalogs != 0 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $output_ultraparalogs = 1; + } + elsif ( $char =~ /s/ ) { + if ( $non_sp != 1 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $non_sp = 0; + } + elsif ( $char =~ /v/ ) { + $t_ultra_paralogs = $arg; + } + elsif ( $char =~ /x/ ) { + if ( $output_HTML == 1 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $output_HTML = 1; + } + elsif ( $char =~ /y/ ) { + if ( $seed_for_makeTree != $SEED_FOR_MAKETREE_DEFAULT ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $seed_for_makeTree = $arg; + } + elsif ( $char =~ /\+/ ) { + if ( $parallel != 0 ) { + &errorInCommandLine( "Entered same argument twice." ); + } + $parallel = 1; + } + else { + &errorInCommandLine( "Unknown option: \"$args\"." ); + } + } +} ## analyzeCommandLine + + + + +# Last modified: 03/08/01 +sub CheckArguments { + + if ( $outfile eq "" ) { + &errorInCommandLine( "Outfile not specified. Use \"O=\"." ); + } + if ( $alignment eq "" ) { + &errorInCommandLine( "Need to specify a Pfam alignment file. Use \"A=\"." ); + } + if ( -e $outfile ) { + &userError( "\"$outfile\" already exists." ); + } + + if ( $sort < 0 || $sort > 17 ) { + &errorInCommandLine( "Sort priority (\"P=\") must be between 0 and 15." ); + } + + if ( $parallel == 1 && $mode != 1 ) { + &errorInCommandLine( "Parallelization only implemented for mode 1." ); + } + + if ( $mode == 1 || $mode == 2 ) { + + if ( $species_names_file =~ /\S/ ) { + &errorInCommandLine( "Modes 1 and 2: Cannot use species names file. Do not use \"G=\"." ); + } + if ( $non_sp == 0 ) { + &errorInCommandLine( "Can not use \"s\" in modes 1 or 2." ); + } + if ( $no_frags == 1 ) { + &errorInCommandLine( "Can not use \"f\" in modes 1 or 2." ); + } + } + + if ( $mode == 1 || $mode == 3 ) { + if ( $seqX_file eq "" ) { + &errorInCommandLine( "Modes 1 and 3: Need to specify a query file. Use \"Q=\"." ); + } + } + + if ( $mode == 3 ) { + if ( $hmm_name eq "" && $hmm_file eq "" && $seed_aln_for_hmmbuild eq "" ) { + &errorInCommandLine( "Mode 3: Need to specify either a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." ); + } + } + + if ( $mode == 1 ) { + if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { + &errorInCommandLine( "Mode 1: Must not specify a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." ); + } + } + + if ( $mode == 2 || $mode == 4 ) { + if ( $seqX_file ne "" ) { + &errorInCommandLine( "Modes 2 and 4: Must not specify a query file. Do not use \"Q=\".\n" ); + } + if ( $query_name eq "" ) { + &errorInCommandLine( "Modes 2 and 4: Must specify a query name. Use \"N=\"." ); + } + if ( $hmm_name ne "" || $hmm_file ne "" || $seed_aln_for_hmmbuild ne "" ) { + &errorInCommandLine( "Modes 2 and 4: Cannot specify a HMM name (\"H=\"), a HMM file (\"F=\") or build a HMM (\"b=\")." ); + } + + } + + if ( $non_sp != 1 && $no_frags == 1 ) { + &errorInCommandLine( "\"Fragments\" are assumed to be only found in non SWISS-PROT seqs.\n Do not use \"f\" together with \"s\"." ); + } + + if ( $output_HTML == 1 ) { + if ( $mode != 1 ) { + &errorInCommandLine( "Output in HTML (for web server) only for mode 1." ); + } + } + + if ( $output_ultraparalogs == 0 && $t_ultra_paralogs != $T_ULTRA_PARALOGS_DEFAULT ) { + &errorInCommandLine( "Use \"p\" to output ultra paralogs (cannot use \"v=\" without \"p\")." ); + } + + if ( $non_sp == 1 && ( $mode == 3 || $mode == 4 ) ) { + unless ( ( -s $TREMBL_ACDEOS_FILE ) && ( -f $TREMBL_ACDEOS_FILE ) && ( -T $TREMBL_ACDEOS_FILE ) ) { + my $message = "AC, DE, and OS-file not found.\n"; + $message .= " If non SWISS-PROT sequences are not to be removed from the\n"; + $message .= " Pfam alignment (\"s\" option), variable \"\$TREMBL_ACDEOS_FILE\" needs\n"; + $message .= " to point to a file containing AC, DE, and OS from TrEMBL. Such a\n"; + $message .= " file can be generated with \"extractTrembl.pl\".\n"; + $message .= " Currently, \"TREMBL_ACDEOS_FILE\" points to:\n"; + $message .= " $TREMBL_ACDEOS_FILE"; + &userError( $message ); + } + } + + unless ( ( -s $species_tree_file ) && ( -f $species_tree_file ) && ( -T $species_tree_file ) ) { + my $message = "Species tree file not found.\n"; + $message .= " A valid species tree must be specified.\n"; + $message .= " Either, use \"S=\" option, or set variable\n"; + $message .= " \"\$SPECIES_TREE_FILE_DEFAULT\".\n"; + $message .= " Currently, this program looks for a species tree at:\n"; + $message .= " $species_tree_file"; + &userError( $message ); + } + + if ( $hmm_name ne "" ) { + unless ( ( -s $PFAM_HMM_DB ) && ( -f $PFAM_HMM_DB ) ) { + my $message = "HMMER model db file not found.\n"; + $message .= " If \"H=\" option is used, a valid HMMER model db needs\n"; + $message .= " to be specified with variable \"\$PFAM_HMM_DB\".\n"; + $message .= " Currently, \"\$PFAM_HMM_DB\" points to:\n"; + $message .= " $PFAM_HMM_DB"; + &userError( $message ); + } + } +} ## CheckArguments + + + +# Last modfied: 06/25/01 +sub userErrorCheckForTextFileExistence { + my $file = $_[ 0 ]; + unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { + &userError( "\"$file\" does not exist or is not a plain text file." ); + } +} ## checkForFileExistence + + + +# One argument: the error message. +# Last modified: 04/26/01 +sub errorInCommandLine { + + my $error = $_[ 0 ]; + + print " \n"; + print " rio.pl version: $VERSION\n"; + print " ------\n"; + print " \n"; + print " Error in command line:\n"; + if ( $error ne "" ) { + print " $error"; + } + print " \n\n"; + print " Type \"rio.pl\" (no arguments) for more information.\n"; + print " \n"; + exit( -1 ); +} ## errorInCommandLine + + + + +# One argument: the error message. +# Last modified: 04/26/01 +sub userError { + + my $error = $_[ 0 ]; + + print " \n"; + print " rio.pl version: $VERSION\n"; + print " ------\n"; + print " \n"; + print " Error:\n"; + if ( $error ne "" ) { + print " $error"; + } + print " \n\n"; + print " Type \"rio.pl\" (no arguments) for more information.\n"; + print " \n"; + &cleanUpTempDir(); + exit( -1 ); +} ## UserError + + + + + + +# Last modified: 04/26/01 +sub printHelp { + + print " \n"; + print " rio.pl version: $VERSION\n"; + print " ------\n\n"; + + print < + ----- + + + Examples: + --------- + + % RIO1.1/perl/rio.pl 1 A=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out1 p I C E + + % RIO1.1/perl/rio.pl 2 A=aconitase N=LEU2_LACLA/5-449 O=out2 p I C E + + % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase H=aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out3 p I C E + + % RIO1.1/perl/rio.pl 4 A=/path/to/my/pfam/Full/aconitase N=LEU2_LACLA/5-449 O=out4 p I C E + + % RIO1.1/perl/rio.pl 3 A=/path/to/my/pfam/Full/aconitase b=/path/to/my/pfam/Seed/aconitase Q=RIO1.1/LEU2_HAEIN N=QUERY_HAEIN O=out5 p I C E + + + + Modes: + ------ + + 1: RIO analysis based on precalculated pairwise distances + alignment does not contain query sequence + + 2: RIO analysis based on precalculated pairwise distances + alignment does contain query sequence + + 3: RIO analysis based on Pfam alignments, + alignment does not contain query sequence + + 4: RIO analysis based on Pfam alignments, + alignment does contain query sequence + + + + Tagged arguments: + ----------------- + + No "G=", "H=", "F=", "T=", "a=", "b=", "s", "f" in modes 1 and 2. + + + A= Pfam alignment name (mandatory). This specifies the alignment + against which the RIO analysis is to be performed. + In modes 1 and 2: Pfam model (alignment) name + (e.g. "A=aconitase"). + In modes 3 and 4: Pfam alignment path/name + (e.g. "A=/path/to/your/pfam/Full/aconitase"). + + Q= Path/name of file containing the query sequence + (in FASTA format or raw sequence) (mandatory in modes 1 and 3). + + N= Query name (mandatory). This must include the SWISS-PROT code + for the species of the query after a "_" (e.g. "N=QUERY_HAEIN"). + If the query sequence is already in the alignment (modes 2 and 4) + the complete name needs to be specified -- including "/xxx-xxx". + + O= Output file path/name (mandatory). + + T= Model for pairwaise distance calculation: + J=JTT, B=BLOSUM 62, M=mtREV24, V=VT, W=WAG, P=PAM. + BLOSUM 62 is default. + (Not in modes 1 and 2; these modes use \$MATRIX_FOR_PWD instead.) + + In modes 1 and 3, a HMM is needed to align the query sequence to + the alignment and either one of the following options must be + employed: + H= HMM name: This uses hmmfetch to retrieve a HMM from + \$PFAM_HMM_DB. + F= HMM file: This directly reads the HMM from a file. + + S= Species tree file path/name (in NHX format) (optional). + If not specified, \$SPECIES_TREE_FILE_DEFAULT is used. + + G= Species names file (optional). Only sequences associated with + species found in this file are used. + In the species names file, individual species names must be + separated by newlines and lines starting with "#" are ignored. + While only sequences associated with species found in the species + tree ("S=") are used for the actual RIO analysis, this allows to + remove sequences prior to tree calculation (which is the most + time consuming step). + + P= Sort priority (default is 12): + 0 : Ortholog + 1 : Ortholog, Super ortholog + 2 : Super ortholog, Ortholog + 3 : Ortholog, Distance + 4 : Distance, Ortholog + 5 : Ortholog, Super ortholog, Distance + 6 : Ortholog, Distance, Super ortholog + 7 : Super ortholog, Ortholog, Distance + 8 : Super ortholog, Distance, Ortholog + 9 : Distance, Ortholog, Super ortholog + 10 : Distance, Super ortholog, Ortholog + 11 : Ortholog, Subtree neighbor, Distance + 12 : Ortholog, Subtree neighbor, Super ortholog, Distance (default) + 13 : Ortholog, Super ortholog, Subtree neighbor, Distance + 14 : Subtree neighbor, Ortholog, Super ortholog, Distance + 15 : Subtree neighbor, Distance, Ortholog, Super ortholog + 16 : Ortholog, Distance, Subtree neighbor, Super ortholog + 17 : Ortholog, Subtree neighbor, Distance, Super ortholog + + a= Bootstraps for tree construction (not in modes 1 and 2). + Default is 100. + + L= Threshold for orthologies for output. Default is 0. + v= Threshold for ultra-paralogies for output. Default is 50. + + U= Threshold for orthologies for distance calculation. Default is 60. + + X= In case of more than one putative orthologs: + number of sd the distance query - LCA has to differ + from the mean to generate a warning. Default is 2. + + Y= In case of no putative orthologs: + number of sd the distance query - root has to differ + from mean to generate a warning. Default is 2. + + Z= In case of one putative ortholog: + threshold for factor between the two distances to their + LCA (larger/smaller) to generate a warning. Default is 2. + + B= Threshold for subtree-neighborings. Default is 0. + + b= Build HMM from seed alignment with "hmmbuild -s" (optional). + This is to prevent from finding multiple domains per sequence + (i.e. prevents "cutting" the query sequence). Give path/name to + Seed with this. + + j= Name for temporary directory (optional). + + y= Seed for random number generator. Default is 41. + + I Create and save a rooted, with duplication vs speciation, + and orthology information annotated gene tree. + If precalculated distances are used (modes 1 and 2): this gene + tree is a NJ tree calculated based on the non-bootstrap resampled + (original) pairwise distances. + If precalculated distances are not used (modes 3 and 4): this gene + is a consenus tree with ML branch length values and is also + annotated with bootstrap values for each node. + + Options for output: + p Output ultra-paralogs. + D Description from SWISS-PROT and TrEMBL. + C Complete description from SWISS-PROT and TrEMBL. + E 118 character output instead of 78 character output. + + K Keep intermediate files (they will go into the same directory + as the output file, their names are the same as of the output + file, with various suffixes added). + + s Ignore non SWISS-PROT sequences (i.e. sequences from TrEMBL) + in the Pfam alignment. + + f Try to ignore TrEMBL "fragments" (sequences with "fragment" in + their description). + + + Parallel, use machines listed in file \$NODE_LIST. + + x RIO used as web server -- HTML output. + + +END + exit( 0 ); + +} ## printHelp + diff --git a/forester/archive/perl/rio_module.pm b/forester/archive/perl/rio_module.pm new file mode 100755 index 0000000..59b15e2 --- /dev/null +++ b/forester/archive/perl/rio_module.pm @@ -0,0 +1,1108 @@ +# Copyright (C) 2002-2003 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified 03/13/03 + + +package rio_module2; +use strict; +require Exporter; + +our $VERSION = 3.20; + +our @ISA = qw( Exporter ); + +our @EXPORT = qw( executeConsense + executeMakeTree + executePuzzleDQO + executePuzzleDQObootstrapped + pfam2phylipMatchOnly + startsWithSWISS_PROTname + isPfamSequenceLine + isPfamCommentLine + containsPfamNamedSequence + isRFline + executeNeighbor + executeProtpars + setModelForPuzzle + setRateHeterogeneityOptionForPuzzle + setParameterEstimatesOptionForPuzzle + executePuzzleBootstrapped + executePuzzle + executeHmmfetch + addDistsToQueryToPWDfile + testForTextFilePresence + exitWithWarning + dieWithUnexpectedError + addSlashAtEndIfNotPresent + $LENGTH_OF_NAME + $MIN_NUMBER_OF_AA + $TREMBL_ACDEOS_FILE + $SWISSPROT_ACDEOS_FILE + $SPECIES_NAMES_FILE + $SPECIES_TREE_FILE_DEFAULT + $MULTIPLE_TREES_FILE_SUFFIX + $LOG_FILE_SUFFIX + $ALIGN_FILE_SUFFIX + $TREE_FILE_SUFFIX + $ADDITION_FOR_RIO_ANNOT_TREE + $SUFFIX_PWD + $SUFFIX_BOOT_STRP_POS + $SUFFIX_PWD_NOT_BOOTS + $SUFFIX_HMM + $MATRIX_FOR_PWD + $RIO_PWD_DIRECTORY + $RIO_BSP_DIRECTORY + $RIO_NBD_DIRECTORY + $RIO_ALN_DIRECTORY + $RIO_HMM_DIRECTORY + $PFAM_FULL_DIRECTORY + $PFAM_SEED_DIRECTORY + $PRIOR_FILE_DIR + $PFAM_HMM_DB + $SEQBOOT + $NEIGHBOR + $PROTPARS + $CONSENSE + $PUZZLE + $HMMALIGN + $HMMSEARCH + $HMMBUILD + $HMMFETCH + $SFE + $HMMCALIBRATE + $P7EXTRACT + $MULTIFETCH + $BOOTSTRAP_CZ + $BOOTSTRAP_CZ_PL + $TRANSFERSBRANCHLENGHTS + $MAKETREE + $RIO_PL + $DORIO + $PUZZLE_DQO + $BOOTSTRAPS + $PATH_TO_FORESTER + $JAVA + $NODE_LIST + $RIO_SLAVE_DRIVER + $RIO_SLAVE + $TEMP_DIR_DEFAULT + $EXPASY_SPROT_SEARCH_DE + $EXPASY_SPROT_SEARCH_AC + ); + + + + +# ============================================================================= +# ============================================================================= +# +# THESE VARIABLES ARE ENVIRONMENT DEPENDENT, AND NEED TO BE SET ACCORDINGLY +# BY THE USER +# ------------------------------------------------------------------------- +# + + + +# RIO itself: +# ----------- +our $PATH_TO_FORESTER = "/nfs/dm3/homedir1/czmasek/RIO1.24/"; + + +# Java virtual machine: +# --------------------- +our $JAVA = "/usr/local/java/jdk/bin/java"; + + + +# Where all the temporary files can be created: +# --------------------------------------------- +our $TEMP_DIR_DEFAULT = "/tmp/"; + + + +# Pfam data: +# ---------- +our $PFAM_FULL_DIRECTORY = "/path/to/Pfam/Full/"; +our $PFAM_SEED_DIRECTORY = "/path/to/Pfam/Seed/"; +our $PFAM_HMM_DB = "/path/to/Pfam/Pfam_ls"; # Need to run "hmmindex" on this + # to produce .ssi file. + # Then, for example + # "setenv HMMERDB /home/rio/pfam-6.6/" + + +$PATH_TO_FORESTER = &addSlashAtEndIfNotPresent( $PATH_TO_FORESTER ); + + +# Description lines and species from SWISS-PROT and TrEMBL: +# --------------------------------------------------------- +our $TREMBL_ACDEOS_FILE = $PATH_TO_FORESTER."data/trembl22_ACDEOS_1-6"; + +our $SWISSPROT_ACDEOS_FILE = $PATH_TO_FORESTER."data/sp40_ACDEOS_1-6"; + + + +# Names of species which can be analyzed and analyzed +# against (must also be in tree $SPECIES_TREE_FILE_DEFAULT). +# By using a list with less species, RIO analyses become faster +# but lose phylogenetic resolution. +# For many purposes, list "tree_of_life_bin_1-6_species_list" +# in "data/species/" might be sufficient: +# -------------------------------------------------------------- +our $SPECIES_NAMES_FILE = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6_species_list"; + + + +# A default species tree in NHX format. +# For many purposes, tree "tree_of_life_bin_1-6.nhx" +# in "data/species/" might be fine: +# -------------------------------------------------- +our $SPECIES_TREE_FILE_DEFAULT = $PATH_TO_FORESTER."data/species/tree_of_life_bin_1-6.nhx"; + + + +# Data for using precalculated distances: +# --------------------------------------- +our $MATRIX_FOR_PWD = 2; # The matrix which has been used for the pwd in $RIO_PWD_DIRECTORY. + # 0=JTT, 1=PAM, 2=BLOSUM 62, 3=mtREV24, 5=VT, 6=WAG. + +our $RIO_PWD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; # all must end with "/" +our $RIO_BSP_DIRECTORY = $PATH_TO_FORESTER."example_data/"; +our $RIO_NBD_DIRECTORY = $PATH_TO_FORESTER."example_data/"; +our $RIO_ALN_DIRECTORY = $PATH_TO_FORESTER."example_data/"; +our $RIO_HMM_DIRECTORY = $PATH_TO_FORESTER."example_data/"; + + + +# +# End of variables which need to be set by the user. +# +# ============================================================================= +# ============================================================================= + + + + + +$TEMP_DIR_DEFAULT = &addSlashAtEndIfNotPresent( $TEMP_DIR_DEFAULT ); +$PFAM_FULL_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_FULL_DIRECTORY ); +$PFAM_SEED_DIRECTORY = &addSlashAtEndIfNotPresent( $PFAM_SEED_DIRECTORY ); + + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# These variables should normally not be changed: +# + +our $PRIOR_FILE_DIR = $PATH_TO_FORESTER."data/priors_for_hmmbuild/"; + # Directory containing dirichlet prior + # files needed for certain aligments + # by hmmbuild (e.g. Collagen). + + +# PHYLIP: +our $SEQBOOT = $PATH_TO_FORESTER."phylip_mod/exe/seqboot"; +our $NEIGHBOR = $PATH_TO_FORESTER."phylip_mod/exe/neighbor"; +our $PROTPARS = $PATH_TO_FORESTER."phylip_mod/exe/protpars"; +our $CONSENSE = $PATH_TO_FORESTER."phylip_mod/exe/consense"; + +# TREE-PUZZLE: +our $PUZZLE = $PATH_TO_FORESTER."puzzle_mod/src/puzzle"; +our $PUZZLE_DQO = $PATH_TO_FORESTER."puzzle_dqo/src/puzzle"; + +# HMMER: +our $HMMALIGN = $PATH_TO_FORESTER."hmmer/binaries/hmmalign"; +our $HMMSEARCH = $PATH_TO_FORESTER."hmmer/binaries/hmmsearch"; +our $HMMBUILD = $PATH_TO_FORESTER."hmmer/binaries/hmmbuild"; +our $HMMFETCH = $PATH_TO_FORESTER."hmmer/binaries/hmmfetch"; +our $SFE = $PATH_TO_FORESTER."hmmer/binaries/sfetch"; +our $HMMCALIBRATE = $PATH_TO_FORESTER."hmmer/binaries/hmmcalibrate"; + +our $P7EXTRACT = $PATH_TO_FORESTER."perl/p7extract.pl"; +our $MULTIFETCH = $PATH_TO_FORESTER."perl/multifetch.pl"; + + +# RIO/FORESTER: +our $BOOTSTRAP_CZ = $PATH_TO_FORESTER."C/bootstrap_cz"; +our $BOOTSTRAP_CZ_PL = $PATH_TO_FORESTER."perl/bootstrap_cz.pl"; +our $TRANSFERSBRANCHLENGHTS = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.transfersBranchLenghts"; +our $MAKETREE = $PATH_TO_FORESTER."perl/makeTree.pl"; +our $RIO_PL = $PATH_TO_FORESTER."perl/rio.pl"; +our $DORIO = $JAVA." -cp $PATH_TO_FORESTER"."java forester.tools.DoRIO"; +# parallel RIO: +our $RIO_SLAVE_DRIVER = $PATH_TO_FORESTER."perl/rio_slave_driver.pl"; +our $RIO_SLAVE = $PATH_TO_FORESTER."perl/rio_slave.pl"; +our $NODE_LIST = $PATH_TO_FORESTER."data/node_list.dat"; + +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +our $BOOTSTRAPS = 100; +our $MIN_NUMBER_OF_AA = 20; # After removal of gaps, if less, gaps are not removed. +our $LENGTH_OF_NAME = 26; + + + + +our $MULTIPLE_TREES_FILE_SUFFIX = ".mlt"; +our $LOG_FILE_SUFFIX = ".log"; +our $ALIGN_FILE_SUFFIX = ".aln"; +our $TREE_FILE_SUFFIX = ".nhx"; +our $ADDITION_FOR_RIO_ANNOT_TREE = ".rio"; +our $SUFFIX_PWD = ".pwd"; +our $SUFFIX_BOOT_STRP_POS = ".bsp"; +our $SUFFIX_PWD_NOT_BOOTS = ".nbd"; +our $SUFFIX_HMM = ".hmm"; + +our $EXPASY_SPROT_SEARCH_DE = "http://www.expasy.org/cgi-bin/sprot-search-de?"; +our $EXPASY_SPROT_SEARCH_AC = "http://www.expasy.org/cgi-bin/sprot-search-ac?"; + + + +# One argument: input multiple trees file +# Last modified: 07/05/01 +sub executeConsense { + my $in = $_[ 0 ]; + + &testForTextFilePresence( "$in" ); + + system( "$CONSENSE >/dev/null 2>&1 << ! +$in +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$CONSENSE \"" ); + + return; +} + + + +# Four arguments: +# 1. options ("-" is not necessary) +# 2. alignment or pwd file +# 3. outfile +# 4. temp dir +# Last modified: 07/05/01 +sub executeMakeTree { + + my $opts = $_[ 0 ]; + my $B = $_[ 1 ]; + my $C = $_[ 2 ]; + my $D = $_[ 3 ]; + + &testForTextFilePresence( $B ); + + $opts = "-".$opts; + + system( "$MAKETREE $opts $B $C $D" ) + && &dieWithUnexpectedError( "Could not execute \"$MAKETREE $opts $B $C $D\"" ); + +} ## executeMakeTree + + + + +# Two arguments: +# 1. Name of inputfile +# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +sub executePuzzleDQO { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + my $mat = ""; + + &testForTextFilePresence( $in ); + + $mat = setModelForPuzzle( $matrix_option ); + + system( "$PUZZLE_DQO $in >/dev/null 2>&1 << !$mat +y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO\"" ); + + return; + +} ## executePuzzleDQO + + + + +# Two arguments: +# 1. Name of inputfile +# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# Last modified: 01/28/02 +sub executePuzzleDQObootstrapped { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + + + my $l = 0; + my $slen = 0; + my $counter = 0; + my $mat = ""; + my $a = ""; + my @a = (); + + &testForTextFilePresence( $in ); + + open( GRP, "<$in" ) || &dieWithUnexpectedError( "Cannot open file \"$in\"" ); + while( ) { + if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { + $counter++; + } + } + close( GRP ); + + $l = `cat $in | wc -l`; + $slen = $l / $counter; + + system( "split -$slen $in $in.splt." ) + && &dieWithUnexpectedError( "Could not execute \"split -$slen $in $in.splt.\"" ); + + @a = <$in.splt.*>; + + $mat = setModelForPuzzle( $matrix_option ); + + foreach $a ( @a ) { + + system( "$PUZZLE_DQO $a >/dev/null 2>&1 << !$mat +y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$PUZZLE_DQO $a\"" ); + + system( "cat $a.dist >> $in.dist" ) + && &dieWithUnexpectedError( "Could not execute \"cat outdist >> $in.dist\"" ); + + unlink( $a, $a.".dist" ); + } + + return; + +} ## executePuzzleDQObootstrapped + + + +# Transfers a Pfam (SELEX) alignment to a +# PHYLIP sequential style alignment. +# It only writes "match columns" as indicated by the +# "# RF" line ('x' means match). +# +# Three arguments: +# 1. infile name +# 2. outfile name +# 3. 1 to NOT ensure that match states contain only 'A'-'Z' or '-' +# +# Returns the number of match states (=length of output alignment), +# the length of the input alignment, +# the number of seqs in the input alignment +# +# Last modified: 07/07/01 +# +sub pfam2phylipMatchOnly { + + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $ne = $_[ 2 ]; + my @seq_name = (); + my @seq_array = (); + my $return_line = ""; + my $seq = ""; + my $x = 0; + my $y = 0; + my $i = 0; + my $x_offset = 0; + my $max_x = 0; + my $rf_y = 0; + my $number_colum = 0; + my $not_ensure = 0; + my $saw_rf_line = 0; + + if ( $ne && $ne == 1 ) { + $not_ensure = 1; + } + + &testForTextFilePresence( $infile ); + + open( INPP, "$infile" ) || &dieWithUnexpectedError( "Cannot open file \"$infile\"" ); + + # This reads in the first block. It reads in the seq names. + while ( 1 ) { + if ( &isPfamSequenceLine( $return_line ) ) { + $return_line =~ /^(\S+)\s+(\S+)/; + $seq_name[ $y ] = substr( $1, 0, $LENGTH_OF_NAME ); + $seq = $2; + for ( $x = 0; $x < length( $seq ); $x++ ) { + $seq_array[ $x ][ $y ] = substr( $seq, $x, 1 ); + } + $y++; + } + elsif ( &isRFline( $return_line ) ) { + $saw_rf_line = 1; + $return_line =~ /\s+(\S+)\s*$/; + $seq = $1; + $x_offset = length( $seq ); + $rf_y = $y; + for ( $x = 0; $x < $x_offset; $x++ ) { + $seq_array[ $x ][ $rf_y ] = substr( $seq, $x, 1 ); + } + last; + } + + $return_line = ; + + if ( !$return_line ) { + &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); + } + } + + if ( $saw_rf_line != 1 ) { + &dieWithUnexpectedError( "Alignment not in expected format (no RF line)" ); + } + + $y = 0; + $max_x = 0; + + # This reads all blocks after the 1st one. + while ( $return_line = ) { + if ( &isPfamSequenceLine( $return_line ) ) { + $return_line =~ /^\S+\s+(\S+)/; + $seq = $1; + for ( $x = 0; $x < length( $seq ); $x++ ) { + $seq_array[ $x + $x_offset ][ $y % $rf_y ] = substr( $seq, $x, 1 ); + } + $y++; + } + elsif ( &isRFline( $return_line ) ) { + if ( $y != $rf_y ) { + &dieWithUnexpectedError( "Alignment not in expected format" ); + } + + $return_line =~ /\s+(\S+)\s*$/; + $seq = $1; + $max_x = length( $seq ); + + for ( $x = 0; $x < length( $seq ); $x++ ) { + $seq_array[ $x + $x_offset ][ $rf_y ] = substr( $seq, $x, 1 ); + } + + $y = 0; + $x_offset = $x_offset + $max_x; + $max_x = 0; + } + } + + close( INPP ); + + # Counts the match states, and hence the number of aa in the alignment: + for ( $x = 0; $x < $x_offset; $x++ ) { + if ( !$seq_array[ $x ][ $rf_y ] ) { + &dieWithUnexpectedError( "Alignment not in expected format" ); + } + if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { + $number_colum++; + } + } + + # Writes the file: + + open( OUTPP, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + print OUTPP "$rf_y $number_colum\n"; + for ( $y = 0; $y < $rf_y; $y++ ) { + print OUTPP "$seq_name[ $y ]"; + for ( $i = 0; $i < ( $LENGTH_OF_NAME - length( $seq_name[ $y ] ) ); $i++ ) { + print OUTPP " "; + } + for ( $x = 0; $x < $x_offset; $x++ ) { + if ( $seq_array[ $x ][ $rf_y ] eq 'x' ) { + if ( !$seq_array[ $x ][ $y ] ) { + &dieWithUnexpectedError( "Alignment not in expected format" ); + } + if ( $not_ensure != 1 && $seq_array[ $x ][ $y ] !~ /[A-Z]|-/ ) { + &dieWithUnexpectedError( "Alignment not in expected format (match states must only contain 'A'-'Z' or '-')" ); + } + print OUTPP "$seq_array[ $x ][ $y ]"; + } + } + print OUTPP "\n"; + } + close( OUTPP ); + + return $number_colum, $x_offset, $rf_y; + +} ## pfam2phylipMatchOnly + + + +# Returns whether the argument (a String) +# starts with a SWISS-PROT name (SEQN_SPECI). +# Last modified: 06/21/01 +sub startsWithSWISS_PROTname { + return ( $_[ 0 ] =~ /^[A-Z0-9]{1,4}_[A-Z0-9]{1,5}/ ); +} + + + +# Returns whether the argument starts with XXX.. XXXXX.. and the first +# character is not a "#". +# Last modified: 06/21/01 +sub isPfamSequenceLine { + return( !&isPfamCommentLine( $_[ 0 ] ) + && &containsPfamNamedSequence( $_[ 0 ] ) ); +} + + + +# Returns whether the argument does start with a "#". +# Last modified: 06/21/01 +sub isPfamCommentLine { + return ( $_[ 0 ] =~ /^#/ ); +} + + + +# Returns whether the argument starts with XXX XXXXX. +# Last modified: 06/21/01 +sub containsPfamNamedSequence { + return ( $_[ 0 ] =~ /^\S+\s+\S+/ ); +} + + +# Returns whether the argument starts with XXX XXXXX. +# Last modified: 06/21/01 +sub isRFline { + return ( $_[ 0 ] =~ /^#.*RF/ ); +} + + + + +# Five arguments: +# 1. pairwise distance file +# 2. number of bootstraps +# 3. randomize_input_order: 0: do not randomize input order; >=1 jumble +# 4. seed for random number generator +# 5. lower-triangular data matrix? 1: yes; no, otherwise +# Last modified: 06/08/01 +sub executeNeighbor { + my $inpwd = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $rand = $_[ 2 ]; + my $s = $_[ 3 ]; + my $l = $_[ 4 ]; + my $jumble = ""; + my $multi = ""; + my $lower = ""; + + + &testForTextFilePresence( $inpwd ); + + if ( $rand >= 1 ) { + $jumble = " +J +$s"; + } + + if ( $bs >= 2 ) { + $multi = " +M +$bs +$s"; + } + if ( $l == 1 ) { + $lower = " +L"; + } + + + system( "$NEIGHBOR >/dev/null 2>&1 << ! +$inpwd$jumble$multi$lower +2 +3 +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$NEIGHBOR $inpwd$jumble$multi$lower\"" ); + # 3: Do NOT print out tree + + + return; + +} ## executeNeighbor + + + +# Four arguments: +# 1. name of alignment file (in correct format!) +# 2. number of bootstraps +# 3. jumbles: 0: do not jumble; >=1 number of jumbles +# 4. seed for random number generator +# Last modified: 03/13/04 +sub executeProtpars { + my $alin = $_[ 0 ]; + my $bs = $_[ 1 ]; + my $rand = $_[ 2 ]; + my $s = $_[ 3 ]; + my $jumble = ""; + my $multi = ""; + + + &testForTextFilePresence( $alin ); + + if ( $bs >= 2 && $rand < 1 ) { + $rand = 1; + } + + if ( $rand >= 1 ) { + $jumble = " +J +$s +$rand"; + } + + if ( $bs >= 2 ) { + $multi = " +M +D +$bs"; + } + + + + system( "$PROTPARS 2>&1 << ! +$alin$jumble$multi +I +3 +Y +!" ) + && &dieWithUnexpectedError( "Could not execute \"$PROTPARS $alin$jumble$multi\"" ); + # 3: Do NOT print out tree + # I: Interleaved + + return; + +} ## executeProtpars + + + +# "Model of substitution" order for DQO TREE-PUZZLE 5.0: +# Auto +# m -> Dayhoff (Dayhoff et al. 1978) +# m -> JTT (Jones et al. 1992) +# m -> mtREV24 (Adachi-Hasegawa 1996) +# m -> BLOSUM62 (Henikoff-Henikoff 92) +# m -> VT (Mueller-Vingron 2000) +# m -> WAG (Whelan-Goldman 2000) +# m -> Auto +# One argument: +# matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# Last modified: 07/07/01 +sub setModelForPuzzle { + my $matrix_option = $_[ 0 ]; + my $matr = ""; + + if ( $matrix_option == 0 ) { # JTT + $matr = " +m +m"; + } + elsif ( $matrix_option == 2 ) { # BLOSUM 62 + $matr = " +m +m +m +m"; + } + elsif ( $matrix_option == 3 ) { # mtREV24 + $matr = " +m +m +m"; + } + elsif ( $matrix_option == 5 ) { # VT + $matr = " +m +m +m +m +m"; + } + elsif ( $matrix_option == 6 ) { # WAG + $matr = " +m +m +m +m +m +m"; + } + elsif ( $matrix_option == 7 ) { # auto + $matr = ""; + } + else { # PAM + $matr = " +m" + } + + return $matr; + +} ## setModelForPuzzle + +# One argument: +# Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +# Last modified: 09/08/03 +sub setRateHeterogeneityOptionForPuzzle { + my $rate_heterogeneity_option = $_[ 0 ]; + my $opt = ""; + + if ( $rate_heterogeneity_option == 1 ) { + $opt = " +w"; + } + elsif ( $rate_heterogeneity_option == 2 ) { + $opt = " +w +w"; + } + elsif ( $rate_heterogeneity_option == 3 ) { + $opt = " +w +w +w"; + } + else { + $opt = ""; + } + + return $opt; +} ## setRateHeterogeneityOptionForPuzzle + + +# One argument: +# Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# Last modified: 09/08/03 +sub setParameterEstimatesOptionForPuzzle { + my $parameter_estimates_option = $_[ 0 ]; + my $opt = ""; + + if ( $parameter_estimates_option == 1 ) { + $opt = " +e"; + } + else { + $opt = ""; + } + + return $opt; +} ## setParameterEstimatesOptionForPuzzle + + + +# Two/three/four arguments: +# 1. Name of inputfile +# 2. matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# 3. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# 4. Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +# Last modified: 09/08/03 (added 3rd and 4th parameter) +sub executePuzzleBootstrapped { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + my $parameter_estimates_option = $_[ 2 ]; + my $rate_heterogeneity_option = $_[ 3 ]; + + my $l = 0; + my $slen = 0; + my $counter = 0; + my $mat = ""; + my $est = ""; + my $rate = ""; + my $a = ""; + my @a = (); + + &testForTextFilePresence( $in ); + + open( GRP, "<$in" ) || die "\n\n$0: Unexpected error: Cannot open file <<$in>>: $!"; + while( ) { + if ( $_ =~ /^\s*\d+\s+\d+\s*$/ ) { + $counter++; + } + } + close( GRP ); + + $l = `cat $in | wc -l`; + $slen = $l / $counter; + + system( "split -$slen $in $in.splt." ) + && die "\n\n$0: executePuzzleDQObootstrapped: Could not execute \"split -$slen $in $in.splt.\": $!"; + + @a = <$in.splt.*>; + + $mat = setModelForPuzzle( $matrix_option ); + if ( $parameter_estimates_option ) { + $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); + } + if ( $rate_heterogeneity_option ) { + $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); + } + + foreach $a ( @a ) { + print "-".$a."\n"; + system( "$PUZZLE $a << ! +k +k$mat$est$rate +y +!" ) + && die "$0: Could not execute \"$PUZZLE $a\""; + + system( "cat $a.dist >> $in.dist" ) + && die "$0: Could not execute \"cat outdist >> $in.dist\""; + + unlink( $a, $a.".dist", $a.".tree" ); + } + + return; + +} ## executePuzzleBootstrapped + + + + + +# Two/three/four arguments: +# 1. Name of inputfile +# 2. Matrix option: 0 = JTT; 2 = BLOSUM 62; 3 = mtREV24; +# 5 = VT; 6 = WAG; 7 = auto; PAM otherwise +# 3. Parameter estimates: 1 for "Exact (slow)"; "Approximate (faster)" otherwise +# 4. Model of rate heterogeneity: +# 1 for "8 Gamma distributed rates" +# 2 for "Two rates (1 invariable + 1 variable)" +# 3 for "Mixed (1 invariable + 8 Gamma rates)" +# otherwise: Uniform rate +# Last modified: 09/08/03 (added 3rd and 4th parameter) +sub executePuzzle { + my $in = $_[ 0 ]; + my $matrix_option = $_[ 1 ]; + my $parameter_estimates_option = $_[ 2 ]; + my $rate_heterogeneity_option = $_[ 3 ]; + my $mat = ""; + my $est = ""; + my $rate = ""; + + &testForTextFilePresence( $in ); + + $mat = &setModelForPuzzle( $matrix_option ); + if ( $parameter_estimates_option ) { + $est = &setParameterEstimatesOptionForPuzzle( $parameter_estimates_option ); + } + if ( $rate_heterogeneity_option ) { + $rate = &setRateHeterogeneityOptionForPuzzle( $rate_heterogeneity_option ); + } + + + system( "$PUZZLE $in << ! +k +k$mat$est$rate +y +!" ) + && die "$0: Could not execute \"$PUZZLE\""; + + return; + +} ## executePuzzle + + + + +# Preparation of the pwd file +sub addDistsToQueryToPWDfile { + my $pwd_file = $_[ 0 ]; + my $disttoquery_file = $_[ 1 ]; + my $outfile = $_[ 2 ]; + my $name_of_query = $_[ 3 ]; + my $name_of_query_ = ""; + my $return_line_pwd = ""; + my $return_line_dq = ""; + my $num_of_sqs = 0; + my $block = 0; + my $name_from_pwd = "X"; + my $name_from_dq = "Y"; + my @dists_to_query = (); + my $i = 0; + + &testForTextFilePresence( $pwd_file ); + &testForTextFilePresence( $disttoquery_file ); + + $name_of_query_ = $name_of_query; + for ( my $j = 0; $j <= ( $LENGTH_OF_NAME - length( $name_of_query ) - 1 ); ++$j ) { + $name_of_query_ .= " "; + } + + open( OUT_AD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" ); + open( IN_DQ, "$disttoquery_file" ) || &dieWithUnexpectedError( "Cannot open file \"$disttoquery_file\"" ); + + W: while ( $return_line_pwd = ) { + + + if ( $return_line_pwd =~ /^\s*(\d+)\s*$/ ) { + $num_of_sqs = $1; + $num_of_sqs++; + if ( $block > 0 ) { + print OUT_AD "$name_of_query_ "; + for ( my $j = 0; $j < $i; ++$j ) { + print OUT_AD "$dists_to_query[ $j ] "; + } + print OUT_AD "0.0\n"; + } + print OUT_AD " $num_of_sqs\n"; + $block++; + @dists_to_query = (); + $i = 0; + } + + if ( $block == 1 + && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { + $name_from_pwd = $1; + + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + + if ( $return_line_dq !~ /\S/ ) { + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + } + $return_line_dq =~ /^\s*(\S+)\s+(\S+)/; + $name_from_dq = $1; + $dists_to_query[ $i++ ] = $2; + + + if ( $name_from_pwd ne $name_from_dq ) { + &dieWithUnexpectedError( "Order of sequence names in \"$pwd_file\" and \"$disttoquery_file\" is not the same" ); + } + print OUT_AD $return_line_pwd; + + } + elsif ( $block > 1 + && $return_line_pwd =~ /^\s*(\S+)\s+\S+/ ) { + $name_from_pwd = $1; + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + if ( $return_line_dq !~ /\S/ ) { + if ( !defined( $return_line_dq = ) ) { + &dieWithUnexpectedError( "\"$disttoquery_file\" seems too short" ); + } + } + $return_line_dq =~ /^\s*\S+\s+(\S+)/; + $dists_to_query[ $i++ ] = $1; + print OUT_AD $return_line_pwd; + } + } + print OUT_AD "$name_of_query_ "; + for ( my $j = 0; $j < $i; ++$j ) { + print OUT_AD "$dists_to_query[ $j ] "; + } + print OUT_AD "0.0\n"; + + close( OUT_AD ); + close( IN_PWD ); + close( IN_DQ ); + return $block; + +} ## addDistsToQueryToPWDfile + + + + +# Three arguments: +# 1. HMMER model db +# 2. name of HMM +# 3. outputfile name +# Last modified: 02/27/01 +sub executeHmmfetch { + + my $db = $_[ 0 ]; + my $name = $_[ 1 ]; + my $outfile = $_[ 2 ]; + + system( "$HMMFETCH $db $name > $outfile" ) + && &dieWithUnexpectedError( "Could not execute \"$HMMFETCH $db $name > $outfile\"" ); + return; + +} ## executeHmmfetch + + + +# Checks wether a file is present, not empty and a plain textfile. +# One argument: name of file. +# Last modified: 07/07/01 +sub testForTextFilePresence { + my $file = $_[ 0 ]; + unless ( ( -s $file ) && ( -f $file ) && ( -T $file ) ) { + dieWithUnexpectedError( "File \"$file\" does not exist, is empty, or is not a plain textfile" ); + } +} ## testForTextFilePresence + + +# Last modified: 02/21/03 +sub addSlashAtEndIfNotPresent { + my $filename = $_[ 0 ]; + $filename =~ s/\s+//g; + unless ( $filename =~ /\/$/ ) { + $filename = $filename."/"; + } + return $filename; +} ## addSlashAtEndIfNotPresent + + + +# Last modified: 02/15/02 +sub exitWithWarning { + + my $text = $_[ 0 ]; + if ( defined( $_[ 1 ] ) && $_[ 1 ] == 1 ) { + print( "

user error

\n" ); + print( "

\n" ); + print( "$text\n" ); + print( "

\n" ); + print( "

 

\n" ); + } + else { + print( "\n\n$text\n\n" ); + } + + exit( 0 ); + +} ## exit_with_warning + + + +# Last modified: 02/15/02 +sub dieWithUnexpectedError { + + my $text = $_[ 0 ]; + + die( "\n\n$0:\nUnexpected error (should not have happened):\n$text\n$!\n\n" ); + +} ## dieWithUnexpectedError + + + +1; diff --git a/forester/archive/perl/rio_slave.pl b/forester/archive/perl/rio_slave.pl new file mode 100755 index 0000000..94a0e56 --- /dev/null +++ b/forester/archive/perl/rio_slave.pl @@ -0,0 +1,160 @@ +#!/usr/bin/perl -W + +# rio_slave.pl +# ------------ +# +# Copyright (C) 2002 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Created: 01/18/02 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified: 02/20/02 + + +# Arguments: + +# 0: first block in multiple alignment to process +# 1: last block in multiple alignment to process +# 2: name of resampled alignment, inc. query +# 3: matrix number +# 4: name of query +# 5: PWD file +# 6: seed for random number generator for neighbor +# 7: node number +# 8: temp dir + + +use strict; + +use FindBin; +use lib $FindBin::Bin; +use rio_module; + +if ( @ARGV != 9 ) { + &dieWithUnexpectedError( "argument count is off" ); +} + +my $start = $ARGV[ 0 ]; +my $end = $ARGV[ 1 ]; +my $align = $ARGV[ 2 ]; +my $matrix_n = $ARGV[ 3 ]; +my $name = $ARGV[ 4 ]; +my $pwd_file = $ARGV[ 5 ]; +my $seed = $ARGV[ 6 ]; +my $number = $ARGV[ 7 ]; +my $temp_dir = $ARGV[ 8 ]; + +my $b = 0; +my $outfile = ""; +my $mytemp_dir = $temp_dir."/dir_".$number; + +mkdir( $mytemp_dir, 0700 ) +|| &dieWithUnexpectedError( "Could not create \"$mytemp_dir\"" ); + +unless ( ( -e $mytemp_dir ) && ( -d $mytemp_dir ) ) { + &dieWithUnexpectedError( "\"$mytemp_dir\" does not exist, or is not a directory" ); +} + + +&executePuzzleDQObootstrapped( $align, $matrix_n ); + +system( "mv", $align.".dist", $mytemp_dir."/DISTs_TO_QUERY" ) +&& &dieWithUnexpectedError( "could not mv" ); + +unlink( $align ); + +sleep( 2 ); + +÷PWDfile( $pwd_file, + $mytemp_dir."/DIVIDED", + $start, + $end ); + +&addDistsToQueryToPWDfile( $mytemp_dir."/DIVIDED", + $mytemp_dir."/DISTs_TO_QUERY", + $mytemp_dir."/PWD_INC_QUERY", + $name ); + +unlink( $mytemp_dir."/DIVIDED" ); + +$b = $end - $start + 1; + +chdir ( $mytemp_dir ) +|| &dieWithUnexpectedError( "Could not chdir to \"$mytemp_dir\"" ); + +&executeNeighbor( $mytemp_dir."/PWD_INC_QUERY", + $b, + 1, # randomize input order + $seed, + 1 ); # lower-triangular data matrix + + +unlink( "outfile", $mytemp_dir."/PWD_INC_QUERY", $mytemp_dir."/DISTs_TO_QUERY" ); + +system( "mv", "outtree", "../MAKETREEOUT".$MULTIPLE_TREES_FILE_SUFFIX.$number ) +&& &dieWithUnexpectedError( "could not mv" ); + +sleep( 1 ); + +chdir( ".." ) +|| &dieWithUnexpectedError( "Could not chdir to \"..\"" ); + +rmdir( $mytemp_dir ) || &dieWithUnexpectedError( "Could not delete \"$mytemp_dir\"" ); + +$outfile = "FINISHED_$number"; + +open( OUT, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); +close( OUT ); + +exit( 0 ); + + + + +sub dividePWDfile { + my $pwd_file = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $start = $_[ 2 ]; # e.g. 0 + my $end = $_[ 3 ]; # e.g. 9 + + my $c = 0; + my $write = 0; + my $return_line = ""; + + &testForTextFilePresence( $pwd_file ); + + open( IN_PWD, "$pwd_file" ) || &dieWithUnexpectedError( "Cannot open file \"$pwd_file\"" ); + open( OUT_PWD, ">$outfile" ) || &dieWithUnexpectedError( "Cannot create file \"$outfile\"" ); + + while ( $return_line = ) { + if ( $return_line =~ /^\s*(\d+)\s*$/ ) { + if ( $c >= $start && $c <= $end ) { + $write = 1; + } + elsif ( $c > $end ) { + last; + } + $c++; + } + if ( $write == 1 ) { + print OUT_PWD $return_line; + } + } + + close( IN_PWD ); + close( OUT_PWD ); + + return; + +} ## dividePWDfile + + + + + + + diff --git a/forester/archive/perl/rio_slave_driver.pl b/forester/archive/perl/rio_slave_driver.pl new file mode 100755 index 0000000..3e82e4e --- /dev/null +++ b/forester/archive/perl/rio_slave_driver.pl @@ -0,0 +1,108 @@ +#!/usr/bin/perl -W + +# rio_slave_driver.pl +# ------------------- +# +# Copyright (C) 2002 Washington University School of Medicine +# and Howard Hughes Medical Institute +# All rights reserved +# +# Created: 01/18/02 +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Last modified: 02/20/02 + + +# 0: block size +# 1: number of blocks which have a size of block size + 1 +# 2: name of resampled alignment, inc. query +# 3: matrix number +# 4: name of query +# 5: PWD file +# 6: temp dir +# 7: seed for random number generator for neighbor +# 8...: list of node names + + + + +use strict; + +use FindBin; +use lib $FindBin::Bin; +use rio_module; + +if ( @ARGV < 9 ) { + &dieWithUnexpectedError( "argumnet count off" ); +} + + +my $block_size = shift( @ARGV ); +my $larger_blocks = shift( @ARGV ); +my $align = shift( @ARGV ); +my $matrix_n = shift( @ARGV ); +my $name = shift( @ARGV ); +my $pwd_file = shift( @ARGV ); +my $temp_dir = shift( @ARGV ); +my $seed = shift( @ARGV ); +my @nodelist = @ARGV; +my $start = 0; +my $end = 0; +my $x = 0; +my $node = ""; + + +$start = 0; + +if ( $larger_blocks > 0 ) { + $end = $block_size; +} +else { + $end = $block_size - 1; +} + +for ( $x = 0; $x < scalar( @nodelist ); $x++ ) { + my $child_pid; + $node = $nodelist[ $x ]; + + if ( !defined( $child_pid = fork() ) ) { + &dieWithUnexpectedError( "cannot fork" ); + } + elsif ( $child_pid ) { + # I'm the parent, forking off $nodelist number of children + } + else { + exec( "ssh", + $node, + "/usr/bin/perl", + $RIO_SLAVE, + $start, + $end, + $align.$x, + $matrix_n, + $name, + $pwd_file, + $seed, + $x, + $temp_dir ) + || &dieWithUnexpectedError( "could not \"exec ssh $node /usr/bin/perl $RIO_SLAVE\"" ); + + } + $larger_blocks--; + if ( $larger_blocks > 0 ) { + $start += ( $block_size + 1 ); + $end += ( $block_size + 1 ); + } + elsif ( $larger_blocks == 0 ) { + $start += ( $block_size + 1 ); + $end += $block_size; + } + else { + $start += $block_size; + $end += $block_size; + } +} + +exit( 0 ); diff --git a/forester/archive/perl/xt.pl b/forester/archive/perl/xt.pl new file mode 100755 index 0000000..4fc5da3 --- /dev/null +++ b/forester/archive/perl/xt.pl @@ -0,0 +1,640 @@ +#!/usr/bin/perl -W + +# xt.pl +# ----- +# +# Copyright (C) 2003 Christian M. Zmasek +# All rights reserved +# +# Author: Christian M. Zmasek +# zmasek@genetics.wustl.edu +# http://www.genetics.wustl.edu/eddy/people/zmasek/ +# +# Version: 1.010 +# Last modified 03/25/03 +# +# +# +# Calculates trees based on Pfam alignments or precalculated distances using +# makeTree.pl. + + +use strict; +use FindBin; +use lib $FindBin::Bin; +use rio_module; + + +# To use _your_ species list make $MY_SPECIES_NAMES_FILE point to it. +# To use _your_ TrEMBL ACDEOS make $MY_TREMBL_ACDEOS_FILE point to it. + +my $MY_SPECIES_NAMES_FILE = $SPECIES_NAMES_FILE; # $SPECIES_NAMES_FILE is inherited + # from rio_module.pm + +my $MY_TREMBL_ACDEOS_FILE = $TREMBL_ACDEOS_FILE; # $TREMBL_ACDEOS_FILE is inherited + # from rio_module.pm + +my $MY_TEMP_DIR = $TEMP_DIR_DEFAULT; # $TEMP_DIR_DEFAULT is inherited + # from rio_module.pm + +my $LOGFILE = "00_xt_logfile"; +my $PWD_SUFFIX = ".pwd"; +my $ALN_SUFFIX = ".aln"; + +my $use_precalc_pwd = 0; # 0: input is Pfam aligments ($input_dir must point to "/Pfam/Full/"). + # 1: input is precalculated pairwise distancs ($input_dir must point to ""). +my $use_precalc_pwd_and_aln = 0;# 0: otherwise + # 1: input is precalculated pairwise distancs + # _and_ alns,$use_precalc_pwd = 1 ($input_dir must point to alns). +my $add_species = 0; # "I": 0: do nothing with species information. + # "S": 1: add species code to TrEMBL sequences and ignore sequences from + # species not in $MY_SPECIES_NAMES_FILE (only if input is Pfam aligments). +my $options = ""; # Options for makeTree.pl, see makeTree.pl. + # Do not use F [Pairwise distance (pwd) file as input (instead of alignment)] + # since this is determined with $USE_PRECALC_PWD +my $min_seqs = 0; # Minimal number of sequences (TREE-PUZZLE needs at least four seqs). + # Ignored if $USE_PRECALC_PWD = 1 +my $max_seqs = 0; # Maximal number of sequences. + # Ignored if $USE_PRECALC_PWD = 1 +my $input_dir = ""; +my $input_dir_aln = ""; # for .aln files +my $output_dir = ""; + +my $i = 0; +my $seqs = 0; +my $filename = ""; +my @filenames = (); +my %AC_OS = (); # AC -> species name +my %Species_names_hash = (); +my $too_small = 0; +my $too_large = 0; +my $already_present = 0; +my @too_small_names = (); +my @too_large_names = (); +my @already_present_names = (); + + +# Analyzes the options: +# --------------------- + +unless ( @ARGV == 3 || @ARGV == 4 || @ARGV == 6 ) { + &printUsage(); +} + +if ( @ARGV == 3 ) { + $use_precalc_pwd = 1; + $use_precalc_pwd_and_aln = 0; + $options = $ARGV[ 0 ]; + $input_dir = $ARGV[ 1 ]; + $output_dir = $ARGV[ 2 ]; + $add_species = 0; +} +elsif ( @ARGV == 4 ) { + $use_precalc_pwd = 1; + $use_precalc_pwd_and_aln = 1; + $options = $ARGV[ 0 ]; + $input_dir = $ARGV[ 1 ]; + $input_dir_aln = $ARGV[ 2 ]; + $output_dir = $ARGV[ 3 ]; + $add_species = 0; + $input_dir_aln = &addSlashAtEndIfNotPresent( $input_dir_aln ); +} +else { + $use_precalc_pwd = 0; + $use_precalc_pwd_and_aln = 0; + $add_species = $ARGV[ 0 ]; + $options = $ARGV[ 1 ]; + $min_seqs = $ARGV[ 2 ]; + $max_seqs = $ARGV[ 3 ]; + $input_dir = $ARGV[ 4 ]; + $output_dir = $ARGV[ 5 ]; + if ( $min_seqs < 4 ) { + $min_seqs = 4; + } + if ( $add_species eq "I" ) { + $add_species = 0; + } + elsif ( $add_species eq "S" ) { + $add_species = 1; + } + else { + print( "\nFirst must be either \"I\" [Ignore species] or\n\"S\" [add Species code to TrEMBL sequences and ignore sequences from species not in $MY_SPECIES_NAMES_FILE].\n\n" ); + &printUsage(); + } +} + + + +$input_dir = &addSlashAtEndIfNotPresent( $input_dir ); +$output_dir = &addSlashAtEndIfNotPresent( $output_dir ); +$MY_TEMP_DIR = &addSlashAtEndIfNotPresent( $MY_TEMP_DIR ); + + + + +# This adds a "-" before the options for makeTree: +# ------------------------------------------------ +unless ( $options =~ /^-/ ) { + $options = "-".$options; +} + + + +# If based on pwd, species are "fixed" and certain options for makeTree +# are not applicable and option "F" is mandatory: +# --------------------------------------------------------------------- +if ( $use_precalc_pwd == 1 ) { + $options =~ s/D//g; + $options =~ s/C//g; + $options =~ s/N//g; + unless ( $options =~ /F/ ) { + $options = $options."F"; + } +} +else { + $options =~ s/F//g; +} + +if ( $use_precalc_pwd_and_aln == 1 ) { + unless ( $options =~ /U/ ) { + $options = $options."U"; + } +} +if ( $use_precalc_pwd_and_aln == 0 && $use_precalc_pwd == 1 ) { + $options =~ s/U//g; +} + + + + +# If species are to be considered, speices names file and TrEMBL ACDEOS +# files need to be read in: +# --------------------------------------------------------------------- +if ( $add_species == 1 ) { + print "\nXT.PL: Reading species names file...\n"; + &readSpeciesNamesFile( $MY_SPECIES_NAMES_FILE ); + print "\nXT.PL: Reading TrEMBL ACDEOS file...\n"; + &readTrEMBL_ACDEOS_FILE( $MY_TREMBL_ACDEOS_FILE ); +} + + + +# This creates the temp file: +# -------------------------- + +my $time = time; +my $ii = 0; + +my $temp_file = $MY_TEMP_DIR."xt".$time.$ii; + +while ( -e $temp_file ) { + $ii++; + $temp_file = $MY_TEMP_DIR."xt".$time.$ii; +} + + + +&startLogfile(); + +opendir( DIR, $input_dir ) || error( "Cannot open directory \"$input_dir\": $!" ); + +$i = 0; + +while( defined( $filename = readdir( DIR ) ) ) { + if ( $filename =~ /^\.\.?$/ ) { + next; + } + if ( $use_precalc_pwd == 1 && $filename !~ /$PWD_SUFFIX$/ ) { + next + } + $filenames[ $i ] = $filename; + $i++; +} + +close( DIR ); + +$i = 0; + +FOREACH: foreach $filename ( @filenames ) { + + # If the corresponding tree seems to already exists, do next one. + my $fn = $filename; + if ( $use_precalc_pwd == 1 ) { + $fn =~ s/$PWD_SUFFIX$//; + } + if ( -e "$output_dir$fn.nhx" ) { + $already_present_names[ $already_present++ ] = $fn; + next FOREACH; + } + + if ( $use_precalc_pwd != 1 ) { + + if ( $add_species == 1 ) { + + # 1. Pfam flat file name + # 2. outfile name + # Returns the number of sequences in the resulting alignment. + $seqs = &removeSeqsFromPfamAlign( $input_dir.$filename, $temp_file ); + + } + else { + # Gets the number of seqs in the alignment. + open( F, "$input_dir"."$filename" ); + while( ) { + if ( $_ =~/^#.+SQ\s+(\d+)\s*$/ ) { + $seqs = $1; + last; + } + } + close( F ); + } + + if ( $seqs < $min_seqs ) { + $too_small_names[ $too_small++ ] = $filename; + next FOREACH; + } + if ( $seqs > $max_seqs ) { + $too_large_names [ $too_large++ ] = $filename; + next FOREACH; + } + } + + print "\n\n\n\n"; + print "XT.PL\n"; + if ( $use_precalc_pwd == 1 ) { + print "working on: $filename\n"; + } + else { + print "working on: $filename [$seqs seqs]\n"; + } + print "[tree calculation $i]\n"; + print "=====================================================================\n\n\n"; + + + unlink( "$output_dir$filename.aln", "$output_dir$filename.log" ); + + print( "XT.PL: executing:\n" ); + + my $inputfile = ""; + + if ( $add_species == 1 ) { + $inputfile = $temp_file; + } + else { + $inputfile = $input_dir.$filename; + } + + if ( $use_precalc_pwd == 1 ) { + $filename =~ s/$PWD_SUFFIX$//; + } + + if ( $use_precalc_pwd_and_aln == 1 ) { + $inputfile = $inputfile." ".$input_dir_aln.$filename.$ALN_SUFFIX; + } + + my $command = "$MAKETREE $options $inputfile $output_dir$filename.nhx"; + + print( "$command\n" ); + system( $command ) && &error( "Could not execute \"$command\"" ); + + if ( $add_species == 1 ) { + if ( unlink( $temp_file ) != 1 ) { + &error( "Unexpected: Could not delete \"$temp_file\"" ); + } + } + + $i++; + +} + +&finishLogfile(); + +print( "\n\n\nXT.PL: Done!\n" ); +print( "Wrote \"$LOGFILE\".\n\n" ); + +exit( 0 ); + + + + + + +sub error{ + + my $text = $_[ 0 ]; + + print( "\nxt.pl: ERROR:\n" ); + print( "$text\n\n" ); + + exit( -1 ); + +} ## dieWithUnexpectedError + +# Similar to the method with the same name in "rio.pl". +# Removes sequences from a Pfam flat file. +# Adds species to TrEMBL seqs. +# It can remove all sequences not from species listed in a species names file. +# Two arguments: +# 1. Pfam flat file name +# 2. outfile name +# Returns the number of sequences in the resulting alignment. +# Last modified: 02/22/03 +sub removeSeqsFromPfamAlign { + my $infile = $_[ 0 ]; + my $outfile = $_[ 1 ]; + my $return_line = ""; + my $saw_sequence_line = 0; + my $number_of_seqs = 0; + my $OS = ""; + my $AC = ""; + my $i = 0; + my $length = 0; + my $seq_name = ""; + my $seq = ""; + + + open( OUT_RNSP, ">$outfile" ) || die "\n\n$0: Unexpected error: Cannot create file \"$outfile\": $!"; + open( IN_RNSP, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!"; + while ( $return_line = ) { + + if ( $saw_sequence_line == 1 + && !&containsPfamNamedSequence( $return_line ) + && !&isPfamCommentLine( $return_line ) ) { + # This is just for counting purposes. + $saw_sequence_line = 2; + } + if ( &isPfamSequenceLine( $return_line ) ) { + if ( $saw_sequence_line == 0 ) { + $saw_sequence_line = 1; + } + $return_line =~ /^\s*(\S+)\s+(\S+)/; + $seq_name = $1; + $seq = $2; + if ( !&startsWithSWISS_PROTname( $return_line ) ) { + $seq_name =~ /^(\S+)\//; + $AC = $1; + unless( exists( $AC_OS{ $AC } ) ) { + #ACs not present in "ACDEOS" file. + next; + } + $OS = $AC_OS{ $AC }; + if ( !$OS || $OS eq "" ) { + die "\n\n$0: Unexpected error: species for \"$AC\" not found.\n\n"; + } + unless( exists( $Species_names_hash{ $OS } ) ) { + next; + } + $seq_name =~ s/\//_$OS\//; + } + else { + if ( $return_line =~ /_([A-Z0-9]{1,5})\// ) { + unless( exists( $Species_names_hash{ $1 } ) ) { + next; + } + } + # remove everything whose species cannot be determined. + else { + next; + } + } + $length = length( $seq_name ); + for ( $i = 0; $i <= ( $LENGTH_OF_NAME - $length - 1 ); $i++ ) { + $seq_name .= " "; + } + $return_line = $seq_name.$seq."\n"; + } + + if ( !&isPfamCommentLine( $return_line ) ) { + print OUT_RNSP $return_line; + } + + if ( $saw_sequence_line == 1 ) { + $number_of_seqs++; + } + } ## while ( $return_line = ) + close( IN_RNSP ); + close( OUT_RNSP ); + + return $number_of_seqs; + +} ## removeSeqsFromPfamAlign + + + + + + + +# Reads in (SWISS-PROT) species names from a file. +# Names must be separated by newlines. +# Lines beginning with "#" are ignored. +# A possible "=" and everything after is ignored. +# One argument: species-names-file name +# Last modified: 04/24/01 +sub readSpeciesNamesFile { + my $infile = $_[ 0 ]; + my $return_line = ""; + my $species = ""; + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + die "\n\n$0: Error: \"$infile\" does not exist, is empty, or is not a plain textfile.\n\n"; + } + + open( IN_RSNF, "$infile" ) || die "\n\n$0: Unexpected error: Cannot open file <<$infile>>: $!\n\n"; + while ( $return_line = ) { + if ( $return_line !~ /^\s*#/ && $return_line =~ /(\S+)/ ) { + $species = $1; + $species =~ s/=.+//; + $Species_names_hash{ $species } = ""; + } + } + close( IN_RSNF ); + + return; +} ## readSpeciesNamesFile + + + +# Last modified: 05/18/01 +sub readTrEMBL_ACDEOS_FILE { + my $infile = $_[ 0 ]; + my $return_line = ""; + + unless ( ( -s $infile ) && ( -f $infile ) && ( -T $infile ) ) { + &error( "\"$infile\" does not exist, is empty, or is not a plain textfile" ); + } + # Fill up (huge) hashs. + open( HH, "$infile" ) || &error( "Unexpected error: Cannot open file \"$infile\"" ); + while ( $return_line = ) { + + if ( $return_line =~ /(\S+);[^;]*;(\S+)/ ) { + $AC_OS{ $1 } = $2; + } + } + close( HH ); +} ## readTrEMBL_ACDEOS_FILE + + + +# Last modified: 05/17/01 +sub startLogfile { + if ( -e "$LOGFILE" ) { + &error( "logfile \"$LOGFILE\" already exists, rename it or place it in another directory" ); + } + + open( L, ">$LOGFILE" ) || &error( "Cannot create logfile: $!" ); + if ( $use_precalc_pwd != 1 ) { + print L "Trees are based directly on Pfam alignments\n"; + if ( $add_species == 1 ) { + print L "Add species code to TrEMBL sequences and ignore sequences\nfrom species not in $MY_SPECIES_NAMES_FILE\n"; + } + else { + print L "Do nothing with species information\n"; + } + } + else { + print L "Trees are based on precalculated pairwise distances\n"; + } + if ( $use_precalc_pwd_and_aln == 1 ) { + print L "and the matching alignments\n"; + } + print L "Options for makeTree: $options\n"; + if ( $use_precalc_pwd != 1 ) { + print L "Min seqs : $min_seqs\n"; + print L "Max seqs : $max_seqs\n"; + } + if ( $add_species == 1 ) { + print L "TrEMBL ACDEOS file : $MY_TREMBL_ACDEOS_FILE\n"; + print L "Species names file : $MY_SPECIES_NAMES_FILE\n"; + } + print L "Input directory : $input_dir\n"; + if ( $use_precalc_pwd_and_aln == 1 ) { + print L "Input directory aln : $input_dir_aln\n"; + } + print L "Output directory : $output_dir\n"; + print L "Start date : ".`date`; + +} ## startLogfile + + + +# Last modified: 05/17/01 +sub finishLogfile { + my $j = 0; + print L "\n\n"; + print L "Successfully calculated $i trees.\n"; + if ( $use_precalc_pwd != 1 ) { + print L "Too large alignments (>$max_seqs): $too_large\n"; + print L "Too small alignments (<$min_seqs): $too_small\n"; + } + print L "Alignments for which a tree appears to already exist: $already_present\n"; + print L "Finish date : ".`date`."\n\n"; + if ( $use_precalc_pwd != 1 ) { + print L "List of the $too_large alignments which were ignored because they\n"; + print L "contained too many sequences (>$max_seqs) [after pruning]:\n"; + for ( $j = 0; $j < $too_large; ++$j ) { + print L "$too_large_names[ $j ]\n"; + } + print L "\n\n"; + print L "List of the $too_small alignments which were ignored because they\n"; + print L "contained not enough sequences (<$min_seqs) [after pruning]:\n"; + for ( $j = 0; $j < $too_small; ++$j ) { + print L "$too_small_names[ $j ]\n"; + } + } + print L "\n\n"; + print L "List of the $already_present alignments which were ignored because\n"; + print L "a tree appears to already exist:\n"; + for ( $j = 0; $j < $already_present; ++$j ) { + print L "$already_present_names[ $j ]\n"; + } + print L "\n"; + close( L ); +} ## finishLogfile + + +sub printUsage { + print "\n"; + print " xt.pl\n"; + print " _____\n"; + print " \n"; + print " Copyright (C) 2003 Christian M. Zmasek\n"; + print " All rights reserved\n"; + print "\n"; + print " Author: Christian M. Zmasek\n"; + print " zmasek\@genetics.wustl.edu\n"; + print " http://www.genetics.wustl.edu/eddy/forester/\n"; + print "\n"; + print "\n"; + print " Purpose\n"; + print " -------\n"; + print "\n"; + print " Tree construction using makeTree.pl based on directories\n"; + print " of Pfam alignments or precalculated pairwise distances.\n"; + print "\n"; + print "\n"; + print " Usage\n"; + print " -----\n"; + print "\n"; + print " Input is Pfam aligments:\n"; + print " xt.pl \n"; + print " \n"; + print "\n"; + print " Input is precalculated pairwise distancs:\n"; + print " xt.pl \n"; + print "\n"; + print " Input is precalculated pairwise distancs and corresponding alignment files:\n"; + print " xt.pl \n"; + print " \n"; + print "\n"; + print "\n"; + print " Examples\n"; + print " --------\n"; + print "\n"; + print " \"xt.pl S NS21UTRB100DX 4 200 DB/PFAM/Full/ trees/\"\n"; + print "\n"; + print " \"xt.pl FLB100R /pfam2pwd_out/ trees/\"\n"; + print "\n"; + print " \"xt.pl FULB100R /pfam2pwd_out/ /pfam2pwd_out/ trees/\"\n"; + print "\n"; + print "\n"; + print " Options\n"; + print " -------\n"; + print "\n"; + print " I: ignore species information (use all sequences)\n"; + print " S: add species codes to TrEMBL sequences and ignore sequences\n"; + print " from species not in $MY_SPECIES_NAMES_FILE,\n"; + print " species codes are extracted from $MY_TREMBL_ACDEOS_FILE\n"; + print "\n"; + print "\n"; + print " Options for makeTree\n"; + print " --------------------\n"; + print "\n"; + print " N : Suggestion to remove columns in the alignment which contain gaps.\n"; + print " Gaps are not removed, if, after removal of gaps, the resulting\n"; + print " alignment would be shorter than $MIN_NUMBER_OF_AA aa (\$MIN_NUMBER_OF_AA).\n"; + print " Default is not to remove gaps.\n"; + print " Bx : Number of bootstrapps. B0: do not bootstrap. Default: 100 bootstrapps.\n"; + print " The number of bootstrapps should be divisible by 10.\n"; + print " U : Use TREE-PUZZLE to calculate ML branchlengths for consesus tree, in case of\n"; + print " bootstrapped analysis.\n"; + print " J : Use JTT matrix (Jones et al. 1992) in TREE-PUZZLE, default: PAM.\n"; + print " L : Use BLOSUM 62 matrix (Henikoff-Henikoff 92) in TREE-PUZZLE, default: PAM.\n"; + print " M : Use mtREV24 matrix (Adachi-Hasegawa 1996) in TREE-PUZZLE, default: PAM.\n"; + print " W : Use WAG matrix (Whelan-Goldman 2000) in TREE-PUZZLE, default: PAM.\n"; + print " T : Use VT matrix (Mueller-Vingron 2000) in TREE-PUZZLE, default: PAM.\n"; + print " P : Let TREE-PUZZLE choose which matrix to use, default: PAM.\n"; + print " R : Randomize input order in PHYLIP NEIGHBOR.\n"; + print " Sx : Seed for random number generator(s). Must be 4n+1. Default is 9.\n"; + print " X : To keep multiple tree file (=trees from bootstrap resampled alignments).\n"; + print " D : To keep (and create, in case of bootstrap analysis) pairwise distance\n"; + print " matrix file. This is created form the not resampled aligment.\n"; + print " C : Calculate pairwise distances only (no tree). Bootstrap is always 1.\n"; + print " No other files are generated.\n"; + print " F : Pairwise distance (pwd) file as input (instead of alignment).\n"; + print " No -D, -C, and -N options available in this case.\n"; + print " V : Verbose\n"; + print "\n"; + exit( -1 ); + +}