+++ /dev/null
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <stdarg.h>
-
-#include "io_lib_header.h"
-#include "util_lib_header.h"
-#include "dp_lib_header.h"
-#include "define_header.h"
-
-static int first_seq, last_seq;
-
-static int nseqs;
-static char **names; /* the seq. names */
-
-static double **tmat;
-static double *av;
-static double *left_branch, *right_branch;
-static int *tkill;
-
-
-/*!
- * \file util_make_tree.c
- * \brief Source code tree algorithms
- */
-
-NT_node ** seq2cw_tree ( Sequence *S, char *tree)
-{
- Alignment *A;
- char *aln,command[1000];
- int tot_node;
-
-
-
- A=seq2clustalw_aln (S);
- aln=vtmpnam (NULL); if (!tree)tree=vtmpnam (NULL);
-
- output_fasta_aln (aln, A);
-
- sprintf ( command, "clustalw -infile=%s -newtree=%s -tree %s", aln, tree, TO_NULL_DEVICE);
- my_system (command);
- return read_tree (tree, &tot_node, S->nseq, S->name);
-}
-
-NT_node ** make_upgma_tree ( Alignment *A,int **distances,int gop, int gep, char **out_seq, char **out_seq_name, int out_nseq, char *tree_file, char *tree_mode)
- {
-
- if ( distances==NULL && A==NULL)
- {
- fprintf ( stderr, "\nError: make_nj_tree, must provide an alignment or a distance matrix [FATAL:%s]", PROGRAM);
- myexit (EXIT_FAILURE);
- }
- else if ( distances==NULL)
- {
-
- distances=get_dist_aln_array (A, "idmat");
- }
- return int_dist2upgma_tree (distances,A, A->nseq,tree_file);
- }
-NT_node ** make_nj_tree ( Alignment *A,int **distances,int gop, int gep, char **out_seq, char **out_seq_name, int out_nseq, char *tree_file, char *tree_mode)
- {
-
- if ( distances==NULL && A==NULL)
- {
- fprintf ( stderr, "\nError: make_nj_tree, must provide an alignment or a distance matrix [FATAL:%s]", PROGRAM);
- myexit (EXIT_FAILURE);
- }
- else if ( distances==NULL)
- {
- distances=get_dist_aln_array (A, "idmat");
- }
- return int_dist2nj_tree (distances,out_seq_name, out_nseq,tree_file);
- }
-
-NT_node ** int_dist2nj_tree (int **distances, char **out_seq_name, int out_nseq, char *tree_file)
- {
- int a, b;
- double **d;
- NT_node **T;
-
- d=declare_double( out_nseq, out_nseq);
- for ( a=0; a< out_nseq; a++)
- for ( b=0; b< out_nseq; b++)
- d[a][b]=distances[a][b];
-
- T=dist2nj_tree ( d, out_seq_name, out_nseq, tree_file);
-
- free_double (d, -1);
- return T;
- }
-NT_node ** float_dist2nj_tree (float **distances, char **out_seq_name, int out_nseq, char *tree_file)
- {
- int a, b;
- double **d;
- NT_node **T;
-
- d=declare_double( out_nseq, out_nseq);
- for ( a=0; a< out_nseq; a++)
- for ( b=0; b< out_nseq; b++)
- d[a][b]=distances[a][b];
- T=dist2nj_tree ( d, out_seq_name, out_nseq, tree_file);
- free_double (d, -1);
- return T;
- }
-NT_node ** dist2nj_tree (double **distances, char **out_seq_name, int out_nseq, char *tree_file)
- {
- int a, b;
- double **d_dist;
- int tot_node=0;
- NT_node **T;
-
- if ( !tree_file)tree_file=vtmpnam(NULL);
- d_dist=declare_double( out_nseq+1, out_nseq+1);
-
- for ( a=0; a<out_nseq; a++)
- for ( b=0; b< out_nseq; b++)
- {
- if ( a!=b)
- d_dist[a+1][b+1]=distances[a][b]/MAXID;
- else d_dist[a+1][b+1]=0;
- }
-
- guide_tree ( tree_file, d_dist, out_seq_name, out_nseq);
- free_double (d_dist,-1);
- T=read_tree (tree_file,&tot_node,out_nseq, out_seq_name);
-
- return T;
- }
-
-
-void guide_tree(char *fname, double **saga_tmat, char **saga_seq_name, int saga_nseq)
-/*
- Routine for producing unrooted NJ trees from seperately aligned
- pairwise distances. This produces the GUIDE DENDROGRAMS in
- PHYLIP format.
-*/
-{
-
- int i;
- FILE *fp;
- char **standard_tree;
-
-
-
- nseqs=saga_nseq;
- first_seq=1;
- last_seq=nseqs;
-
- names=saga_seq_name;
- tmat=saga_tmat;
-
- standard_tree = (char **) vmalloc( (nseqs+1) * sizeof (char *) );
- for(i=0; i<nseqs+1; i++)
- standard_tree[i] = (char *) vmalloc( (nseqs+1) * sizeof(char));
-
-
- nj_tree(standard_tree, nseqs);
-
- fp=fopen ( fname, "w");
- print_phylip_tree(standard_tree,fp,FALSE);
-
- vfree(left_branch);
- vfree(right_branch);
- vfree(tkill);
- vfree(av);
-
- for (i=0;i<nseqs+1;i++)
- vfree(standard_tree[i]);
-
-
- fclose(fp);
-
-
-}
-
-
-void nj_tree(char **tree_description, int nseq)
-{
- if ( nseq<100)
- slow_nj_tree (tree_description);
- else
- fast_nj_tree (tree_description);
-
-}
-
-
-
-void slow_nj_tree(char **tree_description)
-{
- int i;
- int l[4],nude,k;
- int nc,mini,minj,j,ii,jj;
- double fnseqs,fnseqs2=0,sumd;
- double diq,djq,dij,d2r,dr,dio,djo,da;
- double tmin,total,dmin;
- double bi,bj,b1,b2,b3,branch[4];
- int typei,typej; /* 0 = node; 1 = OTU */
-
- fnseqs = (double)nseqs;
-
-/*********************** First initialisation ***************************/
-
-
-
- mini = minj = 0;
-
- left_branch = (double *) vcalloc( (nseqs+2),sizeof (double) );
- right_branch = (double *) vcalloc( (nseqs+2),sizeof (double) );
- tkill = (int *) vcalloc( (nseqs+1),sizeof (int) );
- av = (double *) vcalloc( (nseqs+1),sizeof (double) );
-
- for(i=1;i<=nseqs;++i)
- {
- tmat[i][i] = av[i] = 0.0;
- tkill[i] = 0;
- }
-
-/*********************** Enter The Main Cycle ***************************/
-
- /**start main cycle**/
- for(nc=1; nc<=(nseqs-3); ++nc)
- {
-
- sumd = 0.0;
- for(j=2; j<=nseqs; ++j)
- for(i=1; i<j; ++i)
- {
- tmat[j][i] = tmat[i][j];
- sumd = sumd + tmat[i][j];
- }
- tmin = 99999.0;
-
-/*.................compute SMATij values and find the smallest one ........*/
-
- for(jj=2; jj<=nseqs; ++jj)
- if(tkill[jj] != 1)
- for(ii=1; ii<jj; ++ii)
- if(tkill[ii] != 1)
- {
- diq = djq = 0.0;
-
- for(i=1; i<=nseqs; ++i)
- {
- diq = diq + tmat[i][ii];
- djq = djq + tmat[i][jj];
- }
- dij = tmat[ii][jj];
- d2r = diq + djq - (2.0*dij);
- dr = sumd - dij -d2r;
- fnseqs2 = fnseqs - 2.0;
- total= d2r+ fnseqs2*dij +dr*2.0;
- total= total / (2.0*fnseqs2);
-
- /* commented out to have consistent results with CYGWIN: if(total < tmin)"*/
- if(total < tmin)
- {
- if ( tmin-total>MY_EPS)
- tmin = total;
- mini = ii;
- minj = jj;
- }
- }
-
-
-/*.................compute branch lengths and print the results ........*/
-
-
- dio = djo = 0.0;
- for(i=1; i<=nseqs; ++i) {
- dio = dio + tmat[i][mini];
- djo = djo + tmat[i][minj];
- }
-
- dmin = tmat[mini][minj];
- dio = (dio - dmin) / fnseqs2;
- djo = (djo - dmin) / fnseqs2;
- bi = (dmin + dio - djo) * 0.5;
- bj = dmin - bi;
- bi = bi - av[mini];
- bj = bj - av[minj];
-
- if( av[mini] > 0.0 )
- typei = 0;
- else
- typei = 1;
- if( av[minj] > 0.0 )
- typej = 0;
- else
- typej = 1;
-
-
-
-/*
- set negative branch lengths to zero. Also set any tiny positive
- branch lengths to zero.
-*/
- if( fabs(bi) < 0.0001) bi = 0.0;
- if( fabs(bj) < 0.0001) bj = 0.0;
-
-
-
-
- left_branch[nc] = bi;
- right_branch[nc] = bj;
-
- for(i=1; i<=nseqs; i++)
- tree_description[nc][i] = 0;
-
- if(typei == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][mini] == 1) {
- for(j=1; j<=nseqs; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][mini] = 1;
-
- if(typej == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][minj] == 1) {
- for(j=1; j<=nseqs; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][minj] = 1;
-
-
-/*
- Here is where the -0.00005 branch lengths come from for 3 or more
- identical seqs.
-*/
-/* if(dmin <= 0.0) dmin = 0.0001; */
- if(dmin <= 0.0) dmin = 0.000001;
- av[mini] = dmin * 0.5;
-
- /*........................Re-initialisation................................*/
-
- fnseqs = fnseqs - 1.0;
- tkill[minj] = 1;
-
- for(j=1; j<=nseqs; ++j)
- if( tkill[j] != 1 ) {
- da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;
- if( (mini - j) < 0 )
- tmat[mini][j] = da;
- if( (mini - j) > 0)
- tmat[j][mini] = da;
- }
-
- for(j=1; j<=nseqs; ++j)
- tmat[minj][j] = tmat[j][minj] = 0.0;
-
-
-
- }
- /*end main cycle**/
-
-/******************************Last Cycle (3 Seqs. left)********************/
-
- nude = 1;
-
-
- for(i=1; i<=nseqs; ++i)
- if( tkill[i] != 1 ) {
- l[nude] = i;
- nude = nude + 1;
- }
-
- b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
- b2 = tmat[l[1]][l[2]] - b1;
- b3 = tmat[l[1]][l[3]] - b1;
-
- branch[1] = b1 - av[l[1]];
- branch[2] = b2 - av[l[2]];
- branch[3] = b3 - av[l[3]];
-
-/* Reset tiny negative and positive branch lengths to zero */
- if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
- if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
- if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
-
- left_branch[nseqs-2] = branch[1];
- left_branch[nseqs-1] = branch[2];
- left_branch[nseqs] = branch[3];
-
- for(i=1; i<=nseqs; i++)
- tree_description[nseqs-2][i] = 0;
-
-
-
- for(i=1; i<=3; ++i) {
- if( av[l[i]] > 0.0) {
-
- for(k=nseqs-3; k>=1; k--)
- if(tree_description[k][l[i]] == 1) {
- for(j=1; j<=nseqs; j++)
- if(tree_description[k][j] == 1)
- tree_description[nseqs-2][j] = i;
- break;
- }
- }
- else {
-
- tree_description[nseqs-2][l[i]] = i;
- }
- if(i < 3) {
- }
- }
-}
-
-void print_phylip_tree(char **tree_description, FILE *tree, int bootstrap)
-{
-
- fprintf(tree,"(\n");
-
- two_way_split(tree_description, tree, nseqs-2,1,bootstrap);
- fprintf(tree,":%7.5f,\n",left_branch[nseqs-2]);
- two_way_split(tree_description, tree, nseqs-2,2,bootstrap);
- fprintf(tree,":%7.5f,\n",left_branch[nseqs-1]);
- two_way_split(tree_description, tree, nseqs-2,3,bootstrap);
-
- fprintf(tree,":%7.5f)",left_branch[nseqs]);
- if (bootstrap) fprintf(tree,"TRICHOTOMY");
- fprintf(tree,";\n");
-}
-
-
-void two_way_split
-(char **tree_description, FILE *tree, int start_row, int flag, int bootstrap)
-{
- int row, new_row, col, test_col=0;
- int single_seq;
-
- if(start_row != nseqs-2) fprintf(tree,"(\n");
-
- for(col=1; col<=nseqs; col++) {
- if(tree_description[start_row][col] == flag) {
- test_col = col;
- break;
- }
- }
-
- single_seq = TRUE;
- for(row=start_row-1; row>=1; row--)
- if(tree_description[row][test_col] == 1) {
- single_seq = FALSE;
- new_row = row;
- break;
- }
-
- if(single_seq) {
- tree_description[start_row][test_col] = 0;
- fprintf(tree,"%s",names[test_col+0-1]);
- }
- else {
- for(col=1; col<=nseqs; col++) {
- if((tree_description[start_row][col]==1)&&
- (tree_description[new_row][col]==1))
- tree_description[start_row][col] = 0;
- }
- two_way_split(tree_description, tree, new_row, (int)1, bootstrap);
- }
-
- if(start_row == nseqs-2) {
-/* if (bootstrap && (flag==1)) fprintf(tree,"[TRICHOTOMY]");
-*/
- return;
- }
-
- fprintf(tree,":%7.5f,\n",left_branch[start_row]);
-
- for(col=1; col<=nseqs; col++)
- if(tree_description[start_row][col] == flag) {
- test_col = col;
- break;
- }
-
- single_seq = TRUE;
- for(row=start_row-1; row>=1; row--)
- if(tree_description[row][test_col] == 1) {
- single_seq = FALSE;
- new_row = row;
- break;
- }
-
- if(single_seq) {
- tree_description[start_row][test_col] = 0;
- fprintf(tree,"%s",names[test_col+0-1]);
- }
- else {
- for(col=1; col<=nseqs; col++) {
- if((tree_description[start_row][col]==1)&&
- (tree_description[new_row][col]==1))
- tree_description[start_row][col] = 0;
- }
- two_way_split(tree_description, tree, new_row, (int)1, bootstrap);
- }
-
- fprintf(tree,":%7.5f)\n",right_branch[start_row]);
-
-
-}
-
-
-
-
-/****************************************************************************
- * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
- * written by Tadashi Koike
- * (takoike@genes.nig.ac.jp)
- *******************
- * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
- * and use again and again.
- *
- * In the main cycle, these are calculated again and again :
- * diq = sum of tmat[n][ii] (n:1 to last_seq-first_seq+1),
- * djq = sum of tmat[n][jj] (n:1 to last_seq-first_seq+1),
- * dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1),
- * djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1)
- * // 'last_seq' and 'first_seq' are both constant values //
- * and the result of above calculations is always same until
- * a best pair of neighbour nodes is joined.
- *
- * So, we change the logic to calculate the sum[i] (=sum of tmat[n][i]
- * (n:1 to last_seq-first_seq+1)) and store it to array, before
- * beginning to find a best pair of neighbour nodes, and after that
- * we use them again and again.
- *
- * tmat[i][j]
- * 1 2 3 4 5
- * +---+---+---+---+---+
- * 1 | | | | | |
- * +---+---+---+---+---+
- * 2 | | | | | | 1) calculate sum of tmat[n][i]
- * +---+---+---+---+---+ (n: 1 to last_seq-first_seq+1)
- * 3 | | | | | | 2) store that sum value to sum[i]
- * +---+---+---+---+---+
- * 4 | | | | | | 3) use sum[i] during finding a best
- * +---+---+---+---+---+ pair of neibour nodes.
- * 5 | | | | | |
- * +---+---+---+---+---+
- * | | | | |
- * V V V V V Calculate sum , and store it to sum[i]
- * +---+---+---+---+---+
- * sum[i] | | | | | |
- * +---+---+---+---+---+
- *
- * At this time, we thought that we use upper triangle of the matrix
- * because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal
- * to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead
- * of sum[i] for storing the sum value.
- *
- * tmat[i][j]
- * 1 2 3 4 5 sum_cols[i]
- * +---+---+---+---+---+ +---+
- * 1 | # | # | # | # | --> | | ... sum of tmat[1][2..5]
- * + - +---+---+---+---+ +---+
- * 2 | # | # | # | --> | | ... sum of tmat[2][3..5]
- * + - + - +---+---+---+ +---+
- * 3 | # | # | --> | | ... sum of tmat[3][4..5]
- * + - + - + - +---+---+ +---+
- * 4 | # | --> | | ... sum of tmat[4][5]
- * + - + - + - + - +---+ +---+
- * 5 | --> | | ... zero
- * + - + - + - + - + - + +---+
- * | | | | |
- * V V V V V Calculate sum , sotre to sum[i]
- * +---+---+---+---+---+
- * sum_rows[i] | | | | | |
- * +---+---+---+---+---+
- * | | | | |
- * | | | | +----- sum of tmat[1..4][5]
- * | | | +--------- sum of tmat[1..3][4]
- * | | +------------- sum of tmat[1..2][3]
- * | +----------------- sum of tmat[1][2]
- * +--------------------- zero
- *
- * And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
- *
- *******************
- * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
- * tkill[i] flag array.
- *
- * In original logic, invalid(killed?) nodes after nodes-joining
- * are managed with tkill[i] flag array (set to 1 when killed).
- * By this method, it is conspicuous to try next node but skip it
- * at the latter of finding a best pair of neighbor nodes.
- *
- * So, we thought that we managed valid nodes by using a chain list
- * as below:
- *
- * 1) declare the list structure.
- * struct {
- * int n; // entry number of node.
- * void *prev; // pointer to previous entry.
- * void *next; // pointer to next entry.
- * }
- * 2) construct a valid node list.
- *
- * +-----+ +-----+ +-----+ +-----+ +-----+
- * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
- * | 0 | | 1 | | 2 | | 3 | | n |
- * | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
- * +-----+ +-----+ +-----+ +-----+ +-----+
- *
- * 3) when finding a best pair of neighbor nodes, we use
- * this chain list as loop counter.
- *
- * 4) If an entry was killed by node-joining, this chain list is
- * modified to remove that entry.
- *
- * EX) remove the entry No 2.
- * +-----+ +-----+ +-----+ +-----+
- * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
- * | 0 | | 1 | | 3 | | n |
- * | next|--->| next|-------------->| next|- - - ->| next|->NULL
- * +-----+ +-----+ +-----+ +-----+
- * +-----+
- * NULL<-|prev |
- * | 2 |
- * | next|->NULL
- * +-----+
- *
- * By this method, speed is up at the latter of finding a best pair of
- * neighbor nodes.
- *
- *******************
- * <IMPROVEMENT 3> : Cut the frequency of division.
- *
- * At comparison between 'total' and 'tmin' in the main cycle, total is
- * divided by (2.0*fnseqs2) before comparison. If N nodes are available,
- * that division happen (N*(N-1))/2 order.
- *
- * We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
- * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
- * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
- * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
- *
- *******************
- * <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
- *
- * We transform an equation of calculating 'total' in the main cycle.
- *
- */
-
-
-void fast_nj_tree(char **tree_description)
-{
- register int i;
- int l[4],nude,k;
- int nc,mini,minj,j,ii,jj;
- double fnseqs,fnseqs2=0,sumd;
- double diq,djq,dij,dio,djo,da;
- double tmin,total,dmin;
- double bi,bj,b1,b2,b3,branch[4];
- int typei,typej; /* 0 = node; 1 = OTU */
-
- /* IMPROVEMENT 1, STEP 0 : declare variables */
- double *sum_cols, *sum_rows, *join;
-
- /* IMPROVEMENT 2, STEP 0 : declare variables */
- int loop_limit;
- typedef struct _ValidNodeID {
- int n;
- struct _ValidNodeID *prev;
- struct _ValidNodeID *next;
- } ValidNodeID;
- ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;
-
- /*
- * correspondence of the loop counter variables.
- * i .. lpi->n, ii .. lpii->n
- * j .. lpj->n, jj .. lpjj->n
- */
-
- fnseqs = (double)last_seq-first_seq+1;
-
-/*********************** First initialisation ***************************/
-
-
- if (fnseqs == 2) {
- return;
- }
-
- mini = minj = 0;
-
- left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
- tkill = (int *) ckalloc( (nseqs+1) * sizeof (int) );
- av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
-
- /* IMPROVEMENT 1, STEP 1 : Allocate memory */
- sum_cols = (double *) ckalloc( (nseqs+1) * sizeof (double) );
- sum_rows = (double *) ckalloc( (nseqs+1) * sizeof (double) );
- join = (double *) ckalloc( (nseqs+1) * sizeof (double) );
-
- /* IMPROVEMENT 2, STEP 1 : Allocate memory */
- tvalid = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );
- /* tvalid[0] is special entry in array. it points a header of valid entry list */
- tvalid[0].n = 0;
- tvalid[0].prev = NULL;
- tvalid[0].next = &tvalid[1];
-
- /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
- for(i=1, loop_limit = last_seq-first_seq+1,
- lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;
- i<=loop_limit ;
- ++i, ++lpi, ++lp_prev, ++lp_next)
- {
- tmat[i][i] = av[i] = 0.0;
- tkill[i] = 0;
- lpi->n = i;
- lpi->prev = lp_prev;
- lpi->next = lp_next;
-
- /* IMPROVEMENT 1, STEP 2 : Initialize arrays */
- sum_cols[i] = sum_rows[i] = join[i] = 0.0;
- }
- tvalid[loop_limit].next = NULL;
-
- /*
- * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that
- * is sequence[i] to others.
- */
- sumd = 0.0;
- for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
- double tmp_sum = 0.0;
- j = lpj->n;
- /* calculate sum_rows[j] */
- for (lpi=tvalid[0].next ; lpi->n < j ; lpi = lpi->next) {
- i = lpi->n;
- tmp_sum += tmat[i][j];
- /* tmat[j][i] = tmat[i][j]; */
- }
- sum_rows[j] = tmp_sum;
-
- tmp_sum = 0.0;
- /* Set lpi to that lpi->n is greater than j */
- if ((lpi != NULL) && (lpi->n == j)) {
- lpi = lpi->next;
- }
- /* calculate sum_cols[j] */
- for( ; lpi!=NULL ; lpi = lpi->next) {
- i = lpi->n;
- tmp_sum += tmat[j][i];
- /* tmat[i][j] = tmat[j][i]; */
- }
- sum_cols[j] = tmp_sum;
- }
-
-/*********************** Enter The Main Cycle ***************************/
-
- for(nc=1, loop_limit = (last_seq-first_seq+1-3); nc<=loop_limit; ++nc) {
-
- sumd = 0.0;
- /* IMPROVEMENT 1, STEP 4 : use sum value */
- for(lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
- sumd += sum_cols[lpj->n];
- }
-
- /* IMPROVEMENT 3, STEP 0 : multiply tmin and 2*fnseqs2 */
- fnseqs2 = fnseqs - 2.0; /* Set fnseqs2 at this point. */
- tmin = 99999.0 * 2.0 * fnseqs2;
-
-
-/*.................compute SMATij values and find the smallest one ........*/
-
- mini = minj = 0;
-
- /* jj must starts at least 2 */
- if ((tvalid[0].next != NULL) && (tvalid[0].next->n == 1)) {
- lpjj = tvalid[0].next->next;
- } else {
- lpjj = tvalid[0].next;
- }
-
- for( ; lpjj != NULL; lpjj = lpjj->next) {
- jj = lpjj->n;
- for(lpii=tvalid[0].next ; lpii->n < jj ; lpii = lpii->next) {
- ii = lpii->n;
- diq = djq = 0.0;
-
- /* IMPROVEMENT 1, STEP 4 : use sum value */
- diq = sum_cols[ii] + sum_rows[ii];
- djq = sum_cols[jj] + sum_rows[jj];
- /*
- * always ii < jj in this point. Use upper
- * triangle of score matrix.
- */
- dij = tmat[ii][jj];
-
- /*
- * IMPROVEMENT 3, STEP 1 : fnseqs2 is
- * already calculated.
- */
- /* fnseqs2 = fnseqs - 2.0 */
-
- /* IMPROVEMENT 4 : transform the equation */
- /*-------------------------------------------------------------------*
- * OPTIMIZE of expression 'total = d2r + fnseqs2*dij + dr*2.0' *
- * total = d2r + fnseq2*dij + 2.0*dr *
- * = d2r + fnseq2*dij + 2(sumd - dij - d2r) *
- * = d2r + fnseq2*dij + 2*sumd - 2*dij - 2*d2r *
- * = fnseq2*dij + 2*sumd - 2*dij - 2*d2r + d2r *
- * = fnseq2*dij + 2*sumd - 2*dij - d2r *
- * = fnseq2*dij + 2*sumd - 2*dij - (diq + djq - 2*dij) *
- * = fnseq2*dij + 2*sumd - 2*dij - diq - djq + 2*dij *
- * = fnseq2*dij + 2*sumd - 2*dij + 2*dij - diq - djq *
- * = fnseq2*dij + 2*sumd - diq - djq *
- *-------------------------------------------------------------------*/
- total = fnseqs2*dij + 2.0*sumd - diq - djq;
-
- /*
- * IMPROVEMENT 3, STEP 2 : abbrevlate
- * the division on comparison between
- * total and tmin.
- */
- /* total = total / (2.0*fnseqs2); */
-
- if(total < tmin) {
- tmin = total;
- mini = ii;
- minj = jj;
- }
- }
- }
-
- /* MEMO: always ii < jj in avobe loop, so mini < minj */
-
-/*.................compute branch lengths and print the results ........*/
-
-
- dio = djo = 0.0;
-
- /* IMPROVEMENT 1, STEP 4 : use sum value */
- dio = sum_cols[mini] + sum_rows[mini];
- djo = sum_cols[minj] + sum_rows[minj];
-
- dmin = tmat[mini][minj];
- dio = (dio - dmin) / fnseqs2;
- djo = (djo - dmin) / fnseqs2;
- bi = (dmin + dio - djo) * 0.5;
- bj = dmin - bi;
- bi = bi - av[mini];
- bj = bj - av[minj];
-
- if( av[mini] > 0.0 )
- typei = 0;
- else
- typei = 1;
- if( av[minj] > 0.0 )
- typej = 0;
- else
- typej = 1;
-
-
-/*
- set negative branch lengths to zero. Also set any tiny positive
- branch lengths to zero.
-*/ if( fabs(bi) < 0.0001) bi = 0.0;
- if( fabs(bj) < 0.0001) bj = 0.0;
-
-
- left_branch[nc] = bi;
- right_branch[nc] = bj;
-
- for(i=1; i<=last_seq-first_seq+1; i++)
- tree_description[nc][i] = 0;
-
- if(typei == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][mini] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][mini] = 1;
-
- if(typej == 0) {
- for(i=nc-1; i>=1; i--)
- if(tree_description[i][minj] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[i][j] == 1)
- tree_description[nc][j] = 1;
- break;
- }
- }
- else
- tree_description[nc][minj] = 1;
-
-
-/*
- Here is where the -0.00005 branch lengths come from for 3 or more
- identical seqs.
-*/
-/* if(dmin <= 0.0) dmin = 0.0001; */
- if(dmin <= 0.0) dmin = 0.000001;
- av[mini] = dmin * 0.5;
-
-/*........................Re-initialisation................................*/
-
- fnseqs = fnseqs - 1.0;
- tkill[minj] = 1;
-
- /* IMPROVEMENT 2, STEP 3 : Remove tvalid[minj] from chain list. */
- /* [ Before ]
- * +---------+ +---------+ +---------+
- * |prev |<-------|prev |<-------|prev |<---
- * | n | | n(=minj)| | n |
- * | next|------->| next|------->| next|----
- * +---------+ +---------+ +---------+
- *
- * [ After ]
- * +---------+ +---------+
- * |prev |<--------------------------|prev |<---
- * | n | | n |
- * | next|-------------------------->| next|----
- * +---------+ +---------+
- * +---------+
- * NULL---|prev |
- * | n(=minj)|
- * | next|---NULL
- * +---------+
- */
- (tvalid[minj].prev)->next = tvalid[minj].next;
- if (tvalid[minj].next != NULL) {
- (tvalid[minj].next)->prev = tvalid[minj].prev;
- }
- tvalid[minj].prev = tvalid[minj].next = NULL;
-
- /* IMPROVEMENT 1, STEP 5 : re-calculate sum values. */
- for(lpj=tvalid[0].next ; lpj != NULL ; lpj = lpj->next) {
- double tmp_di = 0.0;
- double tmp_dj = 0.0;
- j = lpj->n;
-
- /*
- * subtrace a score value related with 'minj' from
- * sum arrays .
- */
- if (j < minj) {
- tmp_dj = tmat[j][minj];
- sum_cols[j] -= tmp_dj;
- } else if (j > minj) {
- tmp_dj = tmat[minj][j];
- sum_rows[j] -= tmp_dj;
- } /* nothing to do when j is equal to minj. */
-
-
- /*
- * subtrace a score value related with 'mini' from
- * sum arrays .
- */
- if (j < mini) {
- tmp_di = tmat[j][mini];
- sum_cols[j] -= tmp_di;
- } else if (j > mini) {
- tmp_di = tmat[mini][j];
- sum_rows[j] -= tmp_di;
- } /* nothing to do when j is equal to mini. */
-
- /*
- * calculate a score value of the new inner node.
- * then, store it temporary to join[] array.
- */
- join[j] = (tmp_dj + tmp_di) * 0.5;
- }
-
- /*
- * 1)
- * Set the score values (stored in join[]) into the matrix,
- * row/column position is 'mini'.
- * 2)
- * Add a score value of the new inner node to sum arrays.
- */
- for(lpj=tvalid[0].next ; lpj != NULL; lpj = lpj->next) {
- j = lpj->n;
- if (j < mini) {
- tmat[j][mini] = join[j];
- sum_cols[j] += join[j];
- } else if (j > mini) {
- tmat[mini][j] = join[j];
- sum_rows[j] += join[j];
- } /* nothing to do when j is equal to mini. */
- }
-
- /* Re-calculate sum_rows[mini],sum_cols[mini]. */
- sum_cols[mini] = sum_rows[mini] = 0.0;
-
- /* calculate sum_rows[mini] */
- da = 0.0;
- for(lpj=tvalid[0].next ; lpj->n < mini ; lpj = lpj->next) {
- da += join[lpj->n];
- }
- sum_rows[mini] = da;
-
- /* skip if 'lpj->n' is equal to 'mini' */
- if ((lpj != NULL) && (lpj->n == mini)) {
- lpj = lpj->next;
- }
-
- /* calculate sum_cols[mini] */
- da = 0.0;
- for( ; lpj != NULL; lpj = lpj->next) {
- da += join[lpj->n];
- }
- sum_cols[mini] = da;
-
- /*
- * Clean up sum_rows[minj], sum_cols[minj] and score matrix
- * related with 'minj'.
- */
- sum_cols[minj] = sum_rows[minj] = 0.0;
- for(j=1; j<=last_seq-first_seq+1; ++j)
- tmat[minj][j] = tmat[j][minj] = join[j] = 0.0;
-
-
-/****/ } /*end main cycle**/
-
-/******************************Last Cycle (3 Seqs. left)********************/
-
- nude = 1;
-
- for(lpi=tvalid[0].next; lpi != NULL; lpi = lpi->next) {
- l[nude] = lpi->n;
- ++nude;
- }
-
- b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
- b2 = tmat[l[1]][l[2]] - b1;
- b3 = tmat[l[1]][l[3]] - b1;
-
- branch[1] = b1 - av[l[1]];
- branch[2] = b2 - av[l[2]];
- branch[3] = b3 - av[l[3]];
-
-/* Reset tiny negative and positive branch lengths to zero */
- if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
- if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
- if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
-
- left_branch[last_seq-first_seq+1-2] = branch[1];
- left_branch[last_seq-first_seq+1-1] = branch[2];
- left_branch[last_seq-first_seq+1] = branch[3];
-
- for(i=1; i<=last_seq-first_seq+1; i++)
- tree_description[last_seq-first_seq+1-2][i] = 0;
-
-
- for(i=1; i<=3; ++i) {
- if( av[l[i]] > 0.0) {
-
- for(k=last_seq-first_seq+1-3; k>=1; k--)
- if(tree_description[k][l[i]] == 1) {
- for(j=1; j<=last_seq-first_seq+1; j++)
- if(tree_description[k][j] == 1)
- tree_description[last_seq-first_seq+1-2][j] = i;
- break;
- }
- }
- else {
- tree_description[last_seq-first_seq+1-2][l[i]] = i;
- }
- if(i < 3) {;
- }
- }
- ckfree(sum_cols);
- ckfree(sum_rows);
- ckfree(join);
- ckfree(tvalid);
-}
-//////////////////////////////////////////////////////////////////////////////
-//
-// UPGMA_aln
-//////////////////////////////////////////////////////////////////////////////
-
-Alignment * upgma_merge_aln_rows (Alignment *A, int *ns, int **ls,int N, int**mat,int *used, int *n, Constraint_list *CL);
-int upgma_pair_wise (Alignment *A, int *ls0, int ns0, int *ls2, int ns2, Constraint_list *CL);
-
-
-Alignment * upgma_tree_aln ( Alignment*A, int nseq, Constraint_list *CL)
-{
- int a, b,n, *used;
- static int **mat;
- int **ls;
- int *ns;
- nseq=(CL->S)->nseq;
- mat=declare_int (nseq, nseq);
- ls=declare_int (nseq,nseq);
- ns=vcalloc (nseq,sizeof (int));
-
- for (a=0; a<nseq-1; a++)
- for (b=a+1; b<nseq; b++)
- {
- ns[a]=ns[b]=1;
- ls[a][0]=a;
- ls[b][0]=b;
- mat[a][b]=mat[b][a]=upgma_pair_wise(A,ls[a],ns[a],ls[b],ns[b],CL);
- HERE ("%d %d", a, b, mat[a][b]);
- }
-
- used=vcalloc (nseq, sizeof (int));
- n=nseq;
- while (n>1)
- {
- upgma_merge_aln_rows (A,ns, ls,nseq, mat, used, &n,CL);
- }
- print_aln (A);
- HERE ("finished");
- free_int ( mat, -1);
- free_int (ls, -1);
- vfree (ns);
- return A;
-}
-
-Alignment * upgma_merge_aln_rows (Alignment *A, int *ns, int **ls,int N, int**mat,int *used, int *n, Constraint_list *CL)
-{
-
- int a, b, w, best_a, best_b, set;
- float best_s;
-
- for (set=0,a=0; a<N-1; a++)
- {
- if (used[a])continue;
- for (b=a+1; b<N; b++)
- {
- if (used[b])continue;
- w=mat[a][b];
- if ( !set || w>best_s)
- {
- best_s=w;
- best_a=a;
- best_b=b;
- set=1;
- }
- }
- }
- used[best_b]=1;
-
- //merge a and b
- mat[best_a][best_b]=upgma_pair_wise (A, ls[best_a], ns[best_a], ls[best_b], ns[best_b], CL);
- for (a=0; a<ns[best_b]; a++)ls[best_a][ns[best_a]++]=ls[best_b][a];
- ns[best_b]=0;
-
- //update the a row
- for (a=0; a< A->nseq; a++)
- {
- if (a!=best_a && !used[a])
- mat[best_a][a]=mat[a][best_a]=upgma_pair_wise (A, ls[best_a], ns[best_a], ls[a], ns[a], CL);
- }
-
- HERE ("DONE");
-
- n[0]--;
- return A;
-}
-int upgma_pair_wise (Alignment *A, int *ls0, int ns0, int *ls1, int ns1, Constraint_list *CL)
-{
- static int **ls;
- static int *ns;
- static int *fl;
- int a, b, n;
-
- if ( !ls )
- {
- ls=vcalloc (2, sizeof (int*));
- ns=vcalloc (2, sizeof (int));
- fl=vcalloc ((CL->S)->nseq, sizeof (int));
- }
- ls[0]=ls0;
- ls[1]=ls1;
- ns[0]=ns0; ns[1]=ns1;
-
- fprintf ( stderr, "\n");
- for (a=0; a<ns0; a++)
- fprintf ( stderr, "%d", ls0[a]);
- fprintf ( stderr, "\n");
- for (a=0; a<ns1; a++)
- fprintf ( stderr, "%d", ls1[a]);
-
- ungap_sub_aln (A, ns[0], ls[0]);
- ungap_sub_aln (A, ns[1], ls[1]);
- HERE ("Align");
- pair_wise (A, ns, ls, CL);
- HERE ("Done");
-
- for (n=0,a=0;a<2; a++)
- for (b=0; b<ns[a]; b++)
- fl[n++]=ls[a][b];
-
- return sub_aln2ecl_raw_score (A,CL,n, fl);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// UPGMA
-///////////////////////////////////////////////////////////////////////////////
-
-
-NT_node ** int_dist2upgma_tree (int **mat, Alignment *A, int nseq, char *fname)
-{
- NT_node *NL, T;
- int a, n, *used;
- int tot_node;
- if (upgma_node_heap (NULL))
- {
- printf_exit ( EXIT_FAILURE,stderr, "\nERROR: non empty heap in upgma [FATAL]");
- }
- NL=vcalloc (nseq, sizeof (NT_node));
-
- for (a=0; a<A->nseq; a++)
- {
- NL[a]=new_declare_tree_node ();
- upgma_node_heap (NL[a]);
- sprintf (NL[a]->name, "%s", A->name[a]);
- NL[a]->isseq=1;
- NL[a]->leaf=1;
- }
-
- used=vcalloc ( A->nseq, sizeof (int));
- n=A->nseq;
- while (n>1)
- {
- T=upgma_merge (mat, NL,used, &n, A->nseq);
- }
-
- vfree (used);
- vfclose (print_tree (T, "newick", vfopen (fname, "w")));
- upgma_node_heap (NULL);
- vfree (NL);
-
- return read_tree (fname,&tot_node,A->nseq, A->name);
-}
-NT_node upgma_merge (int **mat, NT_node *NL, int *used, int *n, int N)
-{
- NT_node P, LC, RC;
- int a, b, w, best_a, best_b, set;
- float best_s;
- P=new_declare_tree_node();
- upgma_node_heap (P);
-
- for (set=0,a=0; a<N-1; a++)
- {
- if (used[a])continue;
- for (b=a+1; b<N; b++)
- {
- if (used[b])continue;
- w=mat[a][b];
- if ( !set || w<best_s)
- {
- best_s=w;
- best_a=a;
- best_b=b;
- set=1;
- }
- }
- }
-
- for (a=0; a<N; a++)
- {
- if (!used[a])mat[best_a][a]=mat[a][best_a]=(mat[best_b][a]+mat[best_a][a])/2;
- }
- used[best_b]=1;
-
- LC=NL[best_a];
- RC=NL[best_b];
- best_s/=(float)100;
- LC->dist=RC->dist=best_s;
- LC->parent=RC->parent=P;
- P->left=LC;
- P->right=RC;
- NL[best_a]=P;
- n[0]--;
- return P;
-
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// SPLIT UPGMA
-///////////////////////////////////////////////////////////////////////////////
-
-int upgma_node_heap (NT_node X)
-{
- static int n;
- static NT_node *h;
- if ( X==NULL)
- {
- int a,r;
- if (n==0) return 0;
- for (a=0; a<n; a++)
- {
- free_tree_node (h[a]);
- }
- vfree (h);
- h=NULL;
- r=n;
- n=0;
- return r;
- }
- else
- {
- h=vrealloc (h, sizeof (NT_node)*(n+1));
- h[n++]=X;
- }
- return n;
-}
-NT_node split2upgma_tree (Split **S, Alignment *A, int nseq, char *fname)
-{
- NT_node *NL, T;
- int a, n, *used;
-
- NL=vcalloc (nseq, sizeof (NT_node));
- for (a=0; a<A->nseq; a++)
- {
-
- NL[a]=new_declare_tree_node ();
- NL[a]->lseq2=vcalloc (A->nseq+1, sizeof (int));
- NL[a]->lseq2[a]=1;
- sprintf (NL[a]->name, "%s", A->name[a]);
- NL[a]->isseq=1;
- NL[a]->leaf=1;
- NL[a]->dist=1;
- upgma_node_heap (NL[a]);
- }
- used=vcalloc ( A->nseq, sizeof (int));
- n=A->nseq;
- while (n>1)
- {
- T=split_upgma_merge (A,S, NL,used, &n, A->nseq);
- }
- vfree (used);
- fprintf ( stdout, "\n");
- vfclose (print_tree (T, "newick", vfopen (fname, "w")));
- upgma_node_heap (NULL);
- return T;
-}
-NT_node split_upgma_merge (Alignment *A, Split **S, NT_node *NL, int *used, int *n, int N)
-{
- NT_node P, LC, RC;
- int a, b, w, best_a, best_b, set;
- float best_s;
- static int **mat;
-
- if (!mat)
- {
- mat=declare_int (N, N);
- for (a=0; a<N-1; a++)
- for ( b=a+1; b<N; b++)
- {
- mat[a][b]=get_split_dist (A, NL[a], NL[b], S);
- }
- }
-
- P=new_declare_tree_node();
- upgma_node_heap (P);
- P->lseq2=vcalloc (N, sizeof (int));
- for (set=0,a=0; a<N-1; a++)
- {
- if (used[a])continue;
- for (b=a+1; b<N; b++)
- {
- if (used[b])continue;
- w=mat[a][b];
- if ( !set || w<best_s)
- {
- best_s=w;
- best_a=a;
- best_b=b;
- set=1;
- }
- }
- }
-
-
-
-
- LC=NL[best_a];
- RC=NL[best_b];
- best_s/=100;
-
- P->dist=1-best_s;
- LC->parent=RC->parent=P;
- P->left=LC;
- P->right=RC;
- P->bootstrap=best_s*100;
- used[best_b]=1;
-
- n[0]--;
-
- for (a=0; a<A->nseq; a++)
- {
- P->lseq2[a]=(LC->lseq2[a] || RC->lseq2[a])?1:0;
- }
-
- for (a=0; a<N; a++)
- {
- if (!used[a])mat[best_a][a]=mat[a][best_a]=(int)get_split_dist(A,P, NL[a], S);
- }
- NL[best_a]=P;
-
- return P;
-
-}
-float get_split_dist ( Alignment *A, NT_node L, NT_node R, Split **S)
-{
- static char *split;
-
-
- int n,a;
-
- if (!split)
- {
- split=vcalloc ( A->nseq+1, sizeof (char));
- }
-
-
-
- for ( a=0; a<A->nseq; a++)
- split[a]=((L->lseq2[a] || R->lseq2[a])?1:0)+'0';
-
- n=0;
- while (S[n])
- {
- float score;
- if ( strm (S[n]->split,split))
- {
- return score=100-S[n]->score;
- }
- n++;
- }
- return 100;
-}
-/******************************COPYRIGHT NOTICE*******************************/
-/*© Centro de Regulacio Genomica */
-/*and */
-/*Cedric Notredame */
-/*Fri Feb 18 08:27:45 CET 2011 - Revision 596. */
-/*All rights reserved.*/
-/*This file is part of T-COFFEE.*/
-/**/
-/* T-COFFEE is free software; you can redistribute it and/or modify*/
-/* it under the terms of the GNU General Public License as published by*/
-/* the Free Software Foundation; either version 2 of the License, or*/
-/* (at your option) any later version.*/
-/**/
-/* T-COFFEE is distributed in the hope that it will be useful,*/
-/* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
-/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
-/* GNU General Public License for more details.*/
-/**/
-/* You should have received a copy of the GNU General Public License*/
-/* along with Foobar; if not, write to the Free Software*/
-/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
-/*............................................... |*/
-/* If you need some more information*/
-/* cedric.notredame@europe.com*/
-/*............................................... |*/
-/**/
-/**/
-/* */
-/******************************COPYRIGHT NOTICE*******************************/