6 #include "io_lib_header.h"
7 #include "util_lib_header.h"
8 #include "dp_lib_header.h"
9 #include "define_header.h"
11 static int first_seq, last_seq;
14 static char **names; /* the seq. names */
18 static double *left_branch, *right_branch;
23 * \file util_make_tree.c
24 * \brief Source code tree algorithms
27 NT_node ** seq2cw_tree ( Sequence *S, char *tree)
30 char *aln,command[1000];
35 A=seq2clustalw_aln (S);
36 aln=vtmpnam (NULL); if (!tree)tree=vtmpnam (NULL);
38 output_fasta_aln (aln, A);
40 sprintf ( command, "clustalw -infile=%s -newtree=%s -tree %s", aln, tree, TO_NULL_DEVICE);
42 return read_tree (tree, &tot_node, S->nseq, S->name);
45 NT_node ** make_upgma_tree ( Alignment *A,int **distances,int gop, int gep, char **out_seq, char **out_seq_name, int out_nseq, char *tree_file, char *tree_mode)
48 if ( distances==NULL && A==NULL)
50 fprintf ( stderr, "\nError: make_nj_tree, must provide an alignment or a distance matrix [FATAL:%s]", PROGRAM);
51 myexit (EXIT_FAILURE);
53 else if ( distances==NULL)
56 distances=get_dist_aln_array (A, "idmat");
58 return int_dist2upgma_tree (distances,A, A->nseq,tree_file);
60 NT_node ** make_nj_tree ( Alignment *A,int **distances,int gop, int gep, char **out_seq, char **out_seq_name, int out_nseq, char *tree_file, char *tree_mode)
63 if ( distances==NULL && A==NULL)
65 fprintf ( stderr, "\nError: make_nj_tree, must provide an alignment or a distance matrix [FATAL:%s]", PROGRAM);
66 myexit (EXIT_FAILURE);
68 else if ( distances==NULL)
70 distances=get_dist_aln_array (A, "idmat");
72 return int_dist2nj_tree (distances,out_seq_name, out_nseq,tree_file);
75 NT_node ** int_dist2nj_tree (int **distances, char **out_seq_name, int out_nseq, char *tree_file)
81 d=declare_double( out_nseq, out_nseq);
82 for ( a=0; a< out_nseq; a++)
83 for ( b=0; b< out_nseq; b++)
84 d[a][b]=distances[a][b];
86 T=dist2nj_tree ( d, out_seq_name, out_nseq, tree_file);
91 NT_node ** float_dist2nj_tree (float **distances, char **out_seq_name, int out_nseq, char *tree_file)
97 d=declare_double( out_nseq, out_nseq);
98 for ( a=0; a< out_nseq; a++)
99 for ( b=0; b< out_nseq; b++)
100 d[a][b]=distances[a][b];
101 T=dist2nj_tree ( d, out_seq_name, out_nseq, tree_file);
105 NT_node ** dist2nj_tree (double **distances, char **out_seq_name, int out_nseq, char *tree_file)
112 if ( !tree_file)tree_file=vtmpnam(NULL);
113 d_dist=declare_double( out_nseq+1, out_nseq+1);
115 for ( a=0; a<out_nseq; a++)
116 for ( b=0; b< out_nseq; b++)
119 d_dist[a+1][b+1]=distances[a][b]/MAXID;
120 else d_dist[a+1][b+1]=0;
123 guide_tree ( tree_file, d_dist, out_seq_name, out_nseq);
124 free_double (d_dist,-1);
125 T=read_tree (tree_file,&tot_node,out_nseq, out_seq_name);
131 void guide_tree(char *fname, double **saga_tmat, char **saga_seq_name, int saga_nseq)
133 Routine for producing unrooted NJ trees from seperately aligned
134 pairwise distances. This produces the GUIDE DENDROGRAMS in
141 char **standard_tree;
152 standard_tree = (char **) vmalloc( (nseqs+1) * sizeof (char *) );
153 for(i=0; i<nseqs+1; i++)
154 standard_tree[i] = (char *) vmalloc( (nseqs+1) * sizeof(char));
157 nj_tree(standard_tree, nseqs);
159 fp=fopen ( fname, "w");
160 print_phylip_tree(standard_tree,fp,FALSE);
167 for (i=0;i<nseqs+1;i++)
168 vfree(standard_tree[i]);
177 void nj_tree(char **tree_description, int nseq)
180 slow_nj_tree (tree_description);
182 fast_nj_tree (tree_description);
188 void slow_nj_tree(char **tree_description)
192 int nc,mini,minj,j,ii,jj;
193 double fnseqs,fnseqs2=0,sumd;
194 double diq,djq,dij,d2r,dr,dio,djo,da;
195 double tmin,total,dmin;
196 double bi,bj,b1,b2,b3,branch[4];
197 int typei,typej; /* 0 = node; 1 = OTU */
199 fnseqs = (double)nseqs;
201 /*********************** First initialisation ***************************/
207 left_branch = (double *) vcalloc( (nseqs+2),sizeof (double) );
208 right_branch = (double *) vcalloc( (nseqs+2),sizeof (double) );
209 tkill = (int *) vcalloc( (nseqs+1),sizeof (int) );
210 av = (double *) vcalloc( (nseqs+1),sizeof (double) );
212 for(i=1;i<=nseqs;++i)
214 tmat[i][i] = av[i] = 0.0;
218 /*********************** Enter The Main Cycle ***************************/
220 /**start main cycle**/
221 for(nc=1; nc<=(nseqs-3); ++nc)
225 for(j=2; j<=nseqs; ++j)
228 tmat[j][i] = tmat[i][j];
229 sumd = sumd + tmat[i][j];
233 /*.................compute SMATij values and find the smallest one ........*/
235 for(jj=2; jj<=nseqs; ++jj)
237 for(ii=1; ii<jj; ++ii)
242 for(i=1; i<=nseqs; ++i)
244 diq = diq + tmat[i][ii];
245 djq = djq + tmat[i][jj];
248 d2r = diq + djq - (2.0*dij);
249 dr = sumd - dij -d2r;
250 fnseqs2 = fnseqs - 2.0;
251 total= d2r+ fnseqs2*dij +dr*2.0;
252 total= total / (2.0*fnseqs2);
254 /* commented out to have consistent results with CYGWIN: if(total < tmin)"*/
257 if ( tmin-total>MY_EPS)
265 /*.................compute branch lengths and print the results ........*/
269 for(i=1; i<=nseqs; ++i) {
270 dio = dio + tmat[i][mini];
271 djo = djo + tmat[i][minj];
274 dmin = tmat[mini][minj];
275 dio = (dio - dmin) / fnseqs2;
276 djo = (djo - dmin) / fnseqs2;
277 bi = (dmin + dio - djo) * 0.5;
294 set negative branch lengths to zero. Also set any tiny positive
295 branch lengths to zero.
297 if( fabs(bi) < 0.0001) bi = 0.0;
298 if( fabs(bj) < 0.0001) bj = 0.0;
303 left_branch[nc] = bi;
304 right_branch[nc] = bj;
306 for(i=1; i<=nseqs; i++)
307 tree_description[nc][i] = 0;
310 for(i=nc-1; i>=1; i--)
311 if(tree_description[i][mini] == 1) {
312 for(j=1; j<=nseqs; j++)
313 if(tree_description[i][j] == 1)
314 tree_description[nc][j] = 1;
319 tree_description[nc][mini] = 1;
322 for(i=nc-1; i>=1; i--)
323 if(tree_description[i][minj] == 1) {
324 for(j=1; j<=nseqs; j++)
325 if(tree_description[i][j] == 1)
326 tree_description[nc][j] = 1;
331 tree_description[nc][minj] = 1;
335 Here is where the -0.00005 branch lengths come from for 3 or more
338 /* if(dmin <= 0.0) dmin = 0.0001; */
339 if(dmin <= 0.0) dmin = 0.000001;
340 av[mini] = dmin * 0.5;
342 /*........................Re-initialisation................................*/
344 fnseqs = fnseqs - 1.0;
347 for(j=1; j<=nseqs; ++j)
348 if( tkill[j] != 1 ) {
349 da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;
356 for(j=1; j<=nseqs; ++j)
357 tmat[minj][j] = tmat[j][minj] = 0.0;
364 /******************************Last Cycle (3 Seqs. left)********************/
369 for(i=1; i<=nseqs; ++i)
370 if( tkill[i] != 1 ) {
375 b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
376 b2 = tmat[l[1]][l[2]] - b1;
377 b3 = tmat[l[1]][l[3]] - b1;
379 branch[1] = b1 - av[l[1]];
380 branch[2] = b2 - av[l[2]];
381 branch[3] = b3 - av[l[3]];
383 /* Reset tiny negative and positive branch lengths to zero */
384 if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
385 if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
386 if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
388 left_branch[nseqs-2] = branch[1];
389 left_branch[nseqs-1] = branch[2];
390 left_branch[nseqs] = branch[3];
392 for(i=1; i<=nseqs; i++)
393 tree_description[nseqs-2][i] = 0;
397 for(i=1; i<=3; ++i) {
398 if( av[l[i]] > 0.0) {
400 for(k=nseqs-3; k>=1; k--)
401 if(tree_description[k][l[i]] == 1) {
402 for(j=1; j<=nseqs; j++)
403 if(tree_description[k][j] == 1)
404 tree_description[nseqs-2][j] = i;
410 tree_description[nseqs-2][l[i]] = i;
417 void print_phylip_tree(char **tree_description, FILE *tree, int bootstrap)
422 two_way_split(tree_description, tree, nseqs-2,1,bootstrap);
423 fprintf(tree,":%7.5f,\n",left_branch[nseqs-2]);
424 two_way_split(tree_description, tree, nseqs-2,2,bootstrap);
425 fprintf(tree,":%7.5f,\n",left_branch[nseqs-1]);
426 two_way_split(tree_description, tree, nseqs-2,3,bootstrap);
428 fprintf(tree,":%7.5f)",left_branch[nseqs]);
429 if (bootstrap) fprintf(tree,"TRICHOTOMY");
435 (char **tree_description, FILE *tree, int start_row, int flag, int bootstrap)
437 int row, new_row, col, test_col=0;
440 if(start_row != nseqs-2) fprintf(tree,"(\n");
442 for(col=1; col<=nseqs; col++) {
443 if(tree_description[start_row][col] == flag) {
450 for(row=start_row-1; row>=1; row--)
451 if(tree_description[row][test_col] == 1) {
458 tree_description[start_row][test_col] = 0;
459 fprintf(tree,"%s",names[test_col+0-1]);
462 for(col=1; col<=nseqs; col++) {
463 if((tree_description[start_row][col]==1)&&
464 (tree_description[new_row][col]==1))
465 tree_description[start_row][col] = 0;
467 two_way_split(tree_description, tree, new_row, (int)1, bootstrap);
470 if(start_row == nseqs-2) {
471 /* if (bootstrap && (flag==1)) fprintf(tree,"[TRICHOTOMY]");
476 fprintf(tree,":%7.5f,\n",left_branch[start_row]);
478 for(col=1; col<=nseqs; col++)
479 if(tree_description[start_row][col] == flag) {
485 for(row=start_row-1; row>=1; row--)
486 if(tree_description[row][test_col] == 1) {
493 tree_description[start_row][test_col] = 0;
494 fprintf(tree,"%s",names[test_col+0-1]);
497 for(col=1; col<=nseqs; col++) {
498 if((tree_description[start_row][col]==1)&&
499 (tree_description[new_row][col]==1))
500 tree_description[start_row][col] = 0;
502 two_way_split(tree_description, tree, new_row, (int)1, bootstrap);
505 fprintf(tree,":%7.5f)\n",right_branch[start_row]);
513 /****************************************************************************
514 * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
515 * written by Tadashi Koike
516 * (takoike@genes.nig.ac.jp)
518 * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
519 * and use again and again.
521 * In the main cycle, these are calculated again and again :
522 * diq = sum of tmat[n][ii] (n:1 to last_seq-first_seq+1),
523 * djq = sum of tmat[n][jj] (n:1 to last_seq-first_seq+1),
524 * dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1),
525 * djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1)
526 * // 'last_seq' and 'first_seq' are both constant values //
527 * and the result of above calculations is always same until
528 * a best pair of neighbour nodes is joined.
530 * So, we change the logic to calculate the sum[i] (=sum of tmat[n][i]
531 * (n:1 to last_seq-first_seq+1)) and store it to array, before
532 * beginning to find a best pair of neighbour nodes, and after that
533 * we use them again and again.
537 * +---+---+---+---+---+
539 * +---+---+---+---+---+
540 * 2 | | | | | | 1) calculate sum of tmat[n][i]
541 * +---+---+---+---+---+ (n: 1 to last_seq-first_seq+1)
542 * 3 | | | | | | 2) store that sum value to sum[i]
543 * +---+---+---+---+---+
544 * 4 | | | | | | 3) use sum[i] during finding a best
545 * +---+---+---+---+---+ pair of neibour nodes.
547 * +---+---+---+---+---+
549 * V V V V V Calculate sum , and store it to sum[i]
550 * +---+---+---+---+---+
552 * +---+---+---+---+---+
554 * At this time, we thought that we use upper triangle of the matrix
555 * because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal
556 * to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead
557 * of sum[i] for storing the sum value.
560 * 1 2 3 4 5 sum_cols[i]
561 * +---+---+---+---+---+ +---+
562 * 1 | # | # | # | # | --> | | ... sum of tmat[1][2..5]
563 * + - +---+---+---+---+ +---+
564 * 2 | # | # | # | --> | | ... sum of tmat[2][3..5]
565 * + - + - +---+---+---+ +---+
566 * 3 | # | # | --> | | ... sum of tmat[3][4..5]
567 * + - + - + - +---+---+ +---+
568 * 4 | # | --> | | ... sum of tmat[4][5]
569 * + - + - + - + - +---+ +---+
570 * 5 | --> | | ... zero
571 * + - + - + - + - + - + +---+
573 * V V V V V Calculate sum , sotre to sum[i]
574 * +---+---+---+---+---+
575 * sum_rows[i] | | | | | |
576 * +---+---+---+---+---+
578 * | | | | +----- sum of tmat[1..4][5]
579 * | | | +--------- sum of tmat[1..3][4]
580 * | | +------------- sum of tmat[1..2][3]
581 * | +----------------- sum of tmat[1][2]
582 * +--------------------- zero
584 * And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
587 * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
588 * tkill[i] flag array.
590 * In original logic, invalid(killed?) nodes after nodes-joining
591 * are managed with tkill[i] flag array (set to 1 when killed).
592 * By this method, it is conspicuous to try next node but skip it
593 * at the latter of finding a best pair of neighbor nodes.
595 * So, we thought that we managed valid nodes by using a chain list
598 * 1) declare the list structure.
600 * int n; // entry number of node.
601 * void *prev; // pointer to previous entry.
602 * void *next; // pointer to next entry.
604 * 2) construct a valid node list.
606 * +-----+ +-----+ +-----+ +-----+ +-----+
607 * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
608 * | 0 | | 1 | | 2 | | 3 | | n |
609 * | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
610 * +-----+ +-----+ +-----+ +-----+ +-----+
612 * 3) when finding a best pair of neighbor nodes, we use
613 * this chain list as loop counter.
615 * 4) If an entry was killed by node-joining, this chain list is
616 * modified to remove that entry.
618 * EX) remove the entry No 2.
619 * +-----+ +-----+ +-----+ +-----+
620 * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
621 * | 0 | | 1 | | 3 | | n |
622 * | next|--->| next|-------------->| next|- - - ->| next|->NULL
623 * +-----+ +-----+ +-----+ +-----+
630 * By this method, speed is up at the latter of finding a best pair of
634 * <IMPROVEMENT 3> : Cut the frequency of division.
636 * At comparison between 'total' and 'tmin' in the main cycle, total is
637 * divided by (2.0*fnseqs2) before comparison. If N nodes are available,
638 * that division happen (N*(N-1))/2 order.
640 * We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
641 * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
642 * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
643 * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
646 * <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
648 * We transform an equation of calculating 'total' in the main cycle.
653 void fast_nj_tree(char **tree_description)
657 int nc,mini,minj,j,ii,jj;
658 double fnseqs,fnseqs2=0,sumd;
659 double diq,djq,dij,dio,djo,da;
660 double tmin,total,dmin;
661 double bi,bj,b1,b2,b3,branch[4];
662 int typei,typej; /* 0 = node; 1 = OTU */
664 /* IMPROVEMENT 1, STEP 0 : declare variables */
665 double *sum_cols, *sum_rows, *join;
667 /* IMPROVEMENT 2, STEP 0 : declare variables */
669 typedef struct _ValidNodeID {
671 struct _ValidNodeID *prev;
672 struct _ValidNodeID *next;
674 ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;
677 * correspondence of the loop counter variables.
678 * i .. lpi->n, ii .. lpii->n
679 * j .. lpj->n, jj .. lpjj->n
682 fnseqs = (double)last_seq-first_seq+1;
684 /*********************** First initialisation ***************************/
693 left_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
694 right_branch = (double *) ckalloc( (nseqs+2) * sizeof (double) );
695 tkill = (int *) ckalloc( (nseqs+1) * sizeof (int) );
696 av = (double *) ckalloc( (nseqs+1) * sizeof (double) );
698 /* IMPROVEMENT 1, STEP 1 : Allocate memory */
699 sum_cols = (double *) ckalloc( (nseqs+1) * sizeof (double) );
700 sum_rows = (double *) ckalloc( (nseqs+1) * sizeof (double) );
701 join = (double *) ckalloc( (nseqs+1) * sizeof (double) );
703 /* IMPROVEMENT 2, STEP 1 : Allocate memory */
704 tvalid = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );
705 /* tvalid[0] is special entry in array. it points a header of valid entry list */
707 tvalid[0].prev = NULL;
708 tvalid[0].next = &tvalid[1];
710 /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
711 for(i=1, loop_limit = last_seq-first_seq+1,
712 lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;
714 ++i, ++lpi, ++lp_prev, ++lp_next)
716 tmat[i][i] = av[i] = 0.0;
722 /* IMPROVEMENT 1, STEP 2 : Initialize arrays */
723 sum_cols[i] = sum_rows[i] = join[i] = 0.0;
725 tvalid[loop_limit].next = NULL;
728 * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that
729 * is sequence[i] to others.
732 for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
733 double tmp_sum = 0.0;
735 /* calculate sum_rows[j] */
736 for (lpi=tvalid[0].next ; lpi->n < j ; lpi = lpi->next) {
738 tmp_sum += tmat[i][j];
739 /* tmat[j][i] = tmat[i][j]; */
741 sum_rows[j] = tmp_sum;
744 /* Set lpi to that lpi->n is greater than j */
745 if ((lpi != NULL) && (lpi->n == j)) {
748 /* calculate sum_cols[j] */
749 for( ; lpi!=NULL ; lpi = lpi->next) {
751 tmp_sum += tmat[j][i];
752 /* tmat[i][j] = tmat[j][i]; */
754 sum_cols[j] = tmp_sum;
757 /*********************** Enter The Main Cycle ***************************/
759 for(nc=1, loop_limit = (last_seq-first_seq+1-3); nc<=loop_limit; ++nc) {
762 /* IMPROVEMENT 1, STEP 4 : use sum value */
763 for(lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
764 sumd += sum_cols[lpj->n];
767 /* IMPROVEMENT 3, STEP 0 : multiply tmin and 2*fnseqs2 */
768 fnseqs2 = fnseqs - 2.0; /* Set fnseqs2 at this point. */
769 tmin = 99999.0 * 2.0 * fnseqs2;
772 /*.................compute SMATij values and find the smallest one ........*/
776 /* jj must starts at least 2 */
777 if ((tvalid[0].next != NULL) && (tvalid[0].next->n == 1)) {
778 lpjj = tvalid[0].next->next;
780 lpjj = tvalid[0].next;
783 for( ; lpjj != NULL; lpjj = lpjj->next) {
785 for(lpii=tvalid[0].next ; lpii->n < jj ; lpii = lpii->next) {
789 /* IMPROVEMENT 1, STEP 4 : use sum value */
790 diq = sum_cols[ii] + sum_rows[ii];
791 djq = sum_cols[jj] + sum_rows[jj];
793 * always ii < jj in this point. Use upper
794 * triangle of score matrix.
799 * IMPROVEMENT 3, STEP 1 : fnseqs2 is
800 * already calculated.
802 /* fnseqs2 = fnseqs - 2.0 */
804 /* IMPROVEMENT 4 : transform the equation */
805 /*-------------------------------------------------------------------*
806 * OPTIMIZE of expression 'total = d2r + fnseqs2*dij + dr*2.0' *
807 * total = d2r + fnseq2*dij + 2.0*dr *
808 * = d2r + fnseq2*dij + 2(sumd - dij - d2r) *
809 * = d2r + fnseq2*dij + 2*sumd - 2*dij - 2*d2r *
810 * = fnseq2*dij + 2*sumd - 2*dij - 2*d2r + d2r *
811 * = fnseq2*dij + 2*sumd - 2*dij - d2r *
812 * = fnseq2*dij + 2*sumd - 2*dij - (diq + djq - 2*dij) *
813 * = fnseq2*dij + 2*sumd - 2*dij - diq - djq + 2*dij *
814 * = fnseq2*dij + 2*sumd - 2*dij + 2*dij - diq - djq *
815 * = fnseq2*dij + 2*sumd - diq - djq *
816 *-------------------------------------------------------------------*/
817 total = fnseqs2*dij + 2.0*sumd - diq - djq;
820 * IMPROVEMENT 3, STEP 2 : abbrevlate
821 * the division on comparison between
824 /* total = total / (2.0*fnseqs2); */
834 /* MEMO: always ii < jj in avobe loop, so mini < minj */
836 /*.................compute branch lengths and print the results ........*/
841 /* IMPROVEMENT 1, STEP 4 : use sum value */
842 dio = sum_cols[mini] + sum_rows[mini];
843 djo = sum_cols[minj] + sum_rows[minj];
845 dmin = tmat[mini][minj];
846 dio = (dio - dmin) / fnseqs2;
847 djo = (djo - dmin) / fnseqs2;
848 bi = (dmin + dio - djo) * 0.5;
864 set negative branch lengths to zero. Also set any tiny positive
865 branch lengths to zero.
866 */ if( fabs(bi) < 0.0001) bi = 0.0;
867 if( fabs(bj) < 0.0001) bj = 0.0;
870 left_branch[nc] = bi;
871 right_branch[nc] = bj;
873 for(i=1; i<=last_seq-first_seq+1; i++)
874 tree_description[nc][i] = 0;
877 for(i=nc-1; i>=1; i--)
878 if(tree_description[i][mini] == 1) {
879 for(j=1; j<=last_seq-first_seq+1; j++)
880 if(tree_description[i][j] == 1)
881 tree_description[nc][j] = 1;
886 tree_description[nc][mini] = 1;
889 for(i=nc-1; i>=1; i--)
890 if(tree_description[i][minj] == 1) {
891 for(j=1; j<=last_seq-first_seq+1; j++)
892 if(tree_description[i][j] == 1)
893 tree_description[nc][j] = 1;
898 tree_description[nc][minj] = 1;
902 Here is where the -0.00005 branch lengths come from for 3 or more
905 /* if(dmin <= 0.0) dmin = 0.0001; */
906 if(dmin <= 0.0) dmin = 0.000001;
907 av[mini] = dmin * 0.5;
909 /*........................Re-initialisation................................*/
911 fnseqs = fnseqs - 1.0;
914 /* IMPROVEMENT 2, STEP 3 : Remove tvalid[minj] from chain list. */
916 * +---------+ +---------+ +---------+
917 * |prev |<-------|prev |<-------|prev |<---
918 * | n | | n(=minj)| | n |
919 * | next|------->| next|------->| next|----
920 * +---------+ +---------+ +---------+
923 * +---------+ +---------+
924 * |prev |<--------------------------|prev |<---
926 * | next|-------------------------->| next|----
927 * +---------+ +---------+
934 (tvalid[minj].prev)->next = tvalid[minj].next;
935 if (tvalid[minj].next != NULL) {
936 (tvalid[minj].next)->prev = tvalid[minj].prev;
938 tvalid[minj].prev = tvalid[minj].next = NULL;
940 /* IMPROVEMENT 1, STEP 5 : re-calculate sum values. */
941 for(lpj=tvalid[0].next ; lpj != NULL ; lpj = lpj->next) {
947 * subtrace a score value related with 'minj' from
951 tmp_dj = tmat[j][minj];
952 sum_cols[j] -= tmp_dj;
953 } else if (j > minj) {
954 tmp_dj = tmat[minj][j];
955 sum_rows[j] -= tmp_dj;
956 } /* nothing to do when j is equal to minj. */
960 * subtrace a score value related with 'mini' from
964 tmp_di = tmat[j][mini];
965 sum_cols[j] -= tmp_di;
966 } else if (j > mini) {
967 tmp_di = tmat[mini][j];
968 sum_rows[j] -= tmp_di;
969 } /* nothing to do when j is equal to mini. */
972 * calculate a score value of the new inner node.
973 * then, store it temporary to join[] array.
975 join[j] = (tmp_dj + tmp_di) * 0.5;
980 * Set the score values (stored in join[]) into the matrix,
981 * row/column position is 'mini'.
983 * Add a score value of the new inner node to sum arrays.
985 for(lpj=tvalid[0].next ; lpj != NULL; lpj = lpj->next) {
988 tmat[j][mini] = join[j];
989 sum_cols[j] += join[j];
990 } else if (j > mini) {
991 tmat[mini][j] = join[j];
992 sum_rows[j] += join[j];
993 } /* nothing to do when j is equal to mini. */
996 /* Re-calculate sum_rows[mini],sum_cols[mini]. */
997 sum_cols[mini] = sum_rows[mini] = 0.0;
999 /* calculate sum_rows[mini] */
1001 for(lpj=tvalid[0].next ; lpj->n < mini ; lpj = lpj->next) {
1004 sum_rows[mini] = da;
1006 /* skip if 'lpj->n' is equal to 'mini' */
1007 if ((lpj != NULL) && (lpj->n == mini)) {
1011 /* calculate sum_cols[mini] */
1013 for( ; lpj != NULL; lpj = lpj->next) {
1016 sum_cols[mini] = da;
1019 * Clean up sum_rows[minj], sum_cols[minj] and score matrix
1020 * related with 'minj'.
1022 sum_cols[minj] = sum_rows[minj] = 0.0;
1023 for(j=1; j<=last_seq-first_seq+1; ++j)
1024 tmat[minj][j] = tmat[j][minj] = join[j] = 0.0;
1027 /****/ } /*end main cycle**/
1029 /******************************Last Cycle (3 Seqs. left)********************/
1033 for(lpi=tvalid[0].next; lpi != NULL; lpi = lpi->next) {
1038 b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
1039 b2 = tmat[l[1]][l[2]] - b1;
1040 b3 = tmat[l[1]][l[3]] - b1;
1042 branch[1] = b1 - av[l[1]];
1043 branch[2] = b2 - av[l[2]];
1044 branch[3] = b3 - av[l[3]];
1046 /* Reset tiny negative and positive branch lengths to zero */
1047 if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
1048 if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
1049 if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
1051 left_branch[last_seq-first_seq+1-2] = branch[1];
1052 left_branch[last_seq-first_seq+1-1] = branch[2];
1053 left_branch[last_seq-first_seq+1] = branch[3];
1055 for(i=1; i<=last_seq-first_seq+1; i++)
1056 tree_description[last_seq-first_seq+1-2][i] = 0;
1059 for(i=1; i<=3; ++i) {
1060 if( av[l[i]] > 0.0) {
1062 for(k=last_seq-first_seq+1-3; k>=1; k--)
1063 if(tree_description[k][l[i]] == 1) {
1064 for(j=1; j<=last_seq-first_seq+1; j++)
1065 if(tree_description[k][j] == 1)
1066 tree_description[last_seq-first_seq+1-2][j] = i;
1071 tree_description[last_seq-first_seq+1-2][l[i]] = i;
1081 //////////////////////////////////////////////////////////////////////////////
1084 //////////////////////////////////////////////////////////////////////////////
1086 Alignment * upgma_merge_aln_rows (Alignment *A, int *ns, int **ls,int N, int**mat,int *used, int *n, Constraint_list *CL);
1087 int upgma_pair_wise (Alignment *A, int *ls0, int ns0, int *ls2, int ns2, Constraint_list *CL);
1090 Alignment * upgma_tree_aln ( Alignment*A, int nseq, Constraint_list *CL)
1097 mat=declare_int (nseq, nseq);
1098 ls=declare_int (nseq,nseq);
1099 ns=vcalloc (nseq,sizeof (int));
1101 for (a=0; a<nseq-1; a++)
1102 for (b=a+1; b<nseq; b++)
1107 mat[a][b]=mat[b][a]=upgma_pair_wise(A,ls[a],ns[a],ls[b],ns[b],CL);
1108 HERE ("%d %d", a, b, mat[a][b]);
1111 used=vcalloc (nseq, sizeof (int));
1115 upgma_merge_aln_rows (A,ns, ls,nseq, mat, used, &n,CL);
1119 free_int ( mat, -1);
1125 Alignment * upgma_merge_aln_rows (Alignment *A, int *ns, int **ls,int N, int**mat,int *used, int *n, Constraint_list *CL)
1128 int a, b, w, best_a, best_b, set;
1131 for (set=0,a=0; a<N-1; a++)
1133 if (used[a])continue;
1134 for (b=a+1; b<N; b++)
1136 if (used[b])continue;
1138 if ( !set || w>best_s)
1150 mat[best_a][best_b]=upgma_pair_wise (A, ls[best_a], ns[best_a], ls[best_b], ns[best_b], CL);
1151 for (a=0; a<ns[best_b]; a++)ls[best_a][ns[best_a]++]=ls[best_b][a];
1155 for (a=0; a< A->nseq; a++)
1157 if (a!=best_a && !used[a])
1158 mat[best_a][a]=mat[a][best_a]=upgma_pair_wise (A, ls[best_a], ns[best_a], ls[a], ns[a], CL);
1166 int upgma_pair_wise (Alignment *A, int *ls0, int ns0, int *ls1, int ns1, Constraint_list *CL)
1175 ls=vcalloc (2, sizeof (int*));
1176 ns=vcalloc (2, sizeof (int));
1177 fl=vcalloc ((CL->S)->nseq, sizeof (int));
1181 ns[0]=ns0; ns[1]=ns1;
1183 fprintf ( stderr, "\n");
1184 for (a=0; a<ns0; a++)
1185 fprintf ( stderr, "%d", ls0[a]);
1186 fprintf ( stderr, "\n");
1187 for (a=0; a<ns1; a++)
1188 fprintf ( stderr, "%d", ls1[a]);
1190 ungap_sub_aln (A, ns[0], ls[0]);
1191 ungap_sub_aln (A, ns[1], ls[1]);
1193 pair_wise (A, ns, ls, CL);
1196 for (n=0,a=0;a<2; a++)
1197 for (b=0; b<ns[a]; b++)
1200 return sub_aln2ecl_raw_score (A,CL,n, fl);
1203 //////////////////////////////////////////////////////////////////////////////
1206 ///////////////////////////////////////////////////////////////////////////////
1209 NT_node ** int_dist2upgma_tree (int **mat, Alignment *A, int nseq, char *fname)
1214 if (upgma_node_heap (NULL))
1216 printf_exit ( EXIT_FAILURE,stderr, "\nERROR: non empty heap in upgma [FATAL]");
1218 NL=vcalloc (nseq, sizeof (NT_node));
1220 for (a=0; a<A->nseq; a++)
1222 NL[a]=new_declare_tree_node ();
1223 upgma_node_heap (NL[a]);
1224 sprintf (NL[a]->name, "%s", A->name[a]);
1229 used=vcalloc ( A->nseq, sizeof (int));
1233 T=upgma_merge (mat, NL,used, &n, A->nseq);
1237 vfclose (print_tree (T, "newick", vfopen (fname, "w")));
1238 upgma_node_heap (NULL);
1241 return read_tree (fname,&tot_node,A->nseq, A->name);
1243 NT_node upgma_merge (int **mat, NT_node *NL, int *used, int *n, int N)
1246 int a, b, w, best_a, best_b, set;
1248 P=new_declare_tree_node();
1249 upgma_node_heap (P);
1251 for (set=0,a=0; a<N-1; a++)
1253 if (used[a])continue;
1254 for (b=a+1; b<N; b++)
1256 if (used[b])continue;
1258 if ( !set || w<best_s)
1270 if (!used[a])mat[best_a][a]=mat[a][best_a]=(mat[best_b][a]+mat[best_a][a])/2;
1277 LC->dist=RC->dist=best_s;
1278 LC->parent=RC->parent=P;
1288 //////////////////////////////////////////////////////////////////////////////
1291 ///////////////////////////////////////////////////////////////////////////////
1292 int upgma_node_heap (NT_node X);
1293 int upgma_node_heap (NT_node X)
1303 free_tree_node (h[a]);
1313 h=vrealloc (h, sizeof (NT_node)*(n+1));
1318 NT_node split2upgma_tree (Split **S, Alignment *A, int nseq, char *fname)
1323 NL=vcalloc (nseq, sizeof (NT_node));
1324 for (a=0; a<A->nseq; a++)
1327 NL[a]=new_declare_tree_node ();
1328 NL[a]->lseq2=vcalloc (A->nseq+1, sizeof (int));
1330 sprintf (NL[a]->name, "%s", A->name[a]);
1334 upgma_node_heap (NL[a]);
1336 used=vcalloc ( A->nseq, sizeof (int));
1340 T=split_upgma_merge (A,S, NL,used, &n, A->nseq);
1343 fprintf ( stdout, "\n");
1344 vfclose (print_tree (T, "newick", vfopen (fname, "w")));
1345 upgma_node_heap (NULL);
1348 NT_node split_upgma_merge (Alignment *A, Split **S, NT_node *NL, int *used, int *n, int N)
1351 int a, b, w, best_a, best_b, set;
1357 mat=declare_int (N, N);
1358 for (a=0; a<N-1; a++)
1359 for ( b=a+1; b<N; b++)
1361 mat[a][b]=get_split_dist (A, NL[a], NL[b], S);
1365 P=new_declare_tree_node();
1366 upgma_node_heap (P);
1367 P->lseq2=vcalloc (N, sizeof (int));
1368 for (set=0,a=0; a<N-1; a++)
1370 if (used[a])continue;
1371 for (b=a+1; b<N; b++)
1373 if (used[b])continue;
1375 if ( !set || w<best_s)
1393 LC->parent=RC->parent=P;
1396 P->bootstrap=best_s*100;
1401 for (a=0; a<A->nseq; a++)
1403 P->lseq2[a]=(LC->lseq2[a] || RC->lseq2[a])?1:0;
1408 if (!used[a])mat[best_a][a]=mat[a][best_a]=(int)get_split_dist(A,P, NL[a], S);
1415 float get_split_dist ( Alignment *A, NT_node L, NT_node R, Split **S)
1424 split=vcalloc ( A->nseq+1, sizeof (char));
1429 for ( a=0; a<A->nseq; a++)
1430 split[a]=((L->lseq2[a] || R->lseq2[a])?1:0)+'0';
1436 if ( strm (S[n]->split,split))
1438 return score=100-S[n]->score;
1444 /*********************************COPYRIGHT NOTICE**********************************/
1445 /*© Centro de Regulacio Genomica */
1447 /*Cedric Notredame */
1448 /*Tue Oct 27 10:12:26 WEST 2009. */
1449 /*All rights reserved.*/
1450 /*This file is part of T-COFFEE.*/
1452 /* T-COFFEE is free software; you can redistribute it and/or modify*/
1453 /* it under the terms of the GNU General Public License as published by*/
1454 /* the Free Software Foundation; either version 2 of the License, or*/
1455 /* (at your option) any later version.*/
1457 /* T-COFFEE is distributed in the hope that it will be useful,*/
1458 /* but WITHOUT ANY WARRANTY; without even the implied warranty of*/
1459 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the*/
1460 /* GNU General Public License for more details.*/
1462 /* You should have received a copy of the GNU General Public License*/
1463 /* along with Foobar; if not, write to the Free Software*/
1464 /* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*/
1465 /*............................................... |*/
1466 /* If you need some more information*/
1467 /* cedric.notredame@europe.com*/
1468 /*............................................... |*/
1472 /*********************************COPYRIGHT NOTICE**********************************/