binaries/src/tcoffee/t_coffee_source/util_make_tree.c

   1 #include <stdio.h>
   2 #include <stdlib.h>
   3 #include <math.h>
   4 #include <stdarg.h>
   5
   6 #include "io_lib_header.h"
   7 #include "util_lib_header.h"
   8 #include "dp_lib_header.h"
   9 #include "define_header.h"
  10
  11 static int first_seq, last_seq;
  12
  13 static int nseqs;
  14 static char **names;       /* the seq. names */
  15
  16 static double   **tmat;
  17 static double   *av;
  18 static double   *left_branch, *right_branch;
  19 static int      *tkill;
  20
  21
  22 /*!
  23  *      \file util_make_tree.c
  24  *      \brief Source code tree algorithms
  25  */
  26
  27 NT_node ** seq2cw_tree ( Sequence *S, char *tree)
  28 {
  29   Alignment *A;
  30   char *aln,command[1000];
  31   int tot_node;
  32
  33
  34
  35   A=seq2clustalw_aln (S);
  36   aln=vtmpnam (NULL); if (!tree)tree=vtmpnam (NULL);
  37
  38   output_fasta_aln (aln, A);
  39
  40   sprintf ( command, "clustalw -infile=%s -newtree=%s -tree %s", aln, tree, TO_NULL_DEVICE);
  41   my_system (command);
  42   return read_tree (tree, &tot_node, S->nseq, S->name);
  43 }
  44
  45 NT_node ** make_upgma_tree (  Alignment *A,int **distances,int gop, int gep, char **out_seq, char **out_seq_name, int out_nseq, char *tree_file, char *tree_mode)
  46   {
  47
  48     if ( distances==NULL && A==NULL)
  49       {
  50         fprintf ( stderr, "\nError: make_nj_tree, must provide an alignment or a distance matrix [FATAL:%s]", PROGRAM);
  51         myexit (EXIT_FAILURE);
  52       }
  53     else if ( distances==NULL)
  54       {
  55
  56         distances=get_dist_aln_array (A, "idmat");
  57       }
  58     return int_dist2upgma_tree (distances,A, A->nseq,tree_file);
  59   }
  60 NT_node ** make_nj_tree (  Alignment *A,int **distances,int gop, int gep, char **out_seq, char **out_seq_name, int out_nseq, char *tree_file, char *tree_mode)
  61   {
  62
  63     if ( distances==NULL && A==NULL)
  64       {
  65         fprintf ( stderr, "\nError: make_nj_tree, must provide an alignment or a distance matrix [FATAL:%s]", PROGRAM);
  66         myexit (EXIT_FAILURE);
  67       }
  68     else if ( distances==NULL)
  69       {
  70         distances=get_dist_aln_array (A, "idmat");
  71       }
  72     return int_dist2nj_tree (distances,out_seq_name, out_nseq,tree_file);
  73   }
  74
  75 NT_node ** int_dist2nj_tree (int **distances, char **out_seq_name, int out_nseq,  char *tree_file)
  76         {
  77           int a, b;
  78           double **d;
  79           NT_node **T;
  80
  81           d=declare_double( out_nseq, out_nseq);
  82           for ( a=0; a< out_nseq; a++)
  83             for ( b=0; b< out_nseq; b++)
  84               d[a][b]=distances[a][b];
  85
  86           T=dist2nj_tree ( d, out_seq_name, out_nseq, tree_file);
  87
  88           free_double (d, -1);
  89           return T;
  90         }
  91 NT_node ** float_dist2nj_tree (float **distances, char **out_seq_name, int out_nseq,  char *tree_file)
  92         {
  93           int a, b;
  94           double **d;
  95           NT_node **T;
  96
  97           d=declare_double( out_nseq, out_nseq);
  98           for ( a=0; a< out_nseq; a++)
  99             for ( b=0; b< out_nseq; b++)
 100               d[a][b]=distances[a][b];
 101           T=dist2nj_tree ( d, out_seq_name, out_nseq, tree_file);
 102           free_double (d, -1);
 103           return T;
 104         }
 105 NT_node ** dist2nj_tree (double **distances, char **out_seq_name, int out_nseq,  char *tree_file)
 106         {
 107         int a, b;
 108         double **d_dist;
 109         int tot_node=0;
 110         NT_node **T;
 111
 112         if ( !tree_file)tree_file=vtmpnam(NULL);
 113         d_dist=declare_double( out_nseq+1, out_nseq+1);
 114
 115         for ( a=0; a<out_nseq; a++)
 116                 for ( b=0; b< out_nseq; b++)
 117                         {
 118                           if ( a!=b)
 119                                 d_dist[a+1][b+1]=distances[a][b]/MAXID;
 120                           else d_dist[a+1][b+1]=0;
 121                         }
 122
 123         guide_tree ( tree_file, d_dist, out_seq_name, out_nseq);
 124         free_double (d_dist,-1);
 125         T=read_tree (tree_file,&tot_node,out_nseq,  out_seq_name);
 126
 127         return T;
 128         }
 129
 130
 131 void guide_tree(char *fname, double **saga_tmat, char **saga_seq_name, int saga_nseq)
 132 /*
 133    Routine for producing unrooted NJ trees from seperately aligned
 134    pairwise distances.  This produces the GUIDE DENDROGRAMS in
 135    PHYLIP format.
 136 */
 137 {
 138
 139         int i;
 140         FILE *fp;
 141         char **standard_tree;
 142
 143
 144
 145         nseqs=saga_nseq;
 146         first_seq=1;
 147         last_seq=nseqs;
 148
 149         names=saga_seq_name;
 150         tmat=saga_tmat;
 151
 152         standard_tree   = (char **) vmalloc( (nseqs+1) * sizeof (char *) );
 153         for(i=0; i<nseqs+1; i++)
 154                 standard_tree[i]  = (char *) vmalloc( (nseqs+1) * sizeof(char));
 155
 156
 157         nj_tree(standard_tree, nseqs);
 158
 159         fp=fopen ( fname, "w");
 160         print_phylip_tree(standard_tree,fp,FALSE);
 161
 162         vfree(left_branch);
 163         vfree(right_branch);
 164         vfree(tkill);
 165         vfree(av);
 166
 167         for (i=0;i<nseqs+1;i++)
 168                 vfree(standard_tree[i]);
 169
 170
 171         fclose(fp);
 172
 173
 174 }
 175
 176
 177 void nj_tree(char **tree_description, int nseq)
 178 {
 179   if ( nseq<100)
 180     slow_nj_tree (tree_description);
 181   else
 182     fast_nj_tree (tree_description);
 183
 184 }
 185
 186
 187
 188 void slow_nj_tree(char **tree_description)
 189 {
 190         int i;
 191         int l[4],nude,k;
 192         int nc,mini,minj,j,ii,jj;
 193         double fnseqs,fnseqs2=0,sumd;
 194         double diq,djq,dij,d2r,dr,dio,djo,da;
 195         double tmin,total,dmin;
 196         double bi,bj,b1,b2,b3,branch[4];
 197         int typei,typej;             /* 0 = node; 1 = OTU */
 198
 199         fnseqs = (double)nseqs;
 200
 201 /*********************** First initialisation ***************************/
 202
 203
 204
 205         mini = minj = 0;
 206
 207         left_branch     = (double *) vcalloc( (nseqs+2),sizeof (double)   );
 208         right_branch    = (double *) vcalloc( (nseqs+2),sizeof (double)   );
 209         tkill           = (int *)    vcalloc( (nseqs+1),sizeof (int) );
 210         av              = (double *) vcalloc( (nseqs+1),sizeof (double)   );
 211
 212         for(i=1;i<=nseqs;++i)
 213                 {
 214                   tmat[i][i] = av[i] = 0.0;
 215                   tkill[i] = 0;
 216                 }
 217
 218 /*********************** Enter The Main Cycle ***************************/
 219
 220         /**start main cycle**/
 221         for(nc=1; nc<=(nseqs-3); ++nc)
 222           {
 223
 224             sumd = 0.0;
 225             for(j=2; j<=nseqs; ++j)
 226               for(i=1; i<j; ++i)
 227                 {
 228                 tmat[j][i] = tmat[i][j];
 229                 sumd = sumd + tmat[i][j];
 230                 }
 231             tmin = 99999.0;
 232
 233 /*.................compute SMATij values and find the smallest one ........*/
 234
 235             for(jj=2; jj<=nseqs; ++jj)
 236               if(tkill[jj] != 1)
 237                 for(ii=1; ii<jj; ++ii)
 238                   if(tkill[ii] != 1)
 239                     {
 240                       diq = djq = 0.0;
 241
 242                       for(i=1; i<=nseqs; ++i)
 243                         {
 244                           diq = diq + tmat[i][ii];
 245                           djq = djq + tmat[i][jj];
 246                         }
 247                       dij = tmat[ii][jj];
 248                       d2r = diq + djq - (2.0*dij);
 249                       dr  = sumd - dij -d2r;
 250                       fnseqs2 = fnseqs - 2.0;
 251                       total= d2r+ fnseqs2*dij +dr*2.0;
 252                       total= total / (2.0*fnseqs2);
 253
 254                       /* commented out to have consistent results with CYGWIN: if(total < tmin)"*/
 255                       if(total < tmin)
 256                         {
 257                           if ( tmin-total>MY_EPS)
 258                           tmin = total;
 259                           mini = ii;
 260                           minj = jj;
 261                         }
 262                     }
 263
 264
 265 /*.................compute branch lengths and print the results ........*/
 266
 267
 268             dio = djo = 0.0;
 269             for(i=1; i<=nseqs; ++i) {
 270               dio = dio + tmat[i][mini];
 271               djo = djo + tmat[i][minj];
 272             }
 273
 274             dmin = tmat[mini][minj];
 275             dio = (dio - dmin) / fnseqs2;
 276             djo = (djo - dmin) / fnseqs2;
 277             bi = (dmin + dio - djo) * 0.5;
 278             bj = dmin - bi;
 279             bi = bi - av[mini];
 280             bj = bj - av[minj];
 281
 282             if( av[mini] > 0.0 )
 283               typei = 0;
 284             else
 285               typei = 1;
 286             if( av[minj] > 0.0 )
 287               typej = 0;
 288             else
 289               typej = 1;
 290
 291
 292
 293 /*
 294    set negative branch lengths to zero.  Also set any tiny positive
 295    branch lengths to zero.
 296 */
 297             if( fabs(bi) < 0.0001) bi = 0.0;
 298             if( fabs(bj) < 0.0001) bj = 0.0;
 299
 300
 301
 302
 303             left_branch[nc] = bi;
 304             right_branch[nc] = bj;
 305
 306             for(i=1; i<=nseqs; i++)
 307               tree_description[nc][i] = 0;
 308
 309             if(typei == 0) {
 310               for(i=nc-1; i>=1; i--)
 311                 if(tree_description[i][mini] == 1) {
 312                   for(j=1; j<=nseqs; j++)
 313                     if(tree_description[i][j] == 1)
 314                       tree_description[nc][j] = 1;
 315                   break;
 316                 }
 317             }
 318             else
 319               tree_description[nc][mini] = 1;
 320
 321             if(typej == 0) {
 322               for(i=nc-1; i>=1; i--)
 323                 if(tree_description[i][minj] == 1) {
 324                   for(j=1; j<=nseqs; j++)
 325                     if(tree_description[i][j] == 1)
 326                       tree_description[nc][j] = 1;
 327                   break;
 328                 }
 329             }
 330             else
 331               tree_description[nc][minj] = 1;
 332
 333
 334 /*
 335    Here is where the -0.00005 branch lengths come from for 3 or more
 336    identical seqs.
 337 */
 338 /*              if(dmin <= 0.0) dmin = 0.0001; */
 339             if(dmin <= 0.0) dmin = 0.000001;
 340             av[mini] = dmin * 0.5;
 341
 342  /*........................Re-initialisation................................*/
 343
 344             fnseqs = fnseqs - 1.0;
 345             tkill[minj] = 1;
 346
 347             for(j=1; j<=nseqs; ++j)
 348               if( tkill[j] != 1 ) {
 349                 da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5;
 350                 if( (mini - j) < 0 )
 351                   tmat[mini][j] = da;
 352                 if( (mini - j) > 0)
 353                   tmat[j][mini] = da;
 354               }
 355
 356             for(j=1; j<=nseqs; ++j)
 357               tmat[minj][j] = tmat[j][minj] = 0.0;
 358
 359
 360
 361           }
 362         /*end main cycle**/
 363
 364 /******************************Last Cycle (3 Seqs. left)********************/
 365
 366         nude = 1;
 367
 368
 369         for(i=1; i<=nseqs; ++i)
 370                 if( tkill[i] != 1 ) {
 371                         l[nude] = i;
 372                         nude = nude + 1;
 373                 }
 374
 375         b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
 376         b2 =  tmat[l[1]][l[2]] - b1;
 377         b3 =  tmat[l[1]][l[3]] - b1;
 378
 379         branch[1] = b1 - av[l[1]];
 380         branch[2] = b2 - av[l[2]];
 381         branch[3] = b3 - av[l[3]];
 382
 383 /* Reset tiny negative and positive branch lengths to zero */
 384         if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
 385         if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
 386         if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
 387
 388         left_branch[nseqs-2] = branch[1];
 389         left_branch[nseqs-1] = branch[2];
 390         left_branch[nseqs]   = branch[3];
 391
 392         for(i=1; i<=nseqs; i++)
 393                 tree_description[nseqs-2][i] = 0;
 394
 395
 396
 397         for(i=1; i<=3; ++i) {
 398            if( av[l[i]] > 0.0) {
 399
 400                 for(k=nseqs-3; k>=1; k--)
 401                         if(tree_description[k][l[i]] == 1) {
 402                                 for(j=1; j<=nseqs; j++)
 403                                         if(tree_description[k][j] == 1)
 404                                             tree_description[nseqs-2][j] = i;
 405                                 break;
 406                         }
 407            }
 408            else  {
 409
 410                 tree_description[nseqs-2][l[i]] = i;
 411            }
 412            if(i < 3) {
 413            }
 414         }
 415 }
 416
 417 void print_phylip_tree(char **tree_description, FILE *tree, int bootstrap)
 418 {
 419
 420         fprintf(tree,"(\n");
 421
 422         two_way_split(tree_description, tree, nseqs-2,1,bootstrap);
 423         fprintf(tree,":%7.5f,\n",left_branch[nseqs-2]);
 424         two_way_split(tree_description, tree, nseqs-2,2,bootstrap);
 425         fprintf(tree,":%7.5f,\n",left_branch[nseqs-1]);
 426         two_way_split(tree_description, tree, nseqs-2,3,bootstrap);
 427
 428         fprintf(tree,":%7.5f)",left_branch[nseqs]);
 429         if (bootstrap) fprintf(tree,"TRICHOTOMY");
 430         fprintf(tree,";\n");
 431 }
 432
 433
 434 void two_way_split
 435 (char **tree_description, FILE *tree, int start_row, int flag, int bootstrap)
 436 {
 437         int row, new_row, col, test_col=0;
 438         int single_seq;
 439
 440         if(start_row != nseqs-2) fprintf(tree,"(\n");
 441
 442         for(col=1; col<=nseqs; col++) {
 443                 if(tree_description[start_row][col] == flag) {
 444                         test_col = col;
 445                         break;
 446                 }
 447         }
 448
 449         single_seq = TRUE;
 450         for(row=start_row-1; row>=1; row--)
 451                 if(tree_description[row][test_col] == 1) {
 452                         single_seq = FALSE;
 453                         new_row = row;
 454                         break;
 455                 }
 456
 457         if(single_seq) {
 458                 tree_description[start_row][test_col] = 0;
 459                 fprintf(tree,"%s",names[test_col+0-1]);
 460         }
 461         else {
 462                 for(col=1; col<=nseqs; col++) {
 463                     if((tree_description[start_row][col]==1)&&
 464                        (tree_description[new_row][col]==1))
 465                                 tree_description[start_row][col] = 0;
 466                 }
 467                 two_way_split(tree_description, tree, new_row, (int)1, bootstrap);
 468         }
 469
 470         if(start_row == nseqs-2) {
 471 /*              if (bootstrap && (flag==1)) fprintf(tree,"[TRICHOTOMY]");
 472 */
 473                 return;
 474         }
 475
 476         fprintf(tree,":%7.5f,\n",left_branch[start_row]);
 477
 478         for(col=1; col<=nseqs; col++)
 479                 if(tree_description[start_row][col] == flag) {
 480                         test_col = col;
 481                         break;
 482                 }
 483
 484         single_seq = TRUE;
 485         for(row=start_row-1; row>=1; row--)
 486                 if(tree_description[row][test_col] == 1) {
 487                         single_seq = FALSE;
 488                         new_row = row;
 489                         break;
 490                 }
 491
 492         if(single_seq) {
 493                 tree_description[start_row][test_col] = 0;
 494                 fprintf(tree,"%s",names[test_col+0-1]);
 495         }
 496         else {
 497                 for(col=1; col<=nseqs; col++) {
 498                     if((tree_description[start_row][col]==1)&&
 499                        (tree_description[new_row][col]==1))
 500                                 tree_description[start_row][col] = 0;
 501                 }
 502                 two_way_split(tree_description, tree, new_row, (int)1, bootstrap);
 503         }
 504
 505         fprintf(tree,":%7.5f)\n",right_branch[start_row]);
 506
 507
 508 }
 509
 510
 511
 512
 513 /****************************************************************************
 514  * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
 515  *                                              written by Tadashi Koike
 516  *                                              (takoike@genes.nig.ac.jp)
 517  *******************
 518  * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
 519  *                   and use again and again.
 520  *
 521  *      In the main cycle, these are calculated again and again :
 522  *          diq = sum of tmat[n][ii]   (n:1 to last_seq-first_seq+1),
 523  *          djq = sum of tmat[n][jj]   (n:1 to last_seq-first_seq+1),
 524  *          dio = sum of tmat[n][mini] (n:1 to last_seq-first_seq+1),
 525  *          djq = sum of tmat[n][minj] (n:1 to last_seq-first_seq+1)
 526  *              // 'last_seq' and 'first_seq' are both constant values //
 527  *      and the result of above calculations is always same until
 528  *      a best pair of neighbour nodes is joined.
 529  *
 530  *      So, we change the logic to calculate the sum[i] (=sum of tmat[n][i]
 531  *      (n:1 to last_seq-first_seq+1)) and store it to array, before
 532  *      beginning to find a best pair of neighbour nodes, and after that
 533  *      we use them again and again.
 534  *
 535  *          tmat[i][j]
 536  *                    1   2   3   4   5
 537  *                  +---+---+---+---+---+
 538  *                1 |   |   |   |   |   |
 539  *                  +---+---+---+---+---+
 540  *                2 |   |   |   |   |   |  1) calculate sum of tmat[n][i]
 541  *                  +---+---+---+---+---+        (n: 1 to last_seq-first_seq+1)
 542  *                3 |   |   |   |   |   |  2) store that sum value to sum[i]
 543  *                  +---+---+---+---+---+
 544  *                4 |   |   |   |   |   |  3) use sum[i] during finding a best
 545  *                  +---+---+---+---+---+     pair of neibour nodes.
 546  *                5 |   |   |   |   |   |
 547  *                  +---+---+---+---+---+
 548  *                    |   |   |   |   |
 549  *                    V   V   V   V   V  Calculate sum , and store it to sum[i]
 550  *                  +---+---+---+---+---+
 551  *           sum[i] |   |   |   |   |   |
 552  *                  +---+---+---+---+---+
 553  *
 554  *      At this time, we thought that we use upper triangle of the matrix
 555  *      because tmat[i][j] is equal to tmat[j][i] and tmat[i][i] is equal
 556  *      to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead
 557  *      of sum[i] for storing the sum value.
 558  *
 559  *          tmat[i][j]
 560  *                    1   2   3   4   5     sum_cols[i]
 561  *                  +---+---+---+---+---+     +---+
 562  *                1     | # | # | # | # | --> |   | ... sum of tmat[1][2..5]
 563  *                  + - +---+---+---+---+     +---+
 564  *                2         | # | # | # | --> |   | ... sum of tmat[2][3..5]
 565  *                  + - + - +---+---+---+     +---+
 566  *                3             | # | # | --> |   | ... sum of tmat[3][4..5]
 567  *                  + - + - + - +---+---+     +---+
 568  *                4                 | # | --> |   | ... sum of tmat[4][5]
 569  *                  + - + - + - + - +---+     +---+
 570  *                5                     | --> |   | ... zero
 571  *                  + - + - + - + - + - +     +---+
 572  *                    |   |   |   |   |
 573  *                    V   V   V   V   V  Calculate sum , sotre to sum[i]
 574  *                  +---+---+---+---+---+
 575  *      sum_rows[i] |   |   |   |   |   |
 576  *                  +---+---+---+---+---+
 577  *                    |   |   |   |   |
 578  *                    |   |   |   |   +----- sum of tmat[1..4][5]
 579  *                    |   |   |   +--------- sum of tmat[1..3][4]
 580  *                    |   |   +------------- sum of tmat[1..2][3]
 581  *                    |   +----------------- sum of tmat[1][2]
 582  *                    +--------------------- zero
 583  *
 584  *      And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
 585  *
 586  *******************
 587  * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
 588  *                   tkill[i] flag array.
 589  *
 590  *      In original logic, invalid(killed?) nodes after nodes-joining
 591  *      are managed with tkill[i] flag array (set to 1 when killed).
 592  *      By this method, it is conspicuous to try next node but skip it
 593  *      at the latter of finding a best pair of neighbor nodes.
 594  *
 595  *      So, we thought that we managed valid nodes by using a chain list
 596  *      as below:
 597  *
 598  *      1) declare the list structure.
 599  *              struct {
 600  *                  int n;              // entry number of node.
 601  *                  void *prev;         // pointer to previous entry.
 602  *                  void *next;         // pointer to next entry.
 603  *              }
 604  *      2) construct a valid node list.
 605  *
 606  *       +-----+    +-----+    +-----+    +-----+        +-----+
 607  * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
 608  *       |  0  |    |  1  |    |  2  |    |  3  |        |  n  |
 609  *       | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
 610  *       +-----+    +-----+    +-----+    +-----+        +-----+
 611  *
 612  *      3) when finding a best pair of neighbor nodes, we use
 613  *         this chain list as loop counter.
 614  *
 615  *      4) If an entry was killed by node-joining, this chain list is
 616  *         modified to remove that entry.
 617  *
 618  *         EX) remove the entry No 2.
 619  *       +-----+    +-----+               +-----+        +-----+
 620  * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
 621  *       |  0  |    |  1  |               |  3  |        |  n  |
 622  *       | next|--->| next|-------------->| next|- - - ->| next|->NULL
 623  *       +-----+    +-----+               +-----+        +-----+
 624  *                             +-----+
 625  *                       NULL<-|prev |
 626  *                             |  2  |
 627  *                             | next|->NULL
 628  *                             +-----+
 629  *
 630  *      By this method, speed is up at the latter of finding a best pair of
 631  *      neighbor nodes.
 632  *
 633  *******************
 634  * <IMPROVEMENT 3> : Cut the frequency of division.
 635  *
 636  * At comparison between 'total' and 'tmin' in the main cycle, total is
 637  * divided by (2.0*fnseqs2) before comparison.  If N nodes are available,
 638  * that division happen (N*(N-1))/2 order.
 639  *
 640  * We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
 641  * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
 642  * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
 643  * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
 644  *
 645  *******************
 646  * <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
 647  *
 648  * We transform an equation of calculating 'total' in the main cycle.
 649  *
 650  */
 651
 652
 653 void fast_nj_tree(char **tree_description)
 654 {
 655         register int i;
 656         int l[4],nude,k;
 657         int nc,mini,minj,j,ii,jj;
 658         double fnseqs,fnseqs2=0,sumd;
 659         double diq,djq,dij,dio,djo,da;
 660         double tmin,total,dmin;
 661         double bi,bj,b1,b2,b3,branch[4];
 662         int typei,typej;             /* 0 = node; 1 = OTU */
 663
 664         /* IMPROVEMENT 1, STEP 0 : declare  variables */
 665         double *sum_cols, *sum_rows, *join;
 666
 667         /* IMPROVEMENT 2, STEP 0 : declare  variables */
 668         int loop_limit;
 669         typedef struct _ValidNodeID {
 670             int n;
 671             struct _ValidNodeID *prev;
 672             struct _ValidNodeID *next;
 673         } ValidNodeID;
 674         ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;
 675
 676         /*
 677          * correspondence of the loop counter variables.
 678          *   i .. lpi->n,       ii .. lpii->n
 679          *   j .. lpj->n,       jj .. lpjj->n
 680          */
 681
 682         fnseqs = (double)last_seq-first_seq+1;
 683
 684 /*********************** First initialisation ***************************/
 685
 686
 687         if (fnseqs == 2) {
 688                 return;
 689         }
 690
 691         mini = minj = 0;
 692
 693         left_branch     = (double *) ckalloc( (nseqs+2) * sizeof (double)   );
 694         right_branch    = (double *) ckalloc( (nseqs+2) * sizeof (double)   );
 695         tkill           = (int *) ckalloc( (nseqs+1) * sizeof (int) );
 696         av              = (double *) ckalloc( (nseqs+1) * sizeof (double)   );
 697
 698         /* IMPROVEMENT 1, STEP 1 : Allocate memory */
 699         sum_cols        = (double *) ckalloc( (nseqs+1) * sizeof (double)   );
 700         sum_rows        = (double *) ckalloc( (nseqs+1) * sizeof (double)   );
 701         join            = (double *) ckalloc( (nseqs+1) * sizeof (double)   );
 702
 703         /* IMPROVEMENT 2, STEP 1 : Allocate memory */
 704         tvalid  = (ValidNodeID *) ckalloc( (nseqs+1) * sizeof (ValidNodeID) );
 705         /* tvalid[0] is special entry in array. it points a header of valid entry list */
 706         tvalid[0].n = 0;
 707         tvalid[0].prev = NULL;
 708         tvalid[0].next = &tvalid[1];
 709
 710         /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
 711         for(i=1, loop_limit = last_seq-first_seq+1,
 712                 lpi=&tvalid[1], lp_prev=&tvalid[0], lp_next=&tvalid[2] ;
 713                 i<=loop_limit ;
 714                 ++i, ++lpi, ++lp_prev, ++lp_next)
 715                 {
 716                 tmat[i][i] = av[i] = 0.0;
 717                 tkill[i] = 0;
 718                 lpi->n = i;
 719                 lpi->prev = lp_prev;
 720                 lpi->next = lp_next;
 721
 722                 /* IMPROVEMENT 1, STEP 2 : Initialize arrays */
 723                 sum_cols[i] = sum_rows[i] = join[i] = 0.0;
 724                 }
 725         tvalid[loop_limit].next = NULL;
 726
 727         /*
 728          * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that
 729          * is sequence[i] to others.
 730          */
 731         sumd = 0.0;
 732         for (lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
 733                 double tmp_sum = 0.0;
 734                 j = lpj->n;
 735                 /* calculate sum_rows[j] */
 736                 for (lpi=tvalid[0].next ; lpi->n < j ; lpi = lpi->next) {
 737                         i = lpi->n;
 738                         tmp_sum += tmat[i][j];
 739                         /* tmat[j][i] = tmat[i][j]; */
 740                 }
 741                 sum_rows[j] = tmp_sum;
 742
 743                 tmp_sum = 0.0;
 744                 /* Set lpi to that lpi->n is greater than j */
 745                 if ((lpi != NULL) && (lpi->n == j)) {
 746                         lpi = lpi->next;
 747                 }
 748                 /* calculate sum_cols[j] */
 749                 for( ; lpi!=NULL ; lpi = lpi->next) {
 750                         i = lpi->n;
 751                         tmp_sum += tmat[j][i];
 752                         /* tmat[i][j] = tmat[j][i]; */
 753                 }
 754                 sum_cols[j] = tmp_sum;
 755         }
 756
 757 /*********************** Enter The Main Cycle ***************************/
 758
 759         for(nc=1, loop_limit = (last_seq-first_seq+1-3); nc<=loop_limit; ++nc) {
 760
 761                 sumd = 0.0;
 762                 /* IMPROVEMENT 1, STEP 4 : use sum value */
 763                 for(lpj=tvalid[0].next ; lpj!=NULL ; lpj = lpj->next) {
 764                         sumd += sum_cols[lpj->n];
 765                 }
 766
 767                 /* IMPROVEMENT 3, STEP 0 : multiply tmin and 2*fnseqs2 */
 768                 fnseqs2 = fnseqs - 2.0;         /* Set fnseqs2 at this point. */
 769                 tmin = 99999.0 * 2.0 * fnseqs2;
 770
 771
 772 /*.................compute SMATij values and find the smallest one ........*/
 773
 774                 mini = minj = 0;
 775
 776                 /* jj must starts at least 2 */
 777                 if ((tvalid[0].next != NULL) && (tvalid[0].next->n == 1)) {
 778                         lpjj = tvalid[0].next->next;
 779                 } else {
 780                         lpjj = tvalid[0].next;
 781                 }
 782
 783                 for( ; lpjj != NULL; lpjj = lpjj->next) {
 784                         jj = lpjj->n;
 785                         for(lpii=tvalid[0].next ; lpii->n < jj ; lpii = lpii->next) {
 786                                 ii = lpii->n;
 787                                 diq = djq = 0.0;
 788
 789                                 /* IMPROVEMENT 1, STEP 4 : use sum value */
 790                                 diq = sum_cols[ii] + sum_rows[ii];
 791                                 djq = sum_cols[jj] + sum_rows[jj];
 792                                 /*
 793                                  * always ii < jj in this point. Use upper
 794                                  * triangle of score matrix.
 795                                  */
 796                                 dij = tmat[ii][jj];
 797
 798                                 /*
 799                                  * IMPROVEMENT 3, STEP 1 : fnseqs2 is
 800                                  * already calculated.
 801                                  */
 802                                 /* fnseqs2 = fnseqs - 2.0 */
 803
 804                                 /* IMPROVEMENT 4 : transform the equation */
 805   /*-------------------------------------------------------------------*
 806    * OPTIMIZE of expression 'total = d2r + fnseqs2*dij + dr*2.0'       *
 807    * total = d2r + fnseq2*dij + 2.0*dr                                 *
 808    *       = d2r + fnseq2*dij + 2(sumd - dij - d2r)                    *
 809    *       = d2r + fnseq2*dij + 2*sumd - 2*dij - 2*d2r                 *
 810    *       =       fnseq2*dij + 2*sumd - 2*dij - 2*d2r + d2r           *
 811    *       = fnseq2*dij + 2*sumd - 2*dij - d2r                         *
 812    *       = fnseq2*dij + 2*sumd - 2*dij - (diq + djq - 2*dij)         *
 813    *       = fnseq2*dij + 2*sumd - 2*dij - diq - djq + 2*dij           *
 814    *       = fnseq2*dij + 2*sumd - 2*dij + 2*dij - diq - djq           *
 815    *       = fnseq2*dij + 2*sumd  - diq - djq                          *
 816    *-------------------------------------------------------------------*/
 817                                 total = fnseqs2*dij + 2.0*sumd  - diq - djq;
 818
 819                                 /*
 820                                  * IMPROVEMENT 3, STEP 2 : abbrevlate
 821                                  * the division on comparison between
 822                                  * total and tmin.
 823                                  */
 824                                 /* total = total / (2.0*fnseqs2); */
 825
 826                                 if(total < tmin) {
 827                                         tmin = total;
 828                                         mini = ii;
 829                                         minj = jj;
 830                                 }
 831                         }
 832                 }
 833
 834                 /* MEMO: always ii < jj in avobe loop, so mini < minj */
 835
 836 /*.................compute branch lengths and print the results ........*/
 837
 838
 839                 dio = djo = 0.0;
 840
 841                 /* IMPROVEMENT 1, STEP 4 : use sum value */
 842                 dio = sum_cols[mini] + sum_rows[mini];
 843                 djo = sum_cols[minj] + sum_rows[minj];
 844
 845                 dmin = tmat[mini][minj];
 846                 dio = (dio - dmin) / fnseqs2;
 847                 djo = (djo - dmin) / fnseqs2;
 848                 bi = (dmin + dio - djo) * 0.5;
 849                 bj = dmin - bi;
 850                 bi = bi - av[mini];
 851                 bj = bj - av[minj];
 852
 853                 if( av[mini] > 0.0 )
 854                         typei = 0;
 855                 else
 856                         typei = 1;
 857                 if( av[minj] > 0.0 )
 858                         typej = 0;
 859                 else
 860                         typej = 1;
 861
 862
 863 /*
 864    set negative branch lengths to zero.  Also set any tiny positive
 865    branch lengths to zero.
 866 */              if( fabs(bi) < 0.0001) bi = 0.0;
 867                 if( fabs(bj) < 0.0001) bj = 0.0;
 868
 869
 870                 left_branch[nc] = bi;
 871                 right_branch[nc] = bj;
 872
 873                 for(i=1; i<=last_seq-first_seq+1; i++)
 874                         tree_description[nc][i] = 0;
 875
 876                 if(typei == 0) {
 877                         for(i=nc-1; i>=1; i--)
 878                                 if(tree_description[i][mini] == 1) {
 879                                         for(j=1; j<=last_seq-first_seq+1; j++)
 880                                              if(tree_description[i][j] == 1)
 881                                                     tree_description[nc][j] = 1;
 882                                         break;
 883                                 }
 884                 }
 885                 else
 886                         tree_description[nc][mini] = 1;
 887
 888                 if(typej == 0) {
 889                         for(i=nc-1; i>=1; i--)
 890                                 if(tree_description[i][minj] == 1) {
 891                                         for(j=1; j<=last_seq-first_seq+1; j++)
 892                                              if(tree_description[i][j] == 1)
 893                                                     tree_description[nc][j] = 1;
 894                                         break;
 895                                 }
 896                 }
 897                 else
 898                         tree_description[nc][minj] = 1;
 899
 900
 901 /*
 902    Here is where the -0.00005 branch lengths come from for 3 or more
 903    identical seqs.
 904 */
 905 /*              if(dmin <= 0.0) dmin = 0.0001; */
 906                 if(dmin <= 0.0) dmin = 0.000001;
 907                 av[mini] = dmin * 0.5;
 908
 909 /*........................Re-initialisation................................*/
 910
 911                 fnseqs = fnseqs - 1.0;
 912                 tkill[minj] = 1;
 913
 914                 /* IMPROVEMENT 2, STEP 3 : Remove tvalid[minj] from chain list. */
 915                 /* [ Before ]
 916                  *  +---------+        +---------+        +---------+
 917                  *  |prev     |<-------|prev     |<-------|prev     |<---
 918                  *  |    n    |        | n(=minj)|        |    n    |
 919                  *  |     next|------->|     next|------->|     next|----
 920                  *  +---------+        +---------+        +---------+
 921                  *
 922                  * [ After ]
 923                  *  +---------+                           +---------+
 924                  *  |prev     |<--------------------------|prev     |<---
 925                  *  |    n    |                           |    n    |
 926                  *  |     next|-------------------------->|     next|----
 927                  *  +---------+                           +---------+
 928                  *                     +---------+
 929                  *              NULL---|prev     |
 930                  *                     | n(=minj)|
 931                  *                     |     next|---NULL
 932                  *                     +---------+
 933                  */
 934                 (tvalid[minj].prev)->next = tvalid[minj].next;
 935                 if (tvalid[minj].next != NULL) {
 936                         (tvalid[minj].next)->prev = tvalid[minj].prev;
 937                 }
 938                 tvalid[minj].prev = tvalid[minj].next = NULL;
 939
 940                 /* IMPROVEMENT 1, STEP 5 : re-calculate sum values. */
 941                 for(lpj=tvalid[0].next ; lpj != NULL ; lpj = lpj->next) {
 942                         double tmp_di = 0.0;
 943                         double tmp_dj = 0.0;
 944                         j = lpj->n;
 945
 946                         /*
 947                          * subtrace a score value related with 'minj' from
 948                          * sum arrays .
 949                          */
 950                         if (j < minj) {
 951                                 tmp_dj = tmat[j][minj];
 952                                 sum_cols[j] -= tmp_dj;
 953                         } else if (j > minj) {
 954                                 tmp_dj = tmat[minj][j];
 955                                 sum_rows[j] -= tmp_dj;
 956                         } /* nothing to do when j is equal to minj. */
 957
 958
 959                         /*
 960                          * subtrace a score value related with 'mini' from
 961                          * sum arrays .
 962                          */
 963                         if (j < mini) {
 964                                 tmp_di = tmat[j][mini];
 965                                 sum_cols[j] -= tmp_di;
 966                         } else if (j > mini) {
 967                                 tmp_di = tmat[mini][j];
 968                                 sum_rows[j] -= tmp_di;
 969                         } /* nothing to do when j is equal to mini. */
 970
 971                         /*
 972                          * calculate a score value of the new inner node.
 973                          * then, store it temporary to join[] array.
 974                          */
 975                         join[j] = (tmp_dj + tmp_di) * 0.5;
 976                 }
 977
 978                 /*
 979                  * 1)
 980                  * Set the score values (stored in join[]) into the matrix,
 981                  * row/column position is 'mini'.
 982                  * 2)
 983                  * Add a score value of the new inner node to sum arrays.
 984                  */
 985                 for(lpj=tvalid[0].next ; lpj != NULL; lpj = lpj->next) {
 986                         j = lpj->n;
 987                         if (j < mini) {
 988                                 tmat[j][mini] = join[j];
 989                                 sum_cols[j] += join[j];
 990                         } else if (j > mini) {
 991                                 tmat[mini][j] = join[j];
 992                                 sum_rows[j] += join[j];
 993                         } /* nothing to do when j is equal to mini. */
 994                 }
 995
 996                 /* Re-calculate sum_rows[mini],sum_cols[mini]. */
 997                 sum_cols[mini] = sum_rows[mini] = 0.0;
 998
 999                 /* calculate sum_rows[mini] */
1000                 da = 0.0;
1001                 for(lpj=tvalid[0].next ; lpj->n < mini ; lpj = lpj->next) {
1002                       da += join[lpj->n];
1003                 }
1004                 sum_rows[mini] = da;
1005
1006                 /* skip if 'lpj->n' is equal to 'mini' */
1007                 if ((lpj != NULL) && (lpj->n == mini)) {
1008                         lpj = lpj->next;
1009                 }
1010
1011                 /* calculate sum_cols[mini] */
1012                 da = 0.0;
1013                 for( ; lpj != NULL; lpj = lpj->next) {
1014                       da += join[lpj->n];
1015                 }
1016                 sum_cols[mini] = da;
1017
1018                 /*
1019                  * Clean up sum_rows[minj], sum_cols[minj] and score matrix
1020                  * related with 'minj'.
1021                  */
1022                 sum_cols[minj] = sum_rows[minj] = 0.0;
1023                 for(j=1; j<=last_seq-first_seq+1; ++j)
1024                         tmat[minj][j] = tmat[j][minj] = join[j] = 0.0;
1025
1026
1027 /****/  }                                               /*end main cycle**/
1028
1029 /******************************Last Cycle (3 Seqs. left)********************/
1030
1031         nude = 1;
1032
1033         for(lpi=tvalid[0].next; lpi != NULL; lpi = lpi->next) {
1034                 l[nude] = lpi->n;
1035                 ++nude;
1036         }
1037
1038         b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5;
1039         b2 =  tmat[l[1]][l[2]] - b1;
1040         b3 =  tmat[l[1]][l[3]] - b1;
1041
1042         branch[1] = b1 - av[l[1]];
1043         branch[2] = b2 - av[l[2]];
1044         branch[3] = b3 - av[l[3]];
1045
1046 /* Reset tiny negative and positive branch lengths to zero */
1047         if( fabs(branch[1]) < 0.0001) branch[1] = 0.0;
1048         if( fabs(branch[2]) < 0.0001) branch[2] = 0.0;
1049         if( fabs(branch[3]) < 0.0001) branch[3] = 0.0;
1050
1051         left_branch[last_seq-first_seq+1-2] = branch[1];
1052         left_branch[last_seq-first_seq+1-1] = branch[2];
1053         left_branch[last_seq-first_seq+1]   = branch[3];
1054
1055         for(i=1; i<=last_seq-first_seq+1; i++)
1056                 tree_description[last_seq-first_seq+1-2][i] = 0;
1057
1058
1059         for(i=1; i<=3; ++i) {
1060            if( av[l[i]] > 0.0) {
1061
1062                 for(k=last_seq-first_seq+1-3; k>=1; k--)
1063                         if(tree_description[k][l[i]] == 1) {
1064                                 for(j=1; j<=last_seq-first_seq+1; j++)
1065                                         if(tree_description[k][j] == 1)
1066                                             tree_description[last_seq-first_seq+1-2][j] = i;
1067                                 break;
1068                         }
1069            }
1070            else  {
1071                 tree_description[last_seq-first_seq+1-2][l[i]] = i;
1072            }
1073            if(i < 3) {;
1074            }
1075         }
1076         ckfree(sum_cols);
1077         ckfree(sum_rows);
1078         ckfree(join);
1079         ckfree(tvalid);
1080 }
1081 //////////////////////////////////////////////////////////////////////////////
1082 //
1083 //                              UPGMA_aln
1084 //////////////////////////////////////////////////////////////////////////////
1085
1086 Alignment * upgma_merge_aln_rows (Alignment *A, int *ns, int **ls,int N, int**mat,int *used, int *n,  Constraint_list *CL);
1087 int upgma_pair_wise (Alignment *A, int *ls0, int ns0, int *ls2, int ns2, Constraint_list *CL);
1088
1089
1090 Alignment * upgma_tree_aln  ( Alignment*A, int nseq, Constraint_list *CL)
1091 {
1092   int a, b,n, *used;
1093   static int **mat;
1094   int **ls;
1095   int *ns;
1096   nseq=(CL->S)->nseq;
1097   mat=declare_int (nseq, nseq);
1098   ls=declare_int  (nseq,nseq);
1099   ns=vcalloc (nseq,sizeof (int));
1100
1101   for (a=0; a<nseq-1; a++)
1102     for (b=a+1; b<nseq; b++)
1103       {
1104         ns[a]=ns[b]=1;
1105         ls[a][0]=a;
1106         ls[b][0]=b;
1107         mat[a][b]=mat[b][a]=upgma_pair_wise(A,ls[a],ns[a],ls[b],ns[b],CL);
1108         HERE ("%d %d", a, b, mat[a][b]);
1109       }
1110
1111   used=vcalloc (nseq, sizeof (int));
1112   n=nseq;
1113   while (n>1)
1114     {
1115       upgma_merge_aln_rows (A,ns, ls,nseq, mat, used, &n,CL);
1116     }
1117   print_aln (A);
1118   HERE ("finished");
1119   free_int ( mat, -1);
1120   free_int (ls, -1);
1121   vfree (ns);
1122   return A;
1123 }
1124
1125 Alignment * upgma_merge_aln_rows (Alignment *A, int *ns, int **ls,int N, int**mat,int *used, int *n,  Constraint_list *CL)
1126 {
1127
1128   int a, b, w, best_a, best_b, set;
1129   float best_s;
1130
1131   for (set=0,a=0; a<N-1; a++)
1132     {
1133       if (used[a])continue;
1134       for (b=a+1; b<N; b++)
1135         {
1136           if (used[b])continue;
1137           w=mat[a][b];
1138           if ( !set || w>best_s)
1139             {
1140               best_s=w;
1141               best_a=a;
1142               best_b=b;
1143               set=1;
1144             }
1145         }
1146     }
1147   used[best_b]=1;
1148
1149   //merge a and b
1150   mat[best_a][best_b]=upgma_pair_wise (A, ls[best_a], ns[best_a], ls[best_b], ns[best_b], CL);
1151   for (a=0; a<ns[best_b]; a++)ls[best_a][ns[best_a]++]=ls[best_b][a];
1152   ns[best_b]=0;
1153
1154   //update the a row
1155   for (a=0; a< A->nseq; a++)
1156     {
1157       if (a!=best_a && !used[a])
1158         mat[best_a][a]=mat[a][best_a]=upgma_pair_wise (A, ls[best_a], ns[best_a], ls[a], ns[a], CL);
1159     }
1160
1161   HERE ("DONE");
1162
1163   n[0]--;
1164   return A;
1165 }
1166 int upgma_pair_wise (Alignment *A, int *ls0, int ns0, int *ls1, int ns1, Constraint_list *CL)
1167 {
1168   static int **ls;
1169   static int *ns;
1170   static int *fl;
1171   int a, b, n;
1172
1173   if ( !ls )
1174     {
1175       ls=vcalloc (2, sizeof (int*));
1176       ns=vcalloc (2, sizeof (int));
1177       fl=vcalloc ((CL->S)->nseq, sizeof (int));
1178     }
1179   ls[0]=ls0;
1180   ls[1]=ls1;
1181   ns[0]=ns0; ns[1]=ns1;
1182
1183   fprintf ( stderr, "\n");
1184   for (a=0; a<ns0; a++)
1185     fprintf ( stderr, "%d", ls0[a]);
1186   fprintf ( stderr, "\n");
1187   for (a=0; a<ns1; a++)
1188     fprintf ( stderr, "%d", ls1[a]);
1189
1190   ungap_sub_aln (A, ns[0], ls[0]);
1191   ungap_sub_aln (A, ns[1], ls[1]);
1192   HERE ("Align");
1193   pair_wise (A, ns, ls, CL);
1194   HERE ("Done");
1195
1196   for (n=0,a=0;a<2; a++)
1197     for (b=0; b<ns[a]; b++)
1198       fl[n++]=ls[a][b];
1199
1200   return sub_aln2ecl_raw_score (A,CL,n, fl);
1201 }
1202
1203 //////////////////////////////////////////////////////////////////////////////
1204 //
1205 //                              UPGMA
1206 ///////////////////////////////////////////////////////////////////////////////
1207
1208
1209 NT_node ** int_dist2upgma_tree (int **mat, Alignment *A, int nseq, char *fname)
1210 {
1211   NT_node *NL, T;
1212   int a, n, *used;
1213   int tot_node;
1214   if (upgma_node_heap (NULL))
1215     {
1216       printf_exit ( EXIT_FAILURE,stderr, "\nERROR: non empty heap in upgma [FATAL]");
1217     }
1218   NL=vcalloc (nseq, sizeof (NT_node));
1219
1220   for (a=0; a<A->nseq; a++)
1221     {
1222       NL[a]=new_declare_tree_node ();
1223       upgma_node_heap (NL[a]);
1224       sprintf (NL[a]->name, "%s", A->name[a]);
1225       NL[a]->isseq=1;
1226       NL[a]->leaf=1;
1227     }
1228
1229   used=vcalloc ( A->nseq, sizeof (int));
1230   n=A->nseq;
1231   while (n>1)
1232     {
1233       T=upgma_merge (mat, NL,used, &n, A->nseq);
1234     }
1235
1236   vfree (used);
1237   vfclose (print_tree (T, "newick", vfopen (fname, "w")));
1238   upgma_node_heap (NULL);
1239   vfree (NL);
1240
1241   return read_tree (fname,&tot_node,A->nseq,  A->name);
1242 }
1243 NT_node upgma_merge (int **mat, NT_node *NL, int *used, int *n, int N)
1244 {
1245   NT_node P, LC, RC;
1246   int a, b, w, best_a, best_b, set;
1247   float best_s;
1248   P=new_declare_tree_node();
1249   upgma_node_heap (P);
1250
1251   for (set=0,a=0; a<N-1; a++)
1252     {
1253       if (used[a])continue;
1254       for (b=a+1; b<N; b++)
1255         {
1256           if (used[b])continue;
1257           w=mat[a][b];
1258           if ( !set || w<best_s)
1259             {
1260               best_s=w;
1261               best_a=a;
1262               best_b=b;
1263               set=1;
1264             }
1265         }
1266     }
1267
1268   for (a=0; a<N; a++)
1269     {
1270       if (!used[a])mat[best_a][a]=mat[a][best_a]=(mat[best_b][a]+mat[best_a][a])/2;
1271     }
1272   used[best_b]=1;
1273
1274   LC=NL[best_a];
1275   RC=NL[best_b];
1276   best_s/=(float)100;
1277   LC->dist=RC->dist=best_s;
1278   LC->parent=RC->parent=P;
1279   P->left=LC;
1280   P->right=RC;
1281   NL[best_a]=P;
1282   n[0]--;
1283   return P;
1284
1285 }
1286
1287
1288 //////////////////////////////////////////////////////////////////////////////
1289 //
1290 //                              SPLIT UPGMA
1291 ///////////////////////////////////////////////////////////////////////////////
1292 int upgma_node_heap (NT_node X);
1293 int upgma_node_heap (NT_node X)
1294 {
1295   static int n;
1296   static NT_node *h;
1297   if ( X==NULL)
1298     {
1299       int a,r;
1300       if (n==0) return 0;
1301       for (a=0; a<n; a++)
1302         {
1303           free_tree_node (h[a]);
1304         }
1305       vfree (h);
1306       h=NULL;
1307       r=n;
1308       n=0;
1309       return r;
1310     }
1311   else
1312     {
1313       h=vrealloc (h, sizeof (NT_node)*(n+1));
1314       h[n++]=X;
1315     }
1316   return n;
1317 }
1318 NT_node split2upgma_tree (Split **S, Alignment *A, int nseq, char *fname)
1319 {
1320   NT_node *NL, T;
1321   int a, n, *used;
1322
1323   NL=vcalloc (nseq, sizeof (NT_node));
1324   for (a=0; a<A->nseq; a++)
1325     {
1326
1327       NL[a]=new_declare_tree_node ();
1328       NL[a]->lseq2=vcalloc (A->nseq+1, sizeof (int));
1329       NL[a]->lseq2[a]=1;
1330       sprintf (NL[a]->name, "%s", A->name[a]);
1331       NL[a]->isseq=1;
1332       NL[a]->leaf=1;
1333       NL[a]->dist=1;
1334       upgma_node_heap (NL[a]);
1335     }
1336   used=vcalloc ( A->nseq, sizeof (int));
1337   n=A->nseq;
1338   while (n>1)
1339     {
1340       T=split_upgma_merge (A,S, NL,used, &n, A->nseq);
1341     }
1342   vfree (used);
1343   fprintf ( stdout, "\n");
1344   vfclose (print_tree (T, "newick", vfopen (fname, "w")));
1345   upgma_node_heap (NULL);
1346   return T;
1347 }
1348 NT_node split_upgma_merge (Alignment *A, Split **S, NT_node *NL, int *used, int *n, int N)
1349 {
1350   NT_node P, LC, RC;
1351   int a, b, w, best_a, best_b, set;
1352   float best_s;
1353   static int **mat;
1354
1355   if (!mat)
1356     {
1357       mat=declare_int (N, N);
1358       for (a=0; a<N-1; a++)
1359         for ( b=a+1; b<N; b++)
1360           {
1361             mat[a][b]=get_split_dist (A, NL[a], NL[b], S);
1362           }
1363     }
1364
1365   P=new_declare_tree_node();
1366   upgma_node_heap (P);
1367   P->lseq2=vcalloc (N, sizeof (int));
1368   for (set=0,a=0; a<N-1; a++)
1369     {
1370       if (used[a])continue;
1371       for (b=a+1; b<N; b++)
1372         {
1373           if (used[b])continue;
1374           w=mat[a][b];
1375           if ( !set || w<best_s)
1376             {
1377               best_s=w;
1378               best_a=a;
1379               best_b=b;
1380               set=1;
1381             }
1382         }
1383     }
1384
1385
1386
1387
1388   LC=NL[best_a];
1389   RC=NL[best_b];
1390   best_s/=100;
1391
1392   P->dist=1-best_s;
1393   LC->parent=RC->parent=P;
1394   P->left=LC;
1395   P->right=RC;
1396   P->bootstrap=best_s*100;
1397   used[best_b]=1;
1398
1399   n[0]--;
1400
1401   for (a=0; a<A->nseq; a++)
1402     {
1403       P->lseq2[a]=(LC->lseq2[a] || RC->lseq2[a])?1:0;
1404     }
1405
1406   for (a=0; a<N; a++)
1407     {
1408       if (!used[a])mat[best_a][a]=mat[a][best_a]=(int)get_split_dist(A,P, NL[a], S);
1409     }
1410   NL[best_a]=P;
1411
1412   return P;
1413
1414 }
1415 float get_split_dist ( Alignment *A, NT_node L, NT_node R, Split **S)
1416 {
1417   static char *split;
1418
1419
1420   int n,a;
1421
1422   if (!split)
1423     {
1424       split=vcalloc ( A->nseq+1, sizeof (char));
1425     }
1426
1427
1428
1429   for ( a=0; a<A->nseq; a++)
1430     split[a]=((L->lseq2[a] || R->lseq2[a])?1:0)+'0';
1431
1432   n=0;
1433   while (S[n])
1434     {
1435       float score;
1436       if ( strm (S[n]->split,split))
1437         {
1438           return score=100-S[n]->score;
1439         }
1440       n++;
1441     }
1442   return 100;
1443 }
1444 /*********************************COPYRIGHT NOTICE**********************************/
1445 /*© Centro de Regulacio Genomica */
1446 /*and */
1447 /*Cedric Notredame */
1448 /*Tue Oct 27 10:12:26 WEST 2009. */
1449 /*All rights reserved.*/
1450 /*This file is part of T-COFFEE.*/
1451 /**/
1452 /*    T-COFFEE is free software; you can redistribute it and/or modify*/
1453 /*    it under the terms of the GNU General Public License as published by*/
1454 /*    the Free Software Foundation; either version 2 of the License, or*/
1455 /*    (at your option) any later version.*/
1456 /**/
1457 /*    T-COFFEE is distributed in the hope that it will be useful,*/
1458 /*    but WITHOUT ANY WARRANTY; without even the implied warranty of*/
1459 /*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the*/
1460 /*    GNU General Public License for more details.*/
1461 /**/
1462 /*    You should have received a copy of the GNU General Public License*/
1463 /*    along with Foobar; if not, write to the Free Software*/
1464 /*    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/
1465 /*...............................................                                                                                      |*/
1466 /*  If you need some more information*/
1467 /*  cedric.notredame@europe.com*/
1468 /*...............................................                                                                                                                                     |*/
1469 /**/
1470 /**/
1471 /*      */
1472 /*********************************COPYRIGHT NOTICE**********************************/