4 * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
15 /****************************************************************************
16 * [ Improvement ideas in fast_nj_tree() ] by DDBJ & FUJITSU Limited.
17 * written by Tadashi Koike
18 * (takoike@genes.nig.ac.jp)
20 * <IMPROVEMENT 1> : Store the value of sum of the score to temporary array,
21 * and use again and again.
23 * In the main cycle, these are calculated again and again :
24 * diq = sum of distMat[n][ii] (n:1 to lastSeq-firstSeq+1),
25 * djq = sum of distMat[n][jj] (n:1 to lastSeq-firstSeq+1),
26 * dio = sum of distMat[n][mini] (n:1 to lastSeq-firstSeq+1),
27 * djq = sum of distMat[n][minj] (n:1 to lastSeq-firstSeq+1)
28 * // 'lastSeq' and 'firstSeq' are both constant values //
29 * and the result of above calculations is always same until
30 * a best pair of neighbour nodes is joined.
32 * So, we change the logic to calculate the sum[i] (=sum of distMat[n][i]
33 * (n:1 to lastSeq-firstSeq+1)) and store it to array, before
34 * beginning to find a best pair of neighbour nodes, and after that
35 * we use them again and again.
39 * +---+---+---+---+---+
41 * +---+---+---+---+---+
42 * 2 | | | | | | 1) calculate sum of distMat[n][i]
43 * +---+---+---+---+---+ (n: 1 to lastSeq-firstSeq+1)
44 * 3 | | | | | | 2) store that sum value to sum[i]
45 * +---+---+---+---+---+
46 * 4 | | | | | | 3) use sum[i] during finding a best
47 * +---+---+---+---+---+ pair of neibour nodes.
49 * +---+---+---+---+---+
51 * V V V V V Calculate sum , and store it to sum[i]
52 * +---+---+---+---+---+
54 * +---+---+---+---+---+
56 * At this time, we thought that we use upper triangle of the matrix
57 * because distMat[i][j] is equal to distMat[j][i] and distMat[i][i] is equal
58 * to zero. Therefore, we prepared sum_rows[i] and sum_cols[i] instead
59 * of sum[i] for storing the sum value.
62 * 1 2 3 4 5 sum_cols[i]
63 * +---+---+---+---+---+ +---+
64 * 1 | # | # | # | # | --> | | ... sum of distMat[1][2..5]
65 * + - +---+---+---+---+ +---+
66 * 2 | # | # | # | --> | | ... sum of distMat[2][3..5]
67 * + - + - +---+---+---+ +---+
68 * 3 | # | # | --> | | ... sum of distMat[3][4..5]
69 * + - + - + - +---+---+ +---+
70 * 4 | # | --> | | ... sum of distMat[4][5]
71 * + - + - + - + - +---+ +---+
72 * 5 | --> | | ... zero
73 * + - + - + - + - + - + +---+
75 * V V V V V Calculate sum , sotre to sum[i]
76 * +---+---+---+---+---+
77 * sum_rows[i] | | | | | |
78 * +---+---+---+---+---+
80 * | | | | +----- sum of distMat[1..4][5]
81 * | | | +--------- sum of distMat[1..3][4]
82 * | | +------------- sum of distMat[1..2][3]
83 * | +----------------- sum of distMat[1][2]
84 * +--------------------- zero
86 * And we use (sum_rows[i] + sum_cols[i]) instead of sum[i].
89 * <IMPROVEMENT 2> : We manage valid nodes with chain list, instead of
90 * tkill[i] flag array.
92 * In original logic, invalid(killed?) nodes after nodes-joining
93 * are managed with tkill[i] flag array (set to 1 when killed).
94 * By this method, it is conspicuous to try next node but skip it
95 * at the latter of finding a best pair of neighbor nodes.
97 * So, we thought that we managed valid nodes by using a chain list
100 * 1) declare the list structure.
102 * sint n; // entry number of node.
103 * void *prev; // pointer to previous entry.
104 * void *next; // pointer to next entry.
106 * 2) construct a valid node list.
108 * +-----+ +-----+ +-----+ +-----+ +-----+
109 * NULL<-|prev |<---|prev |<---|prev |<---|prev |<- - - -|prev |
110 * | 0 | | 1 | | 2 | | 3 | | n |
111 * | next|--->| next|--->| next|--->| next|- - - ->| next|->NULL
112 * +-----+ +-----+ +-----+ +-----+ +-----+
114 * 3) when finding a best pair of neighbor nodes, we use
115 * this chain list as loop counter.
117 * 4) If an entry was killed by node-joining, this chain list is
118 * modified to remove that entry.
120 * EX) remove the entry No 2.
121 * +-----+ +-----+ +-----+ +-----+
122 * NULL<-|prev |<---|prev |<--------------|prev |<- - - -|prev |
123 * | 0 | | 1 | | 3 | | n |
124 * | next|--->| next|-------------->| next|- - - ->| next|->NULL
125 * +-----+ +-----+ +-----+ +-----+
132 * By this method, speed is up at the latter of finding a best pair of
136 * <IMPROVEMENT 3> : Cut the frequency of division.
138 * At comparison between 'total' and 'tmin' in the main cycle, total is
139 * divided by (2.0*fnseqs2) before comparison. If N nodes are available,
140 * that division happen (N*(N-1))/2 order.
142 * We thought that the comparison relation between tmin and total/(2.0*fnseqs2)
143 * is equal to the comparison relation between (tmin*2.0*fnseqs2) and total.
144 * Calculation of (tmin*2.0*fnseqs2) is only one time. so we stop dividing
145 * a total value and multiply tmin and (tmin*2.0*fnseqs2) instead.
148 * <IMPROVEMENT 4> : some transformation of the equation (to cut operations).
150 * We transform an equation of calculating 'total' in the main cycle.
154 void NJTree::generateTree(clustalw::PhyloTree* phyTree,
155 clustalw::DistMatrix* distMat,
156 clustalw::SeqInfo* seqInfo,
166 int nc, mini, minj, j, ii, jj;
167 double fnseqs, fnseqs2 = 0, sumd;
168 double diq, djq, dij, dio, djo, da;
169 double tmin, total, dmin;
170 double bi, bj, b1, b2, b3, branch[4];
171 int typei, typej; /* 0 = node; 1 = OTU */
173 int firstSeq = seqInfo->firstSeq;
174 int lastSeq = seqInfo->lastSeq;
175 int numSeqs = seqInfo->numSeqs;
177 /* IMPROVEMENT 1, STEP 0 : declare variables */
178 double *sumCols, *sumRows, *join;
180 sumCols = new double[numSeqs + 1];
181 sumRows = new double[numSeqs + 1];
182 join = new double[numSeqs + 1];
184 /* IMPROVEMENT 2, STEP 0 : declare variables */
186 typedef struct _ValidNodeID
189 struct _ValidNodeID *prev;
190 struct _ValidNodeID *next;
192 ValidNodeID *tvalid, *lpi, *lpj, *lpii, *lpjj, *lp_prev, *lp_next;
195 * correspondence of the loop counter variables.
196 * i .. lpi->n, ii .. lpii->n
197 * j .. lpj->n, jj .. lpjj->n
200 fnseqs = (double)lastSeq - firstSeq + 1;
202 /*********************** First initialisation ***************************/
206 (*log) << "\n\n\t\t\tNeighbor-joining Method\n"
207 << "\n Saitou, N. and Nei, M. (1987)" << " The Neighbor-joining Method:"
208 << "\n A New Method for Reconstructing Phylogenetic Trees."
209 << "\n Mol. Biol. Evol., 4(4), 406-425\n" << "\n\n This is an UNROOTED tree\n"
210 << "\n Numbers in parentheses are branch lengths\n\n";
217 (*log) << "Cycle 1 = SEQ: 1 (" << setw(9) << setprecision(5)
218 << (*distMat)(firstSeq, firstSeq + 1)
219 << ") joins SEQ: 2 ("
220 << setw(9) << setprecision(5)
221 << (*distMat)(firstSeq, firstSeq + 1) << ")";
228 /* IMPROVEMENT 1, STEP 1 : Allocate memory */
229 /* IMPROVEMENT 1, STEP 2 : Initialize arrays to 0 */
230 phyTree->leftBranch.resize(numSeqs + 2, 0.0);
231 phyTree->rightBranch.resize(numSeqs + 2, 0.0);
232 tkill.resize(numSeqs + 1, 0);
233 av.resize(numSeqs + 1, 0.0);
235 /* IMPROVEMENT 2, STEP 1 : Allocate memory */
237 tvalid = new ValidNodeID[numSeqs + 1];
239 /* tvalid[0] is special entry in array. it points a header of valid entry list */
241 tvalid[0].prev = NULL;
242 tvalid[0].next = &tvalid[1];
244 /* IMPROVEMENT 2, STEP 2 : Construct and initialize the entry chain list */
245 for (i = 1, loop_limit = lastSeq - firstSeq + 1, lpi = &tvalid[1],
246 lp_prev = &tvalid[0], lp_next = &tvalid[2]; i <= loop_limit; ++i,
247 ++lpi, ++lp_prev, ++lp_next)
249 (*distMat)(i, i) = av[i] = 0.0;
255 tvalid[loop_limit].next = NULL;
258 * IMPROVEMENT 1, STEP 3 : Calculate the sum of score value that
259 * is sequence[i] to others.
263 for (lpj = tvalid[0].next; lpj != NULL; lpj = lpj->next)
265 double tmp_sum = 0.0;
267 /* calculate sumRows[j] */
268 for (lpi = tvalid[0].next; lpi->n < j; lpi = lpi->next)
271 matValue = (*distMat)(i, j);
272 tmp_sum = tmp_sum + matValue;
274 sumRows[j] = tmp_sum;
277 /* Set lpi to that lpi->n is greater than j */
278 if ((lpi != NULL) && (lpi->n == j))
282 /* calculate sumCols[j] */
283 for (; lpi != NULL; lpi = lpi->next)
286 tmp_sum += (*distMat)(j, i);
288 sumCols[j] = tmp_sum;
291 /*********************** Enter The Main Cycle ***************************/
293 for (nc = 1, loop_limit = (lastSeq - firstSeq + 1-3); nc <= loop_limit; ++nc)
296 /* IMPROVEMENT 1, STEP 4 : use sum value */
297 for (lpj = tvalid[0].next; lpj != NULL; lpj = lpj->next)
299 sumd += sumCols[lpj->n];
302 /* IMPROVEMENT 3, STEP 0 : multiply tmin and 2*fnseqs2 */
303 fnseqs2 = fnseqs - 2.0; /* Set fnseqs2 at this point. */
304 tmin = 99999.0 * 2.0 * fnseqs2;
306 /*.................compute SMATij values and find the smallest one ........*/
310 /* jj must starts at least 2 */
311 if ((tvalid[0].next != NULL) && (tvalid[0].next->n == 1))
313 lpjj = tvalid[0].next->next;
317 lpjj = tvalid[0].next;
320 for (; lpjj != NULL; lpjj = lpjj->next)
323 for (lpii = tvalid[0].next; lpii->n < jj; lpii = lpii->next)
328 /* IMPROVEMENT 1, STEP 4 : use sum value */
329 diq = sumCols[ii] + sumRows[ii];
330 djq = sumCols[jj] + sumRows[jj];
332 * always ii < jj in this point. Use upper
333 * triangle of score matrix.
335 dij = (*distMat)(ii, jj);
337 * IMPROVEMENT 3, STEP 1 : fnseqs2 is
338 * already calculated.
340 /* fnseqs2 = fnseqs - 2.0 */
342 /* IMPROVEMENT 4 : transform the equation */
343 /*-------------------------------------------------------------------*
344 * OPTIMIZE of expression 'total = d2r + fnseqs2*dij + dr*2.0' *
345 * total = d2r + fnseq2*dij + 2.0*dr *
346 * = d2r + fnseq2*dij + 2(sumd - dij - d2r) *
347 * = d2r + fnseq2*dij + 2*sumd - 2*dij - 2*d2r *
348 * = fnseq2*dij + 2*sumd - 2*dij - 2*d2r + d2r *
349 * = fnseq2*dij + 2*sumd - 2*dij - d2r *
350 * = fnseq2*dij + 2*sumd - 2*dij - (diq + djq - 2*dij) *
351 * = fnseq2*dij + 2*sumd - 2*dij - diq - djq + 2*dij *
352 * = fnseq2*dij + 2*sumd - 2*dij + 2*dij - diq - djq *
353 * = fnseq2*dij + 2*sumd - diq - djq *
354 *-------------------------------------------------------------------*/
355 total = fnseqs2 * dij + 2.0 * sumd - diq - djq;
357 * IMPROVEMENT 3, STEP 2 : abbrevlate
358 * the division on comparison between
361 /* total = total / (2.0*fnseqs2); */
372 /* MEMO: always ii < jj in avobe loop, so mini < minj */
374 /*.................compute branch lengths and print the results ........*/
379 /* IMPROVEMENT 1, STEP 4 : use sum value */
380 dio = sumCols[mini] + sumRows[mini];
381 djo = sumCols[minj] + sumRows[minj];
383 dmin = (*distMat)(mini, minj);
384 dio = (dio - dmin) / fnseqs2;
385 djo = (djo - dmin) / fnseqs2;
386 bi = (dmin + dio - djo) *0.5;
392 (*log) << endl << "*** cycle " << nc << endl;
393 (*log) << "dmin = " << setw(9) << right << setprecision(9) << dmin << endl;
394 (*log) << "dio = " << setw(9) << right << setprecision(9) << dio << endl;
395 (*log) << "djo = " << setw(9) << right << setprecision(9) << djo << endl;
396 (*log) << "bi = " << setw(9) << right << setprecision(9) << bi << endl;
397 (*log) << "bj = " << setw(9) << right << setprecision(9) << bj << endl;
398 (*log) << "mini = " << setw(9) << mini << endl;
399 (*log) << "minj = " << setw(9) << minj << endl;
400 (*log) << "av[minj] = " << setw(9) << right << setprecision(9) << av[minj] << endl;
401 (*log) << "av[minj] = " << setw(9) << right << setprecision(9) << av[minj] << endl;
422 (*log) << "\n Cycle" << setw(4) << nc << " = ";
426 set (tiny? (AW&FS)) negative branch lengths to zero. Also set any tiny positive
427 branch lengths to zero.
429 if (fabs(bi) < 0.0001)
433 if (fabs(bj) < 0.0001)
442 (*log) << "Node:" << setw(4) << mini << " (" << setw(9) << setprecision(5)
447 (*log) << " SEQ:" << setw(4) << mini << " (" << setw(9) << setprecision(5)
453 (*log) << "Node:" << setw(4) << minj << " (" << setw(9) << setprecision(5)
458 (*log) << " SEQ:" << setw(4) << minj << " (" << setw(9) << setprecision(5)
465 phyTree->leftBranch[nc] = bi;
466 phyTree->rightBranch[nc] = bj;
468 for (i = 1; i <= lastSeq - firstSeq + 1; i++)
470 phyTree->treeDesc[nc][i] = 0;
475 for (i = nc - 1; i >= 1; i--)
476 if (phyTree->treeDesc[i][mini] == 1)
478 for (j = 1; j <= lastSeq - firstSeq + 1; j++)
479 if (phyTree->treeDesc[i][j] == 1)
481 phyTree->treeDesc[nc][j] = 1;
488 phyTree->treeDesc[nc][mini] = 1;
493 for (i = nc - 1; i >= 1; i--)
494 if (phyTree->treeDesc[i][minj] == 1)
496 for (j = 1; j <= lastSeq - firstSeq + 1; j++)
497 if (phyTree->treeDesc[i][j] == 1)
499 phyTree->treeDesc[nc][j] = 1;
506 phyTree->treeDesc[nc][minj] = 1;
511 Here is where the -0.00005 branch lengths come from for 3 or more
514 /* if(dmin <= 0.0) dmin = 0.0001; */
519 av[mini] = dmin * 0.5;
521 /*........................Re-initialisation................................*/
523 fnseqs = fnseqs - 1.0;
526 /* IMPROVEMENT 2, STEP 3 : Remove tvalid[minj] from chain list. */
528 * +---------+ +---------+ +---------+
529 * |prev |<-------|prev |<-------|prev |<---
530 * | n | | n(=minj)| | n |
531 * | next|------->| next|------->| next|----
532 * +---------+ +---------+ +---------+
535 * +---------+ +---------+
536 * |prev |<--------------------------|prev |<---
538 * | next|-------------------------->| next|----
539 * +---------+ +---------+
546 (tvalid[minj].prev)->next = tvalid[minj].next;
547 if (tvalid[minj].next != NULL)
549 (tvalid[minj].next)->prev = tvalid[minj].prev;
551 tvalid[minj].prev = tvalid[minj].next = NULL;
553 /* IMPROVEMENT 1, STEP 5 : re-calculate sum values. */
554 for (lpj = tvalid[0].next; lpj != NULL; lpj = lpj->next)
561 * subtrace a score value related with 'minj' from
566 tmp_dj = (*distMat)(j, minj);
567 sumCols[j] -= tmp_dj;
571 tmp_dj = (*distMat)(minj, j);
572 sumRows[j] -= tmp_dj;
573 } /* nothing to do when j is equal to minj. */
577 * subtrace a score value related with 'mini' from
582 tmp_di = (*distMat)(j, mini);
583 sumCols[j] -= tmp_di;
587 tmp_di = (*distMat)(mini, j);
588 sumRows[j] -= tmp_di;
589 } /* nothing to do when j is equal to mini. */
592 * calculate a score value of the new inner node.
593 * then, store it temporary to join[] array.
595 join[j] = (tmp_dj + tmp_di) *0.5;
600 * Set the score values (stored in join[]) into the matrix,
601 * row/column position is 'mini'.
603 * Add a score value of the new inner node to sum arrays.
605 for (lpj = tvalid[0].next; lpj != NULL; lpj = lpj->next)
610 distMat->SetAt(j, mini, join[j]);
611 sumCols[j] += join[j];
615 distMat->SetAt(mini, j, join[j]);
616 sumRows[j] += join[j];
617 } /* nothing to do when j is equal to mini. */
620 /* Re-calculate sumRows[mini],sumCols[mini]. */
621 sumCols[mini] = sumRows[mini] = 0.0;
623 /* calculate sumRows[mini] */
625 for (lpj = tvalid[0].next; lpj->n < mini; lpj = lpj->next)
627 da = da + join[lpj->n];
631 /* skip if 'lpj->n' is equal to 'mini' */
632 if ((lpj != NULL) && (lpj->n == mini))
637 /* calculate sumCols[mini] */
639 for (; lpj != NULL; lpj = lpj->next)
641 da = da + join[lpj->n];
646 * Clean up sumRows[minj], sumCols[minj] and score matrix
647 * related with 'minj'.
649 sumCols[minj] = sumRows[minj] = 0.0;
650 for (j = 1; j <= lastSeq - firstSeq + 1; ++j)
652 distMat->SetAt(minj, j, 0.0);
653 distMat->SetAt(j, minj, 0.0);
657 } /** end main cycle **/
659 /******************************Last Cycle (3 Seqs. left)********************/
663 for (lpi = tvalid[0].next; lpi != NULL; lpi = lpi->next)
669 b1 = ((*distMat)(l[1], l[2]) + (*distMat)(l[1], l[3]) - (*distMat)(l[2], l[3])) *0.5;
670 b2 = (*distMat)(l[1], l[2]) - b1;
671 b3 = (*distMat)(l[1], l[3]) - b1;
673 branch[1] = b1 - av[l[1]];
674 branch[2] = b2 - av[l[2]];
675 branch[3] = b3 - av[l[3]];
677 /* Reset tiny negative and positive branch lengths to zero */
678 if (fabs(branch[1]) < 0.0001)
682 if (fabs(branch[2]) < 0.0001)
686 if (fabs(branch[3]) < 0.0001)
691 phyTree->leftBranch[lastSeq - firstSeq + 1-2] = branch[1];
692 phyTree->leftBranch[lastSeq - firstSeq + 1-1] = branch[2];
693 phyTree->leftBranch[lastSeq - firstSeq + 1] = branch[3];
695 for (i = 1; i <= lastSeq - firstSeq + 1; i++)
697 phyTree->treeDesc[lastSeq - firstSeq + 1-2][i] = 0;
702 (*log) << "\n Cycle" << setw(4) << nc << " (Last cycle, trichotomy):\n";
705 for (i = 1; i <= 3; ++i)
711 (*log) << "\n\t\t Node:" << setw(4) << l[i] <<" (" << setw(9)
712 << setprecision(5) << branch[i] << ") ";
714 for (k = lastSeq - firstSeq + 1-3; k >= 1; k--)
715 if (phyTree->treeDesc[k][l[i]] == 1)
717 for (j = 1; j <= lastSeq - firstSeq + 1; j++)
718 if (phyTree->treeDesc[k][j] == 1)
720 phyTree->treeDesc[lastSeq - firstSeq + 1-2][j] = i;
729 (*log) << "\n\t\t SEQ:" << setw(4) << l[i] << " (" << setw(9)
730 << setprecision(5) << branch[i] << ") ";
732 phyTree->treeDesc[lastSeq - firstSeq + 1-2][l[i]] = i;
748 /* IMPROVEMENT 2, STEP 4 : release memory area */