1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /* This the fast UPGMA algorithm (O(N^2)) as implemented in Bob Edgar's
4 * Muscle (UPGMA2.cpp; version 3.7) ported to pure C.
6 * Muscle's code is public domain and so is this code here.
8 * From http://www.drive5.com/muscle/license.htm:
10 * MUSCLE is public domain software
12 * The MUSCLE software, including object and source code and
13 * documentation, is hereby donated to the public domain.
15 * Disclaimer of warranty
16 * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
17 * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * RCS $Id: muscle_upgma.c 230 2011-04-09 15:37:50Z andreas $
29 * LINKAGE become linkage_t here
31 * Replaced the the following member functions for DistCalc DC:
32 * DC.GetId = sequence id as int
33 * DC.GetName = sequence name
34 * DC.GetCount = matrix dim
35 * DC.DistRange = vector / matrix row for object i with index j<i
37 * Log() has been replaced with Clustal's Info(), Quiet() with Log(&rLog, LOG_FATAL)
39 * Made TriangleSubscript() and g_ulTriangleSize ulong to prevent overflow for many sequences
43 /* limit use of unsigned vars (see coding_style_guideline.txt) */
44 typedef unsigned long int ulong;
55 #include "symmatrix.h"
57 #include "muscle_tree.h"
58 #include "muscle_upgma.h"
62 static const dist_t BIG_DIST = (dist_t) 1e29;
64 static const unsigned uInsane = 8888888;
70 ulong TriangleSubscript(uint uIndex1, uint uIndex2);
78 #define MIN(x, y) ((x) < (y) ? (x) : (y))
81 #define MAX(x, y) ((x) > (y) ? (x) : (y))
83 #define AVG(x, y) (((x) + (y))/2)
85 static uint g_uLeafCount;
86 static ulong g_ulTriangleSize;
87 static uint g_uInternalNodeCount;
88 static uint g_uInternalNodeIndex;
90 /* Triangular distance matrix is g_Dist, which is allocated
91 * as a one-dimensional vector of length g_ulTriangleSize.
92 * TriangleSubscript(i,j) maps row,column=i,j to the subscript
94 * Row / column coordinates are a bit messy.
95 * Initially they are leaf indexes 0..N-1.
96 * But each time we create a new node (=new cluster, new subtree),
97 * we re-use one of the two rows that become available (the children
98 * of the new node). This saves memory.
99 * We keep track of this through the g_uNodeIndex vector.
101 static dist_t *g_Dist;
103 /* Distance to nearest neighbor in row i of distance matrix.
104 * Subscript is distance matrix row.
106 static dist_t *g_MinDist;
108 /* Nearest neighbor to row i of distance matrix.
109 * Subscript is distance matrix row.
111 static uint *g_uNearestNeighbor;
113 /* Node index of row i in distance matrix.
114 * Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
115 * Subscript is distance matrix row.
117 static uint *g_uNodeIndex;
119 /* The following vectors are defined on internal nodes,
120 * subscripts are internal node index 0..N-2.
121 * For g_uLeft/Right, value is the node index 0 .. 2N-2
122 * because a child can be internal or leaf.
124 static uint *g_uLeft;
125 static uint *g_uRight;
126 static dist_t *g_Height;
127 static dist_t *g_LeftLength;
128 static dist_t *g_RightLength;
133 * Imitation of DistCalc.DistRange
135 * Sets values of row (vector / matrix row) to distances for object i with index j<i
137 * row must be preallocated
139 void CalcDistRange(symmatrix_t *distmat, uint i, dist_t *row)
142 for (j = 0; j < i; ++j) {
143 row[j] = SymMatrixGetValue(distmat, i, j);
146 /* end of CalcDistRange */
152 TriangleSubscript(uint uIndex1, uint uIndex2)
156 if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
157 Log(&rLog, LOG_FATAL, "TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
159 if (uIndex1 >= uIndex2)
160 v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
162 v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
163 assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
168 static void ListState()
171 Info("Dist matrix\n");
173 for (i = 0; i < g_uLeafCount; ++i)
175 if (uInsane == g_uNodeIndex[i])
177 Info(" %5u", g_uNodeIndex[i]);
181 for (i = 0; i < g_uLeafCount; ++i)
183 if (uInsane == g_uNodeIndex[i])
185 Info("%5u ", g_uNodeIndex[i]);
186 for (j = 0; j < g_uLeafCount; ++j)
188 if (uInsane == g_uNodeIndex[j])
194 ulong v = TriangleSubscript(i, j);
195 Info("%5.2g ", g_Dist[v]);
202 Info(" i Node NrNb Dist\n");
203 Info("----- ----- ----- --------\n");
204 for (i = 0; i < g_uLeafCount; ++i)
206 if (uInsane == g_uNodeIndex[i])
208 Info("%5u %5u %5u %8.3f\n",
211 g_uNearestNeighbor[i],
216 Info(" Node L R Height LLength RLength\n");
217 Info("----- ----- ----- ------ ------- -------\n");
218 for (i = 0; i <= g_uInternalNodeIndex; ++i)
219 Info("%5u %5u %5u %6.2g %6.2g %6.2g\n",
231 * @brief Creates a UPGMA in O(N^2) tree from given distmat
234 * newly created rooted UPGMA tree
236 * distance matrix to be clustered
240 * leaf names, will be copied
242 * @note called UPGMA2() in Muscle3.7.
243 * caller has to free with FreeMuscleTree()
245 * @see FreeMuscleTree()
248 MuscleUpgma2(tree_t *tree, symmatrix_t *distmat, linkage_t linkage, char **names)
253 /* only works on full symmetric matrices */
254 assert (distmat->nrows==distmat->ncols);
256 g_uLeafCount = distmat->ncols;
257 g_ulTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
258 g_uInternalNodeCount = g_uLeafCount - 1;
260 g_Dist = (dist_t *) CKMALLOC(g_ulTriangleSize * sizeof(dist_t));
262 g_uNodeIndex = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
263 g_uNearestNeighbor = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
264 g_MinDist = (dist_t *) CKMALLOC(sizeof(dist_t) * g_uLeafCount);
265 Ids = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
266 /* NOTE: we replaced Names with argument names */
269 * left and right node indices, as well as left and right
270 * branch-length and height for for internal nodes
272 g_uLeft = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
273 g_uRight = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
274 g_Height = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
275 g_LeftLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
276 g_RightLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
278 for (i = 0; i < g_uLeafCount; ++i) {
279 g_MinDist[i] = BIG_DIST;
281 g_uNearestNeighbor[i] = uInsane;
285 for (i = 0; i < g_uInternalNodeCount; ++i) {
286 g_uLeft[i] = uInsane;
287 g_uRight[i] = uInsane;
288 g_LeftLength[i] = BIG_DIST;
289 g_RightLength[i] = BIG_DIST;
290 g_Height[i] = BIG_DIST;
293 /* Compute initial NxN triangular distance matrix.
294 * Store minimum distance for each full (not triangular) row.
295 * Loop from 1, not 0, because "row" is 0, 1 ... i-1,
296 * so nothing to do when i=0.
298 for (i = 1; i < g_uLeafCount; ++i) {
299 dist_t *Row = g_Dist + TriangleSubscript(i, 0);
300 CalcDistRange(distmat, i, Row);
301 for (j = 0; j < i; ++j) {
302 const dist_t d = Row[j];
303 if (d < g_MinDist[i]) {
305 g_uNearestNeighbor[i] = j;
307 if (d < g_MinDist[j]) {
309 g_uNearestNeighbor[j] = i;
315 Info("Initial state:\n");
319 for (g_uInternalNodeIndex = 0;
320 g_uInternalNodeIndex < g_uLeafCount - 1;
321 ++g_uInternalNodeIndex) {
323 dist_t dtNewMinDist = BIG_DIST;
324 uint uNewNearestNeighbor = uInsane;
328 Info("Internal node index %5u\n", g_uInternalNodeIndex);
329 Info("-------------------------\n");
332 /* Find nearest neighbors */
335 dist_t dtMinDist = BIG_DIST;
336 for (j = 0; j < g_uLeafCount; ++j) {
338 if (uInsane == g_uNodeIndex[j])
345 Rmin = g_uNearestNeighbor[j];
346 assert(uInsane != Rmin);
347 assert(uInsane != g_uNodeIndex[Rmin]);
351 assert(Lmin != uInsane);
352 assert(Rmin != uInsane);
353 assert(dtMinDist != BIG_DIST);
356 Info("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
364 /* Compute distances to new node
365 * New node overwrites row currently assigned to Lmin
367 for ( j = 0; j < g_uLeafCount; ++j) {
372 if (j == Lmin || j == Rmin)
374 if (uInsane == g_uNodeIndex[j])
377 vL = TriangleSubscript(Lmin, j);
378 vR = TriangleSubscript(Rmin, j);
385 dtNewDist = AVG(dL, dR);
389 dtNewDist = MIN(dL, dR);
393 dtNewDist = MAX(dL, dR);
395 /* couldn't be arsed to figure out proper usage of g_dSUEFF */
398 dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
402 Log(&rLog, LOG_FATAL, "UPGMA2: Invalid LINKAGE_%u", linkage);
405 /* Nasty special case.
406 * If nearest neighbor of j is Lmin or Rmin, then make the new
407 * node (which overwrites the row currently occupied by Lmin)
408 * the nearest neighbor. This situation can occur when there are
409 * equal distances in the matrix. If we don't make this fix,
410 * the nearest neighbor pointer for j would become invalid.
411 * (We don't need to test for == Lmin, because in that case
412 * the net change needed is zero due to the change in row
415 if (g_uNearestNeighbor[j] == Rmin)
416 g_uNearestNeighbor[j] = Lmin;
419 Info("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
420 j, Lmin, dL, Rmin, dR, dtNewDist);
422 g_Dist[vL] = dtNewDist;
423 if (dtNewDist < dtNewMinDist) {
424 dtNewMinDist = dtNewDist;
425 uNewNearestNeighbor = j;
429 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
430 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);
432 const ulong v = TriangleSubscript(Lmin, Rmin);
433 const dist_t dLR = g_Dist[v];
434 const dist_t dHeightNew = dLR/2;
435 const uint uLeft = g_uNodeIndex[Lmin];
436 const uint uRight = g_uNodeIndex[Rmin];
437 const dist_t HeightLeft =
438 uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
439 const dist_t HeightRight =
440 uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];
442 g_uLeft[g_uInternalNodeIndex] = uLeft;
443 g_uRight[g_uInternalNodeIndex] = uRight;
444 g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
445 g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
446 g_Height[g_uInternalNodeIndex] = dHeightNew;
448 /* Row for left child overwritten by row for new node */
449 g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
450 g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
451 g_MinDist[Lmin] = dtNewMinDist;
453 /* Delete row for right child */
454 g_uNodeIndex[Rmin] = uInsane;
457 Info("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
458 g_uInternalNodeIndex, Lmin, Rmin);
463 uint uRoot = g_uLeafCount - 2;
466 Log(&rLog, LOG_FORCED_DEBUG, "uRoot=%d g_uLeafCount=%d g_uInternalNodeCount=%d", uRoot, g_uLeafCount, g_uInternalNodeCount);
467 for (i=0; i<g_uInternalNodeCount; i++) {
468 Log(&rLog, LOG_FORCED_DEBUG, "internal node=%d: g_uLeft=%d g_uRight=%d g_LeftLength=%f g_RightLength=%f g_Height=%f",
469 i, g_uLeft[i], g_uRight[i],
470 g_LeftLength[i], g_RightLength[i],
473 for (i=0; i<g_uLeafCount; i++) {
474 Log(&rLog, LOG_FORCED_DEBUG, "leaf node=%d: Ids=%d names=%s",
475 i, Ids[i], names[i]);
479 MuscleTreeCreate(tree, g_uLeafCount, uRoot,
481 g_LeftLength, g_RightLength,
490 free(g_uNearestNeighbor);
499 /* NOTE: Muscle's "Names" variable is here the argument "names" */
502 /*** end of UPGMA2 ***/