3 #include "distcalc.h"
\r
5 // UPGMA clustering in O(N^2) time and space.
\r
9 #define MIN(x, y) ((x) < (y) ? (x) : (y))
\r
10 #define MAX(x, y) ((x) > (y) ? (x) : (y))
\r
11 #define AVG(x, y) (((x) + (y))/2)
\r
13 static unsigned g_uLeafCount;
\r
14 static unsigned g_uTriangleSize;
\r
15 static unsigned g_uInternalNodeCount;
\r
16 static unsigned g_uInternalNodeIndex;
\r
18 // Triangular distance matrix is g_Dist, which is allocated
\r
19 // as a one-dimensional vector of length g_uTriangleSize.
\r
20 // TriangleSubscript(i,j) maps row,column=i,j to the subscript
\r
21 // into this vector.
\r
22 // Row / column coordinates are a bit messy.
\r
23 // Initially they are leaf indexes 0..N-1.
\r
24 // But each time we create a new node (=new cluster, new subtree),
\r
25 // we re-use one of the two rows that become available (the children
\r
26 // of the new node). This saves memory.
\r
27 // We keep track of this through the g_uNodeIndex vector.
\r
28 static dist_t *g_Dist;
\r
30 // Distance to nearest neighbor in row i of distance matrix.
\r
31 // Subscript is distance matrix row.
\r
32 static dist_t *g_MinDist;
\r
34 // Nearest neighbor to row i of distance matrix.
\r
35 // Subscript is distance matrix row.
\r
36 static unsigned *g_uNearestNeighbor;
\r
38 // Node index of row i in distance matrix.
\r
39 // Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
\r
40 // Subscript is distance matrix row.
\r
41 static unsigned *g_uNodeIndex;
\r
43 // The following vectors are defined on internal nodes,
\r
44 // subscripts are internal node index 0..N-2.
\r
45 // For g_uLeft/Right, value is the node index 0 .. 2N-2
\r
46 // because a child can be internal or leaf.
\r
47 static unsigned *g_uLeft;
\r
48 static unsigned *g_uRight;
\r
49 static dist_t *g_Height;
\r
50 static dist_t *g_LeftLength;
\r
51 static dist_t *g_RightLength;
\r
53 static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2)
\r
56 if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
\r
57 Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
\r
60 if (uIndex1 >= uIndex2)
\r
61 v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
\r
63 v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
\r
64 assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
\r
68 static void ListState()
\r
70 Log("Dist matrix\n");
\r
72 for (unsigned i = 0; i < g_uLeafCount; ++i)
\r
74 if (uInsane == g_uNodeIndex[i])
\r
76 Log(" %5u", g_uNodeIndex[i]);
\r
80 for (unsigned i = 0; i < g_uLeafCount; ++i)
\r
82 if (uInsane == g_uNodeIndex[i])
\r
84 Log("%5u ", g_uNodeIndex[i]);
\r
85 for (unsigned j = 0; j < g_uLeafCount; ++j)
\r
87 if (uInsane == g_uNodeIndex[j])
\r
93 unsigned v = TriangleSubscript(i, j);
\r
94 Log("%5.2g ", g_Dist[v]);
\r
101 Log(" i Node NrNb Dist\n");
\r
102 Log("----- ----- ----- --------\n");
\r
103 for (unsigned i = 0; i < g_uLeafCount; ++i)
\r
105 if (uInsane == g_uNodeIndex[i])
\r
107 Log("%5u %5u %5u %8.3f\n",
\r
110 g_uNearestNeighbor[i],
\r
115 Log(" Node L R Height LLength RLength\n");
\r
116 Log("----- ----- ----- ------ ------- -------\n");
\r
117 for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i)
\r
118 Log("%5u %5u %5u %6.2g %6.2g %6.2g\n",
\r
127 void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage)
\r
129 g_uLeafCount = DC.GetCount();
\r
131 g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
\r
132 g_uInternalNodeCount = g_uLeafCount - 1;
\r
134 g_Dist = new dist_t[g_uTriangleSize];
\r
136 g_uNodeIndex = new unsigned[g_uLeafCount];
\r
137 g_uNearestNeighbor = new unsigned[g_uLeafCount];
\r
138 g_MinDist = new dist_t[g_uLeafCount];
\r
139 unsigned *Ids = new unsigned [g_uLeafCount];
\r
140 char **Names = new char *[g_uLeafCount];
\r
142 g_uLeft = new unsigned[g_uInternalNodeCount];
\r
143 g_uRight = new unsigned[g_uInternalNodeCount];
\r
144 g_Height = new dist_t[g_uInternalNodeCount];
\r
145 g_LeftLength = new dist_t[g_uInternalNodeCount];
\r
146 g_RightLength = new dist_t[g_uInternalNodeCount];
\r
148 for (unsigned i = 0; i < g_uLeafCount; ++i)
\r
150 g_MinDist[i] = BIG_DIST;
\r
151 g_uNodeIndex[i] = i;
\r
152 g_uNearestNeighbor[i] = uInsane;
\r
153 Ids[i] = DC.GetId(i);
\r
154 Names[i] = strsave(DC.GetName(i));
\r
157 for (unsigned i = 0; i < g_uInternalNodeCount; ++i)
\r
159 g_uLeft[i] = uInsane;
\r
160 g_uRight[i] = uInsane;
\r
161 g_LeftLength[i] = BIG_DIST;
\r
162 g_RightLength[i] = BIG_DIST;
\r
163 g_Height[i] = BIG_DIST;
\r
166 // Compute initial NxN triangular distance matrix.
\r
167 // Store minimum distance for each full (not triangular) row.
\r
168 // Loop from 1, not 0, because "row" is 0, 1 ... i-1,
\r
169 // so nothing to do when i=0.
\r
170 for (unsigned i = 1; i < g_uLeafCount; ++i)
\r
172 dist_t *Row = g_Dist + TriangleSubscript(i, 0);
\r
173 DC.CalcDistRange(i, Row);
\r
174 for (unsigned j = 0; j < i; ++j)
\r
176 const dist_t d = Row[j];
\r
177 if (d < g_MinDist[i])
\r
180 g_uNearestNeighbor[i] = j;
\r
182 if (d < g_MinDist[j])
\r
185 g_uNearestNeighbor[j] = i;
\r
191 Log("Initial state:\n");
\r
195 for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1;
\r
196 ++g_uInternalNodeIndex)
\r
200 Log("Internal node index %5u\n", g_uInternalNodeIndex);
\r
201 Log("-------------------------\n");
\r
204 // Find nearest neighbors
\r
205 unsigned Lmin = uInsane;
\r
206 unsigned Rmin = uInsane;
\r
207 dist_t dtMinDist = BIG_DIST;
\r
208 for (unsigned j = 0; j < g_uLeafCount; ++j)
\r
210 if (uInsane == g_uNodeIndex[j])
\r
213 dist_t d = g_MinDist[j];
\r
218 Rmin = g_uNearestNeighbor[j];
\r
219 assert(uInsane != Rmin);
\r
220 assert(uInsane != g_uNodeIndex[Rmin]);
\r
224 assert(Lmin != uInsane);
\r
225 assert(Rmin != uInsane);
\r
226 assert(dtMinDist != BIG_DIST);
\r
229 Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
\r
231 g_uNodeIndex[Lmin],
\r
233 g_uNodeIndex[Rmin],
\r
237 // Compute distances to new node
\r
238 // New node overwrites row currently assigned to Lmin
\r
239 dist_t dtNewMinDist = BIG_DIST;
\r
240 unsigned uNewNearestNeighbor = uInsane;
\r
241 for (unsigned j = 0; j < g_uLeafCount; ++j)
\r
243 if (j == Lmin || j == Rmin)
\r
245 if (uInsane == g_uNodeIndex[j])
\r
248 const unsigned vL = TriangleSubscript(Lmin, j);
\r
249 const unsigned vR = TriangleSubscript(Rmin, j);
\r
250 const dist_t dL = g_Dist[vL];
\r
251 const dist_t dR = g_Dist[vR];
\r
257 dtNewDist = AVG(dL, dR);
\r
261 dtNewDist = MIN(dL, dR);
\r
265 dtNewDist = MAX(dL, dR);
\r
268 case LINKAGE_Biased:
\r
269 dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
\r
273 Quit("UPGMA2: Invalid LINKAGE_%u", Linkage);
\r
276 // Nasty special case.
\r
277 // If nearest neighbor of j is Lmin or Rmin, then make the new
\r
278 // node (which overwrites the row currently occupied by Lmin)
\r
279 // the nearest neighbor. This situation can occur when there are
\r
280 // equal distances in the matrix. If we don't make this fix,
\r
281 // the nearest neighbor pointer for j would become invalid.
\r
282 // (We don't need to test for == Lmin, because in that case
\r
283 // the net change needed is zero due to the change in row
\r
285 if (g_uNearestNeighbor[j] == Rmin)
\r
286 g_uNearestNeighbor[j] = Lmin;
\r
289 Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
\r
290 j, Lmin, dL, Rmin, dR, dtNewDist);
\r
292 g_Dist[vL] = dtNewDist;
\r
293 if (dtNewDist < dtNewMinDist)
\r
295 dtNewMinDist = dtNewDist;
\r
296 uNewNearestNeighbor = j;
\r
300 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
\r
301 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);
\r
303 const unsigned v = TriangleSubscript(Lmin, Rmin);
\r
304 const dist_t dLR = g_Dist[v];
\r
305 const dist_t dHeightNew = dLR/2;
\r
306 const unsigned uLeft = g_uNodeIndex[Lmin];
\r
307 const unsigned uRight = g_uNodeIndex[Rmin];
\r
308 const dist_t HeightLeft =
\r
309 uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
\r
310 const dist_t HeightRight =
\r
311 uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];
\r
313 g_uLeft[g_uInternalNodeIndex] = uLeft;
\r
314 g_uRight[g_uInternalNodeIndex] = uRight;
\r
315 g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
\r
316 g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
\r
317 g_Height[g_uInternalNodeIndex] = dHeightNew;
\r
319 // Row for left child overwritten by row for new node
\r
320 g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
\r
321 g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
\r
322 g_MinDist[Lmin] = dtNewMinDist;
\r
324 // Delete row for right child
\r
325 g_uNodeIndex[Rmin] = uInsane;
\r
328 Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
\r
329 g_uInternalNodeIndex, Lmin, Rmin);
\r
334 unsigned uRoot = g_uLeafCount - 2;
\r
335 tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength,
\r
344 delete[] g_uNodeIndex;
\r
345 delete[] g_uNearestNeighbor;
\r
346 delete[] g_MinDist;
\r
351 delete[] g_LeftLength;
\r
352 delete[] g_RightLength;
\r
354 for (unsigned i = 0; i < g_uLeafCount; ++i)
\r
360 class DistCalcTest : public DistCalc
\r
362 virtual void CalcDistRange(unsigned i, dist_t Dist[]) const
\r
364 static dist_t TestDist[5][5] =
\r
372 for (unsigned j = 0; j < i; ++j)
\r
373 Dist[j] = TestDist[i][j];
\r
375 virtual unsigned GetCount() const
\r
379 virtual unsigned GetId(unsigned i) const
\r
383 virtual const char *GetName(unsigned i) const
\r
391 SetListFileName("c:\\tmp\\lobster.log", false);
\r
394 UPGMA2(DC, tree, LINKAGE_Avg);
\r