Next version of JABA
[jabaws.git] / binaries / src / muscle / upgma2.cpp
1 #include "muscle.h"\r
2 #include "tree.h"\r
3 #include "distcalc.h"\r
4 \r
5 // UPGMA clustering in O(N^2) time and space.\r
6 \r
7 #define TRACE   0\r
8 \r
9 #define MIN(x, y)       ((x) < (y) ? (x) : (y))\r
10 #define MAX(x, y)       ((x) > (y) ? (x) : (y))\r
11 #define AVG(x, y)       (((x) + (y))/2)\r
12 \r
13 static unsigned g_uLeafCount;\r
14 static unsigned g_uTriangleSize;\r
15 static unsigned g_uInternalNodeCount;\r
16 static unsigned g_uInternalNodeIndex;\r
17 \r
18 // Triangular distance matrix is g_Dist, which is allocated\r
19 // as a one-dimensional vector of length g_uTriangleSize.\r
20 // TriangleSubscript(i,j) maps row,column=i,j to the subscript\r
21 // into this vector.\r
22 // Row / column coordinates are a bit messy.\r
23 // Initially they are leaf indexes 0..N-1.\r
24 // But each time we create a new node (=new cluster, new subtree),\r
25 // we re-use one of the two rows that become available (the children\r
26 // of the new node). This saves memory.\r
27 // We keep track of this through the g_uNodeIndex vector.\r
28 static dist_t *g_Dist;\r
29 \r
30 // Distance to nearest neighbor in row i of distance matrix.\r
31 // Subscript is distance matrix row.\r
32 static dist_t *g_MinDist;\r
33 \r
34 // Nearest neighbor to row i of distance matrix.\r
35 // Subscript is distance matrix row.\r
36 static unsigned *g_uNearestNeighbor;\r
37 \r
38 // Node index of row i in distance matrix.\r
39 // Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.\r
40 // Subscript is distance matrix row.\r
41 static unsigned *g_uNodeIndex;\r
42 \r
43 // The following vectors are defined on internal nodes,\r
44 // subscripts are internal node index 0..N-2.\r
45 // For g_uLeft/Right, value is the node index 0 .. 2N-2\r
46 // because a child can be internal or leaf.\r
47 static unsigned *g_uLeft;\r
48 static unsigned *g_uRight;\r
49 static dist_t *g_Height;\r
50 static dist_t *g_LeftLength;\r
51 static dist_t *g_RightLength;\r
52 \r
53 static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2)\r
54         {\r
55 #if     DEBUG\r
56         if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)\r
57                 Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);\r
58 #endif\r
59         unsigned v;\r
60         if (uIndex1 >= uIndex2)\r
61                 v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;\r
62         else\r
63                 v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;\r
64         assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);\r
65         return v;\r
66         }\r
67 \r
68 static void ListState()\r
69         {\r
70         Log("Dist matrix\n");\r
71         Log("     ");\r
72         for (unsigned i = 0; i < g_uLeafCount; ++i)\r
73                 {\r
74                 if (uInsane == g_uNodeIndex[i])\r
75                         continue;\r
76                 Log("  %5u", g_uNodeIndex[i]);\r
77                 }\r
78         Log("\n");\r
79 \r
80         for (unsigned i = 0; i < g_uLeafCount; ++i)\r
81                 {\r
82                 if (uInsane == g_uNodeIndex[i])\r
83                         continue;\r
84                 Log("%5u  ", g_uNodeIndex[i]);\r
85                 for (unsigned j = 0; j < g_uLeafCount; ++j)\r
86                         {\r
87                         if (uInsane == g_uNodeIndex[j])\r
88                                 continue;\r
89                         if (i == j)\r
90                                 Log("       ");\r
91                         else\r
92                                 {\r
93                                 unsigned v = TriangleSubscript(i, j);\r
94                                 Log("%5.2g  ", g_Dist[v]);\r
95                                 }\r
96                         }\r
97                 Log("\n");\r
98                 }\r
99 \r
100         Log("\n");\r
101         Log("    i   Node   NrNb      Dist\n");\r
102         Log("-----  -----  -----  --------\n");\r
103         for (unsigned i = 0; i < g_uLeafCount; ++i)\r
104                 {\r
105                 if (uInsane == g_uNodeIndex[i])\r
106                         continue;\r
107                 Log("%5u  %5u  %5u  %8.3f\n",\r
108                   i,\r
109                   g_uNodeIndex[i],\r
110                   g_uNearestNeighbor[i],\r
111                   g_MinDist[i]);\r
112                 }\r
113 \r
114         Log("\n");\r
115         Log(" Node      L      R  Height  LLength  RLength\n");\r
116         Log("-----  -----  -----  ------  -------  -------\n");\r
117         for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i)\r
118                 Log("%5u  %5u  %5u  %6.2g  %6.2g  %6.2g\n",\r
119                   i,\r
120                   g_uLeft[i],\r
121                   g_uRight[i],\r
122                   g_Height[i],\r
123                   g_LeftLength[i],\r
124                   g_RightLength[i]);\r
125         }\r
126 \r
127 void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage)\r
128         {\r
129         g_uLeafCount = DC.GetCount();\r
130 \r
131         g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;\r
132         g_uInternalNodeCount = g_uLeafCount - 1;\r
133 \r
134         g_Dist = new dist_t[g_uTriangleSize];\r
135 \r
136         g_uNodeIndex = new unsigned[g_uLeafCount];\r
137         g_uNearestNeighbor = new unsigned[g_uLeafCount];\r
138         g_MinDist = new dist_t[g_uLeafCount];\r
139         unsigned *Ids = new unsigned [g_uLeafCount];\r
140         char **Names = new char *[g_uLeafCount];\r
141 \r
142         g_uLeft = new unsigned[g_uInternalNodeCount];\r
143         g_uRight = new unsigned[g_uInternalNodeCount];\r
144         g_Height = new dist_t[g_uInternalNodeCount];\r
145         g_LeftLength = new dist_t[g_uInternalNodeCount];\r
146         g_RightLength = new dist_t[g_uInternalNodeCount];\r
147 \r
148         for (unsigned i = 0; i < g_uLeafCount; ++i)\r
149                 {\r
150                 g_MinDist[i] = BIG_DIST;\r
151                 g_uNodeIndex[i] = i;\r
152                 g_uNearestNeighbor[i] = uInsane;\r
153                 Ids[i] = DC.GetId(i);\r
154                 Names[i] = strsave(DC.GetName(i));\r
155                 }\r
156 \r
157         for (unsigned i = 0; i < g_uInternalNodeCount; ++i)\r
158                 {\r
159                 g_uLeft[i] = uInsane;\r
160                 g_uRight[i] = uInsane;\r
161                 g_LeftLength[i] = BIG_DIST;\r
162                 g_RightLength[i] = BIG_DIST;\r
163                 g_Height[i] = BIG_DIST;\r
164                 }\r
165 \r
166 // Compute initial NxN triangular distance matrix.\r
167 // Store minimum distance for each full (not triangular) row.\r
168 // Loop from 1, not 0, because "row" is 0, 1 ... i-1,\r
169 // so nothing to do when i=0.\r
170         for (unsigned i = 1; i < g_uLeafCount; ++i)\r
171                 {\r
172                 dist_t *Row = g_Dist + TriangleSubscript(i, 0);\r
173                 DC.CalcDistRange(i, Row);\r
174                 for (unsigned j = 0; j < i; ++j)\r
175                         {\r
176                         const dist_t d = Row[j];\r
177                         if (d < g_MinDist[i])\r
178                                 {\r
179                                 g_MinDist[i] = d;\r
180                                 g_uNearestNeighbor[i] = j;\r
181                                 }\r
182                         if (d < g_MinDist[j])\r
183                                 {\r
184                                 g_MinDist[j] = d;\r
185                                 g_uNearestNeighbor[j] = i;\r
186                                 }\r
187                         }\r
188                 }\r
189 \r
190 #if     TRACE\r
191         Log("Initial state:\n");\r
192         ListState();\r
193 #endif\r
194 \r
195         for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1;\r
196           ++g_uInternalNodeIndex)\r
197                 {\r
198 #if     TRACE\r
199                 Log("\n");\r
200                 Log("Internal node index %5u\n", g_uInternalNodeIndex);\r
201                 Log("-------------------------\n");\r
202 #endif\r
203 \r
204         // Find nearest neighbors\r
205                 unsigned Lmin = uInsane;\r
206                 unsigned Rmin = uInsane;\r
207                 dist_t dtMinDist = BIG_DIST;\r
208                 for (unsigned j = 0; j < g_uLeafCount; ++j)\r
209                         {\r
210                         if (uInsane == g_uNodeIndex[j])\r
211                                 continue;\r
212 \r
213                         dist_t d = g_MinDist[j];\r
214                         if (d < dtMinDist)\r
215                                 {\r
216                                 dtMinDist = d;\r
217                                 Lmin = j;\r
218                                 Rmin = g_uNearestNeighbor[j];\r
219                                 assert(uInsane != Rmin);\r
220                                 assert(uInsane != g_uNodeIndex[Rmin]);\r
221                                 }\r
222                         }\r
223 \r
224                 assert(Lmin != uInsane);\r
225                 assert(Rmin != uInsane);\r
226                 assert(dtMinDist != BIG_DIST);\r
227 \r
228 #if     TRACE\r
229                 Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",\r
230                   Lmin,\r
231                   g_uNodeIndex[Lmin],\r
232                   Rmin,\r
233                   g_uNodeIndex[Rmin],\r
234                   dtMinDist);\r
235 #endif\r
236 \r
237         // Compute distances to new node\r
238         // New node overwrites row currently assigned to Lmin\r
239                 dist_t dtNewMinDist = BIG_DIST;\r
240                 unsigned uNewNearestNeighbor = uInsane;\r
241                 for (unsigned j = 0; j < g_uLeafCount; ++j)\r
242                         {\r
243                         if (j == Lmin || j == Rmin)\r
244                                 continue;\r
245                         if (uInsane == g_uNodeIndex[j])\r
246                                 continue;\r
247 \r
248                         const unsigned vL = TriangleSubscript(Lmin, j);\r
249                         const unsigned vR = TriangleSubscript(Rmin, j);\r
250                         const dist_t dL = g_Dist[vL];\r
251                         const dist_t dR = g_Dist[vR];\r
252                         dist_t dtNewDist;\r
253 \r
254                         switch (Linkage)\r
255                                 {\r
256                         case LINKAGE_Avg:\r
257                                 dtNewDist = AVG(dL, dR);\r
258                                 break;\r
259 \r
260                         case LINKAGE_Min:\r
261                                 dtNewDist = MIN(dL, dR);\r
262                                 break;\r
263 \r
264                         case LINKAGE_Max:\r
265                                 dtNewDist = MAX(dL, dR);\r
266                                 break;\r
267 \r
268                         case LINKAGE_Biased:\r
269                                 dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);\r
270                                 break;\r
271 \r
272                         default:\r
273                                 Quit("UPGMA2: Invalid LINKAGE_%u", Linkage);\r
274                                 }\r
275 \r
276                 // Nasty special case.\r
277                 // If nearest neighbor of j is Lmin or Rmin, then make the new\r
278                 // node (which overwrites the row currently occupied by Lmin)\r
279                 // the nearest neighbor. This situation can occur when there are\r
280                 // equal distances in the matrix. If we don't make this fix,\r
281                 // the nearest neighbor pointer for j would become invalid.\r
282                 // (We don't need to test for == Lmin, because in that case\r
283                 // the net change needed is zero due to the change in row\r
284                 // numbering).\r
285                         if (g_uNearestNeighbor[j] == Rmin)\r
286                                 g_uNearestNeighbor[j] = Lmin;\r
287 \r
288 #if     TRACE\r
289                         Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",\r
290                           j, Lmin, dL, Rmin, dR, dtNewDist);\r
291 #endif\r
292                         g_Dist[vL] = dtNewDist;\r
293                         if (dtNewDist < dtNewMinDist)\r
294                                 {\r
295                                 dtNewMinDist = dtNewDist;\r
296                                 uNewNearestNeighbor = j;\r
297                                 }\r
298                         }\r
299 \r
300                 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);\r
301                 assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);\r
302 \r
303                 const unsigned v = TriangleSubscript(Lmin, Rmin);\r
304                 const dist_t dLR = g_Dist[v];\r
305                 const dist_t dHeightNew = dLR/2;\r
306                 const unsigned uLeft = g_uNodeIndex[Lmin];\r
307                 const unsigned uRight = g_uNodeIndex[Rmin];\r
308                 const dist_t HeightLeft =\r
309                   uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];\r
310                 const dist_t HeightRight =\r
311                   uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];\r
312 \r
313                 g_uLeft[g_uInternalNodeIndex] = uLeft;\r
314                 g_uRight[g_uInternalNodeIndex] = uRight;\r
315                 g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;\r
316                 g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;\r
317                 g_Height[g_uInternalNodeIndex] = dHeightNew;\r
318 \r
319         // Row for left child overwritten by row for new node\r
320                 g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;\r
321                 g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;\r
322                 g_MinDist[Lmin] = dtNewMinDist;\r
323 \r
324         // Delete row for right child\r
325                 g_uNodeIndex[Rmin] = uInsane;\r
326 \r
327 #if     TRACE\r
328                 Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",\r
329                   g_uInternalNodeIndex, Lmin, Rmin);\r
330                 ListState();\r
331 #endif\r
332                 }\r
333 \r
334         unsigned uRoot = g_uLeafCount - 2;\r
335         tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength,\r
336           Ids, Names);\r
337 \r
338 #if     TRACE\r
339         tree.LogMe();\r
340 #endif\r
341 \r
342         delete[] g_Dist;\r
343 \r
344         delete[] g_uNodeIndex;\r
345         delete[] g_uNearestNeighbor;\r
346         delete[] g_MinDist;\r
347         delete[] g_Height;\r
348 \r
349         delete[] g_uLeft;\r
350         delete[] g_uRight;\r
351         delete[] g_LeftLength;\r
352         delete[] g_RightLength;\r
353         \r
354         for (unsigned i = 0; i < g_uLeafCount; ++i)\r
355                 free(Names[i]);\r
356         delete[] Names;\r
357         delete[] Ids;\r
358         }\r
359 \r
360 class DistCalcTest : public DistCalc\r
361         {\r
362         virtual void CalcDistRange(unsigned i, dist_t Dist[]) const\r
363                 {\r
364                 static dist_t TestDist[5][5] =\r
365                         {\r
366                         0,  2, 14, 14, 20,\r
367                         2,  0, 14, 14, 20,\r
368                         14, 14,  0,  4, 20,\r
369                         14, 14,  4,  0, 20,\r
370                         20, 20, 20, 20,  0,\r
371                         };\r
372                 for (unsigned j = 0; j < i; ++j)\r
373                         Dist[j] = TestDist[i][j];\r
374                 }\r
375         virtual unsigned GetCount() const\r
376                 {\r
377                 return 5;\r
378                 }\r
379         virtual unsigned GetId(unsigned i) const\r
380                 {\r
381                 return i;\r
382                 }\r
383         virtual const char *GetName(unsigned i) const\r
384                 {\r
385                 return "name";\r
386                 }\r
387         };\r
388 \r
389 void Test()\r
390         {\r
391         SetListFileName("c:\\tmp\\lobster.log", false);\r
392         DistCalcTest DC;\r
393         Tree tree;\r
394         UPGMA2(DC, tree, LINKAGE_Avg);\r
395         }\r