X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Fmuscle%2Fupgma2.cpp;fp=website%2Farchive%2Fbinaries%2Fmac%2Fsrc%2Fmuscle%2Fupgma2.cpp;h=941b9d27a803c358405c46bda25e10b022026c96;hb=dbde3fb6f00b9bb770343631a517c0e599db8528;hp=0000000000000000000000000000000000000000;hpb=85f830bbd51a7277994bd4233141016304e210c9;p=jabaws.git diff --git a/website/archive/binaries/mac/src/muscle/upgma2.cpp b/website/archive/binaries/mac/src/muscle/upgma2.cpp new file mode 100644 index 0000000..941b9d2 --- /dev/null +++ b/website/archive/binaries/mac/src/muscle/upgma2.cpp @@ -0,0 +1,395 @@ +#include "muscle.h" +#include "tree.h" +#include "distcalc.h" + +// UPGMA clustering in O(N^2) time and space. + +#define TRACE 0 + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#define AVG(x, y) (((x) + (y))/2) + +static unsigned g_uLeafCount; +static unsigned g_uTriangleSize; +static unsigned g_uInternalNodeCount; +static unsigned g_uInternalNodeIndex; + +// Triangular distance matrix is g_Dist, which is allocated +// as a one-dimensional vector of length g_uTriangleSize. +// TriangleSubscript(i,j) maps row,column=i,j to the subscript +// into this vector. +// Row / column coordinates are a bit messy. +// Initially they are leaf indexes 0..N-1. +// But each time we create a new node (=new cluster, new subtree), +// we re-use one of the two rows that become available (the children +// of the new node). This saves memory. +// We keep track of this through the g_uNodeIndex vector. +static dist_t *g_Dist; + +// Distance to nearest neighbor in row i of distance matrix. +// Subscript is distance matrix row. +static dist_t *g_MinDist; + +// Nearest neighbor to row i of distance matrix. +// Subscript is distance matrix row. +static unsigned *g_uNearestNeighbor; + +// Node index of row i in distance matrix. +// Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes. +// Subscript is distance matrix row. +static unsigned *g_uNodeIndex; + +// The following vectors are defined on internal nodes, +// subscripts are internal node index 0..N-2. +// For g_uLeft/Right, value is the node index 0 .. 2N-2 +// because a child can be internal or leaf. +static unsigned *g_uLeft; +static unsigned *g_uRight; +static dist_t *g_Height; +static dist_t *g_LeftLength; +static dist_t *g_RightLength; + +static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2) + { +#if DEBUG + if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount) + Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount); +#endif + unsigned v; + if (uIndex1 >= uIndex2) + v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; + else + v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; + assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2); + return v; + } + +static void ListState() + { + Log("Dist matrix\n"); + Log(" "); + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + if (uInsane == g_uNodeIndex[i]) + continue; + Log(" %5u", g_uNodeIndex[i]); + } + Log("\n"); + + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + if (uInsane == g_uNodeIndex[i]) + continue; + Log("%5u ", g_uNodeIndex[i]); + for (unsigned j = 0; j < g_uLeafCount; ++j) + { + if (uInsane == g_uNodeIndex[j]) + continue; + if (i == j) + Log(" "); + else + { + unsigned v = TriangleSubscript(i, j); + Log("%5.2g ", g_Dist[v]); + } + } + Log("\n"); + } + + Log("\n"); + Log(" i Node NrNb Dist\n"); + Log("----- ----- ----- --------\n"); + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + if (uInsane == g_uNodeIndex[i]) + continue; + Log("%5u %5u %5u %8.3f\n", + i, + g_uNodeIndex[i], + g_uNearestNeighbor[i], + g_MinDist[i]); + } + + Log("\n"); + Log(" Node L R Height LLength RLength\n"); + Log("----- ----- ----- ------ ------- -------\n"); + for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i) + Log("%5u %5u %5u %6.2g %6.2g %6.2g\n", + i, + g_uLeft[i], + g_uRight[i], + g_Height[i], + g_LeftLength[i], + g_RightLength[i]); + } + +void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage) + { + g_uLeafCount = DC.GetCount(); + + g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2; + g_uInternalNodeCount = g_uLeafCount - 1; + + g_Dist = new dist_t[g_uTriangleSize]; + + g_uNodeIndex = new unsigned[g_uLeafCount]; + g_uNearestNeighbor = new unsigned[g_uLeafCount]; + g_MinDist = new dist_t[g_uLeafCount]; + unsigned *Ids = new unsigned [g_uLeafCount]; + char **Names = new char *[g_uLeafCount]; + + g_uLeft = new unsigned[g_uInternalNodeCount]; + g_uRight = new unsigned[g_uInternalNodeCount]; + g_Height = new dist_t[g_uInternalNodeCount]; + g_LeftLength = new dist_t[g_uInternalNodeCount]; + g_RightLength = new dist_t[g_uInternalNodeCount]; + + for (unsigned i = 0; i < g_uLeafCount; ++i) + { + g_MinDist[i] = BIG_DIST; + g_uNodeIndex[i] = i; + g_uNearestNeighbor[i] = uInsane; + Ids[i] = DC.GetId(i); + Names[i] = strsave(DC.GetName(i)); + } + + for (unsigned i = 0; i < g_uInternalNodeCount; ++i) + { + g_uLeft[i] = uInsane; + g_uRight[i] = uInsane; + g_LeftLength[i] = BIG_DIST; + g_RightLength[i] = BIG_DIST; + g_Height[i] = BIG_DIST; + } + +// Compute initial NxN triangular distance matrix. +// Store minimum distance for each full (not triangular) row. +// Loop from 1, not 0, because "row" is 0, 1 ... i-1, +// so nothing to do when i=0. + for (unsigned i = 1; i < g_uLeafCount; ++i) + { + dist_t *Row = g_Dist + TriangleSubscript(i, 0); + DC.CalcDistRange(i, Row); + for (unsigned j = 0; j < i; ++j) + { + const dist_t d = Row[j]; + if (d < g_MinDist[i]) + { + g_MinDist[i] = d; + g_uNearestNeighbor[i] = j; + } + if (d < g_MinDist[j]) + { + g_MinDist[j] = d; + g_uNearestNeighbor[j] = i; + } + } + } + +#if TRACE + Log("Initial state:\n"); + ListState(); +#endif + + for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1; + ++g_uInternalNodeIndex) + { +#if TRACE + Log("\n"); + Log("Internal node index %5u\n", g_uInternalNodeIndex); + Log("-------------------------\n"); +#endif + + // Find nearest neighbors + unsigned Lmin = uInsane; + unsigned Rmin = uInsane; + dist_t dtMinDist = BIG_DIST; + for (unsigned j = 0; j < g_uLeafCount; ++j) + { + if (uInsane == g_uNodeIndex[j]) + continue; + + dist_t d = g_MinDist[j]; + if (d < dtMinDist) + { + dtMinDist = d; + Lmin = j; + Rmin = g_uNearestNeighbor[j]; + assert(uInsane != Rmin); + assert(uInsane != g_uNodeIndex[Rmin]); + } + } + + assert(Lmin != uInsane); + assert(Rmin != uInsane); + assert(dtMinDist != BIG_DIST); + +#if TRACE + Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", + Lmin, + g_uNodeIndex[Lmin], + Rmin, + g_uNodeIndex[Rmin], + dtMinDist); +#endif + + // Compute distances to new node + // New node overwrites row currently assigned to Lmin + dist_t dtNewMinDist = BIG_DIST; + unsigned uNewNearestNeighbor = uInsane; + for (unsigned j = 0; j < g_uLeafCount; ++j) + { + if (j == Lmin || j == Rmin) + continue; + if (uInsane == g_uNodeIndex[j]) + continue; + + const unsigned vL = TriangleSubscript(Lmin, j); + const unsigned vR = TriangleSubscript(Rmin, j); + const dist_t dL = g_Dist[vL]; + const dist_t dR = g_Dist[vR]; + dist_t dtNewDist; + + switch (Linkage) + { + case LINKAGE_Avg: + dtNewDist = AVG(dL, dR); + break; + + case LINKAGE_Min: + dtNewDist = MIN(dL, dR); + break; + + case LINKAGE_Max: + dtNewDist = MAX(dL, dR); + break; + + case LINKAGE_Biased: + dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR); + break; + + default: + Quit("UPGMA2: Invalid LINKAGE_%u", Linkage); + } + + // Nasty special case. + // If nearest neighbor of j is Lmin or Rmin, then make the new + // node (which overwrites the row currently occupied by Lmin) + // the nearest neighbor. This situation can occur when there are + // equal distances in the matrix. If we don't make this fix, + // the nearest neighbor pointer for j would become invalid. + // (We don't need to test for == Lmin, because in that case + // the net change needed is zero due to the change in row + // numbering). + if (g_uNearestNeighbor[j] == Rmin) + g_uNearestNeighbor[j] = Lmin; + +#if TRACE + Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", + j, Lmin, dL, Rmin, dR, dtNewDist); +#endif + g_Dist[vL] = dtNewDist; + if (dtNewDist < dtNewMinDist) + { + dtNewMinDist = dtNewDist; + uNewNearestNeighbor = j; + } + } + + assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist); + assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor); + + const unsigned v = TriangleSubscript(Lmin, Rmin); + const dist_t dLR = g_Dist[v]; + const dist_t dHeightNew = dLR/2; + const unsigned uLeft = g_uNodeIndex[Lmin]; + const unsigned uRight = g_uNodeIndex[Rmin]; + const dist_t HeightLeft = + uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount]; + const dist_t HeightRight = + uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount]; + + g_uLeft[g_uInternalNodeIndex] = uLeft; + g_uRight[g_uInternalNodeIndex] = uRight; + g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft; + g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight; + g_Height[g_uInternalNodeIndex] = dHeightNew; + + // Row for left child overwritten by row for new node + g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex; + g_uNearestNeighbor[Lmin] = uNewNearestNeighbor; + g_MinDist[Lmin] = dtNewMinDist; + + // Delete row for right child + g_uNodeIndex[Rmin] = uInsane; + +#if TRACE + Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", + g_uInternalNodeIndex, Lmin, Rmin); + ListState(); +#endif + } + + unsigned uRoot = g_uLeafCount - 2; + tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength, + Ids, Names); + +#if TRACE + tree.LogMe(); +#endif + + delete[] g_Dist; + + delete[] g_uNodeIndex; + delete[] g_uNearestNeighbor; + delete[] g_MinDist; + delete[] g_Height; + + delete[] g_uLeft; + delete[] g_uRight; + delete[] g_LeftLength; + delete[] g_RightLength; + + for (unsigned i = 0; i < g_uLeafCount; ++i) + free(Names[i]); + delete[] Names; + delete[] Ids; + } + +class DistCalcTest : public DistCalc + { + virtual void CalcDistRange(unsigned i, dist_t Dist[]) const + { + static dist_t TestDist[5][5] = + { + 0, 2, 14, 14, 20, + 2, 0, 14, 14, 20, + 14, 14, 0, 4, 20, + 14, 14, 4, 0, 20, + 20, 20, 20, 20, 0, + }; + for (unsigned j = 0; j < i; ++j) + Dist[j] = TestDist[i][j]; + } + virtual unsigned GetCount() const + { + return 5; + } + virtual unsigned GetId(unsigned i) const + { + return i; + } + virtual const char *GetName(unsigned i) const + { + return "name"; + } + }; + +void Test() + { + SetListFileName("c:\\tmp\\lobster.log", false); + DistCalcTest DC; + Tree tree; + UPGMA2(DC, tree, LINKAGE_Avg); + }