/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* This the fast UPGMA algorithm (O(N^2)) as implemented in Bob Edgar's * Muscle (UPGMA2.cpp; version 3.7) ported to pure C. * * Muscle's code is public domain and so is this code here. * * From http://www.drive5.com/muscle/license.htm: * """ * MUSCLE is public domain software * * The MUSCLE software, including object and source code and * documentation, is hereby donated to the public domain. * * Disclaimer of warranty * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * """ * */ /* * RCS $Id: muscle_upgma.c 230 2011-04-09 15:37:50Z andreas $ * * * Notes: * ------ * LINKAGE become linkage_t here * * Replaced the the following member functions for DistCalc DC: * DC.GetId = sequence id as int * DC.GetName = sequence name * DC.GetCount = matrix dim * DC.DistRange = vector / matrix row for object i with index j #include #include #include "util.h" #include "log.h" #include "symmatrix.h" #include "muscle_tree.h" #include "muscle_upgma.h" /* from distcalc.h */ typedef float dist_t; static const dist_t BIG_DIST = (dist_t) 1e29; /* from muscle.h */ static const unsigned uInsane = 8888888; /*static inline*/ ulong TriangleSubscript(uint uIndex1, uint uIndex2); #define TRACE 0 #ifndef MIN #define MIN(x, y) ((x) < (y) ? (x) : (y)) #endif #ifndef MIN #define MAX(x, y) ((x) > (y) ? (x) : (y)) #endif #define AVG(x, y) (((x) + (y))/2) static uint g_uLeafCount; static ulong g_ulTriangleSize; static uint g_uInternalNodeCount; static uint g_uInternalNodeIndex; /* Triangular distance matrix is g_Dist, which is allocated * as a one-dimensional vector of length g_ulTriangleSize. * TriangleSubscript(i,j) maps row,column=i,j to the subscript * into this vector. * Row / column coordinates are a bit messy. * Initially they are leaf indexes 0..N-1. * But each time we create a new node (=new cluster, new subtree), * we re-use one of the two rows that become available (the children * of the new node). This saves memory. * We keep track of this through the g_uNodeIndex vector. */ static dist_t *g_Dist; /* Distance to nearest neighbor in row i of distance matrix. * Subscript is distance matrix row. */ static dist_t *g_MinDist; /* Nearest neighbor to row i of distance matrix. * Subscript is distance matrix row. */ static uint *g_uNearestNeighbor; /* Node index of row i in distance matrix. * Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes. * Subscript is distance matrix row. */ static uint *g_uNodeIndex; /* The following vectors are defined on internal nodes, * subscripts are internal node index 0..N-2. * For g_uLeft/Right, value is the node index 0 .. 2N-2 * because a child can be internal or leaf. */ static uint *g_uLeft; static uint *g_uRight; static dist_t *g_Height; static dist_t *g_LeftLength; static dist_t *g_RightLength; /*** CalcDistRange * * Imitation of DistCalc.DistRange * * Sets values of row (vector / matrix row) to distances for object i with index j= g_uLeafCount || uIndex2 >= g_uLeafCount) Log(&rLog, LOG_FATAL, "TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount); #endif if (uIndex1 >= uIndex2) v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; else v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2); return v; } #ifdef UNUSED static void ListState() { uint i, j; Info("Dist matrix\n"); Info(" "); for (i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Info(" %5u", g_uNodeIndex[i]); } Info("\n"); for (i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Info("%5u ", g_uNodeIndex[i]); for (j = 0; j < g_uLeafCount; ++j) { if (uInsane == g_uNodeIndex[j]) continue; if (i == j) Info(" "); else { ulong v = TriangleSubscript(i, j); Info("%5.2g ", g_Dist[v]); } } Info("\n"); } Info("\n"); Info(" i Node NrNb Dist\n"); Info("----- ----- ----- --------\n"); for (i = 0; i < g_uLeafCount; ++i) { if (uInsane == g_uNodeIndex[i]) continue; Info("%5u %5u %5u %8.3f\n", i, g_uNodeIndex[i], g_uNearestNeighbor[i], g_MinDist[i]); } Info("\n"); Info(" Node L R Height LLength RLength\n"); Info("----- ----- ----- ------ ------- -------\n"); for (i = 0; i <= g_uInternalNodeIndex; ++i) Info("%5u %5u %5u %6.2g %6.2g %6.2g\n", i, g_uLeft[i], g_uRight[i], g_Height[i], g_LeftLength[i], g_RightLength[i]); } #endif /* ifdef UNUSED */ /** * @brief Creates a UPGMA in O(N^2) tree from given distmat * * @param[out] tree * newly created rooted UPGMA tree * @param[in] distmat * distance matrix to be clustered * @param[in] linkage * linkage type * @param[in] names * leaf names, will be copied * * @note called UPGMA2() in Muscle3.7. * caller has to free with FreeMuscleTree() * * @see FreeMuscleTree() */ void MuscleUpgma2(tree_t *tree, symmatrix_t *distmat, linkage_t linkage, char **names) { int i, j; uint *Ids; /* only works on full symmetric matrices */ assert (distmat->nrows==distmat->ncols); g_uLeafCount = distmat->ncols; g_ulTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2; g_uInternalNodeCount = g_uLeafCount - 1; g_Dist = (dist_t *) CKMALLOC(g_ulTriangleSize * sizeof(dist_t)); g_uNodeIndex = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount); g_uNearestNeighbor = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount); g_MinDist = (dist_t *) CKMALLOC(sizeof(dist_t) * g_uLeafCount); Ids = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount); /* NOTE: we replaced Names with argument names */ /** * left and right node indices, as well as left and right * branch-length and height for for internal nodes */ g_uLeft = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount); g_uRight = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount); g_Height = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount); g_LeftLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount); g_RightLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount); for (i = 0; i < g_uLeafCount; ++i) { g_MinDist[i] = BIG_DIST; g_uNodeIndex[i] = i; g_uNearestNeighbor[i] = uInsane; Ids[i] = i; } for (i = 0; i < g_uInternalNodeCount; ++i) { g_uLeft[i] = uInsane; g_uRight[i] = uInsane; g_LeftLength[i] = BIG_DIST; g_RightLength[i] = BIG_DIST; g_Height[i] = BIG_DIST; } /* Compute initial NxN triangular distance matrix. * Store minimum distance for each full (not triangular) row. * Loop from 1, not 0, because "row" is 0, 1 ... i-1, * so nothing to do when i=0. */ for (i = 1; i < g_uLeafCount; ++i) { dist_t *Row = g_Dist + TriangleSubscript(i, 0); CalcDistRange(distmat, i, Row); for (j = 0; j < i; ++j) { const dist_t d = Row[j]; if (d < g_MinDist[i]) { g_MinDist[i] = d; g_uNearestNeighbor[i] = j; } if (d < g_MinDist[j]) { g_MinDist[j] = d; g_uNearestNeighbor[j] = i; } } } #if TRACE Info("Initial state:\n"); ListState(); #endif for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1; ++g_uInternalNodeIndex) { dist_t dtNewMinDist = BIG_DIST; uint uNewNearestNeighbor = uInsane; #if TRACE Info("\n"); Info("Internal node index %5u\n", g_uInternalNodeIndex); Info("-------------------------\n"); #endif /* Find nearest neighbors */ uint Lmin = uInsane; uint Rmin = uInsane; dist_t dtMinDist = BIG_DIST; for (j = 0; j < g_uLeafCount; ++j) { dist_t d; if (uInsane == g_uNodeIndex[j]) continue; d = g_MinDist[j]; if (d < dtMinDist) { dtMinDist = d; Lmin = j; Rmin = g_uNearestNeighbor[j]; assert(uInsane != Rmin); assert(uInsane != g_uNodeIndex[Rmin]); } } assert(Lmin != uInsane); assert(Rmin != uInsane); assert(dtMinDist != BIG_DIST); #if TRACE Info("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", Lmin, g_uNodeIndex[Lmin], Rmin, g_uNodeIndex[Rmin], dtMinDist); #endif /* Compute distances to new node * New node overwrites row currently assigned to Lmin */ for ( j = 0; j < g_uLeafCount; ++j) { ulong vL, vR; dist_t dL, dR; dist_t dtNewDist; if (j == Lmin || j == Rmin) continue; if (uInsane == g_uNodeIndex[j]) continue; vL = TriangleSubscript(Lmin, j); vR = TriangleSubscript(Rmin, j); dL = g_Dist[vL]; dR = g_Dist[vR]; dtNewDist = 0.0; switch (linkage) { case LINKAGE_AVG: dtNewDist = AVG(dL, dR); break; case LINKAGE_MIN: dtNewDist = MIN(dL, dR); break; case LINKAGE_MAX: dtNewDist = MAX(dL, dR); break; /* couldn't be arsed to figure out proper usage of g_dSUEFF */ #if 0 case LINKAGE_BIASED: dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR); break; #endif default: Log(&rLog, LOG_FATAL, "UPGMA2: Invalid LINKAGE_%u", linkage); } /* Nasty special case. * If nearest neighbor of j is Lmin or Rmin, then make the new * node (which overwrites the row currently occupied by Lmin) * the nearest neighbor. This situation can occur when there are * equal distances in the matrix. If we don't make this fix, * the nearest neighbor pointer for j would become invalid. * (We don't need to test for == Lmin, because in that case * the net change needed is zero due to the change in row * numbering). */ if (g_uNearestNeighbor[j] == Rmin) g_uNearestNeighbor[j] = Lmin; #if TRACE Info("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", j, Lmin, dL, Rmin, dR, dtNewDist); #endif g_Dist[vL] = dtNewDist; if (dtNewDist < dtNewMinDist) { dtNewMinDist = dtNewDist; uNewNearestNeighbor = j; } } assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist); assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor); const ulong v = TriangleSubscript(Lmin, Rmin); const dist_t dLR = g_Dist[v]; const dist_t dHeightNew = dLR/2; const uint uLeft = g_uNodeIndex[Lmin]; const uint uRight = g_uNodeIndex[Rmin]; const dist_t HeightLeft = uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount]; const dist_t HeightRight = uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount]; g_uLeft[g_uInternalNodeIndex] = uLeft; g_uRight[g_uInternalNodeIndex] = uRight; g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft; g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight; g_Height[g_uInternalNodeIndex] = dHeightNew; /* Row for left child overwritten by row for new node */ g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex; g_uNearestNeighbor[Lmin] = uNewNearestNeighbor; g_MinDist[Lmin] = dtNewMinDist; /* Delete row for right child */ g_uNodeIndex[Rmin] = uInsane; #if TRACE Info("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", g_uInternalNodeIndex, Lmin, Rmin); ListState(); #endif } uint uRoot = g_uLeafCount - 2; #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "uRoot=%d g_uLeafCount=%d g_uInternalNodeCount=%d", uRoot, g_uLeafCount, g_uInternalNodeCount); for (i=0; i