--- /dev/null
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+
+/* This the fast UPGMA algorithm (O(N^2)) as implemented in Bob Edgar's
+ * Muscle (UPGMA2.cpp; version 3.7) ported to pure C.
+ *
+ * Muscle's code is public domain and so is this code here.
+ *
+ * From http://www.drive5.com/muscle/license.htm:
+ * """
+ * MUSCLE is public domain software
+ *
+ * The MUSCLE software, including object and source code and
+ * documentation, is hereby donated to the public domain.
+ *
+ * Disclaimer of warranty
+ * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
+ * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * """
+ *
+ */
+
+/*
+ * RCS $Id: muscle_upgma.c 230 2011-04-09 15:37:50Z andreas $
+ *
+ *
+ * Notes:
+ * ------
+ * LINKAGE become linkage_t here
+ *
+ * Replaced the the following member functions for DistCalc DC:
+ * DC.GetId = sequence id as int
+ * DC.GetName = sequence name
+ * DC.GetCount = matrix dim
+ * DC.DistRange = vector / matrix row for object i with index j<i
+ *
+ * Log() has been replaced with Clustal's Info(), Quiet() with Log(&rLog, LOG_FATAL)
+ *
+ * Made TriangleSubscript() and g_ulTriangleSize ulong to prevent overflow for many sequences
+ */
+
+#ifndef ulint
+/* limit use of unsigned vars (see coding_style_guideline.txt) */
+typedef unsigned long int ulong;
+#endif
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "util.h"
+#include "log.h"
+#include "symmatrix.h"
+
+#include "muscle_tree.h"
+#include "muscle_upgma.h"
+
+/* from distcalc.h */
+typedef float dist_t;
+static const dist_t BIG_DIST = (dist_t) 1e29;
+/* from muscle.h */
+static const unsigned uInsane = 8888888;
+
+
+
+
+/*static inline*/
+ulong TriangleSubscript(uint uIndex1, uint uIndex2);
+
+
+
+
+#define TRACE 0
+
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+#ifndef MIN
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+#define AVG(x, y) (((x) + (y))/2)
+
+static uint g_uLeafCount;
+static ulong g_ulTriangleSize;
+static uint g_uInternalNodeCount;
+static uint g_uInternalNodeIndex;
+
+/* Triangular distance matrix is g_Dist, which is allocated
+ * as a one-dimensional vector of length g_ulTriangleSize.
+ * TriangleSubscript(i,j) maps row,column=i,j to the subscript
+ * into this vector.
+ * Row / column coordinates are a bit messy.
+ * Initially they are leaf indexes 0..N-1.
+ * But each time we create a new node (=new cluster, new subtree),
+ * we re-use one of the two rows that become available (the children
+ * of the new node). This saves memory.
+ * We keep track of this through the g_uNodeIndex vector.
+ */
+static dist_t *g_Dist;
+
+/* Distance to nearest neighbor in row i of distance matrix.
+ * Subscript is distance matrix row.
+ */
+static dist_t *g_MinDist;
+
+/* Nearest neighbor to row i of distance matrix.
+ * Subscript is distance matrix row.
+ */
+static uint *g_uNearestNeighbor;
+
+/* Node index of row i in distance matrix.
+ * Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
+ * Subscript is distance matrix row.
+ */
+static uint *g_uNodeIndex;
+
+/* The following vectors are defined on internal nodes,
+ * subscripts are internal node index 0..N-2.
+ * For g_uLeft/Right, value is the node index 0 .. 2N-2
+ * because a child can be internal or leaf.
+ */
+static uint *g_uLeft;
+static uint *g_uRight;
+static dist_t *g_Height;
+static dist_t *g_LeftLength;
+static dist_t *g_RightLength;
+
+
+/*** CalcDistRange
+ *
+ * Imitation of DistCalc.DistRange
+ *
+ * Sets values of row (vector / matrix row) to distances for object i with index j<i
+ *
+ * row must be preallocated
+ */
+void CalcDistRange(symmatrix_t *distmat, uint i, dist_t *row)
+{
+ uint j;
+ for (j = 0; j < i; ++j) {
+ row[j] = SymMatrixGetValue(distmat, i, j);
+ }
+}
+/* end of CalcDistRange */
+
+
+
+/*static inline*/
+ulong
+TriangleSubscript(uint uIndex1, uint uIndex2)
+{
+ ulong v;
+#ifndef NDEBUG
+ if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
+ Log(&rLog, LOG_FATAL, "TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
+#endif
+ if (uIndex1 >= uIndex2)
+ v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
+ else
+ v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
+ assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
+ return v;
+}
+
+#ifdef UNUSED
+static void ListState()
+{
+ uint i, j;
+ Info("Dist matrix\n");
+ Info(" ");
+ for (i = 0; i < g_uLeafCount; ++i)
+ {
+ if (uInsane == g_uNodeIndex[i])
+ continue;
+ Info(" %5u", g_uNodeIndex[i]);
+ }
+ Info("\n");
+
+ for (i = 0; i < g_uLeafCount; ++i)
+ {
+ if (uInsane == g_uNodeIndex[i])
+ continue;
+ Info("%5u ", g_uNodeIndex[i]);
+ for (j = 0; j < g_uLeafCount; ++j)
+ {
+ if (uInsane == g_uNodeIndex[j])
+ continue;
+ if (i == j)
+ Info(" ");
+ else
+ {
+ ulong v = TriangleSubscript(i, j);
+ Info("%5.2g ", g_Dist[v]);
+ }
+ }
+ Info("\n");
+ }
+
+ Info("\n");
+ Info(" i Node NrNb Dist\n");
+ Info("----- ----- ----- --------\n");
+ for (i = 0; i < g_uLeafCount; ++i)
+ {
+ if (uInsane == g_uNodeIndex[i])
+ continue;
+ Info("%5u %5u %5u %8.3f\n",
+ i,
+ g_uNodeIndex[i],
+ g_uNearestNeighbor[i],
+ g_MinDist[i]);
+ }
+
+ Info("\n");
+ Info(" Node L R Height LLength RLength\n");
+ Info("----- ----- ----- ------ ------- -------\n");
+ for (i = 0; i <= g_uInternalNodeIndex; ++i)
+ Info("%5u %5u %5u %6.2g %6.2g %6.2g\n",
+ i,
+ g_uLeft[i],
+ g_uRight[i],
+ g_Height[i],
+ g_LeftLength[i],
+ g_RightLength[i]);
+}
+#endif
+/* ifdef UNUSED */
+
+/**
+ * @brief Creates a UPGMA in O(N^2) tree from given distmat
+ *
+ * @param[out] tree
+ * newly created rooted UPGMA tree
+ * @param[in] distmat
+ * distance matrix to be clustered
+ * @param[in] linkage
+ * linkage type
+ * @param[in] names
+ * leaf names, will be copied
+ *
+ * @note called UPGMA2() in Muscle3.7.
+ * caller has to free with FreeMuscleTree()
+ *
+ * @see FreeMuscleTree()
+ */
+void
+MuscleUpgma2(tree_t *tree, symmatrix_t *distmat, linkage_t linkage, char **names)
+{
+ int i, j;
+ uint *Ids;
+
+ /* only works on full symmetric matrices */
+ assert (distmat->nrows==distmat->ncols);
+
+ g_uLeafCount = distmat->ncols;
+ g_ulTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
+ g_uInternalNodeCount = g_uLeafCount - 1;
+
+ g_Dist = (dist_t *) CKMALLOC(g_ulTriangleSize * sizeof(dist_t));
+
+ g_uNodeIndex = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
+ g_uNearestNeighbor = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
+ g_MinDist = (dist_t *) CKMALLOC(sizeof(dist_t) * g_uLeafCount);
+ Ids = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
+ /* NOTE: we replaced Names with argument names */
+
+ /**
+ * left and right node indices, as well as left and right
+ * branch-length and height for for internal nodes
+ */
+ g_uLeft = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
+ g_uRight = (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
+ g_Height = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
+ g_LeftLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
+ g_RightLength = (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
+
+ for (i = 0; i < g_uLeafCount; ++i) {
+ g_MinDist[i] = BIG_DIST;
+ g_uNodeIndex[i] = i;
+ g_uNearestNeighbor[i] = uInsane;
+ Ids[i] = i;
+ }
+
+ for (i = 0; i < g_uInternalNodeCount; ++i) {
+ g_uLeft[i] = uInsane;
+ g_uRight[i] = uInsane;
+ g_LeftLength[i] = BIG_DIST;
+ g_RightLength[i] = BIG_DIST;
+ g_Height[i] = BIG_DIST;
+ }
+
+/* Compute initial NxN triangular distance matrix.
+ * Store minimum distance for each full (not triangular) row.
+ * Loop from 1, not 0, because "row" is 0, 1 ... i-1,
+ * so nothing to do when i=0.
+ */
+ for (i = 1; i < g_uLeafCount; ++i) {
+ dist_t *Row = g_Dist + TriangleSubscript(i, 0);
+ CalcDistRange(distmat, i, Row);
+ for (j = 0; j < i; ++j) {
+ const dist_t d = Row[j];
+ if (d < g_MinDist[i]) {
+ g_MinDist[i] = d;
+ g_uNearestNeighbor[i] = j;
+ }
+ if (d < g_MinDist[j]) {
+ g_MinDist[j] = d;
+ g_uNearestNeighbor[j] = i;
+ }
+ }
+ }
+
+#if TRACE
+ Info("Initial state:\n");
+ ListState();
+#endif
+
+ for (g_uInternalNodeIndex = 0;
+ g_uInternalNodeIndex < g_uLeafCount - 1;
+ ++g_uInternalNodeIndex) {
+
+ dist_t dtNewMinDist = BIG_DIST;
+ uint uNewNearestNeighbor = uInsane;
+
+#if TRACE
+ Info("\n");
+ Info("Internal node index %5u\n", g_uInternalNodeIndex);
+ Info("-------------------------\n");
+#endif
+
+ /* Find nearest neighbors */
+ uint Lmin = uInsane;
+ uint Rmin = uInsane;
+ dist_t dtMinDist = BIG_DIST;
+ for (j = 0; j < g_uLeafCount; ++j) {
+ dist_t d;
+ if (uInsane == g_uNodeIndex[j])
+ continue;
+
+ d = g_MinDist[j];
+ if (d < dtMinDist) {
+ dtMinDist = d;
+ Lmin = j;
+ Rmin = g_uNearestNeighbor[j];
+ assert(uInsane != Rmin);
+ assert(uInsane != g_uNodeIndex[Rmin]);
+ }
+ }
+
+ assert(Lmin != uInsane);
+ assert(Rmin != uInsane);
+ assert(dtMinDist != BIG_DIST);
+
+#if TRACE
+ Info("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
+ Lmin,
+ g_uNodeIndex[Lmin],
+ Rmin,
+ g_uNodeIndex[Rmin],
+ dtMinDist);
+#endif
+
+ /* Compute distances to new node
+ * New node overwrites row currently assigned to Lmin
+ */
+ for ( j = 0; j < g_uLeafCount; ++j) {
+ ulong vL, vR;
+ dist_t dL, dR;
+ dist_t dtNewDist;
+
+ if (j == Lmin || j == Rmin)
+ continue;
+ if (uInsane == g_uNodeIndex[j])
+ continue;
+
+ vL = TriangleSubscript(Lmin, j);
+ vR = TriangleSubscript(Rmin, j);
+ dL = g_Dist[vL];
+ dR = g_Dist[vR];
+ dtNewDist = 0.0;
+
+ switch (linkage) {
+ case LINKAGE_AVG:
+ dtNewDist = AVG(dL, dR);
+ break;
+
+ case LINKAGE_MIN:
+ dtNewDist = MIN(dL, dR);
+ break;
+
+ case LINKAGE_MAX:
+ dtNewDist = MAX(dL, dR);
+ break;
+/* couldn't be arsed to figure out proper usage of g_dSUEFF */
+#if 0
+ case LINKAGE_BIASED:
+ dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
+ break;
+#endif
+ default:
+ Log(&rLog, LOG_FATAL, "UPGMA2: Invalid LINKAGE_%u", linkage);
+ }
+
+ /* Nasty special case.
+ * If nearest neighbor of j is Lmin or Rmin, then make the new
+ * node (which overwrites the row currently occupied by Lmin)
+ * the nearest neighbor. This situation can occur when there are
+ * equal distances in the matrix. If we don't make this fix,
+ * the nearest neighbor pointer for j would become invalid.
+ * (We don't need to test for == Lmin, because in that case
+ * the net change needed is zero due to the change in row
+ * numbering).
+ */
+ if (g_uNearestNeighbor[j] == Rmin)
+ g_uNearestNeighbor[j] = Lmin;
+
+#if TRACE
+ Info("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
+ j, Lmin, dL, Rmin, dR, dtNewDist);
+#endif
+ g_Dist[vL] = dtNewDist;
+ if (dtNewDist < dtNewMinDist) {
+ dtNewMinDist = dtNewDist;
+ uNewNearestNeighbor = j;
+ }
+ }
+
+ assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
+ assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);
+
+ const ulong v = TriangleSubscript(Lmin, Rmin);
+ const dist_t dLR = g_Dist[v];
+ const dist_t dHeightNew = dLR/2;
+ const uint uLeft = g_uNodeIndex[Lmin];
+ const uint uRight = g_uNodeIndex[Rmin];
+ const dist_t HeightLeft =
+ uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
+ const dist_t HeightRight =
+ uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];
+
+ g_uLeft[g_uInternalNodeIndex] = uLeft;
+ g_uRight[g_uInternalNodeIndex] = uRight;
+ g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
+ g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
+ g_Height[g_uInternalNodeIndex] = dHeightNew;
+
+ /* Row for left child overwritten by row for new node */
+ g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
+ g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
+ g_MinDist[Lmin] = dtNewMinDist;
+
+ /* Delete row for right child */
+ g_uNodeIndex[Rmin] = uInsane;
+
+#if TRACE
+ Info("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
+ g_uInternalNodeIndex, Lmin, Rmin);
+ ListState();
+#endif
+ }
+
+ uint uRoot = g_uLeafCount - 2;
+
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "uRoot=%d g_uLeafCount=%d g_uInternalNodeCount=%d", uRoot, g_uLeafCount, g_uInternalNodeCount);
+ for (i=0; i<g_uInternalNodeCount; i++) {
+ Log(&rLog, LOG_FORCED_DEBUG, "internal node=%d: g_uLeft=%d g_uRight=%d g_LeftLength=%f g_RightLength=%f g_Height=%f",
+ i, g_uLeft[i], g_uRight[i],
+ g_LeftLength[i], g_RightLength[i],
+ g_Height[i]);
+ }
+ for (i=0; i<g_uLeafCount; i++) {
+ Log(&rLog, LOG_FORCED_DEBUG, "leaf node=%d: Ids=%d names=%s",
+ i, Ids[i], names[i]);
+ }
+#endif
+
+ MuscleTreeCreate(tree, g_uLeafCount, uRoot,
+ g_uLeft, g_uRight,
+ g_LeftLength, g_RightLength,
+ Ids, names);
+#if TRACE
+ tree.LogMe();
+#endif
+
+ free(g_Dist);
+
+ free(g_uNodeIndex);
+ free(g_uNearestNeighbor);
+ free(g_MinDist);
+ free(g_Height);
+
+ free(g_uLeft);
+ free(g_uRight);
+ free(g_LeftLength);
+ free(g_RightLength);
+
+ /* NOTE: Muscle's "Names" variable is here the argument "names" */
+ free(Ids);
+}
+/*** end of UPGMA2 ***/