Delete unneeded directory
[jabaws.git] / website / archive / binaries / mac / src / clustalo / src / clustal / muscle_upgma.c
diff --git a/website/archive/binaries/mac/src/clustalo/src/clustal/muscle_upgma.c b/website/archive/binaries/mac/src/clustalo/src/clustal/muscle_upgma.c
deleted file mode 100644 (file)
index 7e4b5df..0000000
+++ /dev/null
@@ -1,502 +0,0 @@
-/* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */
-
-/* This the fast UPGMA algorithm (O(N^2)) as implemented in Bob Edgar's
- * Muscle (UPGMA2.cpp; version 3.7) ported to pure C.
- *
- * Muscle's code is public domain and so is this code here.
- *
- * From http://www.drive5.com/muscle/license.htm:
- * """
- * MUSCLE is public domain software
- *
- * The MUSCLE software, including object and source code and
- * documentation, is hereby donated to the public domain.
- *
- * Disclaimer of warranty
- * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND,
- * EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * """
- *
- */
-
-/*
- *  RCS $Id: muscle_upgma.c 230 2011-04-09 15:37:50Z andreas $
- *
- *
- * Notes:
- * ------
- * LINKAGE become linkage_t here
- *
- * Replaced the the following member functions for DistCalc DC:
- * DC.GetId = sequence id as int
- * DC.GetName = sequence name
- * DC.GetCount = matrix dim
- * DC.DistRange = vector / matrix row for object i with index j<i
- *
- * Log() has been replaced with Clustal's Info(), Quiet() with Log(&rLog, LOG_FATAL)
- *
- * Made TriangleSubscript() and g_ulTriangleSize ulong to prevent overflow for many sequences
- */
-
-#ifndef ulint
-/* limit use of unsigned vars (see coding_style_guideline.txt) */
-typedef unsigned long int ulong;
-#endif
-
-
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <assert.h>
-
-#include "util.h"
-#include "log.h"
-#include "symmatrix.h"
-
-#include "muscle_tree.h"
-#include "muscle_upgma.h"
-
-/* from distcalc.h */
-typedef float dist_t;
-static const dist_t BIG_DIST = (dist_t) 1e29;
-/* from muscle.h */
-static const unsigned uInsane = 8888888;
-
-
-
-
-/*static inline*/
-ulong TriangleSubscript(uint uIndex1, uint uIndex2);
-
-
-
-
-#define TRACE   0
-
-#ifndef MIN
-#define MIN(x, y)   ((x) < (y) ? (x) : (y))
-#endif
-#ifndef MIN
-#define MAX(x, y)   ((x) > (y) ? (x) : (y))
-#endif
-#define AVG(x, y)   (((x) + (y))/2)
-
-static uint g_uLeafCount;
-static ulong g_ulTriangleSize;
-static uint g_uInternalNodeCount;
-static uint g_uInternalNodeIndex;
-
-/* Triangular distance matrix is g_Dist, which is allocated
- * as a one-dimensional vector of length g_ulTriangleSize.
- * TriangleSubscript(i,j) maps row,column=i,j to the subscript
- * into this vector.
- * Row / column coordinates are a bit messy.
- * Initially they are leaf indexes 0..N-1.
- * But each time we create a new node (=new cluster, new subtree),
- * we re-use one of the two rows that become available (the children
- * of the new node). This saves memory.
- * We keep track of this through the g_uNodeIndex vector.
- */
-static dist_t *g_Dist;
-
-/* Distance to nearest neighbor in row i of distance matrix.
- * Subscript is distance matrix row.
- */
-static dist_t *g_MinDist;
-
-/* Nearest neighbor to row i of distance matrix.
- * Subscript is distance matrix row.
- */
-static uint *g_uNearestNeighbor;
-
-/* Node index of row i in distance matrix.
- * Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes.
- * Subscript is distance matrix row.
- */
-static uint *g_uNodeIndex;
-
-/* The following vectors are defined on internal nodes,
- * subscripts are internal node index 0..N-2.
- * For g_uLeft/Right, value is the node index 0 .. 2N-2
- * because a child can be internal or leaf.
- */
-static uint *g_uLeft;
-static uint *g_uRight;
-static dist_t *g_Height;
-static dist_t *g_LeftLength;
-static dist_t *g_RightLength;
-
-
-/***   CalcDistRange
- *
- * Imitation of DistCalc.DistRange
- *
- * Sets values of row (vector / matrix row) to distances for object i with index j<i
- *
- * row must be preallocated
- */
-void CalcDistRange(symmatrix_t *distmat, uint i, dist_t *row)
-{
-    uint j;
-    for (j = 0; j < i; ++j) {
-        row[j] = SymMatrixGetValue(distmat, i, j);
-    }
-}
-/* end of CalcDistRange */
-
-
-
-/*static inline*/
-ulong
-TriangleSubscript(uint uIndex1, uint uIndex2)
-{
-    ulong v;
-#ifndef NDEBUG
-    if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount)
-        Log(&rLog, LOG_FATAL, "TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount);
-#endif
-    if (uIndex1 >= uIndex2)
-        v = uIndex2 + (uIndex1*(uIndex1 - 1))/2;
-    else
-        v = uIndex1 + (uIndex2*(uIndex2 - 1))/2;
-    assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2);
-    return v;
-}
-
-#ifdef UNUSED
-static void ListState()
-{
-    uint i, j;
-    Info("Dist matrix\n");
-    Info("     ");
-    for (i = 0; i < g_uLeafCount; ++i)
-    {
-        if (uInsane == g_uNodeIndex[i])
-            continue;
-        Info("  %5u", g_uNodeIndex[i]);
-    }
-    Info("\n");
-
-    for (i = 0; i < g_uLeafCount; ++i)
-    {
-        if (uInsane == g_uNodeIndex[i])
-            continue;
-        Info("%5u  ", g_uNodeIndex[i]);
-        for (j = 0; j < g_uLeafCount; ++j)
-        {
-            if (uInsane == g_uNodeIndex[j])
-                continue;
-            if (i == j)
-                Info("       ");
-            else
-            {
-                ulong v = TriangleSubscript(i, j);
-                Info("%5.2g  ", g_Dist[v]);
-            }
-        }
-        Info("\n");
-    }
-
-    Info("\n");
-    Info("    i   Node   NrNb      Dist\n");
-    Info("-----  -----  -----  --------\n");
-    for (i = 0; i < g_uLeafCount; ++i)
-    {
-        if (uInsane == g_uNodeIndex[i])
-            continue;
-        Info("%5u  %5u  %5u  %8.3f\n",
-             i,
-             g_uNodeIndex[i],
-             g_uNearestNeighbor[i],
-             g_MinDist[i]);
-    }
-
-    Info("\n");
-    Info(" Node      L      R  Height  LLength  RLength\n");
-    Info("-----  -----  -----  ------  -------  -------\n");
-    for (i = 0; i <= g_uInternalNodeIndex; ++i)
-        Info("%5u  %5u  %5u  %6.2g  %6.2g  %6.2g\n",
-             i,
-             g_uLeft[i],
-             g_uRight[i],
-             g_Height[i],
-             g_LeftLength[i],
-             g_RightLength[i]);
-}
-#endif
-/* ifdef UNUSED */
-
-/**
- * @brief Creates a UPGMA in O(N^2) tree from given distmat
- *
- * @param[out] tree
- * newly created rooted UPGMA tree
- * @param[in] distmat
- * distance matrix to be clustered
- * @param[in] linkage
- * linkage type
- * @param[in] names
- * leaf names, will be copied
- *
- * @note called UPGMA2() in Muscle3.7.
- * caller has to free with FreeMuscleTree()
- *
- * @see FreeMuscleTree()
- */
-void
-MuscleUpgma2(tree_t *tree, symmatrix_t *distmat, linkage_t linkage, char **names)
-{
-    int i, j;
-    uint *Ids;
-
-    /* only works on full symmetric matrices */
-    assert (distmat->nrows==distmat->ncols);
-   
-    g_uLeafCount = distmat->ncols;    
-    g_ulTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2;
-    g_uInternalNodeCount = g_uLeafCount - 1;
-
-    g_Dist = (dist_t *) CKMALLOC(g_ulTriangleSize * sizeof(dist_t));
-
-    g_uNodeIndex = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
-    g_uNearestNeighbor = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
-    g_MinDist = (dist_t *) CKMALLOC(sizeof(dist_t) * g_uLeafCount);
-    Ids = (uint*) CKMALLOC(sizeof(uint) * g_uLeafCount);
-    /* NOTE: we replaced Names with argument names */
-
-    /**
-     * left and right node indices, as well as left and right
-     * branch-length and height for for internal nodes
-     */
-    g_uLeft =  (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
-    g_uRight =  (uint*) CKMALLOC(sizeof(uint) * g_uInternalNodeCount);
-    g_Height =  (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
-    g_LeftLength =  (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
-    g_RightLength =  (dist_t*) CKMALLOC(sizeof(dist_t) * g_uInternalNodeCount);
-    
-    for (i = 0; i < g_uLeafCount; ++i) {
-        g_MinDist[i] = BIG_DIST;
-        g_uNodeIndex[i] = i;
-        g_uNearestNeighbor[i] = uInsane;
-        Ids[i] = i;
-    }
-    
-    for (i = 0; i < g_uInternalNodeCount; ++i) {
-        g_uLeft[i] = uInsane;
-        g_uRight[i] = uInsane;
-        g_LeftLength[i] = BIG_DIST;
-        g_RightLength[i] = BIG_DIST;
-        g_Height[i] = BIG_DIST;
-    }
-    
-/* Compute initial NxN triangular distance matrix.
- * Store minimum distance for each full (not triangular) row.
- * Loop from 1, not 0, because "row" is 0, 1 ... i-1,
- * so nothing to do when i=0.
- */
-    for (i = 1; i < g_uLeafCount; ++i) {
-        dist_t *Row = g_Dist + TriangleSubscript(i, 0);
-        CalcDistRange(distmat, i, Row);
-        for (j = 0; j < i; ++j) {
-            const dist_t d = Row[j];
-            if (d < g_MinDist[i]) {
-                g_MinDist[i] = d;
-                g_uNearestNeighbor[i] = j;
-            }
-            if (d < g_MinDist[j]) {
-                g_MinDist[j] = d;
-                g_uNearestNeighbor[j] = i;
-            }
-        }
-    }
-
-#if TRACE
-    Info("Initial state:\n");
-    ListState();
-#endif
-
-    for (g_uInternalNodeIndex = 0;
-         g_uInternalNodeIndex < g_uLeafCount - 1;
-         ++g_uInternalNodeIndex) {
-
-        dist_t dtNewMinDist = BIG_DIST;
-        uint uNewNearestNeighbor = uInsane;
-
-#if TRACE
-        Info("\n");
-        Info("Internal node index %5u\n", g_uInternalNodeIndex);
-        Info("-------------------------\n");
-#endif
-
-        /* Find nearest neighbors */
-        uint Lmin = uInsane;
-        uint Rmin = uInsane;
-        dist_t dtMinDist = BIG_DIST;
-        for (j = 0; j < g_uLeafCount; ++j) {
-            dist_t d;
-            if (uInsane == g_uNodeIndex[j])
-                continue;
-
-            d = g_MinDist[j];
-            if (d < dtMinDist) {
-                dtMinDist = d;
-                Lmin = j;
-                Rmin = g_uNearestNeighbor[j];
-                assert(uInsane != Rmin);
-                assert(uInsane != g_uNodeIndex[Rmin]);
-            }
-        }
-
-        assert(Lmin != uInsane);
-        assert(Rmin != uInsane);
-        assert(dtMinDist != BIG_DIST);
-
-#if TRACE
-        Info("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n",
-             Lmin,
-             g_uNodeIndex[Lmin],
-             Rmin,
-             g_uNodeIndex[Rmin],
-             dtMinDist);
-#endif
-
-        /* Compute distances to new node
-         * New node overwrites row currently assigned to Lmin
-         */
-        for ( j = 0; j < g_uLeafCount; ++j) {
-            ulong vL, vR;
-            dist_t dL, dR;
-            dist_t dtNewDist;
-            
-            if (j == Lmin || j == Rmin)
-                continue;
-            if (uInsane == g_uNodeIndex[j])
-                continue;
-
-            vL = TriangleSubscript(Lmin, j);
-            vR = TriangleSubscript(Rmin, j);
-            dL = g_Dist[vL];
-            dR = g_Dist[vR];
-            dtNewDist = 0.0;
-
-            switch (linkage) {
-            case LINKAGE_AVG:
-                dtNewDist = AVG(dL, dR);
-                break;
-
-            case LINKAGE_MIN:
-                dtNewDist = MIN(dL, dR);
-                break;
-
-            case LINKAGE_MAX:
-                dtNewDist = MAX(dL, dR);
-                break;
-/* couldn't be arsed to figure out proper usage of g_dSUEFF */
-#if 0
-            case LINKAGE_BIASED:
-                dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR);
-                break;
-#endif
-            default:
-                Log(&rLog, LOG_FATAL, "UPGMA2: Invalid LINKAGE_%u", linkage);
-            }
-
-            /* Nasty special case.
-             * If nearest neighbor of j is Lmin or Rmin, then make the new
-             * node (which overwrites the row currently occupied by Lmin)
-             * the nearest neighbor. This situation can occur when there are
-             * equal distances in the matrix. If we don't make this fix,
-             * the nearest neighbor pointer for j would become invalid.
-             * (We don't need to test for == Lmin, because in that case
-             * the net change needed is zero due to the change in row
-             * numbering).
-             */
-            if (g_uNearestNeighbor[j] == Rmin)
-                g_uNearestNeighbor[j] = Lmin;
-
-#if TRACE
-            Info("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n",
-                 j, Lmin, dL, Rmin, dR, dtNewDist);
-#endif
-            g_Dist[vL] = dtNewDist;
-            if (dtNewDist < dtNewMinDist) {
-                dtNewMinDist = dtNewDist;
-                uNewNearestNeighbor = j;
-            }
-        }
-
-        assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist);
-        assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor);
-
-        const ulong v = TriangleSubscript(Lmin, Rmin);
-        const dist_t dLR = g_Dist[v];
-        const dist_t dHeightNew = dLR/2;
-        const uint uLeft = g_uNodeIndex[Lmin];
-        const uint uRight = g_uNodeIndex[Rmin];
-        const dist_t HeightLeft =
-            uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount];
-        const dist_t HeightRight =
-            uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount];
-
-        g_uLeft[g_uInternalNodeIndex] = uLeft;
-        g_uRight[g_uInternalNodeIndex] = uRight;
-        g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft;
-        g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight;
-        g_Height[g_uInternalNodeIndex] = dHeightNew;
-
-        /* Row for left child overwritten by row for new node */
-        g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex;
-        g_uNearestNeighbor[Lmin] = uNewNearestNeighbor;
-        g_MinDist[Lmin] = dtNewMinDist;
-
-        /* Delete row for right child */
-        g_uNodeIndex[Rmin] = uInsane;
-
-#if TRACE
-        Info("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n",
-             g_uInternalNodeIndex, Lmin, Rmin);
-        ListState();
-#endif
-    }
-
-    uint uRoot = g_uLeafCount - 2;
-
-#if TRACE
-    Log(&rLog, LOG_FORCED_DEBUG, "uRoot=%d g_uLeafCount=%d g_uInternalNodeCount=%d", uRoot, g_uLeafCount, g_uInternalNodeCount);
-    for (i=0; i<g_uInternalNodeCount; i++) {
-        Log(&rLog, LOG_FORCED_DEBUG, "internal node=%d:  g_uLeft=%d g_uRight=%d g_LeftLength=%f g_RightLength=%f g_Height=%f",
-                  i, g_uLeft[i], g_uRight[i],
-                  g_LeftLength[i], g_RightLength[i],
-                  g_Height[i]);
-    }
-    for (i=0; i<g_uLeafCount; i++) {
-        Log(&rLog, LOG_FORCED_DEBUG, "leaf node=%d:  Ids=%d names=%s",
-                  i, Ids[i], names[i]);
-    }
-#endif
-    
-    MuscleTreeCreate(tree, g_uLeafCount, uRoot,
-                      g_uLeft, g_uRight,
-                      g_LeftLength, g_RightLength,
-                      Ids, names);
-#if TRACE
-    tree.LogMe();
-#endif
-
-    free(g_Dist);
-
-    free(g_uNodeIndex);
-    free(g_uNearestNeighbor);
-    free(g_MinDist);
-    free(g_Height);
-
-    free(g_uLeft);
-    free(g_uRight);
-    free(g_LeftLength);
-    free(g_RightLength);
-
-    /* NOTE: Muscle's "Names" variable is here the argument "names" */
-    free(Ids);
-}
-/***   end of UPGMA2   ***/