Delete unneeded directory

[jabaws.git] / website / archive / binaries / mac / src / clustalo / src / clustal / mbed.c
diff --git a/website/archive/binaries/mac/src/clustalo/src/clustal/mbed.c b/website/archive/binaries/mac/src/clustalo/src/clustal/mbed.c

deleted file mode 100644 (file)

index f143fe0..0000000
--- a/website/archive/binaries/mac/src/clustalo/src/clustal/mbed.c
+++ /dev/null
@@ -1,1574 +0,0 @@
-/* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */
-
-/*********************************************************************
- * Clustal Omega - Multiple sequence alignment
- *
- * Copyright (C) 2010 University College Dublin
- *
- * Clustal-Omega is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This file is part of Clustal-Omega.
- *
- ********************************************************************/
-
-/*
- *  RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $
- *
- *
- * Reimplementation from scratch of mBed:
- * Blackshields et al. (2010); PMID 20470396
- *
- *
- */
-
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#include <assert.h>
-#include <float.h>
-
-#include "mbed.h"
-#include "pair_dist.h"
-#include "symmatrix.h"
-#include "ktuple_pair.h"
-#include "tree.h"
-#include "util.h"
-#include "progress.h"
-#include "queue.h"
-
-#include "log.h"
-#include "kmpp/KMeans.h"
-#include "mbed.h"
-
-#define TIMING 0
-#if TIMING
-#include "squid/stopwatch.h"
-#endif
-
-
-/* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
- * bisecting kmeans subcluster are not estimated using the vectors,
- * but calculated normally (using ktuple or kimura). Surprisingly this
- * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
- * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
- * log2 instead of log2^2). And of course it slows things down.
- */
-#define FULL_WITHIN_CLUSTER_DISTANCES 1
-
-#define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0
-
-/* Cluster size limits. Maximum is a soft limit, which might be
- * exceeded if a K-Means split was unsuccesful, where unsuccesful
- * might also mean that the minimum required number seqs. was not
- * reached */
-#if FULL_WITHIN_CLUSTER_DISTANCES
-static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
-static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
-#else
-static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
-static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
-#endif
-
-/* How many restarts per bisecting kmeans split. 10 give 0.5% increase
- * in quality on original HOMFAM over just 1. It also increases kmeans
- * time by a factor of ~3, but overall time is insignificant
- * compared to pairdist/progressive alignment part.
- */
-static const int RESTARTS_PER_SPLIT = 10;
-
-
-/* Use standard kmeans (lloyds) or kmeans++. Both are almost
- * indistinguishable here, although kmeans++ is slightly ahead the
- * fewer seeds you pick (and it should be better in theory)
- */
-#define USE_KMEANS_LLOYDS 0
-
-
-#ifndef HAVE_LOG2
-#define log2(x)  (log(x) / 0.69314718055994530942)
-#endif
-#define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
-
-
-/* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
- * approach: Sample iSeeds sequence with constant stride from length-sorted X.
- * It might be better to pick the seeds randomly, because length sorting might
- * be more prone to including outliers (e.g. very long and very short seqs).
- * However, we could never observer such a thing. So just stick to the
- * original version as this also removes the random element.
- */
-enum SEED_SELECTION_TYPE {
-    SELECT_SEEDS_RANDOMLY,
-    SELECT_SEEDS_BY_LENGTH
-};
-#define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
-
-
-/* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
- * no difference between mbed-trees created from cosine or euclidean
- * distances (simple version, just using disparities).
- */
-#define USE_EUCLIDEAN_DISTANCE 1
-
-
-/* print some mbed pre-cluster usage to screen */
-#define PRINT_CLUSTER_DISTRIBUTION 0
-
-
-#define TRACE 0
-
-
-typedef struct {
-    /* Number of final clusters
-     */
-    int iNClusters;
-
-    /* Coordinates (columns) for each cluster (rows)
-     * valid indices: [0...iNClusters][0...dim-1]
-     */
-    double **ppdClusterCenters;
-
-    /* Dimensionality of input data and cluster center coordinates
-     */
-    int iDim;
-
-    /* Number of objects per cluster
-     */
-    int *piNObjsPerCluster;
-
-    /* Objects (indices) for each cluster. [i][j] will point to (index
-     * of) object j in cluster i
-     */
-    int **ppiObjIndicesPerCluster;
-} bisecting_kmeans_result_t;
-
-
-
-/**
- * @brief Free KMeans result structure.
- *
- * @param[out]  prResult_p
- * K-Means result to free
- *
- * @see NewKMeansResult()
- */
-void
-FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
-{
-    int iAux;
-    CKFREE((*prResult_p)->piNObjsPerCluster);
-    for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
-        CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
-        CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
-    }
-    CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
-    CKFREE((*prResult_p)->ppdClusterCenters);
-    (*prResult_p)->iNClusters = 0;
-    (*prResult_p)->iDim = 0;
-    CKFREE(*prResult_p);
-}
-/***   end: FreeKMeansResult()   ***/
-
-
-
-/**
- * @brief Allocate new KMeans result
- *
- * @param[out] prKMeansResult_p
- * K-Means result to free
- *
- * @see NewKMeansResult()
- */
-void
-NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
-{
-    (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
-        CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
-    (*prKMeansResult_p)->iNClusters = 0;
-    (*prKMeansResult_p)->iDim = 0;
-    (*prKMeansResult_p)->ppdClusterCenters = NULL;
-    (*prKMeansResult_p)->piNObjsPerCluster = NULL;
-    (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
-
-}
-/***   end: NewKMeansResult()   ***/
-
-
-
-/**
- * @brief Calculate the euclidean distance between two vectors
- *
- * @param[in] v1
- * First vector with dim dimensions
- * @param[in] v2
- * Second vector with dim dimensions
- * @param[in] dim
- * Dimensionality of v1 and v2
- *
- * @return euclidean distance as double
- *
- * @note Could probably be inlined. Also perfect for SSE
- */
-double
-EuclDist(const double *v1, const double *v2, const int dim)
-{
-    int i; /* aux */
-    double dist; /* returned distance */
-
-    assert(v1!=NULL);
-    assert(v2!=NULL);
-
-    dist = 0.0;
-    for (i=0; i<dim; i++) {
-        dist += pow(v1[i]-v2[i], 2.0);
-    }
-
-    return sqrt(dist);
-}
-/***   end: EuclDist()   ***/
-
-
-
-/**
- * @brief Calculate the cosine distance between two vectors
- *
- * @param[in] v1
- * First vector with dim dimensions
- * @param[in] v2
- * Second vector with dim dimensions
- * @param[in] dim
- * Dimensionality of v1 and v2
- *
- * @return cosine distance as double
- *
- * @note Could probably be inlined. Also perfect for SSE
- */
-double
-CosDist(const double *v1, const double *v2, const int dim)
-{
-    int i; /* aux */
-    double dist; /* returned distance */
-    double s, sq1, sq2;
-
-    assert(v1!=NULL);
-    assert(v2!=NULL);
-
-    s = 0.0;
-    sq1 = 0.0;
-    sq2 = 0.0;
-    for (i=0; i<dim; i++) {
-        s += (v1[i]*v2[i]);
-        sq1 += pow(v1[i], 2.0);
-        sq2 += pow(v2[i], 2.0);
-    }
-    sq1 = sqrt(sq1);
-    sq2 = sqrt(sq2);
-
-    if ((sq1 * sq2) < DBL_EPSILON) {
-        dist = 1 - s / (sq1 * sq2);
-    } else {
-        /* if the denominator gets small, the fraction gets big, hence dist
-           gets small: */
-        dist = 0.0;
-    }
-    return dist;
-}
-/***   end: CosDist()   ***/
-
-
-
-/**
- * @brief convert sequences into mbed-like (distance) vector
- * representation. Seeds (prMSeq sequence indices) have to be picked before
- *
- * @param[out] ppdSeqVec
- * Pointer to preallocated matrix of size nseqs x iSeeds
- * @param[in] prMSeq
- * Sequences which are to be converted
- * @param[in] piSeeds
- * Array of sequences in prMSeq which are to be used as seeds.
- * @param[in] iNumSeeds
- * Number of seeds/elements in piSeeds
- * @param[in] iPairDistType
- * Type of pairwise distance comparison
- *
- */
-int
-SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, 
-         int *piSeeds, const int iNumSeeds,
-         const int iPairDistType)
-{
-    symmatrix_t *prDistmat = NULL;
-    int iSeqIndex;
-    int iSeedIndex;
-   /* indices for restoring order */
-    int *restore; 
-    /* sorted copy of piSeeds */
-    int *piSortedSeeds; 
-
-#if TIMING
-    Stopwatch_t *stopwatch = StopwatchCreate();
-    StopwatchZero(stopwatch);
-    StopwatchStart(stopwatch);
-#endif
-    
-    assert(NULL != ppdSeqVec);
-    assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
-
-    piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
-    memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
-
-    /* need them sorted, otherwise the swapping approach below might
-     * break 
-     */
-    qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
-
-
-    /* rearrange mseq order so that we can use ktuple_pairdist code as
-     * is. That is, swap seeds and non-seeds such that the seeds end
-     * up in front of mseq. This way we can use the KTuplePairDist()
-     * code, without making a copy of mseq. Also, keep an array of
-     * indices which serves to restore the original sequence order
-     * after putting the seeds in front
-     *
-     */
-    restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
-    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-        restore[iSeqIndex] = iSeqIndex;
-    }
-    for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
-#endif
-        if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
-            int swap;
-            SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
-
-            swap = restore[iSeedIndex];
-            restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
-            restore[piSortedSeeds[iSeedIndex]] = swap;
-        }
-    }
-#if TRACE
-    printf("DEBUG(%s|%s():%d): restore indices =",
-           __FILE__, __FUNCTION__, __LINE__);
-    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-        printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
-    }
-    printf("\n");
-
-    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-        Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now:  seq = %s",
-                  iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
-    }
-#endif
-
-
-    /* convert sequences into vectors of distances by calling pairwise
-     * distance function for each seq/seed pair
-     *
-     */
-    if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
-                           0, iNumSeeds, 0, prMSeq->nseqs,
-                           NULL, NULL)) {
-        Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
-        FreeSymMatrix(&prDistmat);
-        CKFREE(piSortedSeeds);
-        CKFREE(restore);
-        return -1;
-    }
-#if TRACE
-    {
-        char **labels;
-        labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
-        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-            labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
-        }
-        SymMatrixPrint(prDistmat, labels, NULL);
-        CKFREE(labels);
-    }
-#endif
-
-
-    /* update ppdSeqVec according to prDistmat. keep in mind that we
-     * changed mseq order
-     *
-     */
-    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
-            ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
-                SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
-        }
-    }
-
-
-    /* restore mseq order by reverse swapping
-     *
-     * need reverse order now, so start from top index
-     */
-    iSeedIndex=iNumSeeds-1;
-    do {
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
-#endif
-        if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
-            int swap;
-
-            SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
-
-            swap = restore[iSeedIndex];
-            restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
-            restore[piSortedSeeds[iSeedIndex]] = swap;
-        }
-    } while (0 != iSeedIndex--);
-#if TRACE
-     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-         Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d:  seq = %s %s",
-                   iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
-     }
-#endif
-
-#if TRACE
-    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-        printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
-        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
-            printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
-        }
-        printf("\n");
-    }
-#endif
-
-    
-    FreeSymMatrix(&prDistmat);
-    CKFREE(restore);
-#if TIMING
-    StopwatchStop(stopwatch);
-    StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
-    StopwatchFree(stopwatch);
-#endif
-
-
-    return 0;
-}
-/***   end: SeqToVec()   ***/
-
-
-
-/**
- * @brief Select seeds to be used from an prMSeq
- *
- * @param[out] piSeeds
- * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
- * @param[in] iNumSeeds
- * Number of seeds to be picked
- * @param[in] iSelectionMethod
- * Seed selection method to be used
- * @param[in] prMSeq
- * The prMSeq structure to pick sequences from
- *
- * @return: Non-zero on failure
- *
- */    
-int
-SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod,  mseq_t *prMSeq)
-{
-    /* seed iterator */
-    int iSeedIndex;
-    /* sequence iterator */
-    int iSeqIndex;
-
-    if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
-        int *piPermArray;
-
-        Log(&rLog, LOG_INFO,
-             "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
-             iNumSeeds, prMSeq->nseqs);
-
-        PermutationArray(&piPermArray, iNumSeeds);
-        /* copy to piSeeds */
-        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
-            piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
-        }
-        CKFREE(piPermArray);
-        
-    }  else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
-
-        /* step size for picking with constant stride */
-        int iStep;
-        int *piSeqLen =  CKMALLOC(prMSeq->nseqs * sizeof(int));
-        int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
-
-        Log(&rLog, LOG_INFO,
-             "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
-             iNumSeeds, prMSeq->nseqs);
-
-        iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
-                                            * due to rounding */
-        /* first get an array of seq indices order according to
-           corresponding sequence length: piOrder */
-        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-            piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
-        }
-        QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
-#if 0
-        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-            Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
-                      iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
-        }
-#endif
-        CKFREE(piSeqLen);
-        iSeedIndex = 0;
-        for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
-            piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
-        }
-        CKFREE(piOrder);
-
-    } else {
-        
-        Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
-        return -1;
-    }
-
-
-#ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
-    /* do initial mbedding and kmeans. then pick seeds from each cluster and
-       do full mbed with these seeds. idea is that those seeds represent the
-       sequence space better than random seeds. however, this reduces the
-       quality slightly. random is almost always better.
-    */
-
-    if (1) {
-        /* seqs represented as distance vectors */
-        double **ppdSeqVec;
-        double dCost = -1.0;
-        /* assignments for each seq to the K newly created clusters */
-        int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
-        double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
-        /* create a copy of ppdSeqVec suitable for KMeans */
-        double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
-        bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
-
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
-
-        /*
-          Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
-          iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
-        */
-
-        ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
-        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-            ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
-        }
-        if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
-            Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
-            return -1;
-        }
-
-        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-            int iDim;
-            for (iDim=0; iDim<iNumSeeds; ++iDim) {
-                pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
-            }
-        }
-
-
-        Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
-        dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
-                       pdKMeansVectors,
-                       RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
-                       pdKMeansClusterCenters, piKMeansClusterAssignments);
-        Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
-        
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
-
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
-        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-            Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
-                      iSeqIndex,  prMSeq->sqinfo[iSeqIndex].name,
-                      piKMeansClusterAssignments[iSeqIndex]);
-        }
-#endif
-
-        Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
-        iSeedIndex = 0;
-        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
-            int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
-            if (pbClusterUsed[iAssignedCluster]) {
-                continue;
-            } else {
-                /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
-                piSeeds[iSeedIndex++] = iSeqIndex;
-                pbClusterUsed[iAssignedCluster] = TRUE;
-            }
-        }
-        CKFREE(pbClusterUsed);
-
-        CKFREE(pdKMeansVectors);
-        CKFREE(pdKMeansClusterCenters);
-        CKFREE(piKMeansClusterAssignments);
-        
-    }
-    /* if (1) */
-#endif
-
-
-    if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
-        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
-            Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
-                 piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
-        }
-    }
-    
-    return 0; 
-}
-/* end of SeedSelection() */
-
-
-
-
-/**
- * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
- * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
- *
- * @param[out] prKMeansResult_p
- * Result of Bisecting KMeans. Will be allocated here.
- * Caller has to free. See @see FreeKMeansResult()
- * @param[in] iNObjs
- * Number of objects/sequences to cluster
- * @param[in] iDim
- * Dimensionality of input data
- * @param[in] ppdVectors
- * each row holds iDim points for this object's coordinates
- * @param[in] iMinRequiredObjsPerCluster
- * Minimum number of objects per Cluster (inclusive)/
- * @param[in] iMaxAllowedObjsPerCluster
- * Maximum number of objects per Cluster (inclusive). Function returns once no
- * cluster contains more then this number of objects. Soft limit!
- *
- * @note Convoluted code. Could use some restructuring. My apologies.
- * AW
- *
- */
-void
-BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
-                const int iNObjs, const int iDim, double **ppdVectors,
-                const int iMinRequiredObjsPerCluster,
-                const int iMaxAllowedObjsPerCluster)
-{
-    int iN, iD;
-    /* cluster centers for each cluster created at each split */
-    double *pdKClusterCenters;
-    /* keep track of updated object indices per newly created
-     * cluster */
-    int *piCurObjToUpdate;
-    /* number of times we called 2-means */
-    int iNRounds;
-    /* flag for unsuccessful cluster split */
-    bool bNaNDetected = FALSE;
-    /* flag for detected small cluster after split  */
-    bool bSmallClusterDetected;
-    /* queue of clusters which are to be split */
-    int_queue_t rClusterSplitQueue;
-    
-#if TIMING
-    Stopwatch_t *stopwatch = StopwatchCreate();
-#endif
-
-
-    piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
-
-    NewKMeansResult(prKMeansResult_p);
-
-    /* new cluster centers created at each split/KMeans run
-     */
-    pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
-
-    /* init results by setting a first cluster that contains all objects
-     */
-    (*prKMeansResult_p)->iNClusters = 1;
-    (*prKMeansResult_p)->iDim = iDim;
-    /* fake center coordinates of first cluster */
-    (*prKMeansResult_p)->ppdClusterCenters =
-        (double **) CKMALLOC(1 * sizeof(double *));
-    (*prKMeansResult_p)->ppdClusterCenters[0] =
-        (double *) CKMALLOC(iDim * sizeof(double));
-    /* objects per cluster */
-    (*prKMeansResult_p)->piNObjsPerCluster =
-        (int *) CKMALLOC(1 * sizeof(int));
-    (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
-    /* object indices per cluster */
-    (*prKMeansResult_p)->ppiObjIndicesPerCluster =
-        (int **) CKMALLOC(1 * sizeof(int *));
-    (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
-        CKMALLOC(iNObjs * sizeof(int));
-    for (iN=0; iN<iNObjs; iN++) {
-        (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
-    }
-
-
-
-    /* Starting with the first cluster that now contains all the
-     * sequences keep splitting until no cluster contains more than
-     * iMaxAllowedObjsPerCluster
-     *
-     * Keep a queue of clusters (rClusterSplitQueue) to split
-     *
-     * At each split values/memory of the just split cluster will be
-     * reused and exactly one new only allocated.
-     *
-     * Example:
-     * Given the following cluster assignments
-     * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
-     * and a K-Means split in cluster 3 at |:
-     * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
-     * The result should be this:
-     * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
-     *
-     *
-     */
-    INT_QUEUE_INIT(&rClusterSplitQueue);
-    if (iNObjs>iMaxAllowedObjsPerCluster) {
-        /* pish fake first cluster index */
-        INT_QUEUE_PUSH(&rClusterSplitQueue, 0); 
-    }
-    iNRounds = 0;
-    while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
-        /* assignments for each seq to the K newly created clusters */
-        int *piKClusterAssignments;
-        /* number of objects in cluster that is to be split */
-        int iNObjsInClusterToSplit;
-        /* coordinates of points in cluster that is to be split
-         * array of size n*d where [d*i + j] gives coordinate j of point i
-         */
-        double *pdVectorsInClusterToSplit;
-        /* copy of object indices in split cluster */
-        int *piObjIndicesOfSplitCluster;
-        /* best cost of kmeans rounds */
-        double dCost = -1.0;
-
-        /*  indices for the two created clusters
-         */
-        /* index of cluster to split */
-        int iClusterToSplot;
-        /* index of newly created cluster at each round. data for the
-           other created cluster goes to the one just split,
-           i.e. (*piClusterToSplot) */
-        int iNewClusterIdx;
-
-#if TIMING
-        StopwatchZero(stopwatch);
-        StopwatchStart(stopwatch);
-#endif
-
-        INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
-        
-        iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
-        piKClusterAssignments = (int *)
-            CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
-
-        pdVectorsInClusterToSplit = (double *)
-            CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
-        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
-            for (iD=0; iD<iDim; ++iD) {
-                int iThisObjIdx =
-                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
-                pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
-            }
-        }
-
-#if TRACE
-        Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
-                  iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
-        fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
-                __FILE__, __FUNCTION__, __LINE__);
-        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
-            fprintf(stderr, " %u",
-                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
-        }
-        fprintf(stderr, "\n");
-        (void) fflush(stderr);
-#endif
-
-#if TRACE
-        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
-            fprintf(stderr,
-                    "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
-                    __FILE__, __FUNCTION__, __LINE__, iN,
-                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
-            for (iD=0; iD<iDim; iD++) {
-                fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
-            }
-            fprintf(stderr, "\n");
-            (void) fflush(stderr);
-        }
-#endif
-        
-       /* KMeans(1 "The number of points in the data set",
-        *        2 "The number of clusters to look for",
-        *        3 "The number of dimensions that the data set lives in",
-        *        4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
-        *        5 "attempts: The number of times to independently run k-means",
-        *        6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
-        *        7 "centers: This can either be null or an array of size k*d.
-        *           In the latter case, centers[d*i + j] will give coordinate j of center i.
-        *           If the cluster is unused, it will contain NaN instead.",
-        *        8 "assignments: This can either be null or an array of size n.
-        *           In the latter case, it will be filled with the cluster that each point is assigned to
-        *           (an integer between 0 and k-1 inclusive).");
-        */
-        dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
-                       pdVectorsInClusterToSplit,
-                       RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
-                       pdKClusterCenters, piKClusterAssignments);
-
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
-        for (iN=0; iN<2; iN++) {
-            fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
-                    __FILE__, __FUNCTION__, __LINE__, iN);
-            for (iD=0; iD<iDim; iD++) {
-                fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
-            }
-            fprintf(stderr, "\n");
-            (void) fflush(stderr);
-        }
-        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
-            Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
-                      iN,  piKClusterAssignments[iN], 2);
-        }
-#endif
-        
-        /* real index of one of the newly created clusters. the other
-         * one is iClusterToSplot */
-        iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
-
-        
-        /* We don't want Clusters which are too small. Check here if a
-         * split created a small cluster and if yes, discard the
-         * solution. Because the cluster has already been removed from
-         * the queue, we can just continue.
-         */
-        bSmallClusterDetected = FALSE;
-        if (iMinRequiredObjsPerCluster>1) {
-            int iNObjsCluster[2];
-            iNObjsCluster[0] = 0; /* first cluster */
-            iNObjsCluster[1] = 0; /* second cluster */
-            for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
-                iNObjsCluster[piKClusterAssignments[iN]]+=1;
-            }
-            
-            if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
-                ||
-                iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
-                bSmallClusterDetected = TRUE;
-                Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
-                          iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
-            }
-        }
-        
-        /* Same logic as for small clusters applies if KMeans couldn't
-         * split the cluster. In this case one of its center
-         * coordinates will be NaN, which we check for in the
-         * following.
-         */
-        if (! bSmallClusterDetected) {
-            for (iN=0; iN<2; iN++) {
-                bNaNDetected = FALSE;
-                for (iD=0; iD<iDim; iD++) {
-                    if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
-                        /* Got NaN as coordinate after splitting */
-                        Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
-                             __FUNCTION__,
-                             iClusterToSplot, iNObjsInClusterToSplit,
-                         "Hope it's not too big and doesn't slow things down."
-                            );
-                        bNaNDetected = TRUE;
-                        break;
-                    }
-                }
-                if (bNaNDetected) {
-                    break;
-                }
-            }
-        }
-        
-        /* Discarding split of this cluster. It has been removed from the
-         * queue already. so just continue
-         */
-        if (bNaNDetected || bSmallClusterDetected) {
-            CKFREE(piKClusterAssignments);
-            CKFREE(pdVectorsInClusterToSplit);
-            continue;
-        }
-
-
-        /* update cluster centers: pdClusterCenters
-         *
-         */
-        /* reuse memory of existing/old/split cluster
-         */
-        for (iN=0; iN<iDim; iN++) {
-            double dCoord = pdKClusterCenters[0*iDim+iN];
-            (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
-        }
-        /* realloc and set new one
-         */
-        (*prKMeansResult_p)->ppdClusterCenters = (double **)
-            CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
-                      ((*prKMeansResult_p)->iNClusters+1)  * sizeof(double *));
-        (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
-            CKMALLOC(iDim * sizeof(double));
-        for (iD=0; iD<iDim; iD++) {
-            double dCoord = pdKClusterCenters[1*iDim+iD];
-            (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
-        }
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
-        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
-            fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
-                    __FILE__, __FUNCTION__, __LINE__, iN);
-            for (iD=0; iD<iDim; iD++) {
-                fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
-            }
-            fprintf(stderr, "\n");
-            (void) fflush(stderr);
-        }
-#endif
-
-
-        /* update #seqs per cluster: piNObjsPerCluster
-         *
-         */
-        (*prKMeansResult_p)->piNObjsPerCluster = (int *)
-            CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
-                      ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
-        /* init new and old one to zero */
-        (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
-        (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
-        /* now update values */
-        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
-            if (0 == piKClusterAssignments[iN]) {
-                (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
-            } else if (1 == piKClusterAssignments[iN]) {
-                (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
-            } else {
-                /* there used to be code for iK>=2 in r101 */
-                Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
-                      piKClusterAssignments[iN]);
-            }
-        }
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
-        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
-            Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
-                      iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
-        }
-#endif
-        /* queue clusters if still they are still too big
-         */
-        if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
-            INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
-        }
-        if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
-            INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
-        }
-        
-        
-
-        /* update which objects belong to which cluster:
-         *
-         * note:  piNObjsPerCluster needs to be already updated
-         *
-         */
-        /* create a copy of the object indices in the split cluster
-         * (original will be overwritten)
-         */
-        piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
-        memcpy(piObjIndicesOfSplitCluster,
-               (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
-               iNObjsInClusterToSplit * sizeof(int));
-
-        (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
-            CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
-                      ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
-
-
-        (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
-            (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
-                              (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
-
-        (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
-            (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
-
-
-        /* now reassign the object indices to the assigned cluster
-         */
-        piCurObjToUpdate[0] = 0;
-        piCurObjToUpdate[1] = 0;
-        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
-            int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
-            int iThisClusterAssignment =  piKClusterAssignments[iN];
-            int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
-            int iThisClusterIndex = 0;
-
-            if (0 == iThisClusterAssignment) {
-                iThisClusterIndex = iClusterToSplot;
-            } else if (1 == iThisClusterAssignment) {
-                iThisClusterIndex = iNewClusterIdx;
-            } else {
-                /* there used to be code for iK>=2 in r101 */
-                Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
-                      piKClusterAssignments[iN]);
-            }
-#if 0
-            Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
-                      iThisClusterIndex, iThisOffset, iThisObjIdx);
-#endif
-            (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
-            piCurObjToUpdate[iThisClusterAssignment]+=1;
-        }
-        CKFREE(piObjIndicesOfSplitCluster);
-#if TRACE
-        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
-            int iObj;
-            fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
-                    __FILE__, __FUNCTION__, __LINE__, iN);
-            for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
-                fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
-            }
-            fprintf(stderr, "\n");
-            (void) fflush(stderr);
-        }
-#endif
-
-
-#if TIMING
-        StopwatchStop(stopwatch);
-        StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
-#endif
-
-
-        /* finally: increase number of clusters
-         */
-        (*prKMeansResult_p)->iNClusters += 1;
-        iNRounds += 1;
-        CKFREE(piKClusterAssignments);
-        CKFREE(pdVectorsInClusterToSplit);
-
-    } /* while */
-    INT_QUEUE_DESTROY(&rClusterSplitQueue);
-
-
-    Log(&rLog, LOG_DEBUG,
-         "Bisecting K-means finished after %d rounds (no more clusters to split)",
-         iNRounds);
-         
-#if TIMING
-    StopwatchFree(stopwatch);
-#endif
-
-    /* @note could use progress/timer */
-
-    CKFREE(pdKClusterCenters);
-    CKFREE(piCurObjToUpdate);
-
-    return;
-}
-/***   end: BisectingKmeans()   ***/
-
-
-
-/**
- *
- * @brief From scratch reimplementation of mBed: Blackshields et al.
- * (2010); PMID 20470396.
- *
- * Idea is a follows:
- * - convert sequences into vectors of distances
- * - cluster the vectors using k-means
- * - cluster each of the k clusters using upgma (used cached distances
- *    from above?)
- * - join the sub-clusters to create on tree (use UPGMA on k-means
- *   medoids)
- *
- *
- * @param[out] prMbedTree_p
- * Created upgma tree. will be allocated here. use FreeMuscleTree()
- * to free
- * @param[in] prMSeq
- * Multiple sequences
- * @param[in] iPairDistType
- * Distance measure for pairwise alignments
- * @param[in] pcGuidetreeOut
- * Passed down to GuideTreeUpgma()
- *
- * @return Zero on success, non-zero on error
- *
- */
-int
-Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
-     const char *pcGuidetreeOut)
-{
-    /* number of seeds */
-    int iNumSeeds;
-    /* seed indices matching prMSeq */
-    int *piSeeds; 
-    /* seqs represented as distance vectors */
-    double **ppdSeqVec;
-    /* kmeans result */
-    bisecting_kmeans_result_t *prKMeansResult = NULL;
-    /* distance matrix of kmeans (pre-)cluster centers */
-    symmatrix_t *prPreClusterDistmat = NULL;
-    /* auxiliary for symmetric matrix output; tree routines etc */
-    char **ppcLabels;
-    int iNodeIndex;
-    /* mapping of cluster-center tree node indices to corresponding
-       cluster */
-    int *piClusterToTreeNode;
-    int iClusterIndex;
-    int iI, iJ;
-    FILE *pfOut;
-    progress_t *prSubClusterDistanceProgress;
-    bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
-
-#if FULL_WITHIN_CLUSTER_DISTANCES
-    Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
-#else
-    Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed.");
-#endif
-    
-#if MBED_TIMING
-    Stopwatch_t *stopwatch = StopwatchCreate();
-#endif
-
-    assert(NULL != prMbedTree_p);
-    assert(NULL != prMSeq);
-
-
-    iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
-    if (iNumSeeds >= prMSeq->nseqs) {
-        /* -1 is condition for RandomUniqueIntArray */
-        iNumSeeds = prMSeq->nseqs-1; 
-        Log(&rLog, LOG_DEBUG,
-             "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
-             iNumSeeds);
-    }
-
-       
-    /* Turn sequences into vectors of distances to the seeds
-     *
-     */
-    piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
-    if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
-        Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
-        return -1;
-    }
-    ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
-    for (iI=0; iI<prMSeq->nseqs; iI++) {
-        ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
-    }
-    if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
-        Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
-        return -1;
-    }
-    CKFREE(piSeeds);
-
-    
-    /* Calculate (pre-)clusters of sequence vectors by applying
-     * bisecting kmeans
-     *
-     */
-#if MBED_TIMING
-    /* start clock only here, to make sure we don't include pairwise
-     * distance computation */
-    StopwatchZero(stopwatch);
-    StopwatchStart(stopwatch);
-#endif
-
-    BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
-                    MIN_REQUIRED_SEQ_PER_PRECLUSTER,
-                    MAX_ALLOWED_SEQ_PER_PRECLUSTER);
-    Log(&rLog, LOG_INFO,
-         "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
-         prKMeansResult->iNClusters,
-         MIN_REQUIRED_SEQ_PER_PRECLUSTER,
-         MAX_ALLOWED_SEQ_PER_PRECLUSTER);
-
-
-#if PRINT_CLUSTER_DISTRIBUTION
-    Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
-    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
-#if TRACE
-        int iD;
-        Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
-        fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
-                __FILE__, __FUNCTION__, __LINE__);
-        for (iD=0; iD<iNumSeeds; iD++) {
-            fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
-        }
-        fprintf(stderr, "\n");
-        fflush(stderr);
-#endif
-        Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
-                  iI, prKMeansResult->piNObjsPerCluster[iI]);
-#if TRACE
-        for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
-            int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
-            
-            Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
-                      iI, iJ,  iRealIndex, prMSeq->sqinfo[iRealIndex].name);
-        }
-#endif
-    }
-#endif
-
-    
-    /* Cluster pre-clusters produced by k-means.
-     *
-     * Do this by calculating the vector distances of the cluster
-     * centers and applying UPGMA.
-     *
-     * @note could try to force-balance the tree here
-     *
-     */
-    if (0 != NewSymMatrix(&prPreClusterDistmat,
-                     prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
-        Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
-    }
-    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
-        for (iJ=iI+1; iJ<prKMeansResult->iNClusters; iJ++) {
-            double dDist;
-            dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
-                             prKMeansResult->ppdClusterCenters[iJ],
-                             iNumSeeds);
-            SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist);
-            /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
-               iI, iJ, dDist); */
-        }
-    }
-    
-
-    /* labels needed for the guide tree building routine only */
-    ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
-    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
-        ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
-        (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
-    }
-#if TRACE
-    Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
-    SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
-#endif
-
-    GuideTreeUpgma(prMbedTree_p,
-                   ppcLabels, prPreClusterDistmat, NULL);
-
-    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
-        CKFREE(ppcLabels[iI]);
-    }
-    CKFREE(ppcLabels);
-#if TRACE
-    Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
-    LogTree(*prMbedTree_p);
-#endif
-
-
-
-    /* Now replace each leaf in the pre-cluster-center tree
-     * appropriately, i.e. with the corresponding sub-cluster.
-     *
-     * For each leaf/sub-cluster, create a distance matrix for the
-     * corresponding sequences. Use distances between vectors as
-     * approximated distances between sequences. Then create a
-     * guide-tree by applying UPGMA.
-     *
-     */
-
-    /* Get a mapping of (pre)cluster number and leaf-node indices in
-     * the cluster-center tree. We can add trees to prMbedTree_p
-     * because AppendTrees() guarantees that no other than the node to
-     * append to changes.
-     */
-    piClusterToTreeNode = (int*)
-        CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
-    iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
-    do {
-        if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
-            int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
-            piClusterToTreeNode[iLeafId] = iNodeIndex;
-        }
-        iNodeIndex =  NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
-    } while (NULL_NEIGHBOR != iNodeIndex);
-
-
-
-
-    /* Now step through all the leafs and replace them with the
-     * corresponding sub-trees
-     */
-    NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
-                "Distance calculation within sub-clusters", bPrintCR);
-    /* for each cluster */
-    for (iClusterIndex=0;
-         iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
-        /* distance matrix for the sub-cluster */
-        symmatrix_t *prWithinClusterDistances = NULL;
-        int iNSeqInCluster;
-        tree_t *prSubClusterTree = NULL;
-        
-        ProgressLog(prSubClusterDistanceProgress,
-                    iClusterIndex, prKMeansResult->iNClusters, FALSE);
-
-#if FULL_WITHIN_CLUSTER_DISTANCES
-        mseq_t *prSubClusterMSeq;
-        int iPairDistType;
-        int iSeqIndex;
-        int iOldLogLevel;
-
-        Log(&rLog, LOG_DEBUG, 
-            "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
-
-        if (TRUE == prMSeq->aligned)  {
-            iPairDistType = PAIRDIST_SQUIDID_KIMURA;
-        } else {
-            iPairDistType = PAIRDIST_KTUPLE;
-        }
-#endif
-            
-        iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
-                  iClusterIndex, iNSeqInCluster);
-#endif
-
-#if FULL_WITHIN_CLUSTER_DISTANCES
-        /* create an mseq structure for sequences in this cluster
-         * don't need most of the members (e.g. orig_seq) but copy
-         * them anyway for the sake of completeness
-         */
-        NewMSeq(&prSubClusterMSeq);
-    
-        prSubClusterMSeq->nseqs = iNSeqInCluster;
-        prSubClusterMSeq->seqtype = prMSeq->seqtype;
-        if (NULL!=prMSeq->filename) {
-            prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
-        }
-        prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */
-        prSubClusterMSeq->seq =  (char **)
-            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
-        prSubClusterMSeq->orig_seq =  (char **)
-            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
-        prSubClusterMSeq->sqinfo =  (SQINFO *)
-            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
-        
-        for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
-            int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
-            prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
-            prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
-            SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
-#if TRACE
-            Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
-                iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
-                iRealSeqIndex);
-#endif
-        }
-#endif
-
-        
-        /* Create a distance matrix for this sub-cluster
-         * (prWithinClusterDistances) by using the vector distances or
-         * ktuple distances.
-         *
-         * Then apply UPGMA to get a subcluster tree
-         * (prSubClusterTree) and append created tree to the
-         * pre-cluster-tree (prMbedTree_p)
-         *
-         */
-#if FULL_WITHIN_CLUSTER_DISTANCES
-        
-        iOldLogLevel = rLog.iLogLevelEnabled;
-        rLog.iLogLevelEnabled = LOG_WARN;
-        /* compute distances, but be quiet */
-        if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
-                          0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
-                          NULL, NULL)) {
-            Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
-            return -1;
-        }
-        rLog.iLogLevelEnabled = iOldLogLevel;
-
-#if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE
-        {
-            double dSum = 0.0;
-            for (iI=0; iI<prSubClusterMSeq->nseqs; iI++) {
-                for (iJ=iI+1; iJ<prSubClusterMSeq->nseqs; iJ++) {
-                    dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ);
-                }
-            }
-            Log(&rLog, LOG_FORCED_DEBUG,
-                "mean pair-wise distance within subcluster %d of %d = %f", 
-                iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs);
-        }
-#endif
-
-#else
-        if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
-            Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
-        }
-        for (iI=0; iI<iNSeqInCluster; iI++) {
-            int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
-
-            for (iJ=iI+1; iJ<iNSeqInCluster; iJ++) {
-                int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iJ];
-                double dist;
-
-                /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
-                   iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
-                   iJ, prMSeq->sqinfo[iRealIndexJ].name); */
-
-                if (1 == USE_EUCLIDEAN_DISTANCE) {
-                    dist = EuclDist(ppdSeqVec[iRealIndexI],
-                                    ppdSeqVec[iRealIndexJ], iNumSeeds);
-                } else {
-                    dist = CosDist(ppdSeqVec[iRealIndexI],
-                                   ppdSeqVec[iRealIndexJ], iNumSeeds);
-                }
-                SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist);
-            }
-        }
-#endif
-
-        /* labels needed for the guide tree building routine only */
-        ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
-        for (iI=0; iI<iNSeqInCluster; iI++) {
-#if FULL_WITHIN_CLUSTER_DISTANCES
-            ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
-#else       
-            int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
-            ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
-#endif
-        }        
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
-                  iClusterIndex, prKMeansResult->iNClusters);
-        SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
-#endif
-
-        
-        GuideTreeUpgma(&prSubClusterTree, ppcLabels,
-                       prWithinClusterDistances, NULL);
-
-        CKFREE(ppcLabels); /* don't free members, they just point */
-#if 0
-        Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
-        LogTree(prSubClusterTree);
-#endif
-
-        
-        /* The guide tree id's (that point to the sequences) now start
-         * from 0, i.e. the association with the prMSeq numbering is
-         * broken and fixed in the following
-         */
-        for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
-            if (IsLeaf(iNodeIndex, prSubClusterTree)) {
-                int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
-                int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
-#if 0
-                Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
-                          iNodeIndex, iLeafId,
-                          GetLeafName(iNodeIndex, prSubClusterTree),
-                          iRealId, prMSeq->sqinfo[iRealId].name);
-#endif
-                SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
-            }
-        }
-
-
-        /* Append the newly created tree (prSubClusterTree) to the
-         * corresponding node index of prMbedTree_p.
-         */
-#if TRACE
-        Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
-                  piClusterToTreeNode[iClusterIndex],
-                  GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
-#endif
-
-        AppendTree(*prMbedTree_p,
-                   piClusterToTreeNode[iClusterIndex],
-                   prSubClusterTree);
-        /* Note: piClusterToTreeNode is still valid, because
-         * AppendTrees() guarantees that no other than the node to
-         * append to changes. */
-
-#if 0
-        Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
-        LogTree(*prMbedTree_p);
-
-        if (0) {
-            char fname[] = "mbed-joined-tree.dnd";
-            FILE *pfOut;
-            if (NULL == (pfOut = fopen(fname, "w"))) {
-                Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
-            }
-            MuscleTreeToFile(pfOut, *prMbedTree_p);
-            Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
-            fclose(pfOut);
-            Log(&rLog, LOG_FATAL, "DEBUG EXIT");
-        }
-#endif
-
-        /* cleanup
-         */
-        FreeMuscleTree(prSubClusterTree);
-        FreeSymMatrix(&prWithinClusterDistances);
-#if FULL_WITHIN_CLUSTER_DISTANCES
-        FreeMSeq(&prSubClusterMSeq);
-#endif
-    } /* end for each cluster */
-    ProgressDone(prSubClusterDistanceProgress);
-    FreeProgress(&prSubClusterDistanceProgress);
-
-    
-    if (NULL != pcGuidetreeOut) {
-        if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
-            Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
-        } else {
-            MuscleTreeToFile(pfOut, *prMbedTree_p);
-            Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
-            (void) fclose(pfOut);
-        }
-    }
-
-    
-    /* cleanup
-     *
-     */
-#if MBED_TIMING
-    StopwatchStop(stopwatch);
-    StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
-    StopwatchFree(stopwatch);
-#endif
-
-    FreeKMeansResult(&prKMeansResult);
-    FreeSymMatrix(&prPreClusterDistmat);
-    for (iI=0; iI<prMSeq->nseqs; iI++) {
-        CKFREE(ppdSeqVec[iI]);
-    }
-    CKFREE(ppdSeqVec);
-    CKFREE(piClusterToTreeNode);
-
-#ifndef NDEBUG
-    TreeValidate(*prMbedTree_p);
-#endif
-    
-    return 0;
-}
-/***   end: Mbed()   ***/