/* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */

/*********************************************************************
 * Clustal Omega - Multiple sequence alignment
 *
 * Copyright (C) 2010 University College Dublin
 *
 * Clustal-Omega is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This file is part of Clustal-Omega.
 *
 ********************************************************************/

/*
 *  RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $
 *
 *
 * Reimplementation from scratch of mBed:
 * Blackshields et al. (2010); PMID 20470396
 *
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <assert.h>
#include <float.h>

#include "mbed.h"
#include "pair_dist.h"
#include "symmatrix.h"
#include "ktuple_pair.h"
#include "tree.h"
#include "util.h"
#include "progress.h"
#include "queue.h"

#include "log.h"
#include "kmpp/KMeans.h"
#include "mbed.h"

#define TIMING 0
#if TIMING
#include "squid/stopwatch.h"
#endif


/* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
 * bisecting kmeans subcluster are not estimated using the vectors,
 * but calculated normally (using ktuple or kimura). Surprisingly this
 * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
 * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
 * log2 instead of log2^2). And of course it slows things down.
 */
#define FULL_WITHIN_CLUSTER_DISTANCES 1

#define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0

/* Cluster size limits. Maximum is a soft limit, which might be
 * exceeded if a K-Means split was unsuccesful, where unsuccesful
 * might also mean that the minimum required number seqs. was not
 * reached */
#if FULL_WITHIN_CLUSTER_DISTANCES
static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
#else
static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
#endif

/* How many restarts per bisecting kmeans split. 10 give 0.5% increase
 * in quality on original HOMFAM over just 1. It also increases kmeans
 * time by a factor of ~3, but overall time is insignificant
 * compared to pairdist/progressive alignment part.
 */
static const int RESTARTS_PER_SPLIT = 10;


/* Use standard kmeans (lloyds) or kmeans++. Both are almost
 * indistinguishable here, although kmeans++ is slightly ahead the
 * fewer seeds you pick (and it should be better in theory)
 */
#define USE_KMEANS_LLOYDS 0


#ifndef HAVE_LOG2
#define log2(x)  (log(x) / 0.69314718055994530942)
#endif
#define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)


/* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
 * approach: Sample iSeeds sequence with constant stride from length-sorted X.
 * It might be better to pick the seeds randomly, because length sorting might
 * be more prone to including outliers (e.g. very long and very short seqs).
 * However, we could never observer such a thing. So just stick to the
 * original version as this also removes the random element.
 */
enum SEED_SELECTION_TYPE {
    SELECT_SEEDS_RANDOMLY,
    SELECT_SEEDS_BY_LENGTH
};
#define SEED_SELECTION SELECT_SEEDS_BY_LENGTH


/* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
 * no difference between mbed-trees created from cosine or euclidean
 * distances (simple version, just using disparities).
 */
#define USE_EUCLIDEAN_DISTANCE 1


/* print some mbed pre-cluster usage to screen */
#define PRINT_CLUSTER_DISTRIBUTION 0


#define TRACE 0


typedef struct {
    /* Number of final clusters
     */
    int iNClusters;

    /* Coordinates (columns) for each cluster (rows)
     * valid indices: [0...iNClusters][0...dim-1]
     */
    double **ppdClusterCenters;

    /* Dimensionality of input data and cluster center coordinates
     */
    int iDim;

    /* Number of objects per cluster
     */
    int *piNObjsPerCluster;

    /* Objects (indices) for each cluster. [i][j] will point to (index
     * of) object j in cluster i
     */
    int **ppiObjIndicesPerCluster;
} bisecting_kmeans_result_t;


/**
 * @brief Free KMeans result structure.
 *
 * @param[out]  prResult_p
 * K-Means result to free
 *
 * @see NewKMeansResult()
 */
void
FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
{
    int iAux;
    CKFREE((*prResult_p)->piNObjsPerCluster);
    for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
        CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
        CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
    }
    CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
    CKFREE((*prResult_p)->ppdClusterCenters);
    (*prResult_p)->iNClusters = 0;
    (*prResult_p)->iDim = 0;
    CKFREE(*prResult_p);
}
/***   end: FreeKMeansResult()   ***/


/**
 * @brief Allocate new KMeans result
 *
 * @param[out] prKMeansResult_p
 * K-Means result to free
 *
 * @see NewKMeansResult()
 */
void
NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
{
    (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
        CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
    (*prKMeansResult_p)->iNClusters = 0;
    (*prKMeansResult_p)->iDim = 0;
    (*prKMeansResult_p)->ppdClusterCenters = NULL;
    (*prKMeansResult_p)->piNObjsPerCluster = NULL;
    (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;

}
/***   end: NewKMeansResult()   ***/


/**
 * @brief Calculate the euclidean distance between two vectors
 *
 * @param[in] v1
 * First vector with dim dimensions
 * @param[in] v2
 * Second vector with dim dimensions
 * @param[in] dim
 * Dimensionality of v1 and v2
 *
 * @return euclidean distance as double
 *
 * @note Could probably be inlined. Also perfect for SSE
 */
double
EuclDist(const double *v1, const double *v2, const int dim)
{
    int i; /* aux */
    double dist; /* returned distance */

    assert(v1!=NULL);
    assert(v2!=NULL);

    dist = 0.0;
    for (i=0; i<dim; i++) {
        dist += pow(v1[i]-v2[i], 2.0);
    }

    return sqrt(dist);
}
/***   end: EuclDist()   ***/


/**
 * @brief Calculate the cosine distance between two vectors
 *
 * @param[in] v1
 * First vector with dim dimensions
 * @param[in] v2
 * Second vector with dim dimensions
 * @param[in] dim
 * Dimensionality of v1 and v2
 *
 * @return cosine distance as double
 *
 * @note Could probably be inlined. Also perfect for SSE
 */
double
CosDist(const double *v1, const double *v2, const int dim)
{
    int i; /* aux */
    double dist; /* returned distance */
    double s, sq1, sq2;

    assert(v1!=NULL);
    assert(v2!=NULL);

    s = 0.0;
    sq1 = 0.0;
    sq2 = 0.0;
    for (i=0; i<dim; i++) {
        s += (v1[i]*v2[i]);
        sq1 += pow(v1[i], 2.0);
        sq2 += pow(v2[i], 2.0);
    }
    sq1 = sqrt(sq1);
    sq2 = sqrt(sq2);

    if ((sq1 * sq2) < DBL_EPSILON) {
        dist = 1 - s / (sq1 * sq2);
    } else {
        /* if the denominator gets small, the fraction gets big, hence dist
           gets small: */
        dist = 0.0;
    }
    return dist;
}
/***   end: CosDist()   ***/


/**
 * @brief convert sequences into mbed-like (distance) vector
 * representation. Seeds (prMSeq sequence indices) have to be picked before
 *
 * @param[out] ppdSeqVec
 * Pointer to preallocated matrix of size nseqs x iSeeds
 * @param[in] prMSeq
 * Sequences which are to be converted
 * @param[in] piSeeds
 * Array of sequences in prMSeq which are to be used as seeds.
 * @param[in] iNumSeeds
 * Number of seeds/elements in piSeeds
 * @param[in] iPairDistType
 * Type of pairwise distance comparison
 *
 */
int
SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, 
         int *piSeeds, const int iNumSeeds,
         const int iPairDistType)
{
    symmatrix_t *prDistmat = NULL;
    int iSeqIndex;
    int iSeedIndex;
   /* indices for restoring order */
    int *restore; 
    /* sorted copy of piSeeds */
    int *piSortedSeeds; 

#if TIMING
    Stopwatch_t *stopwatch = StopwatchCreate();
    StopwatchZero(stopwatch);
    StopwatchStart(stopwatch);
#endif
    
    assert(NULL != ppdSeqVec);
    assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);

    piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
    memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));

    /* need them sorted, otherwise the swapping approach below might
     * break 
     */
    qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);


    /* rearrange mseq order so that we can use ktuple_pairdist code as
     * is. That is, swap seeds and non-seeds such that the seeds end
     * up in front of mseq. This way we can use the KTuplePairDist()
     * code, without making a copy of mseq. Also, keep an array of
     * indices which serves to restore the original sequence order
     * after putting the seeds in front
     *
     */
    restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
        restore[iSeqIndex] = iSeqIndex;
    }
    for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
#endif
        if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
            int swap;
            SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);

            swap = restore[iSeedIndex];
            restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
            restore[piSortedSeeds[iSeedIndex]] = swap;
        }
    }
#if TRACE
    printf("DEBUG(%s|%s():%d): restore indices =",
           __FILE__, __FUNCTION__, __LINE__);
    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
        printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
    }
    printf("\n");

    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
        Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now:  seq = %s",
                  iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
    }
#endif


    /* convert sequences into vectors of distances by calling pairwise
     * distance function for each seq/seed pair
     *
     */
    if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
                           0, iNumSeeds, 0, prMSeq->nseqs,
                           NULL, NULL)) {
        Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
        FreeSymMatrix(&prDistmat);
        CKFREE(piSortedSeeds);
        CKFREE(restore);
        return -1;
    }
#if TRACE
    {
        char **labels;
        labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
            labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
        }
        SymMatrixPrint(prDistmat, labels, NULL);
        CKFREE(labels);
    }
#endif


    /* update ppdSeqVec according to prDistmat. keep in mind that we
     * changed mseq order
     *
     */
    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
            ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
                SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
        }
    }


    /* restore mseq order by reverse swapping
     *
     * need reverse order now, so start from top index
     */
    iSeedIndex=iNumSeeds-1;
    do {
#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
#endif
        if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
            int swap;

            SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);

            swap = restore[iSeedIndex];
            restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
            restore[piSortedSeeds[iSeedIndex]] = swap;
        }
    } while (0 != iSeedIndex--);
#if TRACE
     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
         Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d:  seq = %s %s",
                   iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
     }
#endif

#if TRACE
    for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
        printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
            printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
        }
        printf("\n");
    }
#endif

    
    FreeSymMatrix(&prDistmat);
    CKFREE(restore);
#if TIMING
    StopwatchStop(stopwatch);
    StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
    StopwatchFree(stopwatch);
#endif


    return 0;
}
/***   end: SeqToVec()   ***/


/**
 * @brief Select seeds to be used from an prMSeq
 *
 * @param[out] piSeeds
 * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
 * @param[in] iNumSeeds
 * Number of seeds to be picked
 * @param[in] iSelectionMethod
 * Seed selection method to be used
 * @param[in] prMSeq
 * The prMSeq structure to pick sequences from
 *
 * @return: Non-zero on failure
 *
 */    
int
SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod,  mseq_t *prMSeq)
{
    /* seed iterator */
    int iSeedIndex;
    /* sequence iterator */
    int iSeqIndex;

    if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
        int *piPermArray;

        Log(&rLog, LOG_INFO,
             "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
             iNumSeeds, prMSeq->nseqs);

        PermutationArray(&piPermArray, iNumSeeds);
        /* copy to piSeeds */
        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
            piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
        }
        CKFREE(piPermArray);
        
    }  else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {

        /* step size for picking with constant stride */
        int iStep;
        int *piSeqLen =  CKMALLOC(prMSeq->nseqs * sizeof(int));
        int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));

        Log(&rLog, LOG_INFO,
             "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
             iNumSeeds, prMSeq->nseqs);

        iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
                                            * due to rounding */
        /* first get an array of seq indices order according to
           corresponding sequence length: piOrder */
        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
            piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
        }
        QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
#if 0
        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
            Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
                      iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
        }
#endif
        CKFREE(piSeqLen);
        iSeedIndex = 0;
        for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
            piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
        }
        CKFREE(piOrder);

    } else {
        
        Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
        return -1;
    }


#ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
    /* do initial mbedding and kmeans. then pick seeds from each cluster and
       do full mbed with these seeds. idea is that those seeds represent the
       sequence space better than random seeds. however, this reduces the
       quality slightly. random is almost always better.
    */

    if (1) {
        /* seqs represented as distance vectors */
        double **ppdSeqVec;
        double dCost = -1.0;
        /* assignments for each seq to the K newly created clusters */
        int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
        double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
        /* create a copy of ppdSeqVec suitable for KMeans */
        double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
        bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));

        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");

        /*
          Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
          iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
        */

        ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
            ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
        }
        if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
            Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
            return -1;
        }

        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
            int iDim;
            for (iDim=0; iDim<iNumSeeds; ++iDim) {
                pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
            }
        }


        Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
        dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
                       pdKMeansVectors,
                       RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
                       pdKMeansClusterCenters, piKMeansClusterAssignments);
        Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
        
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");

#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
            Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
                      iSeqIndex,  prMSeq->sqinfo[iSeqIndex].name,
                      piKMeansClusterAssignments[iSeqIndex]);
        }
#endif

        Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
        iSeedIndex = 0;
        for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
            int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
            if (pbClusterUsed[iAssignedCluster]) {
                continue;
            } else {
                /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
                piSeeds[iSeedIndex++] = iSeqIndex;
                pbClusterUsed[iAssignedCluster] = TRUE;
            }
        }
        CKFREE(pbClusterUsed);

        CKFREE(pdKMeansVectors);
        CKFREE(pdKMeansClusterCenters);
        CKFREE(piKMeansClusterAssignments);
        
    }
    /* if (1) */
#endif


    if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
        for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
            Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
                 piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
        }
    }
    
    return 0; 
}
/* end of SeedSelection() */


/**
 * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
 * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
 *
 * @param[out] prKMeansResult_p
 * Result of Bisecting KMeans. Will be allocated here.
 * Caller has to free. See @see FreeKMeansResult()
 * @param[in] iNObjs
 * Number of objects/sequences to cluster
 * @param[in] iDim
 * Dimensionality of input data
 * @param[in] ppdVectors
 * each row holds iDim points for this object's coordinates
 * @param[in] iMinRequiredObjsPerCluster
 * Minimum number of objects per Cluster (inclusive)/
 * @param[in] iMaxAllowedObjsPerCluster
 * Maximum number of objects per Cluster (inclusive). Function returns once no
 * cluster contains more then this number of objects. Soft limit!
 *
 * @note Convoluted code. Could use some restructuring. My apologies.
 * AW
 *
 */
void
BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
                const int iNObjs, const int iDim, double **ppdVectors,
                const int iMinRequiredObjsPerCluster,
                const int iMaxAllowedObjsPerCluster)
{
    int iN, iD;
    /* cluster centers for each cluster created at each split */
    double *pdKClusterCenters;
    /* keep track of updated object indices per newly created
     * cluster */
    int *piCurObjToUpdate;
    /* number of times we called 2-means */
    int iNRounds;
    /* flag for unsuccessful cluster split */
    bool bNaNDetected = FALSE;
    /* flag for detected small cluster after split  */
    bool bSmallClusterDetected;
    /* queue of clusters which are to be split */
    int_queue_t rClusterSplitQueue;
    
#if TIMING
    Stopwatch_t *stopwatch = StopwatchCreate();
#endif


    piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));

    NewKMeansResult(prKMeansResult_p);

    /* new cluster centers created at each split/KMeans run
     */
    pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));

    /* init results by setting a first cluster that contains all objects
     */
    (*prKMeansResult_p)->iNClusters = 1;
    (*prKMeansResult_p)->iDim = iDim;
    /* fake center coordinates of first cluster */
    (*prKMeansResult_p)->ppdClusterCenters =
        (double **) CKMALLOC(1 * sizeof(double *));
    (*prKMeansResult_p)->ppdClusterCenters[0] =
        (double *) CKMALLOC(iDim * sizeof(double));
    /* objects per cluster */
    (*prKMeansResult_p)->piNObjsPerCluster =
        (int *) CKMALLOC(1 * sizeof(int));
    (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
    /* object indices per cluster */
    (*prKMeansResult_p)->ppiObjIndicesPerCluster =
        (int **) CKMALLOC(1 * sizeof(int *));
    (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
        CKMALLOC(iNObjs * sizeof(int));
    for (iN=0; iN<iNObjs; iN++) {
        (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
    }


    /* Starting with the first cluster that now contains all the
     * sequences keep splitting until no cluster contains more than
     * iMaxAllowedObjsPerCluster
     *
     * Keep a queue of clusters (rClusterSplitQueue) to split
     *
     * At each split values/memory of the just split cluster will be
     * reused and exactly one new only allocated.
     *
     * Example:
     * Given the following cluster assignments
     * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
     * and a K-Means split in cluster 3 at |:
     * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
     * The result should be this:
     * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
     *
     *
     */
    INT_QUEUE_INIT(&rClusterSplitQueue);
    if (iNObjs>iMaxAllowedObjsPerCluster) {
        /* pish fake first cluster index */
        INT_QUEUE_PUSH(&rClusterSplitQueue, 0); 
    }
    iNRounds = 0;
    while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
        /* assignments for each seq to the K newly created clusters */
        int *piKClusterAssignments;
        /* number of objects in cluster that is to be split */
        int iNObjsInClusterToSplit;
        /* coordinates of points in cluster that is to be split
         * array of size n*d where [d*i + j] gives coordinate j of point i
         */
        double *pdVectorsInClusterToSplit;
        /* copy of object indices in split cluster */
        int *piObjIndicesOfSplitCluster;
        /* best cost of kmeans rounds */
        double dCost = -1.0;

        /*  indices for the two created clusters
         */
        /* index of cluster to split */
        int iClusterToSplot;
        /* index of newly created cluster at each round. data for the
           other created cluster goes to the one just split,
           i.e. (*piClusterToSplot) */
        int iNewClusterIdx;

#if TIMING
        StopwatchZero(stopwatch);
        StopwatchStart(stopwatch);
#endif

        INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
        
        iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
        piKClusterAssignments = (int *)
            CKMALLOC(iNObjsInClusterToSplit * sizeof(int));

        pdVectorsInClusterToSplit = (double *)
            CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
            for (iD=0; iD<iDim; ++iD) {
                int iThisObjIdx =
                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
                pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
            }
        }

#if TRACE
        Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
                  iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
        fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
                __FILE__, __FUNCTION__, __LINE__);
        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
            fprintf(stderr, " %u",
                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
        }
        fprintf(stderr, "\n");
        (void) fflush(stderr);
#endif

#if TRACE
        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
            fprintf(stderr,
                    "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
                    __FILE__, __FUNCTION__, __LINE__, iN,
                    (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
            for (iD=0; iD<iDim; iD++) {
                fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
            }
            fprintf(stderr, "\n");
            (void) fflush(stderr);
        }
#endif
        
       /* KMeans(1 "The number of points in the data set",
        *        2 "The number of clusters to look for",
        *        3 "The number of dimensions that the data set lives in",
        *        4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
        *        5 "attempts: The number of times to independently run k-means",
        *        6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
        *        7 "centers: This can either be null or an array of size k*d.
        *           In the latter case, centers[d*i + j] will give coordinate j of center i.
        *           If the cluster is unused, it will contain NaN instead.",
        *        8 "assignments: This can either be null or an array of size n.
        *           In the latter case, it will be filled with the cluster that each point is assigned to
        *           (an integer between 0 and k-1 inclusive).");
        */
        dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
                       pdVectorsInClusterToSplit,
                       RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
                       pdKClusterCenters, piKClusterAssignments);

#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
        for (iN=0; iN<2; iN++) {
            fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
                    __FILE__, __FUNCTION__, __LINE__, iN);
            for (iD=0; iD<iDim; iD++) {
                fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
            }
            fprintf(stderr, "\n");
            (void) fflush(stderr);
        }
        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
            Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
                      iN,  piKClusterAssignments[iN], 2);
        }
#endif
        
        /* real index of one of the newly created clusters. the other
         * one is iClusterToSplot */
        iNewClusterIdx = (*prKMeansResult_p)->iNClusters;

        
        /* We don't want Clusters which are too small. Check here if a
         * split created a small cluster and if yes, discard the
         * solution. Because the cluster has already been removed from
         * the queue, we can just continue.
         */
        bSmallClusterDetected = FALSE;
        if (iMinRequiredObjsPerCluster>1) {
            int iNObjsCluster[2];
            iNObjsCluster[0] = 0; /* first cluster */
            iNObjsCluster[1] = 0; /* second cluster */
            for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
                iNObjsCluster[piKClusterAssignments[iN]]+=1;
            }
            
            if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
                ||
                iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
                bSmallClusterDetected = TRUE;
                Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
                          iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
            }
        }
        
        /* Same logic as for small clusters applies if KMeans couldn't
         * split the cluster. In this case one of its center
         * coordinates will be NaN, which we check for in the
         * following.
         */
        if (! bSmallClusterDetected) {
            for (iN=0; iN<2; iN++) {
                bNaNDetected = FALSE;
                for (iD=0; iD<iDim; iD++) {
                    if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
                        /* Got NaN as coordinate after splitting */
                        Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
                             __FUNCTION__,
                             iClusterToSplot, iNObjsInClusterToSplit,
                         "Hope it's not too big and doesn't slow things down."
                            );
                        bNaNDetected = TRUE;
                        break;
                    }
                }
                if (bNaNDetected) {
                    break;
                }
            }
        }
        
        /* Discarding split of this cluster. It has been removed from the
         * queue already. so just continue
         */
        if (bNaNDetected || bSmallClusterDetected) {
            CKFREE(piKClusterAssignments);
            CKFREE(pdVectorsInClusterToSplit);
            continue;
        }


        /* update cluster centers: pdClusterCenters
         *
         */
        /* reuse memory of existing/old/split cluster
         */
        for (iN=0; iN<iDim; iN++) {
            double dCoord = pdKClusterCenters[0*iDim+iN];
            (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
        }
        /* realloc and set new one
         */
        (*prKMeansResult_p)->ppdClusterCenters = (double **)
            CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
                      ((*prKMeansResult_p)->iNClusters+1)  * sizeof(double *));
        (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
            CKMALLOC(iDim * sizeof(double));
        for (iD=0; iD<iDim; iD++) {
            double dCoord = pdKClusterCenters[1*iDim+iD];
            (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
        }
#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
            fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
                    __FILE__, __FUNCTION__, __LINE__, iN);
            for (iD=0; iD<iDim; iD++) {
                fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
            }
            fprintf(stderr, "\n");
            (void) fflush(stderr);
        }
#endif


        /* update #seqs per cluster: piNObjsPerCluster
         *
         */
        (*prKMeansResult_p)->piNObjsPerCluster = (int *)
            CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
                      ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
        /* init new and old one to zero */
        (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
        (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
        /* now update values */
        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
            if (0 == piKClusterAssignments[iN]) {
                (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
            } else if (1 == piKClusterAssignments[iN]) {
                (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
            } else {
                /* there used to be code for iK>=2 in r101 */
                Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
                      piKClusterAssignments[iN]);
            }
        }
#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
            Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
                      iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
        }
#endif
        /* queue clusters if still they are still too big
         */
        if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
            INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
        }
        if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
            INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
        }
        
        
        /* update which objects belong to which cluster:
         *
         * note:  piNObjsPerCluster needs to be already updated
         *
         */
        /* create a copy of the object indices in the split cluster
         * (original will be overwritten)
         */
        piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
        memcpy(piObjIndicesOfSplitCluster,
               (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
               iNObjsInClusterToSplit * sizeof(int));

        (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
            CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
                      ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));


        (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
            (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
                              (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));

        (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
            (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));


        /* now reassign the object indices to the assigned cluster
         */
        piCurObjToUpdate[0] = 0;
        piCurObjToUpdate[1] = 0;
        for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
            int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
            int iThisClusterAssignment =  piKClusterAssignments[iN];
            int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
            int iThisClusterIndex = 0;

            if (0 == iThisClusterAssignment) {
                iThisClusterIndex = iClusterToSplot;
            } else if (1 == iThisClusterAssignment) {
                iThisClusterIndex = iNewClusterIdx;
            } else {
                /* there used to be code for iK>=2 in r101 */
                Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
                      piKClusterAssignments[iN]);
            }
#if 0
            Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
                      iThisClusterIndex, iThisOffset, iThisObjIdx);
#endif
            (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
            piCurObjToUpdate[iThisClusterAssignment]+=1;
        }
        CKFREE(piObjIndicesOfSplitCluster);
#if TRACE
        for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
            int iObj;
            fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
                    __FILE__, __FUNCTION__, __LINE__, iN);
            for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
                fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
            }
            fprintf(stderr, "\n");
            (void) fflush(stderr);
        }
#endif


#if TIMING
        StopwatchStop(stopwatch);
        StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
#endif


        /* finally: increase number of clusters
         */
        (*prKMeansResult_p)->iNClusters += 1;
        iNRounds += 1;
        CKFREE(piKClusterAssignments);
        CKFREE(pdVectorsInClusterToSplit);

    } /* while */
    INT_QUEUE_DESTROY(&rClusterSplitQueue);


    Log(&rLog, LOG_DEBUG,
         "Bisecting K-means finished after %d rounds (no more clusters to split)",
         iNRounds);
         
#if TIMING
    StopwatchFree(stopwatch);
#endif

    /* @note could use progress/timer */

    CKFREE(pdKClusterCenters);
    CKFREE(piCurObjToUpdate);

    return;
}
/***   end: BisectingKmeans()   ***/


/**
 *
 * @brief From scratch reimplementation of mBed: Blackshields et al.
 * (2010); PMID 20470396.
 *
 * Idea is a follows:
 * - convert sequences into vectors of distances
 * - cluster the vectors using k-means
 * - cluster each of the k clusters using upgma (used cached distances
 *    from above?)
 * - join the sub-clusters to create on tree (use UPGMA on k-means
 *   medoids)
 *
 *
 * @param[out] prMbedTree_p
 * Created upgma tree. will be allocated here. use FreeMuscleTree()
 * to free
 * @param[in] prMSeq
 * Multiple sequences
 * @param[in] iPairDistType
 * Distance measure for pairwise alignments
 * @param[in] pcGuidetreeOut
 * Passed down to GuideTreeUpgma()
 *
 * @return Zero on success, non-zero on error
 *
 */
int
Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
     const char *pcGuidetreeOut)
{
    /* number of seeds */
    int iNumSeeds;
    /* seed indices matching prMSeq */
    int *piSeeds; 
    /* seqs represented as distance vectors */
    double **ppdSeqVec;
    /* kmeans result */
    bisecting_kmeans_result_t *prKMeansResult = NULL;
    /* distance matrix of kmeans (pre-)cluster centers */
    symmatrix_t *prPreClusterDistmat = NULL;
    /* auxiliary for symmetric matrix output; tree routines etc */
    char **ppcLabels;
    int iNodeIndex;
    /* mapping of cluster-center tree node indices to corresponding
       cluster */
    int *piClusterToTreeNode;
    int iClusterIndex;
    int iI, iJ;
    FILE *pfOut;
    progress_t *prSubClusterDistanceProgress;
    bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;

#if FULL_WITHIN_CLUSTER_DISTANCES
    Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
#else
    Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed.");
#endif
    
#if MBED_TIMING
    Stopwatch_t *stopwatch = StopwatchCreate();
#endif

    assert(NULL != prMbedTree_p);
    assert(NULL != prMSeq);


    iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
    if (iNumSeeds >= prMSeq->nseqs) {
        /* -1 is condition for RandomUniqueIntArray */
        iNumSeeds = prMSeq->nseqs-1; 
        Log(&rLog, LOG_DEBUG,
             "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
             iNumSeeds);
    }

       
    /* Turn sequences into vectors of distances to the seeds
     *
     */
    piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
    if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
        Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
        return -1;
    }
    ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
    for (iI=0; iI<prMSeq->nseqs; iI++) {
        ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
    }
    if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
        Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
        return -1;
    }
    CKFREE(piSeeds);

    
    /* Calculate (pre-)clusters of sequence vectors by applying
     * bisecting kmeans
     *
     */
#if MBED_TIMING
    /* start clock only here, to make sure we don't include pairwise
     * distance computation */
    StopwatchZero(stopwatch);
    StopwatchStart(stopwatch);
#endif

    BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
                    MIN_REQUIRED_SEQ_PER_PRECLUSTER,
                    MAX_ALLOWED_SEQ_PER_PRECLUSTER);
    Log(&rLog, LOG_INFO,
         "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
         prKMeansResult->iNClusters,
         MIN_REQUIRED_SEQ_PER_PRECLUSTER,
         MAX_ALLOWED_SEQ_PER_PRECLUSTER);


#if PRINT_CLUSTER_DISTRIBUTION
    Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
#if TRACE
        int iD;
        Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
        fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
                __FILE__, __FUNCTION__, __LINE__);
        for (iD=0; iD<iNumSeeds; iD++) {
            fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
        }
        fprintf(stderr, "\n");
        fflush(stderr);
#endif
        Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
                  iI, prKMeansResult->piNObjsPerCluster[iI]);
#if TRACE
        for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
            int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
            
            Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
                      iI, iJ,  iRealIndex, prMSeq->sqinfo[iRealIndex].name);
        }
#endif
    }
#endif

    
    /* Cluster pre-clusters produced by k-means.
     *
     * Do this by calculating the vector distances of the cluster
     * centers and applying UPGMA.
     *
     * @note could try to force-balance the tree here
     *
     */
    if (0 != NewSymMatrix(&prPreClusterDistmat,
                     prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
        Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
    }
    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
        for (iJ=iI+1; iJ<prKMeansResult->iNClusters; iJ++) {
            double dDist;
            dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
                             prKMeansResult->ppdClusterCenters[iJ],
                             iNumSeeds);
            SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist);
            /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
               iI, iJ, dDist); */
        }
    }
    

    /* labels needed for the guide tree building routine only */
    ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
        ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
        (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
    }
#if TRACE
    Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
    SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
#endif

    GuideTreeUpgma(prMbedTree_p,
                   ppcLabels, prPreClusterDistmat, NULL);

    for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
        CKFREE(ppcLabels[iI]);
    }
    CKFREE(ppcLabels);
#if TRACE
    Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
    LogTree(*prMbedTree_p);
#endif


    /* Now replace each leaf in the pre-cluster-center tree
     * appropriately, i.e. with the corresponding sub-cluster.
     *
     * For each leaf/sub-cluster, create a distance matrix for the
     * corresponding sequences. Use distances between vectors as
     * approximated distances between sequences. Then create a
     * guide-tree by applying UPGMA.
     *
     */

    /* Get a mapping of (pre)cluster number and leaf-node indices in
     * the cluster-center tree. We can add trees to prMbedTree_p
     * because AppendTrees() guarantees that no other than the node to
     * append to changes.
     */
    piClusterToTreeNode = (int*)
        CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
    iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
    do {
        if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
            int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
            piClusterToTreeNode[iLeafId] = iNodeIndex;
        }
        iNodeIndex =  NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
    } while (NULL_NEIGHBOR != iNodeIndex);


    /* Now step through all the leafs and replace them with the
     * corresponding sub-trees
     */
    NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
                "Distance calculation within sub-clusters", bPrintCR);
    /* for each cluster */
    for (iClusterIndex=0;
         iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
        /* distance matrix for the sub-cluster */
        symmatrix_t *prWithinClusterDistances = NULL;
        int iNSeqInCluster;
        tree_t *prSubClusterTree = NULL;
        
        ProgressLog(prSubClusterDistanceProgress,
                    iClusterIndex, prKMeansResult->iNClusters, FALSE);

#if FULL_WITHIN_CLUSTER_DISTANCES
        mseq_t *prSubClusterMSeq;
        int iPairDistType;
        int iSeqIndex;
        int iOldLogLevel;

        Log(&rLog, LOG_DEBUG, 
            "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");

        if (TRUE == prMSeq->aligned)  {
            iPairDistType = PAIRDIST_SQUIDID_KIMURA;
        } else {
            iPairDistType = PAIRDIST_KTUPLE;
        }
#endif
            
        iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
                  iClusterIndex, iNSeqInCluster);
#endif

#if FULL_WITHIN_CLUSTER_DISTANCES
        /* create an mseq structure for sequences in this cluster
         * don't need most of the members (e.g. orig_seq) but copy
         * them anyway for the sake of completeness
         */
        NewMSeq(&prSubClusterMSeq);
    
        prSubClusterMSeq->nseqs = iNSeqInCluster;
        prSubClusterMSeq->seqtype = prMSeq->seqtype;
        if (NULL!=prMSeq->filename) {
            prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
        }
        prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */
        prSubClusterMSeq->seq =  (char **)
            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
        prSubClusterMSeq->orig_seq =  (char **)
            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
        prSubClusterMSeq->sqinfo =  (SQINFO *)
            CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
        
        for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
            int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
            prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
            prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
            SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
#if TRACE
            Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
                iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
                iRealSeqIndex);
#endif
        }
#endif

        
        /* Create a distance matrix for this sub-cluster
         * (prWithinClusterDistances) by using the vector distances or
         * ktuple distances.
         *
         * Then apply UPGMA to get a subcluster tree
         * (prSubClusterTree) and append created tree to the
         * pre-cluster-tree (prMbedTree_p)
         *
         */
#if FULL_WITHIN_CLUSTER_DISTANCES
        
        iOldLogLevel = rLog.iLogLevelEnabled;
        rLog.iLogLevelEnabled = LOG_WARN;
        /* compute distances, but be quiet */
        if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
                          0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
                          NULL, NULL)) {
            Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
            return -1;
        }
        rLog.iLogLevelEnabled = iOldLogLevel;

#if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE
        {
            double dSum = 0.0;
            for (iI=0; iI<prSubClusterMSeq->nseqs; iI++) {
                for (iJ=iI+1; iJ<prSubClusterMSeq->nseqs; iJ++) {
                    dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ);
                }
            }
            Log(&rLog, LOG_FORCED_DEBUG,
                "mean pair-wise distance within subcluster %d of %d = %f", 
                iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs);
        }
#endif

#else
        if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
            Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
        }
        for (iI=0; iI<iNSeqInCluster; iI++) {
            int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];

            for (iJ=iI+1; iJ<iNSeqInCluster; iJ++) {
                int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iJ];
                double dist;

                /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
                   iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
                   iJ, prMSeq->sqinfo[iRealIndexJ].name); */

                if (1 == USE_EUCLIDEAN_DISTANCE) {
                    dist = EuclDist(ppdSeqVec[iRealIndexI],
                                    ppdSeqVec[iRealIndexJ], iNumSeeds);
                } else {
                    dist = CosDist(ppdSeqVec[iRealIndexI],
                                   ppdSeqVec[iRealIndexJ], iNumSeeds);
                }
                SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist);
            }
        }
#endif

        /* labels needed for the guide tree building routine only */
        ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
        for (iI=0; iI<iNSeqInCluster; iI++) {
#if FULL_WITHIN_CLUSTER_DISTANCES
            ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
#else       
            int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
            ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
#endif
        }        
#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
                  iClusterIndex, prKMeansResult->iNClusters);
        SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
#endif

        
        GuideTreeUpgma(&prSubClusterTree, ppcLabels,
                       prWithinClusterDistances, NULL);

        CKFREE(ppcLabels); /* don't free members, they just point */
#if 0
        Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
        LogTree(prSubClusterTree);
#endif

        
        /* The guide tree id's (that point to the sequences) now start
         * from 0, i.e. the association with the prMSeq numbering is
         * broken and fixed in the following
         */
        for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
            if (IsLeaf(iNodeIndex, prSubClusterTree)) {
                int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
                int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
#if 0
                Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
                          iNodeIndex, iLeafId,
                          GetLeafName(iNodeIndex, prSubClusterTree),
                          iRealId, prMSeq->sqinfo[iRealId].name);
#endif
                SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
            }
        }


        /* Append the newly created tree (prSubClusterTree) to the
         * corresponding node index of prMbedTree_p.
         */
#if TRACE
        Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
                  piClusterToTreeNode[iClusterIndex],
                  GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
#endif

        AppendTree(*prMbedTree_p,
                   piClusterToTreeNode[iClusterIndex],
                   prSubClusterTree);
        /* Note: piClusterToTreeNode is still valid, because
         * AppendTrees() guarantees that no other than the node to
         * append to changes. */

#if 0
        Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
        LogTree(*prMbedTree_p);

        if (0) {
            char fname[] = "mbed-joined-tree.dnd";
            FILE *pfOut;
            if (NULL == (pfOut = fopen(fname, "w"))) {
                Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
            }
            MuscleTreeToFile(pfOut, *prMbedTree_p);
            Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
            fclose(pfOut);
            Log(&rLog, LOG_FATAL, "DEBUG EXIT");
        }
#endif

        /* cleanup
         */
        FreeMuscleTree(prSubClusterTree);
        FreeSymMatrix(&prWithinClusterDistances);
#if FULL_WITHIN_CLUSTER_DISTANCES
        FreeMSeq(&prSubClusterMSeq);
#endif
    } /* end for each cluster */
    ProgressDone(prSubClusterDistanceProgress);
    FreeProgress(&prSubClusterDistanceProgress);

    
    if (NULL != pcGuidetreeOut) {
        if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
            Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
        } else {
            MuscleTreeToFile(pfOut, *prMbedTree_p);
            Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
            (void) fclose(pfOut);
        }
    }

    
    /* cleanup
     *
     */
#if MBED_TIMING
    StopwatchStop(stopwatch);
    StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
    StopwatchFree(stopwatch);
#endif

    FreeKMeansResult(&prKMeansResult);
    FreeSymMatrix(&prPreClusterDistmat);
    for (iI=0; iI<prMSeq->nseqs; iI++) {
        CKFREE(ppdSeqVec[iI]);
    }
    CKFREE(ppdSeqVec);
    CKFREE(piClusterToTreeNode);

#ifndef NDEBUG
    TreeValidate(*prMbedTree_p);
#endif
    
    return 0;
}
/***   end: Mbed()   ***/