--- /dev/null
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+
+/*********************************************************************
+ * Clustal Omega - Multiple sequence alignment
+ *
+ * Copyright (C) 2010 University College Dublin
+ *
+ * Clustal-Omega is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This file is part of Clustal-Omega.
+ *
+ ********************************************************************/
+
+/*
+ * RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $
+ *
+ *
+ * Reimplementation from scratch of mBed:
+ * Blackshields et al. (2010); PMID 20470396
+ *
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include <float.h>
+
+#include "mbed.h"
+#include "pair_dist.h"
+#include "symmatrix.h"
+#include "ktuple_pair.h"
+#include "tree.h"
+#include "util.h"
+#include "progress.h"
+#include "queue.h"
+
+#include "log.h"
+#include "kmpp/KMeans.h"
+#include "mbed.h"
+
+#define TIMING 0
+#if TIMING
+#include "squid/stopwatch.h"
+#endif
+
+
+/* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
+ * bisecting kmeans subcluster are not estimated using the vectors,
+ * but calculated normally (using ktuple or kimura). Surprisingly this
+ * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
+ * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
+ * log2 instead of log2^2). And of course it slows things down.
+ */
+#define FULL_WITHIN_CLUSTER_DISTANCES 1
+
+#define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0
+
+/* Cluster size limits. Maximum is a soft limit, which might be
+ * exceeded if a K-Means split was unsuccesful, where unsuccesful
+ * might also mean that the minimum required number seqs. was not
+ * reached */
+#if FULL_WITHIN_CLUSTER_DISTANCES
+static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
+static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
+#else
+static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
+static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
+#endif
+
+/* How many restarts per bisecting kmeans split. 10 give 0.5% increase
+ * in quality on original HOMFAM over just 1. It also increases kmeans
+ * time by a factor of ~3, but overall time is insignificant
+ * compared to pairdist/progressive alignment part.
+ */
+static const int RESTARTS_PER_SPLIT = 10;
+
+
+/* Use standard kmeans (lloyds) or kmeans++. Both are almost
+ * indistinguishable here, although kmeans++ is slightly ahead the
+ * fewer seeds you pick (and it should be better in theory)
+ */
+#define USE_KMEANS_LLOYDS 0
+
+
+#ifndef HAVE_LOG2
+#define log2(x) (log(x) / 0.69314718055994530942)
+#endif
+#define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
+
+
+/* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
+ * approach: Sample iSeeds sequence with constant stride from length-sorted X.
+ * It might be better to pick the seeds randomly, because length sorting might
+ * be more prone to including outliers (e.g. very long and very short seqs).
+ * However, we could never observer such a thing. So just stick to the
+ * original version as this also removes the random element.
+ */
+enum SEED_SELECTION_TYPE {
+ SELECT_SEEDS_RANDOMLY,
+ SELECT_SEEDS_BY_LENGTH
+};
+#define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
+
+
+/* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
+ * no difference between mbed-trees created from cosine or euclidean
+ * distances (simple version, just using disparities).
+ */
+#define USE_EUCLIDEAN_DISTANCE 1
+
+
+/* print some mbed pre-cluster usage to screen */
+#define PRINT_CLUSTER_DISTRIBUTION 0
+
+
+#define TRACE 0
+
+
+typedef struct {
+ /* Number of final clusters
+ */
+ int iNClusters;
+
+ /* Coordinates (columns) for each cluster (rows)
+ * valid indices: [0...iNClusters][0...dim-1]
+ */
+ double **ppdClusterCenters;
+
+ /* Dimensionality of input data and cluster center coordinates
+ */
+ int iDim;
+
+ /* Number of objects per cluster
+ */
+ int *piNObjsPerCluster;
+
+ /* Objects (indices) for each cluster. [i][j] will point to (index
+ * of) object j in cluster i
+ */
+ int **ppiObjIndicesPerCluster;
+} bisecting_kmeans_result_t;
+
+
+
+/**
+ * @brief Free KMeans result structure.
+ *
+ * @param[out] prResult_p
+ * K-Means result to free
+ *
+ * @see NewKMeansResult()
+ */
+void
+FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
+{
+ int iAux;
+ CKFREE((*prResult_p)->piNObjsPerCluster);
+ for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
+ CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
+ CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
+ }
+ CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
+ CKFREE((*prResult_p)->ppdClusterCenters);
+ (*prResult_p)->iNClusters = 0;
+ (*prResult_p)->iDim = 0;
+ CKFREE(*prResult_p);
+}
+/*** end: FreeKMeansResult() ***/
+
+
+
+/**
+ * @brief Allocate new KMeans result
+ *
+ * @param[out] prKMeansResult_p
+ * K-Means result to free
+ *
+ * @see NewKMeansResult()
+ */
+void
+NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
+{
+ (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
+ CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
+ (*prKMeansResult_p)->iNClusters = 0;
+ (*prKMeansResult_p)->iDim = 0;
+ (*prKMeansResult_p)->ppdClusterCenters = NULL;
+ (*prKMeansResult_p)->piNObjsPerCluster = NULL;
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
+
+}
+/*** end: NewKMeansResult() ***/
+
+
+
+/**
+ * @brief Calculate the euclidean distance between two vectors
+ *
+ * @param[in] v1
+ * First vector with dim dimensions
+ * @param[in] v2
+ * Second vector with dim dimensions
+ * @param[in] dim
+ * Dimensionality of v1 and v2
+ *
+ * @return euclidean distance as double
+ *
+ * @note Could probably be inlined. Also perfect for SSE
+ */
+double
+EuclDist(const double *v1, const double *v2, const int dim)
+{
+ int i; /* aux */
+ double dist; /* returned distance */
+
+ assert(v1!=NULL);
+ assert(v2!=NULL);
+
+ dist = 0.0;
+ for (i=0; i<dim; i++) {
+ dist += pow(v1[i]-v2[i], 2.0);
+ }
+
+ return sqrt(dist);
+}
+/*** end: EuclDist() ***/
+
+
+
+/**
+ * @brief Calculate the cosine distance between two vectors
+ *
+ * @param[in] v1
+ * First vector with dim dimensions
+ * @param[in] v2
+ * Second vector with dim dimensions
+ * @param[in] dim
+ * Dimensionality of v1 and v2
+ *
+ * @return cosine distance as double
+ *
+ * @note Could probably be inlined. Also perfect for SSE
+ */
+double
+CosDist(const double *v1, const double *v2, const int dim)
+{
+ int i; /* aux */
+ double dist; /* returned distance */
+ double s, sq1, sq2;
+
+ assert(v1!=NULL);
+ assert(v2!=NULL);
+
+ s = 0.0;
+ sq1 = 0.0;
+ sq2 = 0.0;
+ for (i=0; i<dim; i++) {
+ s += (v1[i]*v2[i]);
+ sq1 += pow(v1[i], 2.0);
+ sq2 += pow(v2[i], 2.0);
+ }
+ sq1 = sqrt(sq1);
+ sq2 = sqrt(sq2);
+
+ if ((sq1 * sq2) < DBL_EPSILON) {
+ dist = 1 - s / (sq1 * sq2);
+ } else {
+ /* if the denominator gets small, the fraction gets big, hence dist
+ gets small: */
+ dist = 0.0;
+ }
+ return dist;
+}
+/*** end: CosDist() ***/
+
+
+
+/**
+ * @brief convert sequences into mbed-like (distance) vector
+ * representation. Seeds (prMSeq sequence indices) have to be picked before
+ *
+ * @param[out] ppdSeqVec
+ * Pointer to preallocated matrix of size nseqs x iSeeds
+ * @param[in] prMSeq
+ * Sequences which are to be converted
+ * @param[in] piSeeds
+ * Array of sequences in prMSeq which are to be used as seeds.
+ * @param[in] iNumSeeds
+ * Number of seeds/elements in piSeeds
+ * @param[in] iPairDistType
+ * Type of pairwise distance comparison
+ *
+ */
+int
+SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
+ int *piSeeds, const int iNumSeeds,
+ const int iPairDistType)
+{
+ symmatrix_t *prDistmat = NULL;
+ int iSeqIndex;
+ int iSeedIndex;
+ /* indices for restoring order */
+ int *restore;
+ /* sorted copy of piSeeds */
+ int *piSortedSeeds;
+
+#if TIMING
+ Stopwatch_t *stopwatch = StopwatchCreate();
+ StopwatchZero(stopwatch);
+ StopwatchStart(stopwatch);
+#endif
+
+ assert(NULL != ppdSeqVec);
+ assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
+
+ piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
+ memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
+
+ /* need them sorted, otherwise the swapping approach below might
+ * break
+ */
+ qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
+
+
+ /* rearrange mseq order so that we can use ktuple_pairdist code as
+ * is. That is, swap seeds and non-seeds such that the seeds end
+ * up in front of mseq. This way we can use the KTuplePairDist()
+ * code, without making a copy of mseq. Also, keep an array of
+ * indices which serves to restore the original sequence order
+ * after putting the seeds in front
+ *
+ */
+ restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ restore[iSeqIndex] = iSeqIndex;
+ }
+ for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
+#endif
+ if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
+ int swap;
+ SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
+
+ swap = restore[iSeedIndex];
+ restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
+ restore[piSortedSeeds[iSeedIndex]] = swap;
+ }
+ }
+#if TRACE
+ printf("DEBUG(%s|%s():%d): restore indices =",
+ __FILE__, __FUNCTION__, __LINE__);
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
+ }
+ printf("\n");
+
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now: seq = %s",
+ iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
+ }
+#endif
+
+
+ /* convert sequences into vectors of distances by calling pairwise
+ * distance function for each seq/seed pair
+ *
+ */
+ if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
+ 0, iNumSeeds, 0, prMSeq->nseqs,
+ NULL, NULL)) {
+ Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
+ FreeSymMatrix(&prDistmat);
+ CKFREE(piSortedSeeds);
+ CKFREE(restore);
+ return -1;
+ }
+#if TRACE
+ {
+ char **labels;
+ labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
+ }
+ SymMatrixPrint(prDistmat, labels, NULL);
+ CKFREE(labels);
+ }
+#endif
+
+
+ /* update ppdSeqVec according to prDistmat. keep in mind that we
+ * changed mseq order
+ *
+ */
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+ ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
+ SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
+ }
+ }
+
+
+ /* restore mseq order by reverse swapping
+ *
+ * need reverse order now, so start from top index
+ */
+ iSeedIndex=iNumSeeds-1;
+ do {
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
+#endif
+ if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
+ int swap;
+
+ SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
+
+ swap = restore[iSeedIndex];
+ restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
+ restore[piSortedSeeds[iSeedIndex]] = swap;
+ }
+ } while (0 != iSeedIndex--);
+#if TRACE
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d: seq = %s %s",
+ iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
+ }
+#endif
+
+#if TRACE
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
+ for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+ printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
+ }
+ printf("\n");
+ }
+#endif
+
+
+ FreeSymMatrix(&prDistmat);
+ CKFREE(restore);
+#if TIMING
+ StopwatchStop(stopwatch);
+ StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
+ StopwatchFree(stopwatch);
+#endif
+
+
+ return 0;
+}
+/*** end: SeqToVec() ***/
+
+
+
+/**
+ * @brief Select seeds to be used from an prMSeq
+ *
+ * @param[out] piSeeds
+ * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
+ * @param[in] iNumSeeds
+ * Number of seeds to be picked
+ * @param[in] iSelectionMethod
+ * Seed selection method to be used
+ * @param[in] prMSeq
+ * The prMSeq structure to pick sequences from
+ *
+ * @return: Non-zero on failure
+ *
+ */
+int
+SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq)
+{
+ /* seed iterator */
+ int iSeedIndex;
+ /* sequence iterator */
+ int iSeqIndex;
+
+ if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
+ int *piPermArray;
+
+ Log(&rLog, LOG_INFO,
+ "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
+ iNumSeeds, prMSeq->nseqs);
+
+ PermutationArray(&piPermArray, iNumSeeds);
+ /* copy to piSeeds */
+ for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+ piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
+ }
+ CKFREE(piPermArray);
+
+ } else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
+
+ /* step size for picking with constant stride */
+ int iStep;
+ int *piSeqLen = CKMALLOC(prMSeq->nseqs * sizeof(int));
+ int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
+
+ Log(&rLog, LOG_INFO,
+ "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
+ iNumSeeds, prMSeq->nseqs);
+
+ iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
+ * due to rounding */
+ /* first get an array of seq indices order according to
+ corresponding sequence length: piOrder */
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
+ }
+ QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
+#if 0
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
+ iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
+ }
+#endif
+ CKFREE(piSeqLen);
+ iSeedIndex = 0;
+ for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
+ piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
+ }
+ CKFREE(piOrder);
+
+ } else {
+
+ Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
+ return -1;
+ }
+
+
+#ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
+ /* do initial mbedding and kmeans. then pick seeds from each cluster and
+ do full mbed with these seeds. idea is that those seeds represent the
+ sequence space better than random seeds. however, this reduces the
+ quality slightly. random is almost always better.
+ */
+
+ if (1) {
+ /* seqs represented as distance vectors */
+ double **ppdSeqVec;
+ double dCost = -1.0;
+ /* assignments for each seq to the K newly created clusters */
+ int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
+ double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
+ /* create a copy of ppdSeqVec suitable for KMeans */
+ double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
+ bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
+
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
+
+ /*
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
+ iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
+ */
+
+ ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
+ }
+ if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
+ Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
+ return -1;
+ }
+
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ int iDim;
+ for (iDim=0; iDim<iNumSeeds; ++iDim) {
+ pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
+ }
+ }
+
+
+ Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
+ dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
+ pdKMeansVectors,
+ RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
+ pdKMeansClusterCenters, piKMeansClusterAssignments);
+ Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
+
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
+
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
+ iSeqIndex, prMSeq->sqinfo[iSeqIndex].name,
+ piKMeansClusterAssignments[iSeqIndex]);
+ }
+#endif
+
+ Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
+ iSeedIndex = 0;
+ for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
+ int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
+ if (pbClusterUsed[iAssignedCluster]) {
+ continue;
+ } else {
+ /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
+ piSeeds[iSeedIndex++] = iSeqIndex;
+ pbClusterUsed[iAssignedCluster] = TRUE;
+ }
+ }
+ CKFREE(pbClusterUsed);
+
+ CKFREE(pdKMeansVectors);
+ CKFREE(pdKMeansClusterCenters);
+ CKFREE(piKMeansClusterAssignments);
+
+ }
+ /* if (1) */
+#endif
+
+
+ if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
+ for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
+ Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
+ piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
+ }
+ }
+
+ return 0;
+}
+/* end of SeedSelection() */
+
+
+
+
+/**
+ * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
+ * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
+ *
+ * @param[out] prKMeansResult_p
+ * Result of Bisecting KMeans. Will be allocated here.
+ * Caller has to free. See @see FreeKMeansResult()
+ * @param[in] iNObjs
+ * Number of objects/sequences to cluster
+ * @param[in] iDim
+ * Dimensionality of input data
+ * @param[in] ppdVectors
+ * each row holds iDim points for this object's coordinates
+ * @param[in] iMinRequiredObjsPerCluster
+ * Minimum number of objects per Cluster (inclusive)/
+ * @param[in] iMaxAllowedObjsPerCluster
+ * Maximum number of objects per Cluster (inclusive). Function returns once no
+ * cluster contains more then this number of objects. Soft limit!
+ *
+ * @note Convoluted code. Could use some restructuring. My apologies.
+ * AW
+ *
+ */
+void
+BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
+ const int iNObjs, const int iDim, double **ppdVectors,
+ const int iMinRequiredObjsPerCluster,
+ const int iMaxAllowedObjsPerCluster)
+{
+ int iN, iD;
+ /* cluster centers for each cluster created at each split */
+ double *pdKClusterCenters;
+ /* keep track of updated object indices per newly created
+ * cluster */
+ int *piCurObjToUpdate;
+ /* number of times we called 2-means */
+ int iNRounds;
+ /* flag for unsuccessful cluster split */
+ bool bNaNDetected = FALSE;
+ /* flag for detected small cluster after split */
+ bool bSmallClusterDetected;
+ /* queue of clusters which are to be split */
+ int_queue_t rClusterSplitQueue;
+
+#if TIMING
+ Stopwatch_t *stopwatch = StopwatchCreate();
+#endif
+
+
+ piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
+
+ NewKMeansResult(prKMeansResult_p);
+
+ /* new cluster centers created at each split/KMeans run
+ */
+ pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
+
+ /* init results by setting a first cluster that contains all objects
+ */
+ (*prKMeansResult_p)->iNClusters = 1;
+ (*prKMeansResult_p)->iDim = iDim;
+ /* fake center coordinates of first cluster */
+ (*prKMeansResult_p)->ppdClusterCenters =
+ (double **) CKMALLOC(1 * sizeof(double *));
+ (*prKMeansResult_p)->ppdClusterCenters[0] =
+ (double *) CKMALLOC(iDim * sizeof(double));
+ /* objects per cluster */
+ (*prKMeansResult_p)->piNObjsPerCluster =
+ (int *) CKMALLOC(1 * sizeof(int));
+ (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
+ /* object indices per cluster */
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster =
+ (int **) CKMALLOC(1 * sizeof(int *));
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
+ CKMALLOC(iNObjs * sizeof(int));
+ for (iN=0; iN<iNObjs; iN++) {
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
+ }
+
+
+
+ /* Starting with the first cluster that now contains all the
+ * sequences keep splitting until no cluster contains more than
+ * iMaxAllowedObjsPerCluster
+ *
+ * Keep a queue of clusters (rClusterSplitQueue) to split
+ *
+ * At each split values/memory of the just split cluster will be
+ * reused and exactly one new only allocated.
+ *
+ * Example:
+ * Given the following cluster assignments
+ * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
+ * and a K-Means split in cluster 3 at |:
+ * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
+ * The result should be this:
+ * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
+ *
+ *
+ */
+ INT_QUEUE_INIT(&rClusterSplitQueue);
+ if (iNObjs>iMaxAllowedObjsPerCluster) {
+ /* pish fake first cluster index */
+ INT_QUEUE_PUSH(&rClusterSplitQueue, 0);
+ }
+ iNRounds = 0;
+ while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
+ /* assignments for each seq to the K newly created clusters */
+ int *piKClusterAssignments;
+ /* number of objects in cluster that is to be split */
+ int iNObjsInClusterToSplit;
+ /* coordinates of points in cluster that is to be split
+ * array of size n*d where [d*i + j] gives coordinate j of point i
+ */
+ double *pdVectorsInClusterToSplit;
+ /* copy of object indices in split cluster */
+ int *piObjIndicesOfSplitCluster;
+ /* best cost of kmeans rounds */
+ double dCost = -1.0;
+
+ /* indices for the two created clusters
+ */
+ /* index of cluster to split */
+ int iClusterToSplot;
+ /* index of newly created cluster at each round. data for the
+ other created cluster goes to the one just split,
+ i.e. (*piClusterToSplot) */
+ int iNewClusterIdx;
+
+#if TIMING
+ StopwatchZero(stopwatch);
+ StopwatchStart(stopwatch);
+#endif
+
+ INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
+
+ iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
+ piKClusterAssignments = (int *)
+ CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
+
+ pdVectorsInClusterToSplit = (double *)
+ CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
+ for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+ for (iD=0; iD<iDim; ++iD) {
+ int iThisObjIdx =
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
+ pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
+ }
+ }
+
+#if TRACE
+ Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
+ iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
+ fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
+ __FILE__, __FUNCTION__, __LINE__);
+ for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+ fprintf(stderr, " %u",
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
+ }
+ fprintf(stderr, "\n");
+ (void) fflush(stderr);
+#endif
+
+#if TRACE
+ for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+ fprintf(stderr,
+ "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
+ __FILE__, __FUNCTION__, __LINE__, iN,
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
+ for (iD=0; iD<iDim; iD++) {
+ fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
+ }
+ fprintf(stderr, "\n");
+ (void) fflush(stderr);
+ }
+#endif
+
+ /* KMeans(1 "The number of points in the data set",
+ * 2 "The number of clusters to look for",
+ * 3 "The number of dimensions that the data set lives in",
+ * 4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
+ * 5 "attempts: The number of times to independently run k-means",
+ * 6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
+ * 7 "centers: This can either be null or an array of size k*d.
+ * In the latter case, centers[d*i + j] will give coordinate j of center i.
+ * If the cluster is unused, it will contain NaN instead.",
+ * 8 "assignments: This can either be null or an array of size n.
+ * In the latter case, it will be filled with the cluster that each point is assigned to
+ * (an integer between 0 and k-1 inclusive).");
+ */
+ dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
+ pdVectorsInClusterToSplit,
+ RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
+ pdKClusterCenters, piKClusterAssignments);
+
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
+ for (iN=0; iN<2; iN++) {
+ fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
+ __FILE__, __FUNCTION__, __LINE__, iN);
+ for (iD=0; iD<iDim; iD++) {
+ fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
+ }
+ fprintf(stderr, "\n");
+ (void) fflush(stderr);
+ }
+ for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+ Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
+ iN, piKClusterAssignments[iN], 2);
+ }
+#endif
+
+ /* real index of one of the newly created clusters. the other
+ * one is iClusterToSplot */
+ iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
+
+
+ /* We don't want Clusters which are too small. Check here if a
+ * split created a small cluster and if yes, discard the
+ * solution. Because the cluster has already been removed from
+ * the queue, we can just continue.
+ */
+ bSmallClusterDetected = FALSE;
+ if (iMinRequiredObjsPerCluster>1) {
+ int iNObjsCluster[2];
+ iNObjsCluster[0] = 0; /* first cluster */
+ iNObjsCluster[1] = 0; /* second cluster */
+ for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+ iNObjsCluster[piKClusterAssignments[iN]]+=1;
+ }
+
+ if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
+ ||
+ iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
+ bSmallClusterDetected = TRUE;
+ Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
+ iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
+ }
+ }
+
+ /* Same logic as for small clusters applies if KMeans couldn't
+ * split the cluster. In this case one of its center
+ * coordinates will be NaN, which we check for in the
+ * following.
+ */
+ if (! bSmallClusterDetected) {
+ for (iN=0; iN<2; iN++) {
+ bNaNDetected = FALSE;
+ for (iD=0; iD<iDim; iD++) {
+ if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
+ /* Got NaN as coordinate after splitting */
+ Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
+ __FUNCTION__,
+ iClusterToSplot, iNObjsInClusterToSplit,
+ "Hope it's not too big and doesn't slow things down."
+ );
+ bNaNDetected = TRUE;
+ break;
+ }
+ }
+ if (bNaNDetected) {
+ break;
+ }
+ }
+ }
+
+ /* Discarding split of this cluster. It has been removed from the
+ * queue already. so just continue
+ */
+ if (bNaNDetected || bSmallClusterDetected) {
+ CKFREE(piKClusterAssignments);
+ CKFREE(pdVectorsInClusterToSplit);
+ continue;
+ }
+
+
+ /* update cluster centers: pdClusterCenters
+ *
+ */
+ /* reuse memory of existing/old/split cluster
+ */
+ for (iN=0; iN<iDim; iN++) {
+ double dCoord = pdKClusterCenters[0*iDim+iN];
+ (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
+ }
+ /* realloc and set new one
+ */
+ (*prKMeansResult_p)->ppdClusterCenters = (double **)
+ CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
+ ((*prKMeansResult_p)->iNClusters+1) * sizeof(double *));
+ (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
+ CKMALLOC(iDim * sizeof(double));
+ for (iD=0; iD<iDim; iD++) {
+ double dCoord = pdKClusterCenters[1*iDim+iD];
+ (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
+ }
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
+ for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
+ fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
+ __FILE__, __FUNCTION__, __LINE__, iN);
+ for (iD=0; iD<iDim; iD++) {
+ fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
+ }
+ fprintf(stderr, "\n");
+ (void) fflush(stderr);
+ }
+#endif
+
+
+ /* update #seqs per cluster: piNObjsPerCluster
+ *
+ */
+ (*prKMeansResult_p)->piNObjsPerCluster = (int *)
+ CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
+ ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
+ /* init new and old one to zero */
+ (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
+ (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
+ /* now update values */
+ for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+ if (0 == piKClusterAssignments[iN]) {
+ (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
+ } else if (1 == piKClusterAssignments[iN]) {
+ (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
+ } else {
+ /* there used to be code for iK>=2 in r101 */
+ Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
+ piKClusterAssignments[iN]);
+ }
+ }
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
+ for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
+ Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
+ iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
+ }
+#endif
+ /* queue clusters if still they are still too big
+ */
+ if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
+ INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
+ }
+ if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
+ INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
+ }
+
+
+
+ /* update which objects belong to which cluster:
+ *
+ * note: piNObjsPerCluster needs to be already updated
+ *
+ */
+ /* create a copy of the object indices in the split cluster
+ * (original will be overwritten)
+ */
+ piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
+ memcpy(piObjIndicesOfSplitCluster,
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
+ iNObjsInClusterToSplit * sizeof(int));
+
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
+ CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
+ ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
+
+
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
+ (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
+ (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
+
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
+ (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
+
+
+ /* now reassign the object indices to the assigned cluster
+ */
+ piCurObjToUpdate[0] = 0;
+ piCurObjToUpdate[1] = 0;
+ for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
+ int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
+ int iThisClusterAssignment = piKClusterAssignments[iN];
+ int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
+ int iThisClusterIndex = 0;
+
+ if (0 == iThisClusterAssignment) {
+ iThisClusterIndex = iClusterToSplot;
+ } else if (1 == iThisClusterAssignment) {
+ iThisClusterIndex = iNewClusterIdx;
+ } else {
+ /* there used to be code for iK>=2 in r101 */
+ Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
+ piKClusterAssignments[iN]);
+ }
+#if 0
+ Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
+ iThisClusterIndex, iThisOffset, iThisObjIdx);
+#endif
+ (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
+ piCurObjToUpdate[iThisClusterAssignment]+=1;
+ }
+ CKFREE(piObjIndicesOfSplitCluster);
+#if TRACE
+ for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
+ int iObj;
+ fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
+ __FILE__, __FUNCTION__, __LINE__, iN);
+ for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
+ fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
+ }
+ fprintf(stderr, "\n");
+ (void) fflush(stderr);
+ }
+#endif
+
+
+#if TIMING
+ StopwatchStop(stopwatch);
+ StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
+#endif
+
+
+ /* finally: increase number of clusters
+ */
+ (*prKMeansResult_p)->iNClusters += 1;
+ iNRounds += 1;
+ CKFREE(piKClusterAssignments);
+ CKFREE(pdVectorsInClusterToSplit);
+
+ } /* while */
+ INT_QUEUE_DESTROY(&rClusterSplitQueue);
+
+
+ Log(&rLog, LOG_DEBUG,
+ "Bisecting K-means finished after %d rounds (no more clusters to split)",
+ iNRounds);
+
+#if TIMING
+ StopwatchFree(stopwatch);
+#endif
+
+ /* @note could use progress/timer */
+
+ CKFREE(pdKClusterCenters);
+ CKFREE(piCurObjToUpdate);
+
+ return;
+}
+/*** end: BisectingKmeans() ***/
+
+
+
+/**
+ *
+ * @brief From scratch reimplementation of mBed: Blackshields et al.
+ * (2010); PMID 20470396.
+ *
+ * Idea is a follows:
+ * - convert sequences into vectors of distances
+ * - cluster the vectors using k-means
+ * - cluster each of the k clusters using upgma (used cached distances
+ * from above?)
+ * - join the sub-clusters to create on tree (use UPGMA on k-means
+ * medoids)
+ *
+ *
+ * @param[out] prMbedTree_p
+ * Created upgma tree. will be allocated here. use FreeMuscleTree()
+ * to free
+ * @param[in] prMSeq
+ * Multiple sequences
+ * @param[in] iPairDistType
+ * Distance measure for pairwise alignments
+ * @param[in] pcGuidetreeOut
+ * Passed down to GuideTreeUpgma()
+ *
+ * @return Zero on success, non-zero on error
+ *
+ */
+int
+Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
+ const char *pcGuidetreeOut)
+{
+ /* number of seeds */
+ int iNumSeeds;
+ /* seed indices matching prMSeq */
+ int *piSeeds;
+ /* seqs represented as distance vectors */
+ double **ppdSeqVec;
+ /* kmeans result */
+ bisecting_kmeans_result_t *prKMeansResult = NULL;
+ /* distance matrix of kmeans (pre-)cluster centers */
+ symmatrix_t *prPreClusterDistmat = NULL;
+ /* auxiliary for symmetric matrix output; tree routines etc */
+ char **ppcLabels;
+ int iNodeIndex;
+ /* mapping of cluster-center tree node indices to corresponding
+ cluster */
+ int *piClusterToTreeNode;
+ int iClusterIndex;
+ int iI, iJ;
+ FILE *pfOut;
+ progress_t *prSubClusterDistanceProgress;
+ bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
+
+#if FULL_WITHIN_CLUSTER_DISTANCES
+ Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
+#else
+ Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed.");
+#endif
+
+#if MBED_TIMING
+ Stopwatch_t *stopwatch = StopwatchCreate();
+#endif
+
+ assert(NULL != prMbedTree_p);
+ assert(NULL != prMSeq);
+
+
+ iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
+ if (iNumSeeds >= prMSeq->nseqs) {
+ /* -1 is condition for RandomUniqueIntArray */
+ iNumSeeds = prMSeq->nseqs-1;
+ Log(&rLog, LOG_DEBUG,
+ "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
+ iNumSeeds);
+ }
+
+
+ /* Turn sequences into vectors of distances to the seeds
+ *
+ */
+ piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
+ if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
+ Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
+ return -1;
+ }
+ ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
+ for (iI=0; iI<prMSeq->nseqs; iI++) {
+ ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
+ }
+ if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
+ Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
+ return -1;
+ }
+ CKFREE(piSeeds);
+
+
+ /* Calculate (pre-)clusters of sequence vectors by applying
+ * bisecting kmeans
+ *
+ */
+#if MBED_TIMING
+ /* start clock only here, to make sure we don't include pairwise
+ * distance computation */
+ StopwatchZero(stopwatch);
+ StopwatchStart(stopwatch);
+#endif
+
+ BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
+ MIN_REQUIRED_SEQ_PER_PRECLUSTER,
+ MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+ Log(&rLog, LOG_INFO,
+ "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
+ prKMeansResult->iNClusters,
+ MIN_REQUIRED_SEQ_PER_PRECLUSTER,
+ MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+
+
+#if PRINT_CLUSTER_DISTRIBUTION
+ Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
+ for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+#if TRACE
+ int iD;
+ Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
+ fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
+ __FILE__, __FUNCTION__, __LINE__);
+ for (iD=0; iD<iNumSeeds; iD++) {
+ fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
+ }
+ fprintf(stderr, "\n");
+ fflush(stderr);
+#endif
+ Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
+ iI, prKMeansResult->piNObjsPerCluster[iI]);
+#if TRACE
+ for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
+ int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
+
+ Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
+ iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name);
+ }
+#endif
+ }
+#endif
+
+
+ /* Cluster pre-clusters produced by k-means.
+ *
+ * Do this by calculating the vector distances of the cluster
+ * centers and applying UPGMA.
+ *
+ * @note could try to force-balance the tree here
+ *
+ */
+ if (0 != NewSymMatrix(&prPreClusterDistmat,
+ prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
+ Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
+ }
+ for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+ for (iJ=iI+1; iJ<prKMeansResult->iNClusters; iJ++) {
+ double dDist;
+ dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
+ prKMeansResult->ppdClusterCenters[iJ],
+ iNumSeeds);
+ SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist);
+ /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
+ iI, iJ, dDist); */
+ }
+ }
+
+
+ /* labels needed for the guide tree building routine only */
+ ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
+ for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+ ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
+ (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
+ }
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
+ SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
+#endif
+
+ GuideTreeUpgma(prMbedTree_p,
+ ppcLabels, prPreClusterDistmat, NULL);
+
+ for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
+ CKFREE(ppcLabels[iI]);
+ }
+ CKFREE(ppcLabels);
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
+ LogTree(*prMbedTree_p);
+#endif
+
+
+
+ /* Now replace each leaf in the pre-cluster-center tree
+ * appropriately, i.e. with the corresponding sub-cluster.
+ *
+ * For each leaf/sub-cluster, create a distance matrix for the
+ * corresponding sequences. Use distances between vectors as
+ * approximated distances between sequences. Then create a
+ * guide-tree by applying UPGMA.
+ *
+ */
+
+ /* Get a mapping of (pre)cluster number and leaf-node indices in
+ * the cluster-center tree. We can add trees to prMbedTree_p
+ * because AppendTrees() guarantees that no other than the node to
+ * append to changes.
+ */
+ piClusterToTreeNode = (int*)
+ CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
+ iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
+ do {
+ if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
+ int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
+ piClusterToTreeNode[iLeafId] = iNodeIndex;
+ }
+ iNodeIndex = NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
+ } while (NULL_NEIGHBOR != iNodeIndex);
+
+
+
+
+ /* Now step through all the leafs and replace them with the
+ * corresponding sub-trees
+ */
+ NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
+ "Distance calculation within sub-clusters", bPrintCR);
+ /* for each cluster */
+ for (iClusterIndex=0;
+ iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
+ /* distance matrix for the sub-cluster */
+ symmatrix_t *prWithinClusterDistances = NULL;
+ int iNSeqInCluster;
+ tree_t *prSubClusterTree = NULL;
+
+ ProgressLog(prSubClusterDistanceProgress,
+ iClusterIndex, prKMeansResult->iNClusters, FALSE);
+
+#if FULL_WITHIN_CLUSTER_DISTANCES
+ mseq_t *prSubClusterMSeq;
+ int iPairDistType;
+ int iSeqIndex;
+ int iOldLogLevel;
+
+ Log(&rLog, LOG_DEBUG,
+ "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
+
+ if (TRUE == prMSeq->aligned) {
+ iPairDistType = PAIRDIST_SQUIDID_KIMURA;
+ } else {
+ iPairDistType = PAIRDIST_KTUPLE;
+ }
+#endif
+
+ iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
+ iClusterIndex, iNSeqInCluster);
+#endif
+
+#if FULL_WITHIN_CLUSTER_DISTANCES
+ /* create an mseq structure for sequences in this cluster
+ * don't need most of the members (e.g. orig_seq) but copy
+ * them anyway for the sake of completeness
+ */
+ NewMSeq(&prSubClusterMSeq);
+
+ prSubClusterMSeq->nseqs = iNSeqInCluster;
+ prSubClusterMSeq->seqtype = prMSeq->seqtype;
+ if (NULL!=prMSeq->filename) {
+ prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
+ }
+ prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */
+ prSubClusterMSeq->seq = (char **)
+ CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
+ prSubClusterMSeq->orig_seq = (char **)
+ CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
+ prSubClusterMSeq->sqinfo = (SQINFO *)
+ CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
+
+ for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
+ int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
+ prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
+ prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
+ SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
+#if TRACE
+ Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
+ iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
+ iRealSeqIndex);
+#endif
+ }
+#endif
+
+
+ /* Create a distance matrix for this sub-cluster
+ * (prWithinClusterDistances) by using the vector distances or
+ * ktuple distances.
+ *
+ * Then apply UPGMA to get a subcluster tree
+ * (prSubClusterTree) and append created tree to the
+ * pre-cluster-tree (prMbedTree_p)
+ *
+ */
+#if FULL_WITHIN_CLUSTER_DISTANCES
+
+ iOldLogLevel = rLog.iLogLevelEnabled;
+ rLog.iLogLevelEnabled = LOG_WARN;
+ /* compute distances, but be quiet */
+ if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
+ 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
+ NULL, NULL)) {
+ Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
+ return -1;
+ }
+ rLog.iLogLevelEnabled = iOldLogLevel;
+
+#if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE
+ {
+ double dSum = 0.0;
+ for (iI=0; iI<prSubClusterMSeq->nseqs; iI++) {
+ for (iJ=iI+1; iJ<prSubClusterMSeq->nseqs; iJ++) {
+ dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ);
+ }
+ }
+ Log(&rLog, LOG_FORCED_DEBUG,
+ "mean pair-wise distance within subcluster %d of %d = %f",
+ iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs);
+ }
+#endif
+
+#else
+ if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
+ Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
+ }
+ for (iI=0; iI<iNSeqInCluster; iI++) {
+ int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
+
+ for (iJ=iI+1; iJ<iNSeqInCluster; iJ++) {
+ int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iJ];
+ double dist;
+
+ /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
+ iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
+ iJ, prMSeq->sqinfo[iRealIndexJ].name); */
+
+ if (1 == USE_EUCLIDEAN_DISTANCE) {
+ dist = EuclDist(ppdSeqVec[iRealIndexI],
+ ppdSeqVec[iRealIndexJ], iNumSeeds);
+ } else {
+ dist = CosDist(ppdSeqVec[iRealIndexI],
+ ppdSeqVec[iRealIndexJ], iNumSeeds);
+ }
+ SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist);
+ }
+ }
+#endif
+
+ /* labels needed for the guide tree building routine only */
+ ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
+ for (iI=0; iI<iNSeqInCluster; iI++) {
+#if FULL_WITHIN_CLUSTER_DISTANCES
+ ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
+#else
+ int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
+ ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
+#endif
+ }
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
+ iClusterIndex, prKMeansResult->iNClusters);
+ SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
+#endif
+
+
+ GuideTreeUpgma(&prSubClusterTree, ppcLabels,
+ prWithinClusterDistances, NULL);
+
+ CKFREE(ppcLabels); /* don't free members, they just point */
+#if 0
+ Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
+ LogTree(prSubClusterTree);
+#endif
+
+
+ /* The guide tree id's (that point to the sequences) now start
+ * from 0, i.e. the association with the prMSeq numbering is
+ * broken and fixed in the following
+ */
+ for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
+ if (IsLeaf(iNodeIndex, prSubClusterTree)) {
+ int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
+ int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
+#if 0
+ Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
+ iNodeIndex, iLeafId,
+ GetLeafName(iNodeIndex, prSubClusterTree),
+ iRealId, prMSeq->sqinfo[iRealId].name);
+#endif
+ SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
+ }
+ }
+
+
+ /* Append the newly created tree (prSubClusterTree) to the
+ * corresponding node index of prMbedTree_p.
+ */
+#if TRACE
+ Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
+ piClusterToTreeNode[iClusterIndex],
+ GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
+#endif
+
+ AppendTree(*prMbedTree_p,
+ piClusterToTreeNode[iClusterIndex],
+ prSubClusterTree);
+ /* Note: piClusterToTreeNode is still valid, because
+ * AppendTrees() guarantees that no other than the node to
+ * append to changes. */
+
+#if 0
+ Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
+ LogTree(*prMbedTree_p);
+
+ if (0) {
+ char fname[] = "mbed-joined-tree.dnd";
+ FILE *pfOut;
+ if (NULL == (pfOut = fopen(fname, "w"))) {
+ Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
+ }
+ MuscleTreeToFile(pfOut, *prMbedTree_p);
+ Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
+ fclose(pfOut);
+ Log(&rLog, LOG_FATAL, "DEBUG EXIT");
+ }
+#endif
+
+ /* cleanup
+ */
+ FreeMuscleTree(prSubClusterTree);
+ FreeSymMatrix(&prWithinClusterDistances);
+#if FULL_WITHIN_CLUSTER_DISTANCES
+ FreeMSeq(&prSubClusterMSeq);
+#endif
+ } /* end for each cluster */
+ ProgressDone(prSubClusterDistanceProgress);
+ FreeProgress(&prSubClusterDistanceProgress);
+
+
+ if (NULL != pcGuidetreeOut) {
+ if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
+ Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
+ } else {
+ MuscleTreeToFile(pfOut, *prMbedTree_p);
+ Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
+ (void) fclose(pfOut);
+ }
+ }
+
+
+ /* cleanup
+ *
+ */
+#if MBED_TIMING
+ StopwatchStop(stopwatch);
+ StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
+ StopwatchFree(stopwatch);
+#endif
+
+ FreeKMeansResult(&prKMeansResult);
+ FreeSymMatrix(&prPreClusterDistmat);
+ for (iI=0; iI<prMSeq->nseqs; iI++) {
+ CKFREE(ppdSeqVec[iI]);
+ }
+ CKFREE(ppdSeqVec);
+ CKFREE(piClusterToTreeNode);
+
+#ifndef NDEBUG
+ TreeValidate(*prMbedTree_p);
+#endif
+
+ return 0;
+}
+/*** end: Mbed() ***/