binaries/src/clustalo/src/clustal/mbed.c

   1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */
   2
   3 /*********************************************************************
   4  * Clustal Omega - Multiple sequence alignment
   5  *
   6  * Copyright (C) 2010 University College Dublin
   7  *
   8  * Clustal-Omega is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This file is part of Clustal-Omega.
  14  *
  15  ********************************************************************/
  16
  17 /*
  18  *  RCS $Id: mbed.c 316 2016-12-16 16:14:39Z fabian $
  19  *
  20  *
  21  * Reimplementation from scratch of mBed (Blackshields et al., 2010;
  22  * PMID 20470396) and addition of bisecting K-Means which allows
  23  * control over subcluster size
  24  *
  25  */
  26
  27 #ifdef HAVE_CONFIG_H
  28 #include "config.h"
  29 #endif
  30
  31 #include <assert.h>
  32 #include <float.h>
  33
  34 #include "mbed.h"
  35 #include "pair_dist.h"
  36 #include "symmatrix.h"
  37 #include "ktuple_pair.h"
  38 #include "tree.h"
  39 #include "util.h"
  40 #include "progress.h"
  41 #include "queue.h"
  42
  43 #include "log.h"
  44 #include "kmpp/KMeans.h"
  45 #include "mbed.h"
  46
  47 #define TIMING 0
  48 #if TIMING
  49 #include "squid/stopwatch.h"
  50 #endif
  51
  52
  53 /* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
  54  * bisecting kmeans subcluster are not estimated using the vectors,
  55  * but calculated normally (using ktuple or kimura). Surprisingly this
  56  * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
  57  * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
  58  * log2 instead of log2^2). And of course it slows things down.
  59  */
  60 #define FULL_WITHIN_CLUSTER_DISTANCES 1
  61
  62 #define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0
  63
  64 /* Cluster size limits. Maximum is a soft limit, which might be
  65  * exceeded if a K-Means split was unsuccesful, where unsuccesful
  66  * might also mean that the minimum required number seqs. was not
  67  * reached */
  68 #if FULL_WITHIN_CLUSTER_DISTANCES
  69 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
  70 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
  71 #else
  72 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
  73 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
  74 #endif
  75
  76 /* How many restarts per bisecting kmeans split. 10 give 0.5% increase
  77  * in quality on original HOMFAM over just 1. It also increases kmeans
  78  * time by a factor of ~3, but overall time is insignificant
  79  * compared to pairdist/progressive alignment part.
  80  */
  81 static const int RESTARTS_PER_SPLIT = 10;
  82
  83
  84 /* Use standard kmeans (lloyds) or kmeans++. Both are almost
  85  * indistinguishable here, although kmeans++ is slightly ahead the
  86  * fewer seeds you pick (and it should be better in theory)
  87  */
  88 #define USE_KMEANS_LLOYDS 0
  89
  90
  91 //#ifndef HAVE_LOG2
  92 #ifndef CLUSTAL_OMEGA_HAVE_LOG2
  93 //#define log2(x)  (log(x) / 0.69314718055994530942)
  94 #define log2(x)  ( x<=0 ? (float)(-100000) : 1.442695041*log(x) )
  95 #endif
  96 #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
  97
  98
  99 /* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
 100  * approach: Sample iSeeds sequence with constant stride from length-sorted X.
 101  * It might be better to pick the seeds randomly, because length sorting might
 102  * be more prone to including outliers (e.g. very long and very short seqs).
 103  * However, we could never observer such a thing. So just stick to the
 104  * original version as this also removes the random element.
 105  */
 106 enum SEED_SELECTION_TYPE {
 107     SELECT_SEEDS_RANDOMLY,
 108     SELECT_SEEDS_BY_LENGTH
 109 };
 110 #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
 111
 112
 113 /* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
 114  * no difference between mbed-trees created from cosine or euclidean
 115  * distances (simple version, just using disparities).
 116  */
 117 #define USE_EUCLIDEAN_DISTANCE 1
 118
 119
 120 /* print some mbed pre-cluster usage to screen */
 121 #define PRINT_CLUSTER_DISTRIBUTION 0
 122
 123
 124 #define TRACE 0
 125
 126
 127 typedef struct {
 128     /* Number of final clusters
 129      */
 130     int iNClusters;
 131
 132     /* Coordinates (columns) for each cluster (rows)
 133      * valid indices: [0...iNClusters][0...dim-1]
 134      */
 135     double **ppdClusterCenters;
 136
 137     /* Dimensionality of input data and cluster center coordinates
 138      */
 139     int iDim;
 140
 141     /* Number of objects per cluster
 142      */
 143     int *piNObjsPerCluster;
 144
 145     /* Objects (indices) for each cluster. [i][j] will point to (index
 146      * of) object j in cluster i
 147      */
 148     int **ppiObjIndicesPerCluster;
 149 } bisecting_kmeans_result_t;
 150
 151
 152
 153 /**
 154  * @brief Free KMeans result structure.
 155  *
 156  * @param[out]  prResult_p
 157  * K-Means result to free
 158  *
 159  * @see NewKMeansResult()
 160  */
 161 void
 162 FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
 163 {
 164     int iAux;
 165     CKFREE((*prResult_p)->piNObjsPerCluster);
 166     for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
 167         CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
 168         CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
 169     }
 170     CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
 171     CKFREE((*prResult_p)->ppdClusterCenters);
 172     (*prResult_p)->iNClusters = 0;
 173     (*prResult_p)->iDim = 0;
 174     CKFREE(*prResult_p);
 175 }
 176 /***   end: FreeKMeansResult()   ***/
 177
 178
 179
 180 /**
 181  * @brief Allocate new KMeans result
 182  *
 183  * @param[out] prKMeansResult_p
 184  * K-Means result to free
 185  *
 186  * @see NewKMeansResult()
 187  */
 188 void
 189 NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
 190 {
 191     (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
 192         CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
 193     (*prKMeansResult_p)->iNClusters = 0;
 194     (*prKMeansResult_p)->iDim = 0;
 195     (*prKMeansResult_p)->ppdClusterCenters = NULL;
 196     (*prKMeansResult_p)->piNObjsPerCluster = NULL;
 197     (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
 198
 199 }
 200 /***   end: NewKMeansResult()   ***/
 201
 202
 203
 204 /**
 205  * @brief Calculate the euclidean distance between two vectors
 206  *
 207  * @param[in] v1
 208  * First vector with dim dimensions
 209  * @param[in] v2
 210  * Second vector with dim dimensions
 211  * @param[in] dim
 212  * Dimensionality of v1 and v2
 213  *
 214  * @return euclidean distance as double
 215  *
 216  * @note Could probably be inlined. Also perfect for SSE
 217  */
 218 double
 219 EuclDist(const double *v1, const double *v2, const int dim)
 220 {
 221     int i; /* aux */
 222     double dist; /* returned distance */
 223
 224     assert(v1!=NULL);
 225     assert(v2!=NULL);
 226
 227     dist = 0.0;
 228     for (i=0; i<dim; i++) {
 229         dist += pow(v1[i]-v2[i], 2.0);
 230     }
 231
 232     return sqrt(dist);
 233 }
 234 /***   end: EuclDist()   ***/
 235
 236
 237
 238 /**
 239  * @brief Calculate the cosine distance between two vectors
 240  *
 241  * @param[in] v1
 242  * First vector with dim dimensions
 243  * @param[in] v2
 244  * Second vector with dim dimensions
 245  * @param[in] dim
 246  * Dimensionality of v1 and v2
 247  *
 248  * @return cosine distance as double
 249  *
 250  * @note Could probably be inlined. Also perfect for SSE
 251  */
 252 double
 253 CosDist(const double *v1, const double *v2, const int dim)
 254 {
 255     int i; /* aux */
 256     double dist; /* returned distance */
 257     double s, sq1, sq2;
 258
 259     assert(v1!=NULL);
 260     assert(v2!=NULL);
 261
 262     s = 0.0;
 263     sq1 = 0.0;
 264     sq2 = 0.0;
 265     for (i=0; i<dim; i++) {
 266         s += (v1[i]*v2[i]);
 267         sq1 += pow(v1[i], 2.0);
 268         sq2 += pow(v2[i], 2.0);
 269     }
 270     sq1 = sqrt(sq1);
 271     sq2 = sqrt(sq2);
 272
 273     if ((sq1 * sq2) < DBL_EPSILON) {
 274         dist = 1 - s / (sq1 * sq2);
 275     } else {
 276         /* if the denominator gets small, the fraction gets big, hence dist
 277            gets small: */
 278         dist = 0.0;
 279     }
 280     return dist;
 281 }
 282 /***   end: CosDist()   ***/
 283
 284
 285
 286 /**
 287  * @brief convert sequences into mbed-like (distance) vector
 288  * representation. Seeds (prMSeq sequence indices) have to be picked before
 289  *
 290  * @param[out] ppdSeqVec
 291  * Pointer to preallocated matrix of size nseqs x iSeeds
 292  * @param[in] prMSeq
 293  * Sequences which are to be converted
 294  * @param[in] piSeeds
 295  * Array of sequences in prMSeq which are to be used as seeds.
 296  * @param[in] iNumSeeds
 297  * Number of seeds/elements in piSeeds
 298  * @param[in] iPairDistType
 299  * Type of pairwise distance comparison
 300  *
 301  */
 302 int
 303 SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
 304          int *piSeeds, const int iNumSeeds,
 305          const int iPairDistType)
 306 {
 307     symmatrix_t *prDistmat = NULL;
 308     int iSeqIndex;
 309     int iSeedIndex;
 310    /* indices for restoring order */
 311     int *restore = NULL;
 312     /* sorted copy of piSeeds */
 313     int *piSortedSeeds = NULL;
 314
 315 #if TIMING
 316     Stopwatch_t *stopwatch = StopwatchCreate();
 317     StopwatchZero(stopwatch);
 318     StopwatchStart(stopwatch);
 319 #endif
 320
 321     assert(NULL != ppdSeqVec);
 322     assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
 323
 324     piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
 325     memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
 326
 327     /* need them sorted, otherwise the swapping approach below might
 328      * break
 329      */
 330     qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
 331
 332
 333     /* rearrange mseq order so that we can use ktuple_pairdist code as
 334      * is. That is, swap seeds and non-seeds such that the seeds end
 335      * up in front of mseq. This way we can use the KTuplePairDist()
 336      * code, without making a copy of mseq. Also, keep an array of
 337      * indices which serves to restore the original sequence order
 338      * after putting the seeds in front
 339      *
 340      */
 341     restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
 342     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 343         restore[iSeqIndex] = iSeqIndex;
 344     }
 345     for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 346 #if TRACE
 347         Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
 348 #endif
 349         if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
 350             int swap;
 351             SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
 352
 353             swap = restore[iSeedIndex];
 354             restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
 355             restore[piSortedSeeds[iSeedIndex]] = swap;
 356         }
 357     }
 358 #if TRACE
 359     printf("DEBUG(%s|%s():%d): restore indices =",
 360            __FILE__, __FUNCTION__, __LINE__);
 361     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 362         printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
 363     }
 364     printf("\n");
 365
 366     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 367         Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now:  seq = %s",
 368                   iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
 369     }
 370 #endif
 371
 372
 373     /* convert sequences into vectors of distances by calling pairwise
 374      * distance function for each seq/seed pair
 375      *
 376      */
 377     /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
 378     if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, FALSE,
 379                            0, iNumSeeds, 0, prMSeq->nseqs,
 380                            NULL, NULL)) {
 381         Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
 382         FreeSymMatrix(&prDistmat);
 383         CKFREE(piSortedSeeds);
 384         CKFREE(restore);
 385         return -1;
 386     }
 387 #if TRACE
 388     {
 389         char **labels;
 390         labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
 391         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 392             labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
 393         }
 394         SymMatrixPrint(prDistmat, labels, NULL, FALSE);
 395         CKFREE(labels);
 396     }
 397 #endif
 398
 399
 400     /* update ppdSeqVec according to prDistmat. keep in mind that we
 401      * changed mseq order
 402      *
 403      */
 404     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 405         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 406             ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
 407                 SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
 408         }
 409     }
 410
 411
 412     /* restore mseq order by reverse swapping
 413      *
 414      * need reverse order now, so start from top index
 415      */
 416     iSeedIndex=iNumSeeds-1;
 417     do {
 418 #if TRACE
 419         Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
 420 #endif
 421         if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
 422             int swap;
 423
 424             SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
 425
 426             swap = restore[iSeedIndex];
 427             restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
 428             restore[piSortedSeeds[iSeedIndex]] = swap;
 429         }
 430     } while (0 != iSeedIndex--);
 431 #if TRACE
 432      for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 433          Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d:  seq = %s %s",
 434                    iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
 435      }
 436 #endif
 437
 438 #if TRACE
 439     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 440         printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
 441         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 442             printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
 443         }
 444         printf("\n");
 445     }
 446 #endif
 447
 448
 449     FreeSymMatrix(&prDistmat);
 450     CKFREE(restore);
 451     CKFREE(piSortedSeeds);
 452 #if TIMING
 453     StopwatchStop(stopwatch);
 454     StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
 455     StopwatchFree(stopwatch);
 456 #endif
 457
 458
 459     return 0;
 460 }
 461 /***   end: SeqToVec()   ***/
 462
 463
 464
 465 /**
 466  * @brief Select seeds to be used from an prMSeq
 467  *
 468  * @param[out] piSeeds
 469  * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
 470  * @param[in] iNumSeeds
 471  * Number of seeds to be picked
 472  * @param[in] iSelectionMethod
 473  * Seed selection method to be used
 474  * @param[in] prMSeq
 475  * The prMSeq structure to pick sequences from
 476  *
 477  * @return: Non-zero on failure
 478  *
 479  */
 480 int
 481 SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod,  mseq_t *prMSeq)
 482 {
 483     /* seed iterator */
 484     int iSeedIndex;
 485     /* sequence iterator */
 486     int iSeqIndex;
 487
 488     if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
 489         int *piPermArray;
 490
 491         Log(&rLog, LOG_INFO,
 492              "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
 493              iNumSeeds, prMSeq->nseqs);
 494
 495         PermutationArray(&piPermArray, iNumSeeds);
 496         /* copy to piSeeds */
 497         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 498             piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
 499         }
 500         CKFREE(piPermArray);
 501
 502     }  else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
 503
 504         /* step size for picking with constant stride */
 505         int iStep;
 506         int *piSeqLen =  CKMALLOC(prMSeq->nseqs * sizeof(int));
 507         int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
 508
 509         Log(&rLog, LOG_INFO,
 510              "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
 511              iNumSeeds, prMSeq->nseqs);
 512
 513         iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
 514                                             * due to rounding */
 515         /* first get an array of seq indices order according to
 516            corresponding sequence length: piOrder */
 517         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 518             piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
 519         }
 520         QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
 521 #if 0
 522         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 523             Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
 524                       iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
 525         }
 526 #endif
 527         CKFREE(piSeqLen);
 528         iSeedIndex = 0;
 529         for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
 530             piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
 531         }
 532         CKFREE(piOrder);
 533
 534     } else {
 535
 536         Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
 537         return -1;
 538     }
 539
 540
 541 #ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
 542     /* do initial mbedding and kmeans. then pick seeds from each cluster and
 543        do full mbed with these seeds. idea is that those seeds represent the
 544        sequence space better than random seeds. however, this reduces the
 545        quality slightly. random is almost always better.
 546     */
 547
 548     if (1) {
 549         /* seqs represented as distance vectors */
 550         double **ppdSeqVec;
 551         double dCost = -1.0;
 552         /* assignments for each seq to the K newly created clusters */
 553         int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
 554         double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
 555         /* create a copy of ppdSeqVec suitable for KMeans */
 556         double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
 557         bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
 558
 559         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
 560         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
 561         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
 562         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
 563         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
 564
 565         /*
 566           Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
 567           iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
 568         */
 569
 570         ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
 571         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 572             ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
 573         }
 574         if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
 575             Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
 576             return -1;
 577         }
 578
 579         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 580             int iDim;
 581             for (iDim=0; iDim<iNumSeeds; ++iDim) {
 582                 pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
 583             }
 584         }
 585
 586
 587         Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
 588         dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
 589                        pdKMeansVectors,
 590                        RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
 591                        pdKMeansClusterCenters, piKMeansClusterAssignments);
 592         Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
 593
 594         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
 595
 596 #if TRACE
 597         Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
 598         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 599             Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
 600                       iSeqIndex,  prMSeq->sqinfo[iSeqIndex].name,
 601                       piKMeansClusterAssignments[iSeqIndex]);
 602         }
 603 #endif
 604
 605         Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
 606         iSeedIndex = 0;
 607         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 608             int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
 609             if (pbClusterUsed[iAssignedCluster]) {
 610                 continue;
 611             } else {
 612                 /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
 613                 piSeeds[iSeedIndex++] = iSeqIndex;
 614                 pbClusterUsed[iAssignedCluster] = TRUE;
 615             }
 616         }
 617         CKFREE(pbClusterUsed);
 618
 619         CKFREE(pdKMeansVectors);
 620         CKFREE(pdKMeansClusterCenters);
 621         CKFREE(piKMeansClusterAssignments);
 622
 623     }
 624     /* if (1) */
 625 #endif
 626
 627
 628     if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
 629         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 630             Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
 631                  piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
 632         }
 633     }
 634
 635     return 0;
 636 }
 637 /* end of SeedSelection() */
 638
 639
 640
 641
 642 /**
 643  * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
 644  * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
 645  *
 646  * @param[out] prKMeansResult_p
 647  * Result of Bisecting KMeans. Will be allocated here.
 648  * Caller has to free. See @see FreeKMeansResult()
 649  * @param[in] iNObjs
 650  * Number of objects/sequences to cluster
 651  * @param[in] iDim
 652  * Dimensionality of input data
 653  * @param[in] ppdVectors
 654  * each row holds iDim points for this object's coordinates
 655  * @param[in] iMinRequiredObjsPerCluster
 656  * Minimum number of objects per Cluster (inclusive)/
 657  * @param[in] iMaxAllowedObjsPerCluster
 658  * Maximum number of objects per Cluster (inclusive). Function returns once no
 659  * cluster contains more then this number of objects. Soft limit!
 660  *
 661  * @note Convoluted code. Could use some restructuring. My apologies.
 662  * AW
 663  *
 664  */
 665 void
 666 BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
 667                 const int iNObjs, const int iDim, double **ppdVectors,
 668                 const int iMinRequiredObjsPerCluster,
 669                 const int iMaxAllowedObjsPerCluster, char ***ppcClusterSplits_p)
 670 {
 671     int iN, iD;
 672     /* cluster centers for each cluster created at each split */
 673     double *pdKClusterCenters;
 674     /* keep track of updated object indices per newly created
 675      * cluster */
 676     int *piCurObjToUpdate;
 677     /* number of times we called 2-means */
 678     int iNRounds;
 679     /* flag for unsuccessful cluster split */
 680     bool bNaNDetected = FALSE;
 681     /* flag for detected small cluster after split  */
 682     bool bSmallClusterDetected;
 683     /* queue of clusters which are to be split */
 684     int_queue_t rClusterSplitQueue;
 685     /* */
 686     int iLog2N2 = 0;
 687
 688 #if TIMING
 689     Stopwatch_t *stopwatch = StopwatchCreate();
 690 #endif
 691
 692
 693     piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
 694
 695     NewKMeansResult(prKMeansResult_p);
 696
 697     /* new cluster centers created at each split/KMeans run
 698      */
 699     pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
 700
 701     /* init results by setting a first cluster that contains all objects
 702      */
 703     (*prKMeansResult_p)->iNClusters = 1;
 704     (*prKMeansResult_p)->iDim = iDim;
 705     /* fake center coordinates of first cluster */
 706     (*prKMeansResult_p)->ppdClusterCenters =
 707         (double **) CKMALLOC(1 * sizeof(double *));
 708     (*prKMeansResult_p)->ppdClusterCenters[0] =
 709         (double *) CKMALLOC(iDim * sizeof(double));
 710     /* objects per cluster */
 711     (*prKMeansResult_p)->piNObjsPerCluster =
 712         (int *) CKMALLOC(1 * sizeof(int));
 713     (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
 714     /* object indices per cluster */
 715     (*prKMeansResult_p)->ppiObjIndicesPerCluster =
 716         (int **) CKMALLOC(1 * sizeof(int *));
 717     (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
 718         CKMALLOC(iNObjs * sizeof(int));
 719     for (iN=0; iN<iNObjs; iN++) {
 720         (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
 721     }
 722
 723
 724     /*
 725      * this is an array that encodes where a sequences fell at a bisecting k-means
 726      * ppcClusterSplits_p can consume quite a bit of memory,
 727      but it is only needed if clustering info is written to file.
 728      Use ppcClusterSplits_p as a flag, initialise in the calling function (Mbed)
 729      to NULL, if no info written out then leave ppcClusterSplits_p=NULL.
 730      If info is to be written out then malloc one char*, so that
 731      ppcClusterSplits_p!=NULL. In that case realloc proper amount.
 732      Only expect a certain number of splits, iLog2N2.
 733      Worst case would be iNObjs but then ppcClusterSplits_p
 734      would be quadratic, which is too big for many sequences.
 735      */
 736     if (NULL != *ppcClusterSplits_p){
 737         /* (square) of logarithm (base 2) of number of objects */
 738         double dLog2N = log10(iNObjs) / log10(2);
 739         iLog2N2 = (int)(dLog2N*dLog2N);
 740
 741         *ppcClusterSplits_p      = (char **)CKREALLOC(*ppcClusterSplits_p, iNObjs*sizeof(char *));
 742         (*ppcClusterSplits_p)[0] =  (char *)CKMALLOC(iNObjs*iLog2N2);
 743         (*ppcClusterSplits_p)[0][0] = '\0';
 744         for (iN = 1; iN < iNObjs; iN++){
 745             (*ppcClusterSplits_p)[iN] = (*ppcClusterSplits_p)[iN-1] + iLog2N2;
 746             (*ppcClusterSplits_p)[iN][0] = '\0';
 747         }
 748     }
 749
 750
 751     /* Starting with the first cluster that now contains all the
 752      * sequences keep splitting until no cluster contains more than
 753      * iMaxAllowedObjsPerCluster
 754      *
 755      * Keep a queue of clusters (rClusterSplitQueue) to split
 756      *
 757      * At each split values/memory of the just split cluster will be
 758      * reused and exactly one new only allocated.
 759      *
 760      * Example:
 761      * Given the following cluster assignments
 762      * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
 763      * and a K-Means split in cluster 3 at |:
 764      * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
 765      * The result should be this:
 766      * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
 767      *
 768      *
 769      */
 770     INT_QUEUE_INIT(&rClusterSplitQueue);
 771     if (iNObjs>iMaxAllowedObjsPerCluster) {
 772         /* pish fake first cluster index */
 773         INT_QUEUE_PUSH(&rClusterSplitQueue, 0);
 774     }
 775     iNRounds = 0;
 776     while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
 777         /* assignments for each seq to the K newly created clusters */
 778         int *piKClusterAssignments;
 779         /* number of objects in cluster that is to be split */
 780         int iNObjsInClusterToSplit;
 781         /* coordinates of points in cluster that is to be split
 782          * array of size n*d where [d*i + j] gives coordinate j of point i
 783          */
 784         double *pdVectorsInClusterToSplit;
 785         /* copy of object indices in split cluster */
 786         int *piObjIndicesOfSplitCluster;
 787         /* best cost of kmeans rounds */
 788         double dCost = -1.0;
 789
 790         /*  indices for the two created clusters
 791          */
 792         /* index of cluster to split */
 793         int iClusterToSplot;
 794         /* index of newly created cluster at each round. data for the
 795            other created cluster goes to the one just split,
 796            i.e. (*piClusterToSplot) */
 797         int iNewClusterIdx;
 798
 799 #if TIMING
 800         StopwatchZero(stopwatch);
 801         StopwatchStart(stopwatch);
 802 #endif
 803
 804         INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
 805
 806         iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
 807         piKClusterAssignments = (int *)
 808             CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
 809
 810         pdVectorsInClusterToSplit = (double *)
 811             CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
 812         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 813             for (iD=0; iD<iDim; ++iD) {
 814                 int iThisObjIdx =
 815                     (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
 816                 pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
 817             }
 818         }
 819
 820 #if TRACE
 821         Log(&rLog, LOG_FORCED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
 822             iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
 823         fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
 824                 __FILE__, __FUNCTION__, __LINE__);
 825         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 826             fprintf(stderr, " %u",
 827                     (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
 828         }
 829         fprintf(stderr, "\n");
 830         (void) fflush(stderr);
 831 #endif
 832
 833 #if TRACE
 834         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 835             fprintf(stderr,
 836                     "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
 837                     __FILE__, __FUNCTION__, __LINE__, iN,
 838                     (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
 839             for (iD=0; iD<iDim; iD++) {
 840                 fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
 841             }
 842             fprintf(stderr, "\n");
 843             (void) fflush(stderr);
 844         }
 845 #endif
 846
 847        /* KMeans(1 "The number of points in the data set",
 848         *        2 "The number of clusters to look for",
 849         *        3 "The number of dimensions that the data set lives in",
 850         *        4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
 851         *        5 "attempts: The number of times to independently run k-means",
 852         *        6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
 853         *        7 "centers: This can either be null or an array of size k*d.
 854         *           In the latter case, centers[d*i + j] will give coordinate j of center i.
 855         *           If the cluster is unused, it will contain NaN instead.",
 856         *        8 "assignments: This can either be null or an array of size n.
 857         *           In the latter case, it will be filled with the cluster that each point is assigned to
 858         *           (an integer between 0 and k-1 inclusive).");
 859         */
 860         dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
 861                        pdVectorsInClusterToSplit,
 862                        RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
 863                        pdKClusterCenters, piKClusterAssignments);
 864
 865 #if TRACE
 866         Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
 867         for (iN=0; iN<2; iN++) {
 868             fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
 869                     __FILE__, __FUNCTION__, __LINE__, iN);
 870             for (iD=0; iD<iDim; iD++) {
 871                 fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
 872             }
 873             fprintf(stderr, "\n");
 874             (void) fflush(stderr);
 875         }
 876         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 877             Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
 878                       iN,  piKClusterAssignments[iN], 2);
 879         }
 880 #endif
 881
 882         /* real index of one of the newly created clusters. the other
 883          * one is iClusterToSplot */
 884         iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
 885
 886
 887         /* We don't want Clusters which are too small. Check here if a
 888          * split created a small cluster and if yes, discard the
 889          * solution. Because the cluster has already been removed from
 890          * the queue, we can just continue.
 891          */
 892         bSmallClusterDetected = FALSE;
 893         if (iMinRequiredObjsPerCluster>1) {
 894             int iNObjsCluster[2];
 895             iNObjsCluster[0] = 0; /* first cluster */
 896             iNObjsCluster[1] = 0; /* second cluster */
 897             for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 898                 iNObjsCluster[piKClusterAssignments[iN]]+=1;
 899             }
 900
 901             if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
 902                 ||
 903                 iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
 904                 bSmallClusterDetected = TRUE;
 905                 Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
 906                           iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
 907             }
 908         }
 909
 910         /* Same logic as for small clusters applies if KMeans couldn't
 911          * split the cluster. In this case one of its center
 912          * coordinates will be NaN, which we check for in the
 913          * following.
 914          */
 915         if (! bSmallClusterDetected) {
 916             for (iN=0; iN<2; iN++) {
 917                 bNaNDetected = FALSE;
 918                 for (iD=0; iD<iDim; iD++) {
 919                     if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
 920                         /* Got NaN as coordinate after splitting */
 921                         Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
 922                              __FUNCTION__,
 923                              iClusterToSplot, iNObjsInClusterToSplit,
 924                             "Hope it's not too big and doesn't slow things down."
 925                             );
 926                         bNaNDetected = TRUE;
 927                         break;
 928                     }
 929                 }
 930                 if (bNaNDetected) {
 931                     break;
 932                 }
 933             }
 934         }
 935
 936         /* Discarding split of this cluster. It has been removed from the
 937          * queue already. so just continue
 938          */
 939         if (bNaNDetected || bSmallClusterDetected) {
 940             CKFREE(piKClusterAssignments);
 941             CKFREE(pdVectorsInClusterToSplit);
 942             continue;
 943         }
 944
 945
 946         /* update cluster centers: pdClusterCenters
 947          *
 948          */
 949         /* reuse memory of existing/old/split cluster
 950          */
 951         for (iN=0; iN<iDim; iN++) {
 952             double dCoord = pdKClusterCenters[0*iDim+iN];
 953             (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
 954         }
 955         /* realloc and set new one
 956          */
 957         (*prKMeansResult_p)->ppdClusterCenters = (double **)
 958             CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
 959                       ((*prKMeansResult_p)->iNClusters+1)  * sizeof(double *));
 960         (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
 961             CKMALLOC(iDim * sizeof(double));
 962         for (iD=0; iD<iDim; iD++) {
 963             double dCoord = pdKClusterCenters[1*iDim+iD];
 964             (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
 965         }
 966 #if TRACE
 967         Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
 968         for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
 969             fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
 970                     __FILE__, __FUNCTION__, __LINE__, iN);
 971             for (iD=0; iD<iDim; iD++) {
 972                 fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
 973             }
 974             fprintf(stderr, "\n");
 975             (void) fflush(stderr);
 976         }
 977 #endif
 978
 979
 980         /* update #seqs per cluster: piNObjsPerCluster
 981          *
 982          */
 983         (*prKMeansResult_p)->piNObjsPerCluster = (int *)
 984             CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
 985                       ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
 986         /* init new and old one to zero */
 987         (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
 988         (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
 989         /* now update values */
 990         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 991             if (0 == piKClusterAssignments[iN]) {
 992                 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
 993             } else if (1 == piKClusterAssignments[iN]) {
 994                 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
 995             } else {
 996                 /* there used to be code for iK>=2 in r101 */
 997                 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
 998                       piKClusterAssignments[iN]);
 999             }
1000         }
1001 #if TRACE
1002         Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
1003         for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
1004             Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
1005                       iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
1006         }
1007 #endif
1008         /* queue clusters if still they are still too big
1009          */
1010         if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
1011             INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
1012         }
1013         if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
1014             INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
1015         }
1016
1017
1018
1019         /* update which objects belong to which cluster:
1020          *
1021          * note:  piNObjsPerCluster needs to be already updated
1022          *
1023          */
1024         /* create a copy of the object indices in the split cluster
1025          * (original will be overwritten)
1026          */
1027         piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
1028         memcpy(piObjIndicesOfSplitCluster,
1029                (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
1030                iNObjsInClusterToSplit * sizeof(int));
1031
1032         (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
1033             CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
1034                       ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
1035
1036
1037         (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
1038             (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
1039                               (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
1040
1041         (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
1042             (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
1043
1044
1045         /* now reassign the object indices to the assigned cluster
1046          */
1047         piCurObjToUpdate[0] = 0;
1048         piCurObjToUpdate[1] = 0;
1049         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
1050             int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
1051             int iThisClusterAssignment =  piKClusterAssignments[iN];
1052             int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
1053             int iThisClusterIndex = 0;
1054
1055             if (0 == iThisClusterAssignment) {
1056                 iThisClusterIndex = iClusterToSplot;
1057                 if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
1058                     strcat((*ppcClusterSplits_p)[iThisObjIdx], "0");
1059                 }
1060             } else if (1 == iThisClusterAssignment) {
1061                 iThisClusterIndex = iNewClusterIdx;
1062                 if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
1063                     strcat((*ppcClusterSplits_p)[iThisObjIdx], "1");
1064                 }
1065             } else {
1066                 /* there used to be code for iK>=2 in r101 */
1067                 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
1068                       piKClusterAssignments[iN]);
1069             }
1070 #if 0
1071             Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
1072                       iThisClusterIndex, iThisOffset, iThisObjIdx);
1073 #endif
1074             (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
1075             piCurObjToUpdate[iThisClusterAssignment]+=1;
1076         }
1077         CKFREE(piObjIndicesOfSplitCluster);
1078 #if TRACE
1079         for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
1080             int iObj;
1081             fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
1082                     __FILE__, __FUNCTION__, __LINE__, iN);
1083             for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
1084                 fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
1085             }
1086             fprintf(stderr, "\n");
1087             (void) fflush(stderr);
1088         }
1089 #endif
1090
1091
1092 #if TIMING
1093         StopwatchStop(stopwatch);
1094         StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
1095 #endif
1096
1097
1098         /* finally: increase number of clusters
1099          */
1100         (*prKMeansResult_p)->iNClusters += 1;
1101         iNRounds += 1;
1102         CKFREE(piKClusterAssignments);
1103         CKFREE(pdVectorsInClusterToSplit);
1104
1105     } /* while */
1106     INT_QUEUE_DESTROY(&rClusterSplitQueue);
1107
1108
1109     Log(&rLog, LOG_DEBUG,
1110          "Bisecting K-means finished after %d rounds (no more clusters to split)",
1111          iNRounds);
1112
1113 #if TIMING
1114     StopwatchFree(stopwatch);
1115 #endif
1116
1117     /* @note could use progress/timer */
1118
1119     CKFREE(pdKClusterCenters);
1120     CKFREE(piCurObjToUpdate);
1121
1122     return;
1123 }
1124 /***   end: BisectingKmeans()   ***/
1125
1126
1127
1128 /**
1129  *
1130  * @brief From scratch reimplementation of mBed: Blackshields et al.
1131  * (2010); PMID 20470396.
1132  *
1133  * Idea is a follows:
1134  * - convert sequences into vectors of distances
1135  * - cluster the vectors using k-means
1136  * - cluster each of the k clusters using upgma (used cached distances
1137  *    from above?)
1138  * - join the sub-clusters to create on tree (use UPGMA on k-means
1139  *   medoids)
1140  *
1141  *
1142  * @param[out] prMbedTree_p
1143  * Created upgma tree. will be allocated here. use FreeMuscleTree()
1144  * to free
1145  * @param[in] prMSeq
1146  * Multiple sequences
1147  * @param[in] iPairDistType
1148  * Distance measure for pairwise alignments
1149  * @param[in] pcGuidetreeOut
1150  * Passed down to GuideTreeUpgma()
1151  *
1152  * @note: if the number of sequences is smaller than
1153  * MAX_ALLOWED_SEQ_PER_PRECLUSTER then there's no need to do the subclustering
1154  * first. In fact it costs some extra time. However, it's insignificant and
1155  * for simplicities sake we don't do any special checks
1156
1157  *
1158  * @return Zero on success, non-zero on error
1159  *
1160  */
1161 int
1162 Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
1163      const char *pcGuidetreeOut, int iClustersizes, const char *pcClusterFile)
1164 {
1165     /* number of seeds */
1166     int iNumSeeds = 0;
1167     /* seed indices matching prMSeq */
1168     int *piSeeds = NULL;
1169     /* seqs represented as distance vectors */
1170     double **ppdSeqVec = NULL;
1171     /* kmeans result */
1172     bisecting_kmeans_result_t *prKMeansResult = NULL;
1173     /* distance matrix of kmeans (pre-)cluster centers */
1174     symmatrix_t *prPreClusterDistmat = NULL;
1175     /* auxiliary for symmetric matrix output; tree routines etc */
1176     char **ppcLabels = NULL;
1177     int iNodeIndex = 0;
1178     /* mapping of cluster-center tree node indices to corresponding
1179        cluster */
1180     int *piClusterToTreeNode = NULL;
1181     int iClusterIndex = 0;
1182     int iI, iJ;
1183     FILE *pfOut = NULL;
1184     progress_t *prSubClusterDistanceProgress = NULL;
1185     bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
1186     char **ppcClusterSplits = NULL;
1187
1188 #if FULL_WITHIN_CLUSTER_DISTANCES
1189     Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
1190 #else
1191     Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed.");
1192 #endif
1193
1194 #if MBED_TIMING
1195     Stopwatch_t *stopwatch = StopwatchCreate();
1196 #endif
1197
1198     assert(NULL != prMbedTree_p);
1199     assert(NULL != prMSeq);
1200
1201
1202     iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
1203     if (iNumSeeds >= prMSeq->nseqs) {
1204         /* -1 is condition for RandomUniqueIntArray */
1205         iNumSeeds = prMSeq->nseqs-1;
1206         Log(&rLog, LOG_DEBUG,
1207              "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
1208              iNumSeeds);
1209     }
1210
1211
1212     /* Turn sequences into vectors of distances to the seeds
1213      *
1214      */
1215     piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
1216     if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
1217         Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
1218         return -1;
1219     }
1220     ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
1221     for (iI=0; iI<prMSeq->nseqs; iI++) {
1222         ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
1223     }
1224     if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
1225         Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
1226         return -1;
1227     }
1228     CKFREE(piSeeds);
1229
1230
1231     /* Calculate (pre-)clusters of sequence vectors by applying
1232      * bisecting kmeans
1233      *
1234      */
1235 #if MBED_TIMING
1236     /* start clock only here, to make sure we don't include pairwise
1237      * distance computation */
1238     StopwatchZero(stopwatch);
1239     StopwatchStart(stopwatch);
1240 #endif
1241
1242     /* ppcClusterSplits can consume quite a bit of memory,
1243        however, it is only needed if cluster information is
1244        to be written to file. If no pcClusterFile is specified,
1245        then ppcClusterSplits does not have to be allocated.
1246        Use ppcClusterSplits as a flag, initialise to NULL,
1247        if NULL is passed into BisectingKmeans then do NOT malloc,
1248        if !NULL is passed in then realloc the appropriate memory */
1249     if (NULL != pcClusterFile){
1250         ppcClusterSplits = (char **)malloc(sizeof(char*));
1251     }
1252     else {
1253         ppcClusterSplits = NULL;
1254     }
1255
1256     BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
1257                     MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1258                     iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/, &ppcClusterSplits);
1259     Log(&rLog, LOG_INFO,
1260          "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
1261          prKMeansResult->iNClusters,
1262          MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1263         iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/);
1264
1265     if (NULL != pcClusterFile){ /* print clustering: FS, r275 -> */
1266         FILE *pfClust = NULL;
1267         if (NULL == (pfClust = fopen(pcClusterFile, "w"))){
1268             Log(&rLog, LOG_FATAL, "Could not open file %s for writing", pcClusterFile);
1269         }
1270         for (iI = 0; iI < prKMeansResult->iNClusters; iI++) {
1271             for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
1272                 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
1273                 fprintf(pfClust, "Cluster %u: object %u has index %u (=seq %s %d~len)\t %s\n",
1274                         iI, iJ,  iRealIndex, prMSeq->sqinfo[iRealIndex].name, prMSeq->sqinfo[iRealIndex].len, ppcClusterSplits[iRealIndex]);
1275             }
1276         }
1277         fclose(pfClust); pfClust = NULL;
1278         /* if there is no pcClusterFile then no memory will have been allocated */
1279         CKFREE(ppcClusterSplits[0]);
1280         CKFREE(ppcClusterSplits);
1281     } /* there was a request to write out clustering */
1282
1283 #if PRINT_CLUSTER_DISTRIBUTION
1284     Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
1285     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1286 #if TRACE
1287         int iD;
1288         Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
1289         fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
1290                 __FILE__, __FUNCTION__, __LINE__);
1291         for (iD=0; iD<iNumSeeds; iD++) {
1292             fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
1293         }
1294         fprintf(stderr, "\n");
1295         fflush(stderr);
1296 #endif
1297         Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
1298                   iI, prKMeansResult->piNObjsPerCluster[iI]);
1299 #if TRACE
1300         for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
1301             int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
1302
1303             Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
1304                       iI, iJ,  iRealIndex, prMSeq->sqinfo[iRealIndex].name);
1305         }
1306 #endif
1307     }
1308 #endif
1309
1310
1311     /* Cluster pre-clusters produced by k-means.
1312      *
1313      * Do this by calculating the vector distances of the cluster
1314      * centers and applying UPGMA.
1315      *
1316      * @note could try to force-balance the tree here
1317      *
1318      */
1319     if (0 != NewSymMatrix(&prPreClusterDistmat,
1320                      prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
1321         Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
1322     }
1323     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1324         for (iJ=iI+1; iJ<prKMeansResult->iNClusters; iJ++) {
1325             double dDist;
1326             dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
1327                              prKMeansResult->ppdClusterCenters[iJ],
1328                              iNumSeeds);
1329             SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist);
1330             /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
1331                iI, iJ, dDist); */
1332         }
1333     }
1334
1335
1336     /* labels needed for the guide tree building routine only */
1337     ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
1338     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1339         ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
1340         (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
1341     }
1342 #if TRACE
1343     Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
1344     SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL, FALSE);
1345 #endif
1346
1347     GuideTreeUpgma(prMbedTree_p,
1348                    ppcLabels, prPreClusterDistmat, NULL);
1349
1350     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1351         CKFREE(ppcLabels[iI]);
1352     }
1353     CKFREE(ppcLabels);
1354 #if TRACE
1355     Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
1356     LogTree(*prMbedTree_p, stderr);
1357 #endif
1358
1359
1360
1361     /* Now replace each leaf in the pre-cluster-center tree
1362      * appropriately, i.e. with the corresponding sub-cluster.
1363      *
1364      * For each leaf/sub-cluster, create a distance matrix for the
1365      * corresponding sequences. Use distances between vectors as
1366      * approximated distances between sequences. Then create a
1367      * guide-tree by applying UPGMA.
1368      *
1369      */
1370
1371     /* Get a mapping of (pre)cluster number and leaf-node indices in
1372      * the cluster-center tree. We can add trees to prMbedTree_p
1373      * because AppendTrees() guarantees that no other than the node to
1374      * append to changes.
1375      */
1376     piClusterToTreeNode = (int*)
1377         CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
1378     iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
1379     do {
1380         if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
1381             int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
1382             piClusterToTreeNode[iLeafId] = iNodeIndex;
1383         }
1384         iNodeIndex =  NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
1385     } while (NULL_NEIGHBOR != iNodeIndex);
1386
1387
1388
1389
1390     /* Now step through all the leafs and replace them with the
1391      * corresponding sub-trees
1392      */
1393     NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
1394                 "Distance calculation within sub-clusters", bPrintCR);
1395     /* for each cluster */
1396     for (iClusterIndex=0;
1397          iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
1398         /* distance matrix for the sub-cluster */
1399         symmatrix_t *prWithinClusterDistances = NULL;
1400         int iNSeqInCluster;
1401         tree_t *prSubClusterTree = NULL;
1402
1403         ProgressLog(prSubClusterDistanceProgress,
1404                     iClusterIndex, prKMeansResult->iNClusters, FALSE);
1405
1406 #if FULL_WITHIN_CLUSTER_DISTANCES
1407         mseq_t *prSubClusterMSeq;
1408         int iPairDistType;
1409         int iSeqIndex;
1410         int iOldLogLevel;
1411
1412         Log(&rLog, LOG_DEBUG,
1413             "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
1414
1415         if (TRUE == prMSeq->aligned)  {
1416             if (SEQTYPE_PROTEIN == prMSeq->seqtype){
1417                 iPairDistType = PAIRDIST_SQUIDID_KIMURA;
1418             }
1419             else {
1420                 iPairDistType = PAIRDIST_SQUIDID;
1421             }
1422         } else {
1423             iPairDistType = PAIRDIST_KTUPLE;
1424         }
1425 #endif
1426
1427         iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
1428 #if TRACE
1429         Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
1430                   iClusterIndex, iNSeqInCluster);
1431 #endif
1432
1433 #if FULL_WITHIN_CLUSTER_DISTANCES
1434         /* create an mseq structure for sequences in this cluster
1435          * don't need most of the members (e.g. orig_seq) but copy
1436          * them anyway for the sake of completeness
1437          */
1438         NewMSeq(&prSubClusterMSeq);
1439
1440         prSubClusterMSeq->nseqs = iNSeqInCluster;
1441         prSubClusterMSeq->seqtype = prMSeq->seqtype;
1442         if (NULL!=prMSeq->filename) {
1443             prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
1444         }
1445         prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */
1446         prSubClusterMSeq->seq =  (char **)
1447             CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1448         prSubClusterMSeq->orig_seq =  (char **)
1449             CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1450         prSubClusterMSeq->sqinfo =  (SQINFO *)
1451             CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
1452
1453         for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
1454             int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
1455             prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
1456             prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
1457             SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
1458 #if TRACE
1459             Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
1460                 iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
1461                 iRealSeqIndex);
1462 #endif
1463         }
1464 #endif
1465
1466
1467         /* Create a distance matrix for this sub-cluster
1468          * (prWithinClusterDistances) by using the vector distances or
1469          * ktuple distances.
1470          *
1471          * Then apply UPGMA to get a subcluster tree
1472          * (prSubClusterTree) and append created tree to the
1473          * pre-cluster-tree (prMbedTree_p)
1474          *
1475          */
1476 #if FULL_WITHIN_CLUSTER_DISTANCES
1477
1478         iOldLogLevel = rLog.iLogLevelEnabled;
1479         rLog.iLogLevelEnabled = LOG_WARN;
1480         /* compute distances, but be quiet */
1481         /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
1482         if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, FALSE,
1483                           0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
1484                           NULL, NULL)) {
1485             Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
1486             return -1;
1487         }
1488         rLog.iLogLevelEnabled = iOldLogLevel;
1489
1490 #if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE
1491         {
1492             double dSum = 0.0;
1493             for (iI=0; iI<prSubClusterMSeq->nseqs; iI++) {
1494                 for (iJ=iI+1; iJ<prSubClusterMSeq->nseqs; iJ++) {
1495                     dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ);
1496                 }
1497             }
1498             Log(&rLog, LOG_FORCED_DEBUG,
1499                 "mean pair-wise distance within subcluster %d of %d = %f",
1500                 iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs);
1501         }
1502 #endif
1503
1504 #else
1505         if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
1506             Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
1507         }
1508         for (iI=0; iI<iNSeqInCluster; iI++) {
1509             int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1510
1511             for (iJ=iI+1; iJ<iNSeqInCluster; iJ++) {
1512                 int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iJ];
1513                 double dist;
1514
1515                 /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
1516                    iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
1517                    iJ, prMSeq->sqinfo[iRealIndexJ].name); */
1518
1519                 if (1 == USE_EUCLIDEAN_DISTANCE) {
1520                     dist = EuclDist(ppdSeqVec[iRealIndexI],
1521                                     ppdSeqVec[iRealIndexJ], iNumSeeds);
1522                 } else {
1523                     dist = CosDist(ppdSeqVec[iRealIndexI],
1524                                    ppdSeqVec[iRealIndexJ], iNumSeeds);
1525                 }
1526                 SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist);
1527             }
1528         }
1529 #endif
1530
1531         /* labels needed for the guide tree building routine only */
1532         ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
1533         for (iI=0; iI<iNSeqInCluster; iI++) {
1534 #if FULL_WITHIN_CLUSTER_DISTANCES
1535             ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
1536 #else
1537             int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1538             ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
1539 #endif
1540         }
1541 #if TRACE
1542         Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
1543                   iClusterIndex, prKMeansResult->iNClusters);
1544         SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL, FALSE);
1545 #endif
1546
1547
1548         GuideTreeUpgma(&prSubClusterTree, ppcLabels,
1549                        prWithinClusterDistances, NULL);
1550
1551         CKFREE(ppcLabels); /* don't free members, they just point */
1552 #if 0
1553         Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
1554         LogTree(prSubClusterTree);
1555 #endif
1556
1557
1558         /* The guide tree id's (that point to the sequences) now start
1559          * from 0, i.e. the association with the prMSeq numbering is
1560          * broken and fixed in the following
1561          */
1562         for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
1563             if (IsLeaf(iNodeIndex, prSubClusterTree)) {
1564                 int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
1565                 int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
1566 #if 0
1567                 Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
1568                           iNodeIndex, iLeafId,
1569                           GetLeafName(iNodeIndex, prSubClusterTree),
1570                           iRealId, prMSeq->sqinfo[iRealId].name);
1571 #endif
1572                 SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
1573             }
1574         }
1575
1576
1577         /* Append the newly created tree (prSubClusterTree) to the
1578          * corresponding node index of prMbedTree_p.
1579          */
1580 #if TRACE
1581         Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
1582                   piClusterToTreeNode[iClusterIndex],
1583                   GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
1584 #endif
1585
1586         AppendTree(*prMbedTree_p,
1587                    piClusterToTreeNode[iClusterIndex],
1588                    prSubClusterTree);
1589         /* Note: piClusterToTreeNode is still valid, because
1590          * AppendTrees() guarantees that no other than the node to
1591          * append to changes. */
1592
1593 #if 0
1594         Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
1595         LogTree(*prMbedTree_p);
1596
1597         if (0) {
1598             char fname[] = "mbed-joined-tree.dnd";
1599             FILE *pfOut;
1600             if (NULL == (pfOut = fopen(fname, "w"))) {
1601                 Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
1602             }
1603             MuscleTreeToFile(pfOut, *prMbedTree_p);
1604             Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
1605             fclose(pfOut);
1606             Log(&rLog, LOG_FATAL, "DEBUG EXIT");
1607         }
1608 #endif
1609
1610         /* cleanup
1611          */
1612         FreeMuscleTree(prSubClusterTree);
1613         FreeSymMatrix(&prWithinClusterDistances);
1614 #if FULL_WITHIN_CLUSTER_DISTANCES
1615         FreeMSeq(&prSubClusterMSeq);
1616 #endif
1617     } /* end for each cluster */
1618     ProgressDone(prSubClusterDistanceProgress);
1619     FreeProgress(&prSubClusterDistanceProgress);
1620
1621
1622     if (NULL != pcGuidetreeOut) {
1623         if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
1624             Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
1625         } else {
1626             MuscleTreeToFile(pfOut, *prMbedTree_p);
1627             Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
1628             (void) fclose(pfOut);
1629         }
1630     }
1631
1632
1633     /* cleanup
1634      *
1635      */
1636 #if MBED_TIMING
1637     StopwatchStop(stopwatch);
1638     StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
1639     StopwatchFree(stopwatch);
1640 #endif
1641
1642     FreeKMeansResult(&prKMeansResult);
1643     FreeSymMatrix(&prPreClusterDistmat);
1644     for (iI=0; iI<prMSeq->nseqs; iI++) {
1645         CKFREE(ppdSeqVec[iI]);
1646     }
1647     CKFREE(ppdSeqVec);
1648     CKFREE(piClusterToTreeNode);
1649
1650 #ifndef NDEBUG
1651     TreeValidate(*prMbedTree_p);
1652 #endif
1653
1654     return 0;
1655 }
1656 /***   end: Mbed()   ***/