binaries/src/clustalo/src/clustal/mbed.c

   1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4;  indent-tabs-mode: nil -*- */
   2
   3 /*********************************************************************
   4  * Clustal Omega - Multiple sequence alignment
   5  *
   6  * Copyright (C) 2010 University College Dublin
   7  *
   8  * Clustal-Omega is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU General Public License as
  10  * published by the Free Software Foundation; either version 2 of the
  11  * License, or (at your option) any later version.
  12  *
  13  * This file is part of Clustal-Omega.
  14  *
  15  ********************************************************************/
  16
  17 /*
  18  *  RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $
  19  *
  20  *
  21  * Reimplementation from scratch of mBed:
  22  * Blackshields et al. (2010); PMID 20470396
  23  *
  24  *
  25  */
  26
  27 #ifdef HAVE_CONFIG_H
  28 #include "config.h"
  29 #endif
  30
  31 #include <assert.h>
  32 #include <float.h>
  33
  34 #include "mbed.h"
  35 #include "pair_dist.h"
  36 #include "symmatrix.h"
  37 #include "ktuple_pair.h"
  38 #include "tree.h"
  39 #include "util.h"
  40 #include "progress.h"
  41 #include "queue.h"
  42
  43 #include "log.h"
  44 #include "kmpp/KMeans.h"
  45 #include "mbed.h"
  46
  47 #define TIMING 0
  48 #if TIMING
  49 #include "squid/stopwatch.h"
  50 #endif
  51
  52
  53 /* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
  54  * bisecting kmeans subcluster are not estimated using the vectors,
  55  * but calculated normally (using ktuple or kimura). Surprisingly this
  56  * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
  57  * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
  58  * log2 instead of log2^2). And of course it slows things down.
  59  */
  60 #define FULL_WITHIN_CLUSTER_DISTANCES 1
  61
  62 #define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0
  63
  64 /* Cluster size limits. Maximum is a soft limit, which might be
  65  * exceeded if a K-Means split was unsuccesful, where unsuccesful
  66  * might also mean that the minimum required number seqs. was not
  67  * reached */
  68 #if FULL_WITHIN_CLUSTER_DISTANCES
  69 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
  70 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
  71 #else
  72 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
  73 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
  74 #endif
  75
  76 /* How many restarts per bisecting kmeans split. 10 give 0.5% increase
  77  * in quality on original HOMFAM over just 1. It also increases kmeans
  78  * time by a factor of ~3, but overall time is insignificant
  79  * compared to pairdist/progressive alignment part.
  80  */
  81 static const int RESTARTS_PER_SPLIT = 10;
  82
  83
  84 /* Use standard kmeans (lloyds) or kmeans++. Both are almost
  85  * indistinguishable here, although kmeans++ is slightly ahead the
  86  * fewer seeds you pick (and it should be better in theory)
  87  */
  88 #define USE_KMEANS_LLOYDS 0
  89
  90
  91 #ifndef HAVE_LOG2
  92 #define log2(x)  (log(x) / 0.69314718055994530942)
  93 #endif
  94 #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
  95
  96
  97 /* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
  98  * approach: Sample iSeeds sequence with constant stride from length-sorted X.
  99  * It might be better to pick the seeds randomly, because length sorting might
 100  * be more prone to including outliers (e.g. very long and very short seqs).
 101  * However, we could never observer such a thing. So just stick to the
 102  * original version as this also removes the random element.
 103  */
 104 enum SEED_SELECTION_TYPE {
 105     SELECT_SEEDS_RANDOMLY,
 106     SELECT_SEEDS_BY_LENGTH
 107 };
 108 #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
 109
 110
 111 /* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
 112  * no difference between mbed-trees created from cosine or euclidean
 113  * distances (simple version, just using disparities).
 114  */
 115 #define USE_EUCLIDEAN_DISTANCE 1
 116
 117
 118 /* print some mbed pre-cluster usage to screen */
 119 #define PRINT_CLUSTER_DISTRIBUTION 0
 120
 121
 122 #define TRACE 0
 123
 124
 125 typedef struct {
 126     /* Number of final clusters
 127      */
 128     int iNClusters;
 129
 130     /* Coordinates (columns) for each cluster (rows)
 131      * valid indices: [0...iNClusters][0...dim-1]
 132      */
 133     double **ppdClusterCenters;
 134
 135     /* Dimensionality of input data and cluster center coordinates
 136      */
 137     int iDim;
 138
 139     /* Number of objects per cluster
 140      */
 141     int *piNObjsPerCluster;
 142
 143     /* Objects (indices) for each cluster. [i][j] will point to (index
 144      * of) object j in cluster i
 145      */
 146     int **ppiObjIndicesPerCluster;
 147 } bisecting_kmeans_result_t;
 148
 149
 150
 151 /**
 152  * @brief Free KMeans result structure.
 153  *
 154  * @param[out]  prResult_p
 155  * K-Means result to free
 156  *
 157  * @see NewKMeansResult()
 158  */
 159 void
 160 FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
 161 {
 162     int iAux;
 163     CKFREE((*prResult_p)->piNObjsPerCluster);
 164     for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
 165         CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
 166         CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
 167     }
 168     CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
 169     CKFREE((*prResult_p)->ppdClusterCenters);
 170     (*prResult_p)->iNClusters = 0;
 171     (*prResult_p)->iDim = 0;
 172     CKFREE(*prResult_p);
 173 }
 174 /***   end: FreeKMeansResult()   ***/
 175
 176
 177
 178 /**
 179  * @brief Allocate new KMeans result
 180  *
 181  * @param[out] prKMeansResult_p
 182  * K-Means result to free
 183  *
 184  * @see NewKMeansResult()
 185  */
 186 void
 187 NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
 188 {
 189     (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
 190         CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
 191     (*prKMeansResult_p)->iNClusters = 0;
 192     (*prKMeansResult_p)->iDim = 0;
 193     (*prKMeansResult_p)->ppdClusterCenters = NULL;
 194     (*prKMeansResult_p)->piNObjsPerCluster = NULL;
 195     (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
 196
 197 }
 198 /***   end: NewKMeansResult()   ***/
 199
 200
 201
 202 /**
 203  * @brief Calculate the euclidean distance between two vectors
 204  *
 205  * @param[in] v1
 206  * First vector with dim dimensions
 207  * @param[in] v2
 208  * Second vector with dim dimensions
 209  * @param[in] dim
 210  * Dimensionality of v1 and v2
 211  *
 212  * @return euclidean distance as double
 213  *
 214  * @note Could probably be inlined. Also perfect for SSE
 215  */
 216 double
 217 EuclDist(const double *v1, const double *v2, const int dim)
 218 {
 219     int i; /* aux */
 220     double dist; /* returned distance */
 221
 222     assert(v1!=NULL);
 223     assert(v2!=NULL);
 224
 225     dist = 0.0;
 226     for (i=0; i<dim; i++) {
 227         dist += pow(v1[i]-v2[i], 2.0);
 228     }
 229
 230     return sqrt(dist);
 231 }
 232 /***   end: EuclDist()   ***/
 233
 234
 235
 236 /**
 237  * @brief Calculate the cosine distance between two vectors
 238  *
 239  * @param[in] v1
 240  * First vector with dim dimensions
 241  * @param[in] v2
 242  * Second vector with dim dimensions
 243  * @param[in] dim
 244  * Dimensionality of v1 and v2
 245  *
 246  * @return cosine distance as double
 247  *
 248  * @note Could probably be inlined. Also perfect for SSE
 249  */
 250 double
 251 CosDist(const double *v1, const double *v2, const int dim)
 252 {
 253     int i; /* aux */
 254     double dist; /* returned distance */
 255     double s, sq1, sq2;
 256
 257     assert(v1!=NULL);
 258     assert(v2!=NULL);
 259
 260     s = 0.0;
 261     sq1 = 0.0;
 262     sq2 = 0.0;
 263     for (i=0; i<dim; i++) {
 264         s += (v1[i]*v2[i]);
 265         sq1 += pow(v1[i], 2.0);
 266         sq2 += pow(v2[i], 2.0);
 267     }
 268     sq1 = sqrt(sq1);
 269     sq2 = sqrt(sq2);
 270
 271     if ((sq1 * sq2) < DBL_EPSILON) {
 272         dist = 1 - s / (sq1 * sq2);
 273     } else {
 274         /* if the denominator gets small, the fraction gets big, hence dist
 275            gets small: */
 276         dist = 0.0;
 277     }
 278     return dist;
 279 }
 280 /***   end: CosDist()   ***/
 281
 282
 283
 284 /**
 285  * @brief convert sequences into mbed-like (distance) vector
 286  * representation. Seeds (prMSeq sequence indices) have to be picked before
 287  *
 288  * @param[out] ppdSeqVec
 289  * Pointer to preallocated matrix of size nseqs x iSeeds
 290  * @param[in] prMSeq
 291  * Sequences which are to be converted
 292  * @param[in] piSeeds
 293  * Array of sequences in prMSeq which are to be used as seeds.
 294  * @param[in] iNumSeeds
 295  * Number of seeds/elements in piSeeds
 296  * @param[in] iPairDistType
 297  * Type of pairwise distance comparison
 298  *
 299  */
 300 int
 301 SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
 302          int *piSeeds, const int iNumSeeds,
 303          const int iPairDistType)
 304 {
 305     symmatrix_t *prDistmat = NULL;
 306     int iSeqIndex;
 307     int iSeedIndex;
 308    /* indices for restoring order */
 309     int *restore;
 310     /* sorted copy of piSeeds */
 311     int *piSortedSeeds;
 312
 313 #if TIMING
 314     Stopwatch_t *stopwatch = StopwatchCreate();
 315     StopwatchZero(stopwatch);
 316     StopwatchStart(stopwatch);
 317 #endif
 318
 319     assert(NULL != ppdSeqVec);
 320     assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
 321
 322     piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
 323     memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
 324
 325     /* need them sorted, otherwise the swapping approach below might
 326      * break
 327      */
 328     qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
 329
 330
 331     /* rearrange mseq order so that we can use ktuple_pairdist code as
 332      * is. That is, swap seeds and non-seeds such that the seeds end
 333      * up in front of mseq. This way we can use the KTuplePairDist()
 334      * code, without making a copy of mseq. Also, keep an array of
 335      * indices which serves to restore the original sequence order
 336      * after putting the seeds in front
 337      *
 338      */
 339     restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
 340     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 341         restore[iSeqIndex] = iSeqIndex;
 342     }
 343     for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 344 #if TRACE
 345         Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
 346 #endif
 347         if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
 348             int swap;
 349             SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
 350
 351             swap = restore[iSeedIndex];
 352             restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
 353             restore[piSortedSeeds[iSeedIndex]] = swap;
 354         }
 355     }
 356 #if TRACE
 357     printf("DEBUG(%s|%s():%d): restore indices =",
 358            __FILE__, __FUNCTION__, __LINE__);
 359     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 360         printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
 361     }
 362     printf("\n");
 363
 364     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 365         Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now:  seq = %s",
 366                   iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
 367     }
 368 #endif
 369
 370
 371     /* convert sequences into vectors of distances by calling pairwise
 372      * distance function for each seq/seed pair
 373      *
 374      */
 375     if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
 376                            0, iNumSeeds, 0, prMSeq->nseqs,
 377                            NULL, NULL)) {
 378         Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
 379         FreeSymMatrix(&prDistmat);
 380         CKFREE(piSortedSeeds);
 381         CKFREE(restore);
 382         return -1;
 383     }
 384 #if TRACE
 385     {
 386         char **labels;
 387         labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
 388         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 389             labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
 390         }
 391         SymMatrixPrint(prDistmat, labels, NULL);
 392         CKFREE(labels);
 393     }
 394 #endif
 395
 396
 397     /* update ppdSeqVec according to prDistmat. keep in mind that we
 398      * changed mseq order
 399      *
 400      */
 401     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 402         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 403             ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
 404                 SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
 405         }
 406     }
 407
 408
 409     /* restore mseq order by reverse swapping
 410      *
 411      * need reverse order now, so start from top index
 412      */
 413     iSeedIndex=iNumSeeds-1;
 414     do {
 415 #if TRACE
 416         Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
 417 #endif
 418         if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
 419             int swap;
 420
 421             SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
 422
 423             swap = restore[iSeedIndex];
 424             restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
 425             restore[piSortedSeeds[iSeedIndex]] = swap;
 426         }
 427     } while (0 != iSeedIndex--);
 428 #if TRACE
 429      for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 430          Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d:  seq = %s %s",
 431                    iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
 432      }
 433 #endif
 434
 435 #if TRACE
 436     for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 437         printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
 438         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 439             printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
 440         }
 441         printf("\n");
 442     }
 443 #endif
 444
 445
 446     FreeSymMatrix(&prDistmat);
 447     CKFREE(restore);
 448 #if TIMING
 449     StopwatchStop(stopwatch);
 450     StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
 451     StopwatchFree(stopwatch);
 452 #endif
 453
 454
 455     return 0;
 456 }
 457 /***   end: SeqToVec()   ***/
 458
 459
 460
 461 /**
 462  * @brief Select seeds to be used from an prMSeq
 463  *
 464  * @param[out] piSeeds
 465  * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
 466  * @param[in] iNumSeeds
 467  * Number of seeds to be picked
 468  * @param[in] iSelectionMethod
 469  * Seed selection method to be used
 470  * @param[in] prMSeq
 471  * The prMSeq structure to pick sequences from
 472  *
 473  * @return: Non-zero on failure
 474  *
 475  */
 476 int
 477 SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod,  mseq_t *prMSeq)
 478 {
 479     /* seed iterator */
 480     int iSeedIndex;
 481     /* sequence iterator */
 482     int iSeqIndex;
 483
 484     if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
 485         int *piPermArray;
 486
 487         Log(&rLog, LOG_INFO,
 488              "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
 489              iNumSeeds, prMSeq->nseqs);
 490
 491         PermutationArray(&piPermArray, iNumSeeds);
 492         /* copy to piSeeds */
 493         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 494             piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
 495         }
 496         CKFREE(piPermArray);
 497
 498     }  else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
 499
 500         /* step size for picking with constant stride */
 501         int iStep;
 502         int *piSeqLen =  CKMALLOC(prMSeq->nseqs * sizeof(int));
 503         int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
 504
 505         Log(&rLog, LOG_INFO,
 506              "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
 507              iNumSeeds, prMSeq->nseqs);
 508
 509         iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
 510                                             * due to rounding */
 511         /* first get an array of seq indices order according to
 512            corresponding sequence length: piOrder */
 513         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 514             piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
 515         }
 516         QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
 517 #if 0
 518         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 519             Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
 520                       iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
 521         }
 522 #endif
 523         CKFREE(piSeqLen);
 524         iSeedIndex = 0;
 525         for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
 526             piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
 527         }
 528         CKFREE(piOrder);
 529
 530     } else {
 531
 532         Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
 533         return -1;
 534     }
 535
 536
 537 #ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
 538     /* do initial mbedding and kmeans. then pick seeds from each cluster and
 539        do full mbed with these seeds. idea is that those seeds represent the
 540        sequence space better than random seeds. however, this reduces the
 541        quality slightly. random is almost always better.
 542     */
 543
 544     if (1) {
 545         /* seqs represented as distance vectors */
 546         double **ppdSeqVec;
 547         double dCost = -1.0;
 548         /* assignments for each seq to the K newly created clusters */
 549         int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
 550         double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
 551         /* create a copy of ppdSeqVec suitable for KMeans */
 552         double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
 553         bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
 554
 555         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
 556         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
 557         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
 558         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
 559         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
 560
 561         /*
 562           Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
 563           iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
 564         */
 565
 566         ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
 567         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 568             ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
 569         }
 570         if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
 571             Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
 572             return -1;
 573         }
 574
 575         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 576             int iDim;
 577             for (iDim=0; iDim<iNumSeeds; ++iDim) {
 578                 pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
 579             }
 580         }
 581
 582
 583         Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
 584         dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
 585                        pdKMeansVectors,
 586                        RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
 587                        pdKMeansClusterCenters, piKMeansClusterAssignments);
 588         Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
 589
 590         Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
 591
 592 #if TRACE
 593         Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
 594         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 595             Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
 596                       iSeqIndex,  prMSeq->sqinfo[iSeqIndex].name,
 597                       piKMeansClusterAssignments[iSeqIndex]);
 598         }
 599 #endif
 600
 601         Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
 602         iSeedIndex = 0;
 603         for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
 604             int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
 605             if (pbClusterUsed[iAssignedCluster]) {
 606                 continue;
 607             } else {
 608                 /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
 609                 piSeeds[iSeedIndex++] = iSeqIndex;
 610                 pbClusterUsed[iAssignedCluster] = TRUE;
 611             }
 612         }
 613         CKFREE(pbClusterUsed);
 614
 615         CKFREE(pdKMeansVectors);
 616         CKFREE(pdKMeansClusterCenters);
 617         CKFREE(piKMeansClusterAssignments);
 618
 619     }
 620     /* if (1) */
 621 #endif
 622
 623
 624     if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
 625         for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
 626             Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
 627                  piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
 628         }
 629     }
 630
 631     return 0;
 632 }
 633 /* end of SeedSelection() */
 634
 635
 636
 637
 638 /**
 639  * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
 640  * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
 641  *
 642  * @param[out] prKMeansResult_p
 643  * Result of Bisecting KMeans. Will be allocated here.
 644  * Caller has to free. See @see FreeKMeansResult()
 645  * @param[in] iNObjs
 646  * Number of objects/sequences to cluster
 647  * @param[in] iDim
 648  * Dimensionality of input data
 649  * @param[in] ppdVectors
 650  * each row holds iDim points for this object's coordinates
 651  * @param[in] iMinRequiredObjsPerCluster
 652  * Minimum number of objects per Cluster (inclusive)/
 653  * @param[in] iMaxAllowedObjsPerCluster
 654  * Maximum number of objects per Cluster (inclusive). Function returns once no
 655  * cluster contains more then this number of objects. Soft limit!
 656  *
 657  * @note Convoluted code. Could use some restructuring. My apologies.
 658  * AW
 659  *
 660  */
 661 void
 662 BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
 663                 const int iNObjs, const int iDim, double **ppdVectors,
 664                 const int iMinRequiredObjsPerCluster,
 665                 const int iMaxAllowedObjsPerCluster)
 666 {
 667     int iN, iD;
 668     /* cluster centers for each cluster created at each split */
 669     double *pdKClusterCenters;
 670     /* keep track of updated object indices per newly created
 671      * cluster */
 672     int *piCurObjToUpdate;
 673     /* number of times we called 2-means */
 674     int iNRounds;
 675     /* flag for unsuccessful cluster split */
 676     bool bNaNDetected = FALSE;
 677     /* flag for detected small cluster after split  */
 678     bool bSmallClusterDetected;
 679     /* queue of clusters which are to be split */
 680     int_queue_t rClusterSplitQueue;
 681
 682 #if TIMING
 683     Stopwatch_t *stopwatch = StopwatchCreate();
 684 #endif
 685
 686
 687     piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
 688
 689     NewKMeansResult(prKMeansResult_p);
 690
 691     /* new cluster centers created at each split/KMeans run
 692      */
 693     pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
 694
 695     /* init results by setting a first cluster that contains all objects
 696      */
 697     (*prKMeansResult_p)->iNClusters = 1;
 698     (*prKMeansResult_p)->iDim = iDim;
 699     /* fake center coordinates of first cluster */
 700     (*prKMeansResult_p)->ppdClusterCenters =
 701         (double **) CKMALLOC(1 * sizeof(double *));
 702     (*prKMeansResult_p)->ppdClusterCenters[0] =
 703         (double *) CKMALLOC(iDim * sizeof(double));
 704     /* objects per cluster */
 705     (*prKMeansResult_p)->piNObjsPerCluster =
 706         (int *) CKMALLOC(1 * sizeof(int));
 707     (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
 708     /* object indices per cluster */
 709     (*prKMeansResult_p)->ppiObjIndicesPerCluster =
 710         (int **) CKMALLOC(1 * sizeof(int *));
 711     (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
 712         CKMALLOC(iNObjs * sizeof(int));
 713     for (iN=0; iN<iNObjs; iN++) {
 714         (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
 715     }
 716
 717
 718
 719     /* Starting with the first cluster that now contains all the
 720      * sequences keep splitting until no cluster contains more than
 721      * iMaxAllowedObjsPerCluster
 722      *
 723      * Keep a queue of clusters (rClusterSplitQueue) to split
 724      *
 725      * At each split values/memory of the just split cluster will be
 726      * reused and exactly one new only allocated.
 727      *
 728      * Example:
 729      * Given the following cluster assignments
 730      * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
 731      * and a K-Means split in cluster 3 at |:
 732      * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
 733      * The result should be this:
 734      * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
 735      *
 736      *
 737      */
 738     INT_QUEUE_INIT(&rClusterSplitQueue);
 739     if (iNObjs>iMaxAllowedObjsPerCluster) {
 740         /* pish fake first cluster index */
 741         INT_QUEUE_PUSH(&rClusterSplitQueue, 0);
 742     }
 743     iNRounds = 0;
 744     while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
 745         /* assignments for each seq to the K newly created clusters */
 746         int *piKClusterAssignments;
 747         /* number of objects in cluster that is to be split */
 748         int iNObjsInClusterToSplit;
 749         /* coordinates of points in cluster that is to be split
 750          * array of size n*d where [d*i + j] gives coordinate j of point i
 751          */
 752         double *pdVectorsInClusterToSplit;
 753         /* copy of object indices in split cluster */
 754         int *piObjIndicesOfSplitCluster;
 755         /* best cost of kmeans rounds */
 756         double dCost = -1.0;
 757
 758         /*  indices for the two created clusters
 759          */
 760         /* index of cluster to split */
 761         int iClusterToSplot;
 762         /* index of newly created cluster at each round. data for the
 763            other created cluster goes to the one just split,
 764            i.e. (*piClusterToSplot) */
 765         int iNewClusterIdx;
 766
 767 #if TIMING
 768         StopwatchZero(stopwatch);
 769         StopwatchStart(stopwatch);
 770 #endif
 771
 772         INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
 773
 774         iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
 775         piKClusterAssignments = (int *)
 776             CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
 777
 778         pdVectorsInClusterToSplit = (double *)
 779             CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
 780         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 781             for (iD=0; iD<iDim; ++iD) {
 782                 int iThisObjIdx =
 783                     (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
 784                 pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
 785             }
 786         }
 787
 788 #if TRACE
 789         Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
 790                   iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
 791         fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
 792                 __FILE__, __FUNCTION__, __LINE__);
 793         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 794             fprintf(stderr, " %u",
 795                     (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
 796         }
 797         fprintf(stderr, "\n");
 798         (void) fflush(stderr);
 799 #endif
 800
 801 #if TRACE
 802         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 803             fprintf(stderr,
 804                     "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
 805                     __FILE__, __FUNCTION__, __LINE__, iN,
 806                     (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
 807             for (iD=0; iD<iDim; iD++) {
 808                 fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
 809             }
 810             fprintf(stderr, "\n");
 811             (void) fflush(stderr);
 812         }
 813 #endif
 814
 815        /* KMeans(1 "The number of points in the data set",
 816         *        2 "The number of clusters to look for",
 817         *        3 "The number of dimensions that the data set lives in",
 818         *        4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
 819         *        5 "attempts: The number of times to independently run k-means",
 820         *        6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
 821         *        7 "centers: This can either be null or an array of size k*d.
 822         *           In the latter case, centers[d*i + j] will give coordinate j of center i.
 823         *           If the cluster is unused, it will contain NaN instead.",
 824         *        8 "assignments: This can either be null or an array of size n.
 825         *           In the latter case, it will be filled with the cluster that each point is assigned to
 826         *           (an integer between 0 and k-1 inclusive).");
 827         */
 828         dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
 829                        pdVectorsInClusterToSplit,
 830                        RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
 831                        pdKClusterCenters, piKClusterAssignments);
 832
 833 #if TRACE
 834         Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
 835         for (iN=0; iN<2; iN++) {
 836             fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
 837                     __FILE__, __FUNCTION__, __LINE__, iN);
 838             for (iD=0; iD<iDim; iD++) {
 839                 fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
 840             }
 841             fprintf(stderr, "\n");
 842             (void) fflush(stderr);
 843         }
 844         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 845             Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
 846                       iN,  piKClusterAssignments[iN], 2);
 847         }
 848 #endif
 849
 850         /* real index of one of the newly created clusters. the other
 851          * one is iClusterToSplot */
 852         iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
 853
 854
 855         /* We don't want Clusters which are too small. Check here if a
 856          * split created a small cluster and if yes, discard the
 857          * solution. Because the cluster has already been removed from
 858          * the queue, we can just continue.
 859          */
 860         bSmallClusterDetected = FALSE;
 861         if (iMinRequiredObjsPerCluster>1) {
 862             int iNObjsCluster[2];
 863             iNObjsCluster[0] = 0; /* first cluster */
 864             iNObjsCluster[1] = 0; /* second cluster */
 865             for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 866                 iNObjsCluster[piKClusterAssignments[iN]]+=1;
 867             }
 868
 869             if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
 870                 ||
 871                 iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
 872                 bSmallClusterDetected = TRUE;
 873                 Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
 874                           iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
 875             }
 876         }
 877
 878         /* Same logic as for small clusters applies if KMeans couldn't
 879          * split the cluster. In this case one of its center
 880          * coordinates will be NaN, which we check for in the
 881          * following.
 882          */
 883         if (! bSmallClusterDetected) {
 884             for (iN=0; iN<2; iN++) {
 885                 bNaNDetected = FALSE;
 886                 for (iD=0; iD<iDim; iD++) {
 887                     if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
 888                         /* Got NaN as coordinate after splitting */
 889                         Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
 890                              __FUNCTION__,
 891                              iClusterToSplot, iNObjsInClusterToSplit,
 892                          "Hope it's not too big and doesn't slow things down."
 893                             );
 894                         bNaNDetected = TRUE;
 895                         break;
 896                     }
 897                 }
 898                 if (bNaNDetected) {
 899                     break;
 900                 }
 901             }
 902         }
 903
 904         /* Discarding split of this cluster. It has been removed from the
 905          * queue already. so just continue
 906          */
 907         if (bNaNDetected || bSmallClusterDetected) {
 908             CKFREE(piKClusterAssignments);
 909             CKFREE(pdVectorsInClusterToSplit);
 910             continue;
 911         }
 912
 913
 914         /* update cluster centers: pdClusterCenters
 915          *
 916          */
 917         /* reuse memory of existing/old/split cluster
 918          */
 919         for (iN=0; iN<iDim; iN++) {
 920             double dCoord = pdKClusterCenters[0*iDim+iN];
 921             (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
 922         }
 923         /* realloc and set new one
 924          */
 925         (*prKMeansResult_p)->ppdClusterCenters = (double **)
 926             CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
 927                       ((*prKMeansResult_p)->iNClusters+1)  * sizeof(double *));
 928         (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
 929             CKMALLOC(iDim * sizeof(double));
 930         for (iD=0; iD<iDim; iD++) {
 931             double dCoord = pdKClusterCenters[1*iDim+iD];
 932             (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
 933         }
 934 #if TRACE
 935         Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
 936         for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
 937             fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
 938                     __FILE__, __FUNCTION__, __LINE__, iN);
 939             for (iD=0; iD<iDim; iD++) {
 940                 fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
 941             }
 942             fprintf(stderr, "\n");
 943             (void) fflush(stderr);
 944         }
 945 #endif
 946
 947
 948         /* update #seqs per cluster: piNObjsPerCluster
 949          *
 950          */
 951         (*prKMeansResult_p)->piNObjsPerCluster = (int *)
 952             CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
 953                       ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
 954         /* init new and old one to zero */
 955         (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
 956         (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
 957         /* now update values */
 958         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
 959             if (0 == piKClusterAssignments[iN]) {
 960                 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
 961             } else if (1 == piKClusterAssignments[iN]) {
 962                 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
 963             } else {
 964                 /* there used to be code for iK>=2 in r101 */
 965                 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
 966                       piKClusterAssignments[iN]);
 967             }
 968         }
 969 #if TRACE
 970         Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
 971         for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
 972             Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
 973                       iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
 974         }
 975 #endif
 976         /* queue clusters if still they are still too big
 977          */
 978         if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
 979             INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
 980         }
 981         if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
 982             INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
 983         }
 984
 985
 986
 987         /* update which objects belong to which cluster:
 988          *
 989          * note:  piNObjsPerCluster needs to be already updated
 990          *
 991          */
 992         /* create a copy of the object indices in the split cluster
 993          * (original will be overwritten)
 994          */
 995         piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
 996         memcpy(piObjIndicesOfSplitCluster,
 997                (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
 998                iNObjsInClusterToSplit * sizeof(int));
 999
1000         (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
1001             CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
1002                       ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
1003
1004
1005         (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
1006             (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
1007                               (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
1008
1009         (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
1010             (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
1011
1012
1013         /* now reassign the object indices to the assigned cluster
1014          */
1015         piCurObjToUpdate[0] = 0;
1016         piCurObjToUpdate[1] = 0;
1017         for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
1018             int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
1019             int iThisClusterAssignment =  piKClusterAssignments[iN];
1020             int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
1021             int iThisClusterIndex = 0;
1022
1023             if (0 == iThisClusterAssignment) {
1024                 iThisClusterIndex = iClusterToSplot;
1025             } else if (1 == iThisClusterAssignment) {
1026                 iThisClusterIndex = iNewClusterIdx;
1027             } else {
1028                 /* there used to be code for iK>=2 in r101 */
1029                 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
1030                       piKClusterAssignments[iN]);
1031             }
1032 #if 0
1033             Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
1034                       iThisClusterIndex, iThisOffset, iThisObjIdx);
1035 #endif
1036             (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
1037             piCurObjToUpdate[iThisClusterAssignment]+=1;
1038         }
1039         CKFREE(piObjIndicesOfSplitCluster);
1040 #if TRACE
1041         for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
1042             int iObj;
1043             fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
1044                     __FILE__, __FUNCTION__, __LINE__, iN);
1045             for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
1046                 fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
1047             }
1048             fprintf(stderr, "\n");
1049             (void) fflush(stderr);
1050         }
1051 #endif
1052
1053
1054 #if TIMING
1055         StopwatchStop(stopwatch);
1056         StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
1057 #endif
1058
1059
1060         /* finally: increase number of clusters
1061          */
1062         (*prKMeansResult_p)->iNClusters += 1;
1063         iNRounds += 1;
1064         CKFREE(piKClusterAssignments);
1065         CKFREE(pdVectorsInClusterToSplit);
1066
1067     } /* while */
1068     INT_QUEUE_DESTROY(&rClusterSplitQueue);
1069
1070
1071     Log(&rLog, LOG_DEBUG,
1072          "Bisecting K-means finished after %d rounds (no more clusters to split)",
1073          iNRounds);
1074
1075 #if TIMING
1076     StopwatchFree(stopwatch);
1077 #endif
1078
1079     /* @note could use progress/timer */
1080
1081     CKFREE(pdKClusterCenters);
1082     CKFREE(piCurObjToUpdate);
1083
1084     return;
1085 }
1086 /***   end: BisectingKmeans()   ***/
1087
1088
1089
1090 /**
1091  *
1092  * @brief From scratch reimplementation of mBed: Blackshields et al.
1093  * (2010); PMID 20470396.
1094  *
1095  * Idea is a follows:
1096  * - convert sequences into vectors of distances
1097  * - cluster the vectors using k-means
1098  * - cluster each of the k clusters using upgma (used cached distances
1099  *    from above?)
1100  * - join the sub-clusters to create on tree (use UPGMA on k-means
1101  *   medoids)
1102  *
1103  *
1104  * @param[out] prMbedTree_p
1105  * Created upgma tree. will be allocated here. use FreeMuscleTree()
1106  * to free
1107  * @param[in] prMSeq
1108  * Multiple sequences
1109  * @param[in] iPairDistType
1110  * Distance measure for pairwise alignments
1111  * @param[in] pcGuidetreeOut
1112  * Passed down to GuideTreeUpgma()
1113  *
1114  * @return Zero on success, non-zero on error
1115  *
1116  */
1117 int
1118 Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
1119      const char *pcGuidetreeOut)
1120 {
1121     /* number of seeds */
1122     int iNumSeeds;
1123     /* seed indices matching prMSeq */
1124     int *piSeeds;
1125     /* seqs represented as distance vectors */
1126     double **ppdSeqVec;
1127     /* kmeans result */
1128     bisecting_kmeans_result_t *prKMeansResult = NULL;
1129     /* distance matrix of kmeans (pre-)cluster centers */
1130     symmatrix_t *prPreClusterDistmat = NULL;
1131     /* auxiliary for symmetric matrix output; tree routines etc */
1132     char **ppcLabels;
1133     int iNodeIndex;
1134     /* mapping of cluster-center tree node indices to corresponding
1135        cluster */
1136     int *piClusterToTreeNode;
1137     int iClusterIndex;
1138     int iI, iJ;
1139     FILE *pfOut;
1140     progress_t *prSubClusterDistanceProgress;
1141     bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
1142
1143 #if FULL_WITHIN_CLUSTER_DISTANCES
1144     Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
1145 #else
1146     Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed.");
1147 #endif
1148
1149 #if MBED_TIMING
1150     Stopwatch_t *stopwatch = StopwatchCreate();
1151 #endif
1152
1153     assert(NULL != prMbedTree_p);
1154     assert(NULL != prMSeq);
1155
1156
1157     iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
1158     if (iNumSeeds >= prMSeq->nseqs) {
1159         /* -1 is condition for RandomUniqueIntArray */
1160         iNumSeeds = prMSeq->nseqs-1;
1161         Log(&rLog, LOG_DEBUG,
1162              "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
1163              iNumSeeds);
1164     }
1165
1166
1167     /* Turn sequences into vectors of distances to the seeds
1168      *
1169      */
1170     piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
1171     if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
1172         Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
1173         return -1;
1174     }
1175     ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
1176     for (iI=0; iI<prMSeq->nseqs; iI++) {
1177         ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
1178     }
1179     if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
1180         Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
1181         return -1;
1182     }
1183     CKFREE(piSeeds);
1184
1185
1186     /* Calculate (pre-)clusters of sequence vectors by applying
1187      * bisecting kmeans
1188      *
1189      */
1190 #if MBED_TIMING
1191     /* start clock only here, to make sure we don't include pairwise
1192      * distance computation */
1193     StopwatchZero(stopwatch);
1194     StopwatchStart(stopwatch);
1195 #endif
1196
1197     BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
1198                     MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1199                     MAX_ALLOWED_SEQ_PER_PRECLUSTER);
1200     Log(&rLog, LOG_INFO,
1201          "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
1202          prKMeansResult->iNClusters,
1203          MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1204          MAX_ALLOWED_SEQ_PER_PRECLUSTER);
1205
1206
1207 #if PRINT_CLUSTER_DISTRIBUTION
1208     Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
1209     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1210 #if TRACE
1211         int iD;
1212         Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
1213         fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
1214                 __FILE__, __FUNCTION__, __LINE__);
1215         for (iD=0; iD<iNumSeeds; iD++) {
1216             fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
1217         }
1218         fprintf(stderr, "\n");
1219         fflush(stderr);
1220 #endif
1221         Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
1222                   iI, prKMeansResult->piNObjsPerCluster[iI]);
1223 #if TRACE
1224         for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
1225             int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
1226
1227             Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
1228                       iI, iJ,  iRealIndex, prMSeq->sqinfo[iRealIndex].name);
1229         }
1230 #endif
1231     }
1232 #endif
1233
1234
1235     /* Cluster pre-clusters produced by k-means.
1236      *
1237      * Do this by calculating the vector distances of the cluster
1238      * centers and applying UPGMA.
1239      *
1240      * @note could try to force-balance the tree here
1241      *
1242      */
1243     if (0 != NewSymMatrix(&prPreClusterDistmat,
1244                      prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
1245         Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
1246     }
1247     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1248         for (iJ=iI+1; iJ<prKMeansResult->iNClusters; iJ++) {
1249             double dDist;
1250             dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
1251                              prKMeansResult->ppdClusterCenters[iJ],
1252                              iNumSeeds);
1253             SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist);
1254             /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
1255                iI, iJ, dDist); */
1256         }
1257     }
1258
1259
1260     /* labels needed for the guide tree building routine only */
1261     ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
1262     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1263         ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
1264         (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
1265     }
1266 #if TRACE
1267     Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
1268     SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
1269 #endif
1270
1271     GuideTreeUpgma(prMbedTree_p,
1272                    ppcLabels, prPreClusterDistmat, NULL);
1273
1274     for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1275         CKFREE(ppcLabels[iI]);
1276     }
1277     CKFREE(ppcLabels);
1278 #if TRACE
1279     Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
1280     LogTree(*prMbedTree_p);
1281 #endif
1282
1283
1284
1285     /* Now replace each leaf in the pre-cluster-center tree
1286      * appropriately, i.e. with the corresponding sub-cluster.
1287      *
1288      * For each leaf/sub-cluster, create a distance matrix for the
1289      * corresponding sequences. Use distances between vectors as
1290      * approximated distances between sequences. Then create a
1291      * guide-tree by applying UPGMA.
1292      *
1293      */
1294
1295     /* Get a mapping of (pre)cluster number and leaf-node indices in
1296      * the cluster-center tree. We can add trees to prMbedTree_p
1297      * because AppendTrees() guarantees that no other than the node to
1298      * append to changes.
1299      */
1300     piClusterToTreeNode = (int*)
1301         CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
1302     iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
1303     do {
1304         if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
1305             int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
1306             piClusterToTreeNode[iLeafId] = iNodeIndex;
1307         }
1308         iNodeIndex =  NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
1309     } while (NULL_NEIGHBOR != iNodeIndex);
1310
1311
1312
1313
1314     /* Now step through all the leafs and replace them with the
1315      * corresponding sub-trees
1316      */
1317     NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
1318                 "Distance calculation within sub-clusters", bPrintCR);
1319     /* for each cluster */
1320     for (iClusterIndex=0;
1321          iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
1322         /* distance matrix for the sub-cluster */
1323         symmatrix_t *prWithinClusterDistances = NULL;
1324         int iNSeqInCluster;
1325         tree_t *prSubClusterTree = NULL;
1326
1327         ProgressLog(prSubClusterDistanceProgress,
1328                     iClusterIndex, prKMeansResult->iNClusters, FALSE);
1329
1330 #if FULL_WITHIN_CLUSTER_DISTANCES
1331         mseq_t *prSubClusterMSeq;
1332         int iPairDistType;
1333         int iSeqIndex;
1334         int iOldLogLevel;
1335
1336         Log(&rLog, LOG_DEBUG,
1337             "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
1338
1339         if (TRUE == prMSeq->aligned)  {
1340             iPairDistType = PAIRDIST_SQUIDID_KIMURA;
1341         } else {
1342             iPairDistType = PAIRDIST_KTUPLE;
1343         }
1344 #endif
1345
1346         iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
1347 #if TRACE
1348         Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
1349                   iClusterIndex, iNSeqInCluster);
1350 #endif
1351
1352 #if FULL_WITHIN_CLUSTER_DISTANCES
1353         /* create an mseq structure for sequences in this cluster
1354          * don't need most of the members (e.g. orig_seq) but copy
1355          * them anyway for the sake of completeness
1356          */
1357         NewMSeq(&prSubClusterMSeq);
1358
1359         prSubClusterMSeq->nseqs = iNSeqInCluster;
1360         prSubClusterMSeq->seqtype = prMSeq->seqtype;
1361         if (NULL!=prMSeq->filename) {
1362             prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
1363         }
1364         prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */
1365         prSubClusterMSeq->seq =  (char **)
1366             CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1367         prSubClusterMSeq->orig_seq =  (char **)
1368             CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1369         prSubClusterMSeq->sqinfo =  (SQINFO *)
1370             CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
1371
1372         for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
1373             int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
1374             prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
1375             prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
1376             SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
1377 #if TRACE
1378             Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
1379                 iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
1380                 iRealSeqIndex);
1381 #endif
1382         }
1383 #endif
1384
1385
1386         /* Create a distance matrix for this sub-cluster
1387          * (prWithinClusterDistances) by using the vector distances or
1388          * ktuple distances.
1389          *
1390          * Then apply UPGMA to get a subcluster tree
1391          * (prSubClusterTree) and append created tree to the
1392          * pre-cluster-tree (prMbedTree_p)
1393          *
1394          */
1395 #if FULL_WITHIN_CLUSTER_DISTANCES
1396
1397         iOldLogLevel = rLog.iLogLevelEnabled;
1398         rLog.iLogLevelEnabled = LOG_WARN;
1399         /* compute distances, but be quiet */
1400         if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
1401                           0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
1402                           NULL, NULL)) {
1403             Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
1404             return -1;
1405         }
1406         rLog.iLogLevelEnabled = iOldLogLevel;
1407
1408 #if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE
1409         {
1410             double dSum = 0.0;
1411             for (iI=0; iI<prSubClusterMSeq->nseqs; iI++) {
1412                 for (iJ=iI+1; iJ<prSubClusterMSeq->nseqs; iJ++) {
1413                     dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ);
1414                 }
1415             }
1416             Log(&rLog, LOG_FORCED_DEBUG,
1417                 "mean pair-wise distance within subcluster %d of %d = %f",
1418                 iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs);
1419         }
1420 #endif
1421
1422 #else
1423         if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
1424             Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
1425         }
1426         for (iI=0; iI<iNSeqInCluster; iI++) {
1427             int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1428
1429             for (iJ=iI+1; iJ<iNSeqInCluster; iJ++) {
1430                 int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iJ];
1431                 double dist;
1432
1433                 /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
1434                    iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
1435                    iJ, prMSeq->sqinfo[iRealIndexJ].name); */
1436
1437                 if (1 == USE_EUCLIDEAN_DISTANCE) {
1438                     dist = EuclDist(ppdSeqVec[iRealIndexI],
1439                                     ppdSeqVec[iRealIndexJ], iNumSeeds);
1440                 } else {
1441                     dist = CosDist(ppdSeqVec[iRealIndexI],
1442                                    ppdSeqVec[iRealIndexJ], iNumSeeds);
1443                 }
1444                 SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist);
1445             }
1446         }
1447 #endif
1448
1449         /* labels needed for the guide tree building routine only */
1450         ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
1451         for (iI=0; iI<iNSeqInCluster; iI++) {
1452 #if FULL_WITHIN_CLUSTER_DISTANCES
1453             ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
1454 #else
1455             int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1456             ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
1457 #endif
1458         }
1459 #if TRACE
1460         Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
1461                   iClusterIndex, prKMeansResult->iNClusters);
1462         SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
1463 #endif
1464
1465
1466         GuideTreeUpgma(&prSubClusterTree, ppcLabels,
1467                        prWithinClusterDistances, NULL);
1468
1469         CKFREE(ppcLabels); /* don't free members, they just point */
1470 #if 0
1471         Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
1472         LogTree(prSubClusterTree);
1473 #endif
1474
1475
1476         /* The guide tree id's (that point to the sequences) now start
1477          * from 0, i.e. the association with the prMSeq numbering is
1478          * broken and fixed in the following
1479          */
1480         for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
1481             if (IsLeaf(iNodeIndex, prSubClusterTree)) {
1482                 int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
1483                 int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
1484 #if 0
1485                 Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
1486                           iNodeIndex, iLeafId,
1487                           GetLeafName(iNodeIndex, prSubClusterTree),
1488                           iRealId, prMSeq->sqinfo[iRealId].name);
1489 #endif
1490                 SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
1491             }
1492         }
1493
1494
1495         /* Append the newly created tree (prSubClusterTree) to the
1496          * corresponding node index of prMbedTree_p.
1497          */
1498 #if TRACE
1499         Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
1500                   piClusterToTreeNode[iClusterIndex],
1501                   GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
1502 #endif
1503
1504         AppendTree(*prMbedTree_p,
1505                    piClusterToTreeNode[iClusterIndex],
1506                    prSubClusterTree);
1507         /* Note: piClusterToTreeNode is still valid, because
1508          * AppendTrees() guarantees that no other than the node to
1509          * append to changes. */
1510
1511 #if 0
1512         Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
1513         LogTree(*prMbedTree_p);
1514
1515         if (0) {
1516             char fname[] = "mbed-joined-tree.dnd";
1517             FILE *pfOut;
1518             if (NULL == (pfOut = fopen(fname, "w"))) {
1519                 Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
1520             }
1521             MuscleTreeToFile(pfOut, *prMbedTree_p);
1522             Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
1523             fclose(pfOut);
1524             Log(&rLog, LOG_FATAL, "DEBUG EXIT");
1525         }
1526 #endif
1527
1528         /* cleanup
1529          */
1530         FreeMuscleTree(prSubClusterTree);
1531         FreeSymMatrix(&prWithinClusterDistances);
1532 #if FULL_WITHIN_CLUSTER_DISTANCES
1533         FreeMSeq(&prSubClusterMSeq);
1534 #endif
1535     } /* end for each cluster */
1536     ProgressDone(prSubClusterDistanceProgress);
1537     FreeProgress(&prSubClusterDistanceProgress);
1538
1539
1540     if (NULL != pcGuidetreeOut) {
1541         if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
1542             Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
1543         } else {
1544             MuscleTreeToFile(pfOut, *prMbedTree_p);
1545             Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
1546             (void) fclose(pfOut);
1547         }
1548     }
1549
1550
1551     /* cleanup
1552      *
1553      */
1554 #if MBED_TIMING
1555     StopwatchStop(stopwatch);
1556     StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
1557     StopwatchFree(stopwatch);
1558 #endif
1559
1560     FreeKMeansResult(&prKMeansResult);
1561     FreeSymMatrix(&prPreClusterDistmat);
1562     for (iI=0; iI<prMSeq->nseqs; iI++) {
1563         CKFREE(ppdSeqVec[iI]);
1564     }
1565     CKFREE(ppdSeqVec);
1566     CKFREE(piClusterToTreeNode);
1567
1568 #ifndef NDEBUG
1569     TreeValidate(*prMbedTree_p);
1570 #endif
1571
1572     return 0;
1573 }
1574 /***   end: Mbed()   ***/