1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: mbed.c 316 2016-12-16 16:14:39Z fabian $
21 * Reimplementation from scratch of mBed (Blackshields et al., 2010;
22 * PMID 20470396) and addition of bisecting K-Means which allows
23 * control over subcluster size
35 #include "pair_dist.h"
36 #include "symmatrix.h"
37 #include "ktuple_pair.h"
44 #include "kmpp/KMeans.h"
49 #include "squid/stopwatch.h"
53 /* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
54 * bisecting kmeans subcluster are not estimated using the vectors,
55 * but calculated normally (using ktuple or kimura). Surprisingly this
56 * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
57 * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
58 * log2 instead of log2^2). And of course it slows things down.
60 #define FULL_WITHIN_CLUSTER_DISTANCES 1
62 #define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0
64 /* Cluster size limits. Maximum is a soft limit, which might be
65 * exceeded if a K-Means split was unsuccesful, where unsuccesful
66 * might also mean that the minimum required number seqs. was not
68 #if FULL_WITHIN_CLUSTER_DISTANCES
69 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
70 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
72 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
73 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
76 /* How many restarts per bisecting kmeans split. 10 give 0.5% increase
77 * in quality on original HOMFAM over just 1. It also increases kmeans
78 * time by a factor of ~3, but overall time is insignificant
79 * compared to pairdist/progressive alignment part.
81 static const int RESTARTS_PER_SPLIT = 10;
84 /* Use standard kmeans (lloyds) or kmeans++. Both are almost
85 * indistinguishable here, although kmeans++ is slightly ahead the
86 * fewer seeds you pick (and it should be better in theory)
88 #define USE_KMEANS_LLOYDS 0
92 #ifndef CLUSTAL_OMEGA_HAVE_LOG2
93 //#define log2(x) (log(x) / 0.69314718055994530942)
94 #define log2(x) ( x<=0 ? (float)(-100000) : 1.442695041*log(x) )
96 #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
99 /* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
100 * approach: Sample iSeeds sequence with constant stride from length-sorted X.
101 * It might be better to pick the seeds randomly, because length sorting might
102 * be more prone to including outliers (e.g. very long and very short seqs).
103 * However, we could never observer such a thing. So just stick to the
104 * original version as this also removes the random element.
106 enum SEED_SELECTION_TYPE {
107 SELECT_SEEDS_RANDOMLY,
108 SELECT_SEEDS_BY_LENGTH
110 #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
113 /* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
114 * no difference between mbed-trees created from cosine or euclidean
115 * distances (simple version, just using disparities).
117 #define USE_EUCLIDEAN_DISTANCE 1
120 /* print some mbed pre-cluster usage to screen */
121 #define PRINT_CLUSTER_DISTRIBUTION 0
128 /* Number of final clusters
132 /* Coordinates (columns) for each cluster (rows)
133 * valid indices: [0...iNClusters][0...dim-1]
135 double **ppdClusterCenters;
137 /* Dimensionality of input data and cluster center coordinates
141 /* Number of objects per cluster
143 int *piNObjsPerCluster;
145 /* Objects (indices) for each cluster. [i][j] will point to (index
146 * of) object j in cluster i
148 int **ppiObjIndicesPerCluster;
149 } bisecting_kmeans_result_t;
154 * @brief Free KMeans result structure.
156 * @param[out] prResult_p
157 * K-Means result to free
159 * @see NewKMeansResult()
162 FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
165 CKFREE((*prResult_p)->piNObjsPerCluster);
166 for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
167 CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
168 CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
170 CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
171 CKFREE((*prResult_p)->ppdClusterCenters);
172 (*prResult_p)->iNClusters = 0;
173 (*prResult_p)->iDim = 0;
176 /*** end: FreeKMeansResult() ***/
181 * @brief Allocate new KMeans result
183 * @param[out] prKMeansResult_p
184 * K-Means result to free
186 * @see NewKMeansResult()
189 NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
191 (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
192 CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
193 (*prKMeansResult_p)->iNClusters = 0;
194 (*prKMeansResult_p)->iDim = 0;
195 (*prKMeansResult_p)->ppdClusterCenters = NULL;
196 (*prKMeansResult_p)->piNObjsPerCluster = NULL;
197 (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
200 /*** end: NewKMeansResult() ***/
205 * @brief Calculate the euclidean distance between two vectors
208 * First vector with dim dimensions
210 * Second vector with dim dimensions
212 * Dimensionality of v1 and v2
214 * @return euclidean distance as double
216 * @note Could probably be inlined. Also perfect for SSE
219 EuclDist(const double *v1, const double *v2, const int dim)
222 double dist; /* returned distance */
228 for (i=0; i<dim; i++) {
229 dist += pow(v1[i]-v2[i], 2.0);
234 /*** end: EuclDist() ***/
239 * @brief Calculate the cosine distance between two vectors
242 * First vector with dim dimensions
244 * Second vector with dim dimensions
246 * Dimensionality of v1 and v2
248 * @return cosine distance as double
250 * @note Could probably be inlined. Also perfect for SSE
253 CosDist(const double *v1, const double *v2, const int dim)
256 double dist; /* returned distance */
265 for (i=0; i<dim; i++) {
267 sq1 += pow(v1[i], 2.0);
268 sq2 += pow(v2[i], 2.0);
273 if ((sq1 * sq2) < DBL_EPSILON) {
274 dist = 1 - s / (sq1 * sq2);
276 /* if the denominator gets small, the fraction gets big, hence dist
282 /*** end: CosDist() ***/
287 * @brief convert sequences into mbed-like (distance) vector
288 * representation. Seeds (prMSeq sequence indices) have to be picked before
290 * @param[out] ppdSeqVec
291 * Pointer to preallocated matrix of size nseqs x iSeeds
293 * Sequences which are to be converted
295 * Array of sequences in prMSeq which are to be used as seeds.
296 * @param[in] iNumSeeds
297 * Number of seeds/elements in piSeeds
298 * @param[in] iPairDistType
299 * Type of pairwise distance comparison
303 SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
304 int *piSeeds, const int iNumSeeds,
305 const int iPairDistType)
307 symmatrix_t *prDistmat = NULL;
310 /* indices for restoring order */
312 /* sorted copy of piSeeds */
313 int *piSortedSeeds = NULL;
316 Stopwatch_t *stopwatch = StopwatchCreate();
317 StopwatchZero(stopwatch);
318 StopwatchStart(stopwatch);
321 assert(NULL != ppdSeqVec);
322 assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
324 piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
325 memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
327 /* need them sorted, otherwise the swapping approach below might
330 qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
333 /* rearrange mseq order so that we can use ktuple_pairdist code as
334 * is. That is, swap seeds and non-seeds such that the seeds end
335 * up in front of mseq. This way we can use the KTuplePairDist()
336 * code, without making a copy of mseq. Also, keep an array of
337 * indices which serves to restore the original sequence order
338 * after putting the seeds in front
341 restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
342 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
343 restore[iSeqIndex] = iSeqIndex;
345 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
347 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
349 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
351 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
353 swap = restore[iSeedIndex];
354 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
355 restore[piSortedSeeds[iSeedIndex]] = swap;
359 printf("DEBUG(%s|%s():%d): restore indices =",
360 __FILE__, __FUNCTION__, __LINE__);
361 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
362 printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
366 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
367 Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now: seq = %s",
368 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
373 /* convert sequences into vectors of distances by calling pairwise
374 * distance function for each seq/seed pair
377 /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
378 if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, FALSE,
379 0, iNumSeeds, 0, prMSeq->nseqs,
381 Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
382 FreeSymMatrix(&prDistmat);
383 CKFREE(piSortedSeeds);
390 labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
391 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
392 labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
394 SymMatrixPrint(prDistmat, labels, NULL, FALSE);
400 /* update ppdSeqVec according to prDistmat. keep in mind that we
404 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
405 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
406 ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
407 SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
412 /* restore mseq order by reverse swapping
414 * need reverse order now, so start from top index
416 iSeedIndex=iNumSeeds-1;
419 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
421 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
424 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
426 swap = restore[iSeedIndex];
427 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
428 restore[piSortedSeeds[iSeedIndex]] = swap;
430 } while (0 != iSeedIndex--);
432 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
433 Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d: seq = %s %s",
434 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
439 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
440 printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
441 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
442 printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
449 FreeSymMatrix(&prDistmat);
451 CKFREE(piSortedSeeds);
453 StopwatchStop(stopwatch);
454 StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
455 StopwatchFree(stopwatch);
461 /*** end: SeqToVec() ***/
466 * @brief Select seeds to be used from an prMSeq
468 * @param[out] piSeeds
469 * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
470 * @param[in] iNumSeeds
471 * Number of seeds to be picked
472 * @param[in] iSelectionMethod
473 * Seed selection method to be used
475 * The prMSeq structure to pick sequences from
477 * @return: Non-zero on failure
481 SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq)
485 /* sequence iterator */
488 if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
492 "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
493 iNumSeeds, prMSeq->nseqs);
495 PermutationArray(&piPermArray, iNumSeeds);
496 /* copy to piSeeds */
497 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
498 piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
502 } else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
504 /* step size for picking with constant stride */
506 int *piSeqLen = CKMALLOC(prMSeq->nseqs * sizeof(int));
507 int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
510 "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
511 iNumSeeds, prMSeq->nseqs);
513 iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
515 /* first get an array of seq indices order according to
516 corresponding sequence length: piOrder */
517 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
518 piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
520 QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
522 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
523 Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
524 iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
529 for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
530 piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
536 Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
541 #ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
542 /* do initial mbedding and kmeans. then pick seeds from each cluster and
543 do full mbed with these seeds. idea is that those seeds represent the
544 sequence space better than random seeds. however, this reduces the
545 quality slightly. random is almost always better.
549 /* seqs represented as distance vectors */
552 /* assignments for each seq to the K newly created clusters */
553 int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
554 double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
555 /* create a copy of ppdSeqVec suitable for KMeans */
556 double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
557 bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
559 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
560 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
561 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
562 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
563 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
566 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
567 iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
570 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
571 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
572 ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
574 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
575 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
579 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
581 for (iDim=0; iDim<iNumSeeds; ++iDim) {
582 pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
587 Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
588 dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
590 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
591 pdKMeansClusterCenters, piKMeansClusterAssignments);
592 Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
594 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
597 Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
598 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
599 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
600 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name,
601 piKMeansClusterAssignments[iSeqIndex]);
605 Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
607 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
608 int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
609 if (pbClusterUsed[iAssignedCluster]) {
612 /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
613 piSeeds[iSeedIndex++] = iSeqIndex;
614 pbClusterUsed[iAssignedCluster] = TRUE;
617 CKFREE(pbClusterUsed);
619 CKFREE(pdKMeansVectors);
620 CKFREE(pdKMeansClusterCenters);
621 CKFREE(piKMeansClusterAssignments);
628 if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
629 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
630 Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
631 piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
637 /* end of SeedSelection() */
643 * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
644 * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
646 * @param[out] prKMeansResult_p
647 * Result of Bisecting KMeans. Will be allocated here.
648 * Caller has to free. See @see FreeKMeansResult()
650 * Number of objects/sequences to cluster
652 * Dimensionality of input data
653 * @param[in] ppdVectors
654 * each row holds iDim points for this object's coordinates
655 * @param[in] iMinRequiredObjsPerCluster
656 * Minimum number of objects per Cluster (inclusive)/
657 * @param[in] iMaxAllowedObjsPerCluster
658 * Maximum number of objects per Cluster (inclusive). Function returns once no
659 * cluster contains more then this number of objects. Soft limit!
661 * @note Convoluted code. Could use some restructuring. My apologies.
666 BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
667 const int iNObjs, const int iDim, double **ppdVectors,
668 const int iMinRequiredObjsPerCluster,
669 const int iMaxAllowedObjsPerCluster, char ***ppcClusterSplits_p)
672 /* cluster centers for each cluster created at each split */
673 double *pdKClusterCenters;
674 /* keep track of updated object indices per newly created
676 int *piCurObjToUpdate;
677 /* number of times we called 2-means */
679 /* flag for unsuccessful cluster split */
680 bool bNaNDetected = FALSE;
681 /* flag for detected small cluster after split */
682 bool bSmallClusterDetected;
683 /* queue of clusters which are to be split */
684 int_queue_t rClusterSplitQueue;
689 Stopwatch_t *stopwatch = StopwatchCreate();
693 piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
695 NewKMeansResult(prKMeansResult_p);
697 /* new cluster centers created at each split/KMeans run
699 pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
701 /* init results by setting a first cluster that contains all objects
703 (*prKMeansResult_p)->iNClusters = 1;
704 (*prKMeansResult_p)->iDim = iDim;
705 /* fake center coordinates of first cluster */
706 (*prKMeansResult_p)->ppdClusterCenters =
707 (double **) CKMALLOC(1 * sizeof(double *));
708 (*prKMeansResult_p)->ppdClusterCenters[0] =
709 (double *) CKMALLOC(iDim * sizeof(double));
710 /* objects per cluster */
711 (*prKMeansResult_p)->piNObjsPerCluster =
712 (int *) CKMALLOC(1 * sizeof(int));
713 (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
714 /* object indices per cluster */
715 (*prKMeansResult_p)->ppiObjIndicesPerCluster =
716 (int **) CKMALLOC(1 * sizeof(int *));
717 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
718 CKMALLOC(iNObjs * sizeof(int));
719 for (iN=0; iN<iNObjs; iN++) {
720 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
725 * this is an array that encodes where a sequences fell at a bisecting k-means
726 * ppcClusterSplits_p can consume quite a bit of memory,
727 but it is only needed if clustering info is written to file.
728 Use ppcClusterSplits_p as a flag, initialise in the calling function (Mbed)
729 to NULL, if no info written out then leave ppcClusterSplits_p=NULL.
730 If info is to be written out then malloc one char*, so that
731 ppcClusterSplits_p!=NULL. In that case realloc proper amount.
732 Only expect a certain number of splits, iLog2N2.
733 Worst case would be iNObjs but then ppcClusterSplits_p
734 would be quadratic, which is too big for many sequences.
736 if (NULL != *ppcClusterSplits_p){
737 /* (square) of logarithm (base 2) of number of objects */
738 double dLog2N = log10(iNObjs) / log10(2);
739 iLog2N2 = (int)(dLog2N*dLog2N);
741 *ppcClusterSplits_p = (char **)CKREALLOC(*ppcClusterSplits_p, iNObjs*sizeof(char *));
742 (*ppcClusterSplits_p)[0] = (char *)CKMALLOC(iNObjs*iLog2N2);
743 (*ppcClusterSplits_p)[0][0] = '\0';
744 for (iN = 1; iN < iNObjs; iN++){
745 (*ppcClusterSplits_p)[iN] = (*ppcClusterSplits_p)[iN-1] + iLog2N2;
746 (*ppcClusterSplits_p)[iN][0] = '\0';
751 /* Starting with the first cluster that now contains all the
752 * sequences keep splitting until no cluster contains more than
753 * iMaxAllowedObjsPerCluster
755 * Keep a queue of clusters (rClusterSplitQueue) to split
757 * At each split values/memory of the just split cluster will be
758 * reused and exactly one new only allocated.
761 * Given the following cluster assignments
762 * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
763 * and a K-Means split in cluster 3 at |:
764 * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
765 * The result should be this:
766 * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
770 INT_QUEUE_INIT(&rClusterSplitQueue);
771 if (iNObjs>iMaxAllowedObjsPerCluster) {
772 /* pish fake first cluster index */
773 INT_QUEUE_PUSH(&rClusterSplitQueue, 0);
776 while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
777 /* assignments for each seq to the K newly created clusters */
778 int *piKClusterAssignments;
779 /* number of objects in cluster that is to be split */
780 int iNObjsInClusterToSplit;
781 /* coordinates of points in cluster that is to be split
782 * array of size n*d where [d*i + j] gives coordinate j of point i
784 double *pdVectorsInClusterToSplit;
785 /* copy of object indices in split cluster */
786 int *piObjIndicesOfSplitCluster;
787 /* best cost of kmeans rounds */
790 /* indices for the two created clusters
792 /* index of cluster to split */
794 /* index of newly created cluster at each round. data for the
795 other created cluster goes to the one just split,
796 i.e. (*piClusterToSplot) */
800 StopwatchZero(stopwatch);
801 StopwatchStart(stopwatch);
804 INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
806 iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
807 piKClusterAssignments = (int *)
808 CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
810 pdVectorsInClusterToSplit = (double *)
811 CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
812 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
813 for (iD=0; iD<iDim; ++iD) {
815 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
816 pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
821 Log(&rLog, LOG_FORCED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
822 iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
823 fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
824 __FILE__, __FUNCTION__, __LINE__);
825 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
826 fprintf(stderr, " %u",
827 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
829 fprintf(stderr, "\n");
830 (void) fflush(stderr);
834 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
836 "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
837 __FILE__, __FUNCTION__, __LINE__, iN,
838 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
839 for (iD=0; iD<iDim; iD++) {
840 fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
842 fprintf(stderr, "\n");
843 (void) fflush(stderr);
847 /* KMeans(1 "The number of points in the data set",
848 * 2 "The number of clusters to look for",
849 * 3 "The number of dimensions that the data set lives in",
850 * 4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
851 * 5 "attempts: The number of times to independently run k-means",
852 * 6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
853 * 7 "centers: This can either be null or an array of size k*d.
854 * In the latter case, centers[d*i + j] will give coordinate j of center i.
855 * If the cluster is unused, it will contain NaN instead.",
856 * 8 "assignments: This can either be null or an array of size n.
857 * In the latter case, it will be filled with the cluster that each point is assigned to
858 * (an integer between 0 and k-1 inclusive).");
860 dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
861 pdVectorsInClusterToSplit,
862 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
863 pdKClusterCenters, piKClusterAssignments);
866 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
867 for (iN=0; iN<2; iN++) {
868 fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
869 __FILE__, __FUNCTION__, __LINE__, iN);
870 for (iD=0; iD<iDim; iD++) {
871 fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
873 fprintf(stderr, "\n");
874 (void) fflush(stderr);
876 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
877 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
878 iN, piKClusterAssignments[iN], 2);
882 /* real index of one of the newly created clusters. the other
883 * one is iClusterToSplot */
884 iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
887 /* We don't want Clusters which are too small. Check here if a
888 * split created a small cluster and if yes, discard the
889 * solution. Because the cluster has already been removed from
890 * the queue, we can just continue.
892 bSmallClusterDetected = FALSE;
893 if (iMinRequiredObjsPerCluster>1) {
894 int iNObjsCluster[2];
895 iNObjsCluster[0] = 0; /* first cluster */
896 iNObjsCluster[1] = 0; /* second cluster */
897 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
898 iNObjsCluster[piKClusterAssignments[iN]]+=1;
901 if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
903 iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
904 bSmallClusterDetected = TRUE;
905 Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
906 iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
910 /* Same logic as for small clusters applies if KMeans couldn't
911 * split the cluster. In this case one of its center
912 * coordinates will be NaN, which we check for in the
915 if (! bSmallClusterDetected) {
916 for (iN=0; iN<2; iN++) {
917 bNaNDetected = FALSE;
918 for (iD=0; iD<iDim; iD++) {
919 if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
920 /* Got NaN as coordinate after splitting */
921 Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
923 iClusterToSplot, iNObjsInClusterToSplit,
924 "Hope it's not too big and doesn't slow things down."
936 /* Discarding split of this cluster. It has been removed from the
937 * queue already. so just continue
939 if (bNaNDetected || bSmallClusterDetected) {
940 CKFREE(piKClusterAssignments);
941 CKFREE(pdVectorsInClusterToSplit);
946 /* update cluster centers: pdClusterCenters
949 /* reuse memory of existing/old/split cluster
951 for (iN=0; iN<iDim; iN++) {
952 double dCoord = pdKClusterCenters[0*iDim+iN];
953 (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
955 /* realloc and set new one
957 (*prKMeansResult_p)->ppdClusterCenters = (double **)
958 CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
959 ((*prKMeansResult_p)->iNClusters+1) * sizeof(double *));
960 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
961 CKMALLOC(iDim * sizeof(double));
962 for (iD=0; iD<iDim; iD++) {
963 double dCoord = pdKClusterCenters[1*iDim+iD];
964 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
967 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
968 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
969 fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
970 __FILE__, __FUNCTION__, __LINE__, iN);
971 for (iD=0; iD<iDim; iD++) {
972 fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
974 fprintf(stderr, "\n");
975 (void) fflush(stderr);
980 /* update #seqs per cluster: piNObjsPerCluster
983 (*prKMeansResult_p)->piNObjsPerCluster = (int *)
984 CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
985 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
986 /* init new and old one to zero */
987 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
988 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
989 /* now update values */
990 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
991 if (0 == piKClusterAssignments[iN]) {
992 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
993 } else if (1 == piKClusterAssignments[iN]) {
994 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
996 /* there used to be code for iK>=2 in r101 */
997 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
998 piKClusterAssignments[iN]);
1002 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
1003 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
1004 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
1005 iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
1008 /* queue clusters if still they are still too big
1010 if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
1011 INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
1013 if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
1014 INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
1019 /* update which objects belong to which cluster:
1021 * note: piNObjsPerCluster needs to be already updated
1024 /* create a copy of the object indices in the split cluster
1025 * (original will be overwritten)
1027 piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
1028 memcpy(piObjIndicesOfSplitCluster,
1029 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
1030 iNObjsInClusterToSplit * sizeof(int));
1032 (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
1033 CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
1034 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
1037 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
1038 (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
1039 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
1041 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
1042 (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
1045 /* now reassign the object indices to the assigned cluster
1047 piCurObjToUpdate[0] = 0;
1048 piCurObjToUpdate[1] = 0;
1049 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
1050 int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
1051 int iThisClusterAssignment = piKClusterAssignments[iN];
1052 int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
1053 int iThisClusterIndex = 0;
1055 if (0 == iThisClusterAssignment) {
1056 iThisClusterIndex = iClusterToSplot;
1057 if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
1058 strcat((*ppcClusterSplits_p)[iThisObjIdx], "0");
1060 } else if (1 == iThisClusterAssignment) {
1061 iThisClusterIndex = iNewClusterIdx;
1062 if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
1063 strcat((*ppcClusterSplits_p)[iThisObjIdx], "1");
1066 /* there used to be code for iK>=2 in r101 */
1067 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
1068 piKClusterAssignments[iN]);
1071 Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
1072 iThisClusterIndex, iThisOffset, iThisObjIdx);
1074 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
1075 piCurObjToUpdate[iThisClusterAssignment]+=1;
1077 CKFREE(piObjIndicesOfSplitCluster);
1079 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
1081 fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
1082 __FILE__, __FUNCTION__, __LINE__, iN);
1083 for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
1084 fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
1086 fprintf(stderr, "\n");
1087 (void) fflush(stderr);
1093 StopwatchStop(stopwatch);
1094 StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
1098 /* finally: increase number of clusters
1100 (*prKMeansResult_p)->iNClusters += 1;
1102 CKFREE(piKClusterAssignments);
1103 CKFREE(pdVectorsInClusterToSplit);
1106 INT_QUEUE_DESTROY(&rClusterSplitQueue);
1109 Log(&rLog, LOG_DEBUG,
1110 "Bisecting K-means finished after %d rounds (no more clusters to split)",
1114 StopwatchFree(stopwatch);
1117 /* @note could use progress/timer */
1119 CKFREE(pdKClusterCenters);
1120 CKFREE(piCurObjToUpdate);
1124 /*** end: BisectingKmeans() ***/
1130 * @brief From scratch reimplementation of mBed: Blackshields et al.
1131 * (2010); PMID 20470396.
1133 * Idea is a follows:
1134 * - convert sequences into vectors of distances
1135 * - cluster the vectors using k-means
1136 * - cluster each of the k clusters using upgma (used cached distances
1138 * - join the sub-clusters to create on tree (use UPGMA on k-means
1142 * @param[out] prMbedTree_p
1143 * Created upgma tree. will be allocated here. use FreeMuscleTree()
1146 * Multiple sequences
1147 * @param[in] iPairDistType
1148 * Distance measure for pairwise alignments
1149 * @param[in] pcGuidetreeOut
1150 * Passed down to GuideTreeUpgma()
1152 * @note: if the number of sequences is smaller than
1153 * MAX_ALLOWED_SEQ_PER_PRECLUSTER then there's no need to do the subclustering
1154 * first. In fact it costs some extra time. However, it's insignificant and
1155 * for simplicities sake we don't do any special checks
1158 * @return Zero on success, non-zero on error
1162 Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
1163 const char *pcGuidetreeOut, int iClustersizes, const char *pcClusterFile)
1165 /* number of seeds */
1167 /* seed indices matching prMSeq */
1168 int *piSeeds = NULL;
1169 /* seqs represented as distance vectors */
1170 double **ppdSeqVec = NULL;
1172 bisecting_kmeans_result_t *prKMeansResult = NULL;
1173 /* distance matrix of kmeans (pre-)cluster centers */
1174 symmatrix_t *prPreClusterDistmat = NULL;
1175 /* auxiliary for symmetric matrix output; tree routines etc */
1176 char **ppcLabels = NULL;
1178 /* mapping of cluster-center tree node indices to corresponding
1180 int *piClusterToTreeNode = NULL;
1181 int iClusterIndex = 0;
1184 progress_t *prSubClusterDistanceProgress = NULL;
1185 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
1186 char **ppcClusterSplits = NULL;
1188 #if FULL_WITHIN_CLUSTER_DISTANCES
1189 Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
1191 Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed.");
1195 Stopwatch_t *stopwatch = StopwatchCreate();
1198 assert(NULL != prMbedTree_p);
1199 assert(NULL != prMSeq);
1202 iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
1203 if (iNumSeeds >= prMSeq->nseqs) {
1204 /* -1 is condition for RandomUniqueIntArray */
1205 iNumSeeds = prMSeq->nseqs-1;
1206 Log(&rLog, LOG_DEBUG,
1207 "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
1212 /* Turn sequences into vectors of distances to the seeds
1215 piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
1216 if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
1217 Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
1220 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
1221 for (iI=0; iI<prMSeq->nseqs; iI++) {
1222 ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
1224 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
1225 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
1231 /* Calculate (pre-)clusters of sequence vectors by applying
1236 /* start clock only here, to make sure we don't include pairwise
1237 * distance computation */
1238 StopwatchZero(stopwatch);
1239 StopwatchStart(stopwatch);
1242 /* ppcClusterSplits can consume quite a bit of memory,
1243 however, it is only needed if cluster information is
1244 to be written to file. If no pcClusterFile is specified,
1245 then ppcClusterSplits does not have to be allocated.
1246 Use ppcClusterSplits as a flag, initialise to NULL,
1247 if NULL is passed into BisectingKmeans then do NOT malloc,
1248 if !NULL is passed in then realloc the appropriate memory */
1249 if (NULL != pcClusterFile){
1250 ppcClusterSplits = (char **)malloc(sizeof(char*));
1253 ppcClusterSplits = NULL;
1256 BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
1257 MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1258 iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/, &ppcClusterSplits);
1259 Log(&rLog, LOG_INFO,
1260 "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
1261 prKMeansResult->iNClusters,
1262 MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1263 iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/);
1265 if (NULL != pcClusterFile){ /* print clustering: FS, r275 -> */
1266 FILE *pfClust = NULL;
1267 if (NULL == (pfClust = fopen(pcClusterFile, "w"))){
1268 Log(&rLog, LOG_FATAL, "Could not open file %s for writing", pcClusterFile);
1270 for (iI = 0; iI < prKMeansResult->iNClusters; iI++) {
1271 for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
1272 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
1273 fprintf(pfClust, "Cluster %u: object %u has index %u (=seq %s %d~len)\t %s\n",
1274 iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name, prMSeq->sqinfo[iRealIndex].len, ppcClusterSplits[iRealIndex]);
1277 fclose(pfClust); pfClust = NULL;
1278 /* if there is no pcClusterFile then no memory will have been allocated */
1279 CKFREE(ppcClusterSplits[0]);
1280 CKFREE(ppcClusterSplits);
1281 } /* there was a request to write out clustering */
1283 #if PRINT_CLUSTER_DISTRIBUTION
1284 Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
1285 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1288 Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
1289 fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
1290 __FILE__, __FUNCTION__, __LINE__);
1291 for (iD=0; iD<iNumSeeds; iD++) {
1292 fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
1294 fprintf(stderr, "\n");
1297 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
1298 iI, prKMeansResult->piNObjsPerCluster[iI]);
1300 for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
1301 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
1303 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
1304 iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name);
1311 /* Cluster pre-clusters produced by k-means.
1313 * Do this by calculating the vector distances of the cluster
1314 * centers and applying UPGMA.
1316 * @note could try to force-balance the tree here
1319 if (0 != NewSymMatrix(&prPreClusterDistmat,
1320 prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
1321 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
1323 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1324 for (iJ=iI+1; iJ<prKMeansResult->iNClusters; iJ++) {
1326 dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
1327 prKMeansResult->ppdClusterCenters[iJ],
1329 SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist);
1330 /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
1336 /* labels needed for the guide tree building routine only */
1337 ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
1338 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1339 ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
1340 (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
1343 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
1344 SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL, FALSE);
1347 GuideTreeUpgma(prMbedTree_p,
1348 ppcLabels, prPreClusterDistmat, NULL);
1350 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1351 CKFREE(ppcLabels[iI]);
1355 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
1356 LogTree(*prMbedTree_p, stderr);
1361 /* Now replace each leaf in the pre-cluster-center tree
1362 * appropriately, i.e. with the corresponding sub-cluster.
1364 * For each leaf/sub-cluster, create a distance matrix for the
1365 * corresponding sequences. Use distances between vectors as
1366 * approximated distances between sequences. Then create a
1367 * guide-tree by applying UPGMA.
1371 /* Get a mapping of (pre)cluster number and leaf-node indices in
1372 * the cluster-center tree. We can add trees to prMbedTree_p
1373 * because AppendTrees() guarantees that no other than the node to
1374 * append to changes.
1376 piClusterToTreeNode = (int*)
1377 CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
1378 iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
1380 if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
1381 int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
1382 piClusterToTreeNode[iLeafId] = iNodeIndex;
1384 iNodeIndex = NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
1385 } while (NULL_NEIGHBOR != iNodeIndex);
1390 /* Now step through all the leafs and replace them with the
1391 * corresponding sub-trees
1393 NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
1394 "Distance calculation within sub-clusters", bPrintCR);
1395 /* for each cluster */
1396 for (iClusterIndex=0;
1397 iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
1398 /* distance matrix for the sub-cluster */
1399 symmatrix_t *prWithinClusterDistances = NULL;
1401 tree_t *prSubClusterTree = NULL;
1403 ProgressLog(prSubClusterDistanceProgress,
1404 iClusterIndex, prKMeansResult->iNClusters, FALSE);
1406 #if FULL_WITHIN_CLUSTER_DISTANCES
1407 mseq_t *prSubClusterMSeq;
1412 Log(&rLog, LOG_DEBUG,
1413 "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
1415 if (TRUE == prMSeq->aligned) {
1416 if (SEQTYPE_PROTEIN == prMSeq->seqtype){
1417 iPairDistType = PAIRDIST_SQUIDID_KIMURA;
1420 iPairDistType = PAIRDIST_SQUIDID;
1423 iPairDistType = PAIRDIST_KTUPLE;
1427 iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
1429 Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
1430 iClusterIndex, iNSeqInCluster);
1433 #if FULL_WITHIN_CLUSTER_DISTANCES
1434 /* create an mseq structure for sequences in this cluster
1435 * don't need most of the members (e.g. orig_seq) but copy
1436 * them anyway for the sake of completeness
1438 NewMSeq(&prSubClusterMSeq);
1440 prSubClusterMSeq->nseqs = iNSeqInCluster;
1441 prSubClusterMSeq->seqtype = prMSeq->seqtype;
1442 if (NULL!=prMSeq->filename) {
1443 prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
1445 prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */
1446 prSubClusterMSeq->seq = (char **)
1447 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1448 prSubClusterMSeq->orig_seq = (char **)
1449 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1450 prSubClusterMSeq->sqinfo = (SQINFO *)
1451 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
1453 for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
1454 int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
1455 prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
1456 prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
1457 SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
1459 Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
1460 iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
1467 /* Create a distance matrix for this sub-cluster
1468 * (prWithinClusterDistances) by using the vector distances or
1471 * Then apply UPGMA to get a subcluster tree
1472 * (prSubClusterTree) and append created tree to the
1473 * pre-cluster-tree (prMbedTree_p)
1476 #if FULL_WITHIN_CLUSTER_DISTANCES
1478 iOldLogLevel = rLog.iLogLevelEnabled;
1479 rLog.iLogLevelEnabled = LOG_WARN;
1480 /* compute distances, but be quiet */
1481 /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
1482 if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, FALSE,
1483 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
1485 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
1488 rLog.iLogLevelEnabled = iOldLogLevel;
1490 #if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE
1493 for (iI=0; iI<prSubClusterMSeq->nseqs; iI++) {
1494 for (iJ=iI+1; iJ<prSubClusterMSeq->nseqs; iJ++) {
1495 dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ);
1498 Log(&rLog, LOG_FORCED_DEBUG,
1499 "mean pair-wise distance within subcluster %d of %d = %f",
1500 iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs);
1505 if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
1506 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
1508 for (iI=0; iI<iNSeqInCluster; iI++) {
1509 int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1511 for (iJ=iI+1; iJ<iNSeqInCluster; iJ++) {
1512 int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iJ];
1515 /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
1516 iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
1517 iJ, prMSeq->sqinfo[iRealIndexJ].name); */
1519 if (1 == USE_EUCLIDEAN_DISTANCE) {
1520 dist = EuclDist(ppdSeqVec[iRealIndexI],
1521 ppdSeqVec[iRealIndexJ], iNumSeeds);
1523 dist = CosDist(ppdSeqVec[iRealIndexI],
1524 ppdSeqVec[iRealIndexJ], iNumSeeds);
1526 SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist);
1531 /* labels needed for the guide tree building routine only */
1532 ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
1533 for (iI=0; iI<iNSeqInCluster; iI++) {
1534 #if FULL_WITHIN_CLUSTER_DISTANCES
1535 ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
1537 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1538 ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
1542 Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
1543 iClusterIndex, prKMeansResult->iNClusters);
1544 SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL, FALSE);
1548 GuideTreeUpgma(&prSubClusterTree, ppcLabels,
1549 prWithinClusterDistances, NULL);
1551 CKFREE(ppcLabels); /* don't free members, they just point */
1553 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
1554 LogTree(prSubClusterTree);
1558 /* The guide tree id's (that point to the sequences) now start
1559 * from 0, i.e. the association with the prMSeq numbering is
1560 * broken and fixed in the following
1562 for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
1563 if (IsLeaf(iNodeIndex, prSubClusterTree)) {
1564 int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
1565 int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
1567 Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
1568 iNodeIndex, iLeafId,
1569 GetLeafName(iNodeIndex, prSubClusterTree),
1570 iRealId, prMSeq->sqinfo[iRealId].name);
1572 SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
1577 /* Append the newly created tree (prSubClusterTree) to the
1578 * corresponding node index of prMbedTree_p.
1581 Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
1582 piClusterToTreeNode[iClusterIndex],
1583 GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
1586 AppendTree(*prMbedTree_p,
1587 piClusterToTreeNode[iClusterIndex],
1589 /* Note: piClusterToTreeNode is still valid, because
1590 * AppendTrees() guarantees that no other than the node to
1591 * append to changes. */
1594 Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
1595 LogTree(*prMbedTree_p);
1598 char fname[] = "mbed-joined-tree.dnd";
1600 if (NULL == (pfOut = fopen(fname, "w"))) {
1601 Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
1603 MuscleTreeToFile(pfOut, *prMbedTree_p);
1604 Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
1606 Log(&rLog, LOG_FATAL, "DEBUG EXIT");
1612 FreeMuscleTree(prSubClusterTree);
1613 FreeSymMatrix(&prWithinClusterDistances);
1614 #if FULL_WITHIN_CLUSTER_DISTANCES
1615 FreeMSeq(&prSubClusterMSeq);
1617 } /* end for each cluster */
1618 ProgressDone(prSubClusterDistanceProgress);
1619 FreeProgress(&prSubClusterDistanceProgress);
1622 if (NULL != pcGuidetreeOut) {
1623 if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
1624 Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
1626 MuscleTreeToFile(pfOut, *prMbedTree_p);
1627 Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
1628 (void) fclose(pfOut);
1637 StopwatchStop(stopwatch);
1638 StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
1639 StopwatchFree(stopwatch);
1642 FreeKMeansResult(&prKMeansResult);
1643 FreeSymMatrix(&prPreClusterDistmat);
1644 for (iI=0; iI<prMSeq->nseqs; iI++) {
1645 CKFREE(ppdSeqVec[iI]);
1648 CKFREE(piClusterToTreeNode);
1651 TreeValidate(*prMbedTree_p);
1656 /*** end: Mbed() ***/