1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
6 * Copyright (C) 2010 University College Dublin
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This file is part of Clustal-Omega.
15 ********************************************************************/
18 * RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $
21 * Reimplementation from scratch of mBed:
22 * Blackshields et al. (2010); PMID 20470396
35 #include "pair_dist.h"
36 #include "symmatrix.h"
37 #include "ktuple_pair.h"
44 #include "kmpp/KMeans.h"
49 #include "squid/stopwatch.h"
53 /* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each
54 * bisecting kmeans subcluster are not estimated using the vectors,
55 * but calculated normally (using ktuple or kimura). Surprisingly this
56 * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000
57 * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS
58 * log2 instead of log2^2). And of course it slows things down.
60 #define FULL_WITHIN_CLUSTER_DISTANCES 1
62 #define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0
64 /* Cluster size limits. Maximum is a soft limit, which might be
65 * exceeded if a K-Means split was unsuccesful, where unsuccesful
66 * might also mean that the minimum required number seqs. was not
68 #if FULL_WITHIN_CLUSTER_DISTANCES
69 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100;
70 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1;
72 static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000;
73 static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100;
76 /* How many restarts per bisecting kmeans split. 10 give 0.5% increase
77 * in quality on original HOMFAM over just 1. It also increases kmeans
78 * time by a factor of ~3, but overall time is insignificant
79 * compared to pairdist/progressive alignment part.
81 static const int RESTARTS_PER_SPLIT = 10;
84 /* Use standard kmeans (lloyds) or kmeans++. Both are almost
85 * indistinguishable here, although kmeans++ is slightly ahead the
86 * fewer seeds you pick (and it should be better in theory)
88 #define USE_KMEANS_LLOYDS 0
92 #define log2(x) (log(x) / 0.69314718055994530942)
94 #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
97 /* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed
98 * approach: Sample iSeeds sequence with constant stride from length-sorted X.
99 * It might be better to pick the seeds randomly, because length sorting might
100 * be more prone to including outliers (e.g. very long and very short seqs).
101 * However, we could never observer such a thing. So just stick to the
102 * original version as this also removes the random element.
104 enum SEED_SELECTION_TYPE {
105 SELECT_SEEDS_RANDOMLY,
106 SELECT_SEEDS_BY_LENGTH
108 #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH
111 /* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is
112 * no difference between mbed-trees created from cosine or euclidean
113 * distances (simple version, just using disparities).
115 #define USE_EUCLIDEAN_DISTANCE 1
118 /* print some mbed pre-cluster usage to screen */
119 #define PRINT_CLUSTER_DISTRIBUTION 0
126 /* Number of final clusters
130 /* Coordinates (columns) for each cluster (rows)
131 * valid indices: [0...iNClusters][0...dim-1]
133 double **ppdClusterCenters;
135 /* Dimensionality of input data and cluster center coordinates
139 /* Number of objects per cluster
141 int *piNObjsPerCluster;
143 /* Objects (indices) for each cluster. [i][j] will point to (index
144 * of) object j in cluster i
146 int **ppiObjIndicesPerCluster;
147 } bisecting_kmeans_result_t;
152 * @brief Free KMeans result structure.
154 * @param[out] prResult_p
155 * K-Means result to free
157 * @see NewKMeansResult()
160 FreeKMeansResult(bisecting_kmeans_result_t **prResult_p)
163 CKFREE((*prResult_p)->piNObjsPerCluster);
164 for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) {
165 CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]);
166 CKFREE((*prResult_p)->ppdClusterCenters[iAux]);
168 CKFREE((*prResult_p)->ppiObjIndicesPerCluster);
169 CKFREE((*prResult_p)->ppdClusterCenters);
170 (*prResult_p)->iNClusters = 0;
171 (*prResult_p)->iDim = 0;
174 /*** end: FreeKMeansResult() ***/
179 * @brief Allocate new KMeans result
181 * @param[out] prKMeansResult_p
182 * K-Means result to free
184 * @see NewKMeansResult()
187 NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p)
189 (*prKMeansResult_p) = (bisecting_kmeans_result_t *)
190 CKCALLOC(1, sizeof(bisecting_kmeans_result_t));
191 (*prKMeansResult_p)->iNClusters = 0;
192 (*prKMeansResult_p)->iDim = 0;
193 (*prKMeansResult_p)->ppdClusterCenters = NULL;
194 (*prKMeansResult_p)->piNObjsPerCluster = NULL;
195 (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL;
198 /*** end: NewKMeansResult() ***/
203 * @brief Calculate the euclidean distance between two vectors
206 * First vector with dim dimensions
208 * Second vector with dim dimensions
210 * Dimensionality of v1 and v2
212 * @return euclidean distance as double
214 * @note Could probably be inlined. Also perfect for SSE
217 EuclDist(const double *v1, const double *v2, const int dim)
220 double dist; /* returned distance */
226 for (i=0; i<dim; i++) {
227 dist += pow(v1[i]-v2[i], 2.0);
232 /*** end: EuclDist() ***/
237 * @brief Calculate the cosine distance between two vectors
240 * First vector with dim dimensions
242 * Second vector with dim dimensions
244 * Dimensionality of v1 and v2
246 * @return cosine distance as double
248 * @note Could probably be inlined. Also perfect for SSE
251 CosDist(const double *v1, const double *v2, const int dim)
254 double dist; /* returned distance */
263 for (i=0; i<dim; i++) {
265 sq1 += pow(v1[i], 2.0);
266 sq2 += pow(v2[i], 2.0);
271 if ((sq1 * sq2) < DBL_EPSILON) {
272 dist = 1 - s / (sq1 * sq2);
274 /* if the denominator gets small, the fraction gets big, hence dist
280 /*** end: CosDist() ***/
285 * @brief convert sequences into mbed-like (distance) vector
286 * representation. Seeds (prMSeq sequence indices) have to be picked before
288 * @param[out] ppdSeqVec
289 * Pointer to preallocated matrix of size nseqs x iSeeds
291 * Sequences which are to be converted
293 * Array of sequences in prMSeq which are to be used as seeds.
294 * @param[in] iNumSeeds
295 * Number of seeds/elements in piSeeds
296 * @param[in] iPairDistType
297 * Type of pairwise distance comparison
301 SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
302 int *piSeeds, const int iNumSeeds,
303 const int iPairDistType)
305 symmatrix_t *prDistmat = NULL;
308 /* indices for restoring order */
310 /* sorted copy of piSeeds */
314 Stopwatch_t *stopwatch = StopwatchCreate();
315 StopwatchZero(stopwatch);
316 StopwatchStart(stopwatch);
319 assert(NULL != ppdSeqVec);
320 assert(iNumSeeds>0 && iNumSeeds<=prMSeq->nseqs);
322 piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int));
323 memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int));
325 /* need them sorted, otherwise the swapping approach below might
328 qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp);
331 /* rearrange mseq order so that we can use ktuple_pairdist code as
332 * is. That is, swap seeds and non-seeds such that the seeds end
333 * up in front of mseq. This way we can use the KTuplePairDist()
334 * code, without making a copy of mseq. Also, keep an array of
335 * indices which serves to restore the original sequence order
336 * after putting the seeds in front
339 restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int));
340 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
341 restore[iSeqIndex] = iSeqIndex;
343 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
345 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %u", piSortedSeeds[iSeedIndex], iSeedIndex);
347 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
349 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
351 swap = restore[iSeedIndex];
352 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
353 restore[piSortedSeeds[iSeedIndex]] = swap;
357 printf("DEBUG(%s|%s():%d): restore indices =",
358 __FILE__, __FUNCTION__, __LINE__);
359 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
360 printf(" %u:%u", iSeqIndex, restore[iSeqIndex]);
364 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
365 Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now: seq = %s",
366 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name);
371 /* convert sequences into vectors of distances by calling pairwise
372 * distance function for each seq/seed pair
375 if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
376 0, iNumSeeds, 0, prMSeq->nseqs,
378 Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
379 FreeSymMatrix(&prDistmat);
380 CKFREE(piSortedSeeds);
387 labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *));
388 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
389 labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
391 SymMatrixPrint(prDistmat, labels, NULL);
397 /* update ppdSeqVec according to prDistmat. keep in mind that we
401 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
402 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
403 ppdSeqVec[restore[iSeqIndex]][iSeedIndex] =
404 SymMatrixGetValue(prDistmat, iSeqIndex, iSeedIndex);
409 /* restore mseq order by reverse swapping
411 * need reverse order now, so start from top index
413 iSeedIndex=iNumSeeds-1;
416 Log(&rLog, LOG_FORCED_DEBUG, "Swapping seqs %d and %d", piSortedSeeds[iSeedIndex], iSeedIndex);
418 if (piSortedSeeds[iSeedIndex]!=iSeedIndex) {
421 SeqSwap(prMSeq, piSortedSeeds[iSeedIndex], iSeedIndex);
423 swap = restore[iSeedIndex];
424 restore[iSeedIndex] = restore[piSortedSeeds[iSeedIndex]];
425 restore[piSortedSeeds[iSeedIndex]] = swap;
427 } while (0 != iSeedIndex--);
429 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
430 Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d: seq = %s %s",
431 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]);
436 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
437 printf("DEBUG: seq %-4u as distance vector =", iSeqIndex);
438 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
439 printf(" %f", ppdSeqVec[iSeqIndex][iSeedIndex]);
446 FreeSymMatrix(&prDistmat);
449 StopwatchStop(stopwatch);
450 StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
451 StopwatchFree(stopwatch);
457 /*** end: SeqToVec() ***/
462 * @brief Select seeds to be used from an prMSeq
464 * @param[out] piSeeds
465 * Will store the indices of prMSeqs seqs used to be as seeds here. Must be preallocated.
466 * @param[in] iNumSeeds
467 * Number of seeds to be picked
468 * @param[in] iSelectionMethod
469 * Seed selection method to be used
471 * The prMSeq structure to pick sequences from
473 * @return: Non-zero on failure
477 SeedSelection(int *piSeeds, int iNumSeeds, int iSelectionMethod, mseq_t *prMSeq)
481 /* sequence iterator */
484 if (SELECT_SEEDS_RANDOMLY == iSelectionMethod) {
488 "Using %d seeds (randomly chosen) for mBed (from a total of %d sequences)",
489 iNumSeeds, prMSeq->nseqs);
491 PermutationArray(&piPermArray, iNumSeeds);
492 /* copy to piSeeds */
493 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
494 piSeeds[iSeedIndex] = piPermArray[iSeedIndex];
498 } else if (SELECT_SEEDS_BY_LENGTH == iSelectionMethod) {
500 /* step size for picking with constant stride */
502 int *piSeqLen = CKMALLOC(prMSeq->nseqs * sizeof(int));
503 int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int));
506 "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)",
507 iNumSeeds, prMSeq->nseqs);
509 iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big
511 /* first get an array of seq indices order according to
512 corresponding sequence length: piOrder */
513 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
514 piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len;
516 QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE);
518 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
519 Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)",
520 iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]);
525 for (iSeqIndex=0; iSeedIndex<iNumSeeds; iSeqIndex+=iStep) {
526 piSeeds[iSeedIndex++] = piOrder[iSeqIndex];
532 Log(&rLog, LOG_ERROR, "Internal error: unknown seed selection type");
537 #ifdef PICK_SEEDS_FROM_KMEANS_CLUSTERS
538 /* do initial mbedding and kmeans. then pick seeds from each cluster and
539 do full mbed with these seeds. idea is that those seeds represent the
540 sequence space better than random seeds. however, this reduces the
541 quality slightly. random is almost always better.
545 /* seqs represented as distance vectors */
548 /* assignments for each seq to the K newly created clusters */
549 int *piKMeansClusterAssignments = CKMALLOC(prMSeq->nseqs * sizeof(int));
550 double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double));
551 /* create a copy of ppdSeqVec suitable for KMeans */
552 double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double));
553 bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool));
555 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds");
556 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters");
557 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round");
558 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE");
559 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good");
562 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds");
563 iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2);
566 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
567 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
568 ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
570 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) {
571 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
575 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
577 for (iDim=0; iDim<iNumSeeds; ++iDim) {
578 pdKMeansVectors[iDim*iSeqIndex + iDim] = ppdSeqVec[iSeqIndex][iDim];
583 Log(&rLog, LOG_FORCED_DEBUG, "%s\n", "FIXME hardcoded RESTARTS_PER_SPLIT");
584 dCost = KMeans(prMSeq->nseqs, iNumSeeds, iNumSeeds,
586 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
587 pdKMeansClusterCenters, piKMeansClusterAssignments);
588 Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost);
590 Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers");
593 Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:");
594 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
595 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d",
596 iSeqIndex, prMSeq->sqinfo[iSeqIndex].name,
597 piKMeansClusterAssignments[iSeqIndex]);
601 Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds");
603 for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
604 int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex];
605 if (pbClusterUsed[iAssignedCluster]) {
608 /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/
609 piSeeds[iSeedIndex++] = iSeqIndex;
610 pbClusterUsed[iAssignedCluster] = TRUE;
613 CKFREE(pbClusterUsed);
615 CKFREE(pdKMeansVectors);
616 CKFREE(pdKMeansClusterCenters);
617 CKFREE(piKMeansClusterAssignments);
624 if (LOG_DEBUG <= rLog.iLogLevelEnabled) {
625 for (iSeedIndex=0; iSeedIndex<iNumSeeds; iSeedIndex++) {
626 Log(&rLog, LOG_DEBUG, "Picked sequence %d (%s) as seed no %d",
627 piSeeds[iSeedIndex], prMSeq->sqinfo[piSeeds[iSeedIndex]].name, iSeedIndex);
633 /* end of SeedSelection() */
639 * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with
640 * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster.
642 * @param[out] prKMeansResult_p
643 * Result of Bisecting KMeans. Will be allocated here.
644 * Caller has to free. See @see FreeKMeansResult()
646 * Number of objects/sequences to cluster
648 * Dimensionality of input data
649 * @param[in] ppdVectors
650 * each row holds iDim points for this object's coordinates
651 * @param[in] iMinRequiredObjsPerCluster
652 * Minimum number of objects per Cluster (inclusive)/
653 * @param[in] iMaxAllowedObjsPerCluster
654 * Maximum number of objects per Cluster (inclusive). Function returns once no
655 * cluster contains more then this number of objects. Soft limit!
657 * @note Convoluted code. Could use some restructuring. My apologies.
662 BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
663 const int iNObjs, const int iDim, double **ppdVectors,
664 const int iMinRequiredObjsPerCluster,
665 const int iMaxAllowedObjsPerCluster)
668 /* cluster centers for each cluster created at each split */
669 double *pdKClusterCenters;
670 /* keep track of updated object indices per newly created
672 int *piCurObjToUpdate;
673 /* number of times we called 2-means */
675 /* flag for unsuccessful cluster split */
676 bool bNaNDetected = FALSE;
677 /* flag for detected small cluster after split */
678 bool bSmallClusterDetected;
679 /* queue of clusters which are to be split */
680 int_queue_t rClusterSplitQueue;
683 Stopwatch_t *stopwatch = StopwatchCreate();
687 piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int));
689 NewKMeansResult(prKMeansResult_p);
691 /* new cluster centers created at each split/KMeans run
693 pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double));
695 /* init results by setting a first cluster that contains all objects
697 (*prKMeansResult_p)->iNClusters = 1;
698 (*prKMeansResult_p)->iDim = iDim;
699 /* fake center coordinates of first cluster */
700 (*prKMeansResult_p)->ppdClusterCenters =
701 (double **) CKMALLOC(1 * sizeof(double *));
702 (*prKMeansResult_p)->ppdClusterCenters[0] =
703 (double *) CKMALLOC(iDim * sizeof(double));
704 /* objects per cluster */
705 (*prKMeansResult_p)->piNObjsPerCluster =
706 (int *) CKMALLOC(1 * sizeof(int));
707 (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs;
708 /* object indices per cluster */
709 (*prKMeansResult_p)->ppiObjIndicesPerCluster =
710 (int **) CKMALLOC(1 * sizeof(int *));
711 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *)
712 CKMALLOC(iNObjs * sizeof(int));
713 for (iN=0; iN<iNObjs; iN++) {
714 (*prKMeansResult_p)->ppiObjIndicesPerCluster[0][iN] = iN;
719 /* Starting with the first cluster that now contains all the
720 * sequences keep splitting until no cluster contains more than
721 * iMaxAllowedObjsPerCluster
723 * Keep a queue of clusters (rClusterSplitQueue) to split
725 * At each split values/memory of the just split cluster will be
726 * reused and exactly one new only allocated.
729 * Given the following cluster assignments
730 * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4
731 * and a K-Means split in cluster 3 at |:
732 * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4
733 * The result should be this:
734 * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4
738 INT_QUEUE_INIT(&rClusterSplitQueue);
739 if (iNObjs>iMaxAllowedObjsPerCluster) {
740 /* pish fake first cluster index */
741 INT_QUEUE_PUSH(&rClusterSplitQueue, 0);
744 while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) {
745 /* assignments for each seq to the K newly created clusters */
746 int *piKClusterAssignments;
747 /* number of objects in cluster that is to be split */
748 int iNObjsInClusterToSplit;
749 /* coordinates of points in cluster that is to be split
750 * array of size n*d where [d*i + j] gives coordinate j of point i
752 double *pdVectorsInClusterToSplit;
753 /* copy of object indices in split cluster */
754 int *piObjIndicesOfSplitCluster;
755 /* best cost of kmeans rounds */
758 /* indices for the two created clusters
760 /* index of cluster to split */
762 /* index of newly created cluster at each round. data for the
763 other created cluster goes to the one just split,
764 i.e. (*piClusterToSplot) */
768 StopwatchZero(stopwatch);
769 StopwatchStart(stopwatch);
772 INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot);
774 iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot];
775 piKClusterAssignments = (int *)
776 CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
778 pdVectorsInClusterToSplit = (double *)
779 CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double));
780 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
781 for (iD=0; iD<iDim; ++iD) {
783 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN];
784 pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD];
789 Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
790 iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
791 fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
792 __FILE__, __FUNCTION__, __LINE__);
793 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
794 fprintf(stderr, " %u",
795 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
797 fprintf(stderr, "\n");
798 (void) fflush(stderr);
802 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
804 "DEBUG(%s|%s():%d): Coordinate of object %u (real index %u) in cluster to split:",
805 __FILE__, __FUNCTION__, __LINE__, iN,
806 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot][iN]);
807 for (iD=0; iD<iDim; iD++) {
808 fprintf(stderr, " %f", pdVectorsInClusterToSplit[iDim*iN + iD]);
810 fprintf(stderr, "\n");
811 (void) fflush(stderr);
815 /* KMeans(1 "The number of points in the data set",
816 * 2 "The number of clusters to look for",
817 * 3 "The number of dimensions that the data set lives in",
818 * 4 "points: An array of size n*d where points[d*i + j] gives coordinate j of point i",
819 * 5 "attempts: The number of times to independently run k-means",
820 * 6 "use_lloyds_method: uses kmpp if false, otherwise lloyds method",
821 * 7 "centers: This can either be null or an array of size k*d.
822 * In the latter case, centers[d*i + j] will give coordinate j of center i.
823 * If the cluster is unused, it will contain NaN instead.",
824 * 8 "assignments: This can either be null or an array of size n.
825 * In the latter case, it will be filled with the cluster that each point is assigned to
826 * (an integer between 0 and k-1 inclusive).");
828 dCost = KMeans(iNObjsInClusterToSplit, 2, iDim,
829 pdVectorsInClusterToSplit,
830 RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS,
831 pdKClusterCenters, piKClusterAssignments);
834 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Raw K-Means output:");
835 for (iN=0; iN<2; iN++) {
836 fprintf(stderr, "DEBUG(%s|%s():%d): Cluster Center %u =",
837 __FILE__, __FUNCTION__, __LINE__, iN);
838 for (iD=0; iD<iDim; iD++) {
839 fprintf(stderr, " %f", pdKClusterCenters[iN*iDim+iD]);
841 fprintf(stderr, "\n");
842 (void) fflush(stderr);
844 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
845 Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u to cluster %d (of #%u)",
846 iN, piKClusterAssignments[iN], 2);
850 /* real index of one of the newly created clusters. the other
851 * one is iClusterToSplot */
852 iNewClusterIdx = (*prKMeansResult_p)->iNClusters;
855 /* We don't want Clusters which are too small. Check here if a
856 * split created a small cluster and if yes, discard the
857 * solution. Because the cluster has already been removed from
858 * the queue, we can just continue.
860 bSmallClusterDetected = FALSE;
861 if (iMinRequiredObjsPerCluster>1) {
862 int iNObjsCluster[2];
863 iNObjsCluster[0] = 0; /* first cluster */
864 iNObjsCluster[1] = 0; /* second cluster */
865 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
866 iNObjsCluster[piKClusterAssignments[iN]]+=1;
869 if (iNObjsCluster[0]<iMinRequiredObjsPerCluster
871 iNObjsCluster[1]<iMinRequiredObjsPerCluster) {
872 bSmallClusterDetected = TRUE;
873 Log(&rLog, LOG_FORCED_DEBUG, "Skipping this split because objs in 1st/2nd cluster = %d/%d < %d",
874 iNObjsCluster[0], iNObjsCluster[1], iMinRequiredObjsPerCluster);
878 /* Same logic as for small clusters applies if KMeans couldn't
879 * split the cluster. In this case one of its center
880 * coordinates will be NaN, which we check for in the
883 if (! bSmallClusterDetected) {
884 for (iN=0; iN<2; iN++) {
885 bNaNDetected = FALSE;
886 for (iD=0; iD<iDim; iD++) {
887 if (0 != isnan(pdKClusterCenters[iN*iDim+iN])) {
888 /* Got NaN as coordinate after splitting */
889 Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
891 iClusterToSplot, iNObjsInClusterToSplit,
892 "Hope it's not too big and doesn't slow things down."
904 /* Discarding split of this cluster. It has been removed from the
905 * queue already. so just continue
907 if (bNaNDetected || bSmallClusterDetected) {
908 CKFREE(piKClusterAssignments);
909 CKFREE(pdVectorsInClusterToSplit);
914 /* update cluster centers: pdClusterCenters
917 /* reuse memory of existing/old/split cluster
919 for (iN=0; iN<iDim; iN++) {
920 double dCoord = pdKClusterCenters[0*iDim+iN];
921 (*prKMeansResult_p)->ppdClusterCenters[iClusterToSplot][iN] = dCoord;
923 /* realloc and set new one
925 (*prKMeansResult_p)->ppdClusterCenters = (double **)
926 CKREALLOC((*prKMeansResult_p)->ppdClusterCenters,
927 ((*prKMeansResult_p)->iNClusters+1) * sizeof(double *));
928 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *)
929 CKMALLOC(iDim * sizeof(double));
930 for (iD=0; iD<iDim; iD++) {
931 double dCoord = pdKClusterCenters[1*iDim+iD];
932 (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx][iD] = dCoord;
935 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:");
936 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
937 fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =",
938 __FILE__, __FUNCTION__, __LINE__, iN);
939 for (iD=0; iD<iDim; iD++) {
940 fprintf(stderr, " %f", (*prKMeansResult_p)->ppdClusterCenters[iN][iD]);
942 fprintf(stderr, "\n");
943 (void) fflush(stderr);
948 /* update #seqs per cluster: piNObjsPerCluster
951 (*prKMeansResult_p)->piNObjsPerCluster = (int *)
952 CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster,
953 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int));
954 /* init new and old one to zero */
955 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0;
956 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0;
957 /* now update values */
958 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
959 if (0 == piKClusterAssignments[iN]) {
960 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] += 1;
961 } else if (1 == piKClusterAssignments[iN]) {
962 (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1;
964 /* there used to be code for iK>=2 in r101 */
965 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
966 piKClusterAssignments[iN]);
970 Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:");
971 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
972 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences",
973 iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]);
976 /* queue clusters if still they are still too big
978 if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) {
979 INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot);
981 if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) {
982 INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx);
987 /* update which objects belong to which cluster:
989 * note: piNObjsPerCluster needs to be already updated
992 /* create a copy of the object indices in the split cluster
993 * (original will be overwritten)
995 piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int));
996 memcpy(piObjIndicesOfSplitCluster,
997 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
998 iNObjsInClusterToSplit * sizeof(int));
1000 (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **)
1001 CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster,
1002 ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *));
1005 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] =
1006 (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot],
1007 (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int));
1009 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] =
1010 (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int));
1013 /* now reassign the object indices to the assigned cluster
1015 piCurObjToUpdate[0] = 0;
1016 piCurObjToUpdate[1] = 0;
1017 for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
1018 int iThisObjIdx = piObjIndicesOfSplitCluster[iN];
1019 int iThisClusterAssignment = piKClusterAssignments[iN];
1020 int iThisOffset = piCurObjToUpdate[iThisClusterAssignment];
1021 int iThisClusterIndex = 0;
1023 if (0 == iThisClusterAssignment) {
1024 iThisClusterIndex = iClusterToSplot;
1025 } else if (1 == iThisClusterAssignment) {
1026 iThisClusterIndex = iNewClusterIdx;
1028 /* there used to be code for iK>=2 in r101 */
1029 Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
1030 piKClusterAssignments[iN]);
1033 Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d",
1034 iThisClusterIndex, iThisOffset, iThisObjIdx);
1036 (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx;
1037 piCurObjToUpdate[iThisClusterAssignment]+=1;
1039 CKFREE(piObjIndicesOfSplitCluster);
1041 for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) {
1043 fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ",
1044 __FILE__, __FUNCTION__, __LINE__, iN);
1045 for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) {
1046 fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]);
1048 fprintf(stderr, "\n");
1049 (void) fflush(stderr);
1055 StopwatchStop(stopwatch);
1056 StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch);
1060 /* finally: increase number of clusters
1062 (*prKMeansResult_p)->iNClusters += 1;
1064 CKFREE(piKClusterAssignments);
1065 CKFREE(pdVectorsInClusterToSplit);
1068 INT_QUEUE_DESTROY(&rClusterSplitQueue);
1071 Log(&rLog, LOG_DEBUG,
1072 "Bisecting K-means finished after %d rounds (no more clusters to split)",
1076 StopwatchFree(stopwatch);
1079 /* @note could use progress/timer */
1081 CKFREE(pdKClusterCenters);
1082 CKFREE(piCurObjToUpdate);
1086 /*** end: BisectingKmeans() ***/
1092 * @brief From scratch reimplementation of mBed: Blackshields et al.
1093 * (2010); PMID 20470396.
1095 * Idea is a follows:
1096 * - convert sequences into vectors of distances
1097 * - cluster the vectors using k-means
1098 * - cluster each of the k clusters using upgma (used cached distances
1100 * - join the sub-clusters to create on tree (use UPGMA on k-means
1104 * @param[out] prMbedTree_p
1105 * Created upgma tree. will be allocated here. use FreeMuscleTree()
1108 * Multiple sequences
1109 * @param[in] iPairDistType
1110 * Distance measure for pairwise alignments
1111 * @param[in] pcGuidetreeOut
1112 * Passed down to GuideTreeUpgma()
1114 * @return Zero on success, non-zero on error
1118 Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
1119 const char *pcGuidetreeOut)
1121 /* number of seeds */
1123 /* seed indices matching prMSeq */
1125 /* seqs represented as distance vectors */
1128 bisecting_kmeans_result_t *prKMeansResult = NULL;
1129 /* distance matrix of kmeans (pre-)cluster centers */
1130 symmatrix_t *prPreClusterDistmat = NULL;
1131 /* auxiliary for symmetric matrix output; tree routines etc */
1134 /* mapping of cluster-center tree node indices to corresponding
1136 int *piClusterToTreeNode;
1140 progress_t *prSubClusterDistanceProgress;
1141 bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
1143 #if FULL_WITHIN_CLUSTER_DISTANCES
1144 Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
1146 Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed.");
1150 Stopwatch_t *stopwatch = StopwatchCreate();
1153 assert(NULL != prMbedTree_p);
1154 assert(NULL != prMSeq);
1157 iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs);
1158 if (iNumSeeds >= prMSeq->nseqs) {
1159 /* -1 is condition for RandomUniqueIntArray */
1160 iNumSeeds = prMSeq->nseqs-1;
1161 Log(&rLog, LOG_DEBUG,
1162 "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d",
1167 /* Turn sequences into vectors of distances to the seeds
1170 piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int));
1171 if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) {
1172 Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed");
1175 ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *));
1176 for (iI=0; iI<prMSeq->nseqs; iI++) {
1177 ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double));
1179 if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) {
1180 Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed");
1186 /* Calculate (pre-)clusters of sequence vectors by applying
1191 /* start clock only here, to make sure we don't include pairwise
1192 * distance computation */
1193 StopwatchZero(stopwatch);
1194 StopwatchStart(stopwatch);
1197 BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
1198 MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1199 MAX_ALLOWED_SEQ_PER_PRECLUSTER);
1200 Log(&rLog, LOG_INFO,
1201 "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
1202 prKMeansResult->iNClusters,
1203 MIN_REQUIRED_SEQ_PER_PRECLUSTER,
1204 MAX_ALLOWED_SEQ_PER_PRECLUSTER);
1207 #if PRINT_CLUSTER_DISTRIBUTION
1208 Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
1209 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1212 Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI);
1213 fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =",
1214 __FILE__, __FUNCTION__, __LINE__);
1215 for (iD=0; iD<iNumSeeds; iD++) {
1216 fprintf(stderr, " %f", prKMeansResult->ppdClusterCenters[iI][iD]);
1218 fprintf(stderr, "\n");
1221 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned",
1222 iI, prKMeansResult->piNObjsPerCluster[iI]);
1224 for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
1225 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
1227 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)",
1228 iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name);
1235 /* Cluster pre-clusters produced by k-means.
1237 * Do this by calculating the vector distances of the cluster
1238 * centers and applying UPGMA.
1240 * @note could try to force-balance the tree here
1243 if (0 != NewSymMatrix(&prPreClusterDistmat,
1244 prKMeansResult->iNClusters, prKMeansResult->iNClusters)) {
1245 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed");
1247 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1248 for (iJ=iI+1; iJ<prKMeansResult->iNClusters; iJ++) {
1250 dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI],
1251 prKMeansResult->ppdClusterCenters[iJ],
1253 SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist);
1254 /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f",
1260 /* labels needed for the guide tree building routine only */
1261 ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*));
1262 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1263 ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char));
1264 (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI);
1267 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
1268 SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
1271 GuideTreeUpgma(prMbedTree_p,
1272 ppcLabels, prPreClusterDistmat, NULL);
1274 for (iI=0; iI<prKMeansResult->iNClusters; iI++) {
1275 CKFREE(ppcLabels[iI]);
1279 Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
1280 LogTree(*prMbedTree_p);
1285 /* Now replace each leaf in the pre-cluster-center tree
1286 * appropriately, i.e. with the corresponding sub-cluster.
1288 * For each leaf/sub-cluster, create a distance matrix for the
1289 * corresponding sequences. Use distances between vectors as
1290 * approximated distances between sequences. Then create a
1291 * guide-tree by applying UPGMA.
1295 /* Get a mapping of (pre)cluster number and leaf-node indices in
1296 * the cluster-center tree. We can add trees to prMbedTree_p
1297 * because AppendTrees() guarantees that no other than the node to
1298 * append to changes.
1300 piClusterToTreeNode = (int*)
1301 CKMALLOC(prKMeansResult->iNClusters * sizeof(int));
1302 iNodeIndex = FirstDepthFirstNode(*prMbedTree_p);
1304 if (IsLeaf(iNodeIndex, *prMbedTree_p)) {
1305 int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p);
1306 piClusterToTreeNode[iLeafId] = iNodeIndex;
1308 iNodeIndex = NextDepthFirstNode(iNodeIndex, *prMbedTree_p);
1309 } while (NULL_NEIGHBOR != iNodeIndex);
1314 /* Now step through all the leafs and replace them with the
1315 * corresponding sub-trees
1317 NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO),
1318 "Distance calculation within sub-clusters", bPrintCR);
1319 /* for each cluster */
1320 for (iClusterIndex=0;
1321 iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) {
1322 /* distance matrix for the sub-cluster */
1323 symmatrix_t *prWithinClusterDistances = NULL;
1325 tree_t *prSubClusterTree = NULL;
1327 ProgressLog(prSubClusterDistanceProgress,
1328 iClusterIndex, prKMeansResult->iNClusters, FALSE);
1330 #if FULL_WITHIN_CLUSTER_DISTANCES
1331 mseq_t *prSubClusterMSeq;
1336 Log(&rLog, LOG_DEBUG,
1337 "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
1339 if (TRUE == prMSeq->aligned) {
1340 iPairDistType = PAIRDIST_SQUIDID_KIMURA;
1342 iPairDistType = PAIRDIST_KTUPLE;
1346 iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex];
1348 Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d",
1349 iClusterIndex, iNSeqInCluster);
1352 #if FULL_WITHIN_CLUSTER_DISTANCES
1353 /* create an mseq structure for sequences in this cluster
1354 * don't need most of the members (e.g. orig_seq) but copy
1355 * them anyway for the sake of completeness
1357 NewMSeq(&prSubClusterMSeq);
1359 prSubClusterMSeq->nseqs = iNSeqInCluster;
1360 prSubClusterMSeq->seqtype = prMSeq->seqtype;
1361 if (NULL!=prMSeq->filename) {
1362 prSubClusterMSeq->filename = CkStrdup(prMSeq->filename);
1364 prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */
1365 prSubClusterMSeq->seq = (char **)
1366 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1367 prSubClusterMSeq->orig_seq = (char **)
1368 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *));
1369 prSubClusterMSeq->sqinfo = (SQINFO *)
1370 CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO));
1372 for (iSeqIndex=0; iSeqIndex<iNSeqInCluster; iSeqIndex++) {
1373 int iRealSeqIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iSeqIndex];
1374 prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]);
1375 prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]);
1376 SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]);
1378 Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)",
1379 iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name,
1386 /* Create a distance matrix for this sub-cluster
1387 * (prWithinClusterDistances) by using the vector distances or
1390 * Then apply UPGMA to get a subcluster tree
1391 * (prSubClusterTree) and append created tree to the
1392 * pre-cluster-tree (prMbedTree_p)
1395 #if FULL_WITHIN_CLUSTER_DISTANCES
1397 iOldLogLevel = rLog.iLogLevelEnabled;
1398 rLog.iLogLevelEnabled = LOG_WARN;
1399 /* compute distances, but be quiet */
1400 if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
1401 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
1403 Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
1406 rLog.iLogLevelEnabled = iOldLogLevel;
1408 #if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE
1411 for (iI=0; iI<prSubClusterMSeq->nseqs; iI++) {
1412 for (iJ=iI+1; iJ<prSubClusterMSeq->nseqs; iJ++) {
1413 dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ);
1416 Log(&rLog, LOG_FORCED_DEBUG,
1417 "mean pair-wise distance within subcluster %d of %d = %f",
1418 iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs);
1423 if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) {
1424 Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed");
1426 for (iI=0; iI<iNSeqInCluster; iI++) {
1427 int iRealIndexI = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1429 for (iJ=iI+1; iJ<iNSeqInCluster; iJ++) {
1430 int iRealIndexJ = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iJ];
1433 /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s",
1434 iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name,
1435 iJ, prMSeq->sqinfo[iRealIndexJ].name); */
1437 if (1 == USE_EUCLIDEAN_DISTANCE) {
1438 dist = EuclDist(ppdSeqVec[iRealIndexI],
1439 ppdSeqVec[iRealIndexJ], iNumSeeds);
1441 dist = CosDist(ppdSeqVec[iRealIndexI],
1442 ppdSeqVec[iRealIndexJ], iNumSeeds);
1444 SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist);
1449 /* labels needed for the guide tree building routine only */
1450 ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*));
1451 for (iI=0; iI<iNSeqInCluster; iI++) {
1452 #if FULL_WITHIN_CLUSTER_DISTANCES
1453 ppcLabels[iI] = prSubClusterMSeq->sqinfo[iI].name;
1455 int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI];
1456 ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name;
1460 Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
1461 iClusterIndex, prKMeansResult->iNClusters);
1462 SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
1466 GuideTreeUpgma(&prSubClusterTree, ppcLabels,
1467 prWithinClusterDistances, NULL);
1469 CKFREE(ppcLabels); /* don't free members, they just point */
1471 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex);
1472 LogTree(prSubClusterTree);
1476 /* The guide tree id's (that point to the sequences) now start
1477 * from 0, i.e. the association with the prMSeq numbering is
1478 * broken and fixed in the following
1480 for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) {
1481 if (IsLeaf(iNodeIndex, prSubClusterTree)) {
1482 int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree);
1483 int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId];
1485 Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)",
1486 iNodeIndex, iLeafId,
1487 GetLeafName(iNodeIndex, prSubClusterTree),
1488 iRealId, prMSeq->sqinfo[iRealId].name);
1490 SetLeafId(prSubClusterTree, iNodeIndex, iRealId);
1495 /* Append the newly created tree (prSubClusterTree) to the
1496 * corresponding node index of prMbedTree_p.
1499 Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s",
1500 piClusterToTreeNode[iClusterIndex],
1501 GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p));
1504 AppendTree(*prMbedTree_p,
1505 piClusterToTreeNode[iClusterIndex],
1507 /* Note: piClusterToTreeNode is still valid, because
1508 * AppendTrees() guarantees that no other than the node to
1509 * append to changes. */
1512 Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex);
1513 LogTree(*prMbedTree_p);
1516 char fname[] = "mbed-joined-tree.dnd";
1518 if (NULL == (pfOut = fopen(fname, "w"))) {
1519 Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname);
1521 MuscleTreeToFile(pfOut, *prMbedTree_p);
1522 Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname);
1524 Log(&rLog, LOG_FATAL, "DEBUG EXIT");
1530 FreeMuscleTree(prSubClusterTree);
1531 FreeSymMatrix(&prWithinClusterDistances);
1532 #if FULL_WITHIN_CLUSTER_DISTANCES
1533 FreeMSeq(&prSubClusterMSeq);
1535 } /* end for each cluster */
1536 ProgressDone(prSubClusterDistanceProgress);
1537 FreeProgress(&prSubClusterDistanceProgress);
1540 if (NULL != pcGuidetreeOut) {
1541 if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) {
1542 Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut);
1544 MuscleTreeToFile(pfOut, *prMbedTree_p);
1545 Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut);
1546 (void) fclose(pfOut);
1555 StopwatchStop(stopwatch);
1556 StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch);
1557 StopwatchFree(stopwatch);
1560 FreeKMeansResult(&prKMeansResult);
1561 FreeSymMatrix(&prPreClusterDistmat);
1562 for (iI=0; iI<prMSeq->nseqs; iI++) {
1563 CKFREE(ppdSeqVec[iI]);
1566 CKFREE(piClusterToTreeNode);
1569 TreeValidate(*prMbedTree_p);
1574 /*** end: Mbed() ***/