/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /********************************************************************* * Clustal Omega - Multiple sequence alignment * * Copyright (C) 2010 University College Dublin * * Clustal-Omega is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This file is part of Clustal-Omega. * ********************************************************************/ /* * RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $ * * * Reimplementation from scratch of mBed: * Blackshields et al. (2010); PMID 20470396 * * */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "mbed.h" #include "pair_dist.h" #include "symmatrix.h" #include "ktuple_pair.h" #include "tree.h" #include "util.h" #include "progress.h" #include "queue.h" #include "log.h" #include "kmpp/KMeans.h" #include "mbed.h" #define TIMING 0 #if TIMING #include "squid/stopwatch.h" #endif /* If FULL_WITHIN_CLUSTER_DISTANCES is not 0, distances within each * bisecting kmeans subcluster are not estimated using the vectors, * but calculated normally (using ktuple or kimura). Surprisingly this * results in 3% loss on a Homfam p24-h2010-08-09 subset (100-5000 * seqs in test, at least 5 ref seqs; MAX_SEQ 100 vs 10000; NUM_SEEDS * log2 instead of log2^2). And of course it slows things down. */ #define FULL_WITHIN_CLUSTER_DISTANCES 1 #define COMPUTE_WITHIN_SUBCLUSTER_AVERAGE 0 /* Cluster size limits. Maximum is a soft limit, which might be * exceeded if a K-Means split was unsuccesful, where unsuccesful * might also mean that the minimum required number seqs. was not * reached */ #if FULL_WITHIN_CLUSTER_DISTANCES static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 100; static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 1; #else static const int MAX_ALLOWED_SEQ_PER_PRECLUSTER = 10000; static const int MIN_REQUIRED_SEQ_PER_PRECLUSTER = 100; #endif /* How many restarts per bisecting kmeans split. 10 give 0.5% increase * in quality on original HOMFAM over just 1. It also increases kmeans * time by a factor of ~3, but overall time is insignificant * compared to pairdist/progressive alignment part. */ static const int RESTARTS_PER_SPLIT = 10; /* Use standard kmeans (lloyds) or kmeans++. Both are almost * indistinguishable here, although kmeans++ is slightly ahead the * fewer seeds you pick (and it should be better in theory) */ #define USE_KMEANS_LLOYDS 0 #ifndef HAVE_LOG2 #define log2(x) (log(x) / 0.69314718055994530942) #endif #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2) /* Seed selection method: SAMPLE_SEEDS_BY_LENGTH is the original mBed * approach: Sample iSeeds sequence with constant stride from length-sorted X. * It might be better to pick the seeds randomly, because length sorting might * be more prone to including outliers (e.g. very long and very short seqs). * However, we could never observer such a thing. So just stick to the * original version as this also removes the random element. */ enum SEED_SELECTION_TYPE { SELECT_SEEDS_RANDOMLY, SELECT_SEEDS_BY_LENGTH }; #define SEED_SELECTION SELECT_SEEDS_BY_LENGTH /* Tests on BAliBase (RV11,12,20,30,40,50; 10 runs each) show there is * no difference between mbed-trees created from cosine or euclidean * distances (simple version, just using disparities). */ #define USE_EUCLIDEAN_DISTANCE 1 /* print some mbed pre-cluster usage to screen */ #define PRINT_CLUSTER_DISTRIBUTION 0 #define TRACE 0 typedef struct { /* Number of final clusters */ int iNClusters; /* Coordinates (columns) for each cluster (rows) * valid indices: [0...iNClusters][0...dim-1] */ double **ppdClusterCenters; /* Dimensionality of input data and cluster center coordinates */ int iDim; /* Number of objects per cluster */ int *piNObjsPerCluster; /* Objects (indices) for each cluster. [i][j] will point to (index * of) object j in cluster i */ int **ppiObjIndicesPerCluster; } bisecting_kmeans_result_t; /** * @brief Free KMeans result structure. * * @param[out] prResult_p * K-Means result to free * * @see NewKMeansResult() */ void FreeKMeansResult(bisecting_kmeans_result_t **prResult_p) { int iAux; CKFREE((*prResult_p)->piNObjsPerCluster); for (iAux=0; iAux<(*prResult_p)->iNClusters; iAux++) { CKFREE((*prResult_p)->ppiObjIndicesPerCluster[iAux]); CKFREE((*prResult_p)->ppdClusterCenters[iAux]); } CKFREE((*prResult_p)->ppiObjIndicesPerCluster); CKFREE((*prResult_p)->ppdClusterCenters); (*prResult_p)->iNClusters = 0; (*prResult_p)->iDim = 0; CKFREE(*prResult_p); } /*** end: FreeKMeansResult() ***/ /** * @brief Allocate new KMeans result * * @param[out] prKMeansResult_p * K-Means result to free * * @see NewKMeansResult() */ void NewKMeansResult(bisecting_kmeans_result_t **prKMeansResult_p) { (*prKMeansResult_p) = (bisecting_kmeans_result_t *) CKCALLOC(1, sizeof(bisecting_kmeans_result_t)); (*prKMeansResult_p)->iNClusters = 0; (*prKMeansResult_p)->iDim = 0; (*prKMeansResult_p)->ppdClusterCenters = NULL; (*prKMeansResult_p)->piNObjsPerCluster = NULL; (*prKMeansResult_p)->ppiObjIndicesPerCluster = NULL; } /*** end: NewKMeansResult() ***/ /** * @brief Calculate the euclidean distance between two vectors * * @param[in] v1 * First vector with dim dimensions * @param[in] v2 * Second vector with dim dimensions * @param[in] dim * Dimensionality of v1 and v2 * * @return euclidean distance as double * * @note Could probably be inlined. Also perfect for SSE */ double EuclDist(const double *v1, const double *v2, const int dim) { int i; /* aux */ double dist; /* returned distance */ assert(v1!=NULL); assert(v2!=NULL); dist = 0.0; for (i=0; i0 && iNumSeeds<=prMSeq->nseqs); piSortedSeeds = CKMALLOC(iNumSeeds * sizeof(int)); memcpy(piSortedSeeds, piSeeds, iNumSeeds*sizeof(int)); /* need them sorted, otherwise the swapping approach below might * break */ qsort(piSortedSeeds, iNumSeeds, sizeof(int), IntCmp); /* rearrange mseq order so that we can use ktuple_pairdist code as * is. That is, swap seeds and non-seeds such that the seeds end * up in front of mseq. This way we can use the KTuplePairDist() * code, without making a copy of mseq. Also, keep an array of * indices which serves to restore the original sequence order * after putting the seeds in front * */ restore = (int *) CKMALLOC(prMSeq->nseqs * sizeof(int)); for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { restore[iSeqIndex] = iSeqIndex; } for (iSeedIndex=0; iSeedIndexnseqs; iSeqIndex++) { Log(&rLog, LOG_FORCED_DEBUG, "swapped seq no %d now: seq = %s", iSeqIndex, prMSeq->sqinfo[iSeqIndex].name); } #endif /* convert sequences into vectors of distances by calling pairwise * distance function for each seq/seed pair * */ if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, 0, iNumSeeds, 0, prMSeq->nseqs, NULL, NULL)) { Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed."); FreeSymMatrix(&prDistmat); CKFREE(piSortedSeeds); CKFREE(restore); return -1; } #if TRACE { char **labels; labels = (char **) CKMALLOC(prMSeq->nseqs * sizeof(char *)); for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name; } SymMatrixPrint(prDistmat, labels, NULL); CKFREE(labels); } #endif /* update ppdSeqVec according to prDistmat. keep in mind that we * changed mseq order * */ for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { for (iSeedIndex=0; iSeedIndexnseqs; iSeqIndex++) { Log(&rLog, LOG_FORCED_DEBUG, "restored seq no %d: seq = %s %s", iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, prMSeq->seq[iSeqIndex]); } #endif #if TRACE for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { printf("DEBUG: seq %-4u as distance vector =", iSeqIndex); for (iSeedIndex=0; iSeedIndexnseqs); PermutationArray(&piPermArray, iNumSeeds); /* copy to piSeeds */ for (iSeedIndex=0; iSeedIndexnseqs * sizeof(int)); int *piOrder = CKMALLOC(prMSeq->nseqs * sizeof(int)); Log(&rLog, LOG_INFO, "Using %d seeds (chosen with constant stride from length sorted seqs) for mBed (from a total of %d sequences)", iNumSeeds, prMSeq->nseqs); iStep = prMSeq->nseqs / iNumSeeds; /* iStep will never get too big * due to rounding */ /* first get an array of seq indices order according to corresponding sequence length: piOrder */ for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { piSeqLen[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].len; } QSortAndTrackIndex(piOrder, piSeqLen, prMSeq->nseqs, 'd', FALSE); #if 0 for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { Log(&rLog, LOG_FORCED_DEBUG, "Descending order (no %d): seq %d has len %d)", iSeqIndex, piOrder[iSeqIndex], piSeqLen[piOrder[iSeqIndex]]); } #endif CKFREE(piSeqLen); iSeedIndex = 0; for (iSeqIndex=0; iSeedIndexnseqs * sizeof(int)); double *pdKMeansClusterCenters = CKCALLOC(iNumSeeds * iNumSeeds, sizeof(double)); /* create a copy of ppdSeqVec suitable for KMeans */ double *pdKMeansVectors = CKMALLOC(prMSeq->nseqs * iNumSeeds * sizeof(double)); bool *pbClusterUsed = CKCALLOC(iNumSeeds, sizeof(bool)); Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Experimental feature: K-means on first round embedding to get better seeds"); Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Reuse seeds from clusters"); Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Could try log(n) instead of log(n)^2 in first round"); Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME hardcoded iPairDistType PAIRDIST_KTUPLE"); Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Show that this is fast *and* good"); /* Log(&rLog, LOG_FORCED_DEBUG, "%s", "Overriding iNumSeeds"); iNumSeeds = (int) pow(log2((double)prMSeq->nseqs), 2); */ ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *)); for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { ppdSeqVec[iSeqIndex] = (double *) CKMALLOC(iNumSeeds * sizeof(double)); } if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, PAIRDIST_KTUPLE)) { Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed"); return -1; } for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { int iDim; for (iDim=0; iDimnseqs, iNumSeeds, iNumSeeds, pdKMeansVectors, RESTARTS_PER_SPLIT, USE_KMEANS_LLOYDS, pdKMeansClusterCenters, piKMeansClusterAssignments); Log(&rLog, LOG_FORCED_DEBUG, "Best split cost = %f", dCost); Log(&rLog, LOG_FORCED_DEBUG, "%s", "FIXME Check for Nan in cluster centers"); #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "%s", "K-Means output:"); for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { Log(&rLog, LOG_FORCED_DEBUG, " Raw assignment: Seq %u (%s) to cluster %d", iSeqIndex, prMSeq->sqinfo[iSeqIndex].name, piKMeansClusterAssignments[iSeqIndex]); } #endif Log(&rLog, LOG_FORCED_DEBUG, "FIXME %s", "proof of concept implementation: Pick first sequences from each clusters instead of reusing seeds"); iSeedIndex = 0; for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { int iAssignedCluster = piKMeansClusterAssignments[iSeqIndex]; if (pbClusterUsed[iAssignedCluster]) { continue; } else { /*LOG_DEBUG("Picked seed %d from cluster %d", iSeqIndex,iAssignedCluster);*/ piSeeds[iSeedIndex++] = iSeqIndex; pbClusterUsed[iAssignedCluster] = TRUE; } } CKFREE(pbClusterUsed); CKFREE(pdKMeansVectors); CKFREE(pdKMeansClusterCenters); CKFREE(piKMeansClusterAssignments); } /* if (1) */ #endif if (LOG_DEBUG <= rLog.iLogLevelEnabled) { for (iSeedIndex=0; iSeedIndexsqinfo[piSeeds[iSeedIndex]].name, iSeedIndex); } } return 0; } /* end of SeedSelection() */ /** * @brief Bisecting K-Means clustering. Repeatedly calls K-Means with * a K of 2 until no cluster has more than iMaxAllowedObjsPerCluster. * * @param[out] prKMeansResult_p * Result of Bisecting KMeans. Will be allocated here. * Caller has to free. See @see FreeKMeansResult() * @param[in] iNObjs * Number of objects/sequences to cluster * @param[in] iDim * Dimensionality of input data * @param[in] ppdVectors * each row holds iDim points for this object's coordinates * @param[in] iMinRequiredObjsPerCluster * Minimum number of objects per Cluster (inclusive)/ * @param[in] iMaxAllowedObjsPerCluster * Maximum number of objects per Cluster (inclusive). Function returns once no * cluster contains more then this number of objects. Soft limit! * * @note Convoluted code. Could use some restructuring. My apologies. * AW * */ void BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, const int iNObjs, const int iDim, double **ppdVectors, const int iMinRequiredObjsPerCluster, const int iMaxAllowedObjsPerCluster) { int iN, iD; /* cluster centers for each cluster created at each split */ double *pdKClusterCenters; /* keep track of updated object indices per newly created * cluster */ int *piCurObjToUpdate; /* number of times we called 2-means */ int iNRounds; /* flag for unsuccessful cluster split */ bool bNaNDetected = FALSE; /* flag for detected small cluster after split */ bool bSmallClusterDetected; /* queue of clusters which are to be split */ int_queue_t rClusterSplitQueue; #if TIMING Stopwatch_t *stopwatch = StopwatchCreate(); #endif piCurObjToUpdate = (int *) CKMALLOC(2 * sizeof(int)); NewKMeansResult(prKMeansResult_p); /* new cluster centers created at each split/KMeans run */ pdKClusterCenters = (double *) CKCALLOC(2 * iDim, sizeof(double)); /* init results by setting a first cluster that contains all objects */ (*prKMeansResult_p)->iNClusters = 1; (*prKMeansResult_p)->iDim = iDim; /* fake center coordinates of first cluster */ (*prKMeansResult_p)->ppdClusterCenters = (double **) CKMALLOC(1 * sizeof(double *)); (*prKMeansResult_p)->ppdClusterCenters[0] = (double *) CKMALLOC(iDim * sizeof(double)); /* objects per cluster */ (*prKMeansResult_p)->piNObjsPerCluster = (int *) CKMALLOC(1 * sizeof(int)); (*prKMeansResult_p)->piNObjsPerCluster[0] = iNObjs; /* object indices per cluster */ (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **) CKMALLOC(1 * sizeof(int *)); (*prKMeansResult_p)->ppiObjIndicesPerCluster[0] = (int *) CKMALLOC(iNObjs * sizeof(int)); for (iN=0; iNppiObjIndicesPerCluster[0][iN] = iN; } /* Starting with the first cluster that now contains all the * sequences keep splitting until no cluster contains more than * iMaxAllowedObjsPerCluster * * Keep a queue of clusters (rClusterSplitQueue) to split * * At each split values/memory of the just split cluster will be * reused and exactly one new only allocated. * * Example: * Given the following cluster assignments * 0 0 0 0 1 1 2 2 2 3 3 3 3 3 3 3 4 4 * and a K-Means split in cluster 3 at |: * 0 0 0 0 1 1 2 2 2 3 3 3 | 3 3 3 3 4 4 * The result should be this: * 0 0 0 0 1 1 2 2 2 3 3 3 | 5 5 5 5 4 4 * * */ INT_QUEUE_INIT(&rClusterSplitQueue); if (iNObjs>iMaxAllowedObjsPerCluster) { /* pish fake first cluster index */ INT_QUEUE_PUSH(&rClusterSplitQueue, 0); } iNRounds = 0; while (! INT_QUEUE_EMPTY(&rClusterSplitQueue)) { /* assignments for each seq to the K newly created clusters */ int *piKClusterAssignments; /* number of objects in cluster that is to be split */ int iNObjsInClusterToSplit; /* coordinates of points in cluster that is to be split * array of size n*d where [d*i + j] gives coordinate j of point i */ double *pdVectorsInClusterToSplit; /* copy of object indices in split cluster */ int *piObjIndicesOfSplitCluster; /* best cost of kmeans rounds */ double dCost = -1.0; /* indices for the two created clusters */ /* index of cluster to split */ int iClusterToSplot; /* index of newly created cluster at each round. data for the other created cluster goes to the one just split, i.e. (*piClusterToSplot) */ int iNewClusterIdx; #if TIMING StopwatchZero(stopwatch); StopwatchStart(stopwatch); #endif INT_QUEUE_POP(&rClusterSplitQueue, &iClusterToSplot); iNObjsInClusterToSplit = (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot]; piKClusterAssignments = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int)); pdVectorsInClusterToSplit = (double *) CKMALLOC(iNObjsInClusterToSplit * iDim * sizeof(double)); for (iN=0; iNppiObjIndicesPerCluster[iClusterToSplot][iN]; pdVectorsInClusterToSplit[iDim*iN + iD] = ppdVectors[iThisObjIdx][iD]; } } #if TRACE Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects", iNRounds, iClusterToSplot, iNObjsInClusterToSplit); fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:", __FILE__, __FUNCTION__, __LINE__); for (iN=0; iNppiObjIndicesPerCluster[iClusterToSplot][iN]); } fprintf(stderr, "\n"); (void) fflush(stderr); #endif #if TRACE for (iN=0; iNppiObjIndicesPerCluster[iClusterToSplot][iN]); for (iD=0; iDiNClusters; /* We don't want Clusters which are too small. Check here if a * split created a small cluster and if yes, discard the * solution. Because the cluster has already been removed from * the queue, we can just continue. */ bSmallClusterDetected = FALSE; if (iMinRequiredObjsPerCluster>1) { int iNObjsCluster[2]; iNObjsCluster[0] = 0; /* first cluster */ iNObjsCluster[1] = 0; /* second cluster */ for (iN=0; iNppdClusterCenters[iClusterToSplot][iN] = dCoord; } /* realloc and set new one */ (*prKMeansResult_p)->ppdClusterCenters = (double **) CKREALLOC((*prKMeansResult_p)->ppdClusterCenters, ((*prKMeansResult_p)->iNClusters+1) * sizeof(double *)); (*prKMeansResult_p)->ppdClusterCenters[iNewClusterIdx] = (double *) CKMALLOC(iDim * sizeof(double)); for (iD=0; iDppdClusterCenters[iNewClusterIdx][iD] = dCoord; } #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of cluster centers done. Cluster centers so far:"); for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { fprintf(stderr, "DEBUG(%s|%s():%d): Center %u =", __FILE__, __FUNCTION__, __LINE__, iN); for (iD=0; iDppdClusterCenters[iN][iD]); } fprintf(stderr, "\n"); (void) fflush(stderr); } #endif /* update #seqs per cluster: piNObjsPerCluster * */ (*prKMeansResult_p)->piNObjsPerCluster = (int *) CKREALLOC((*prKMeansResult_p)->piNObjsPerCluster, ((*prKMeansResult_p)->iNClusters+1) * sizeof(int)); /* init new and old one to zero */ (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] = 0; (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] = 0; /* now update values */ for (iN=0; iNpiNObjsPerCluster[iClusterToSplot] += 1; } else if (1 == piKClusterAssignments[iN]) { (*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] += 1; } else { /* there used to be code for iK>=2 in r101 */ Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)", piKClusterAssignments[iN]); } } #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "%s", "* Update of NObjs per cluster done:"); for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d contains %d sequences", iN, (*prKMeansResult_p)->piNObjsPerCluster[iN]); } #endif /* queue clusters if still they are still too big */ if ((*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] > iMaxAllowedObjsPerCluster) { INT_QUEUE_PUSH(&rClusterSplitQueue, iClusterToSplot); } if ((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] > iMaxAllowedObjsPerCluster) { INT_QUEUE_PUSH(&rClusterSplitQueue, iNewClusterIdx); } /* update which objects belong to which cluster: * * note: piNObjsPerCluster needs to be already updated * */ /* create a copy of the object indices in the split cluster * (original will be overwritten) */ piObjIndicesOfSplitCluster = (int *) CKMALLOC(iNObjsInClusterToSplit * sizeof(int)); memcpy(piObjIndicesOfSplitCluster, (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot], iNObjsInClusterToSplit * sizeof(int)); (*prKMeansResult_p)->ppiObjIndicesPerCluster = (int **) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster, ((*prKMeansResult_p)->iNClusters+1) * sizeof(int *)); (*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot] = (int *) CKREALLOC((*prKMeansResult_p)->ppiObjIndicesPerCluster[iClusterToSplot], (*prKMeansResult_p)->piNObjsPerCluster[iClusterToSplot] * sizeof(int)); (*prKMeansResult_p)->ppiObjIndicesPerCluster[iNewClusterIdx] = (int *) CKMALLOC((*prKMeansResult_p)->piNObjsPerCluster[iNewClusterIdx] * sizeof(int)); /* now reassign the object indices to the assigned cluster */ piCurObjToUpdate[0] = 0; piCurObjToUpdate[1] = 0; for (iN=0; iN=2 in r101 */ Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)", piKClusterAssignments[iN]); } #if 0 Log(&rLog, LOG_FORCED_DEBUG, "Setting (*prKMeansResult_p)->ppiObjIndicesPerCluster[%d][%d] = %d", iThisClusterIndex, iThisOffset, iThisObjIdx); #endif (*prKMeansResult_p)->ppiObjIndicesPerCluster[iThisClusterIndex][iThisOffset] = iThisObjIdx; piCurObjToUpdate[iThisClusterAssignment]+=1; } CKFREE(piObjIndicesOfSplitCluster); #if TRACE for (iN=0; iN<(*prKMeansResult_p)->iNClusters+1; iN++) { int iObj; fprintf(stderr, "DEBUG(%s|%s():%d): Objects in cluster %u: ", __FILE__, __FUNCTION__, __LINE__, iN); for (iObj=0; iObj<(*prKMeansResult_p)->piNObjsPerCluster[iN]; iObj++) { fprintf(stderr, " %u", (*prKMeansResult_p)->ppiObjIndicesPerCluster[iN][iObj]); } fprintf(stderr, "\n"); (void) fflush(stderr); } #endif #if TIMING StopwatchStop(stopwatch); StopwatchDisplay(stdout, "Total time after next round in Bisecting-KMeans: ", stopwatch); #endif /* finally: increase number of clusters */ (*prKMeansResult_p)->iNClusters += 1; iNRounds += 1; CKFREE(piKClusterAssignments); CKFREE(pdVectorsInClusterToSplit); } /* while */ INT_QUEUE_DESTROY(&rClusterSplitQueue); Log(&rLog, LOG_DEBUG, "Bisecting K-means finished after %d rounds (no more clusters to split)", iNRounds); #if TIMING StopwatchFree(stopwatch); #endif /* @note could use progress/timer */ CKFREE(pdKClusterCenters); CKFREE(piCurObjToUpdate); return; } /*** end: BisectingKmeans() ***/ /** * * @brief From scratch reimplementation of mBed: Blackshields et al. * (2010); PMID 20470396. * * Idea is a follows: * - convert sequences into vectors of distances * - cluster the vectors using k-means * - cluster each of the k clusters using upgma (used cached distances * from above?) * - join the sub-clusters to create on tree (use UPGMA on k-means * medoids) * * * @param[out] prMbedTree_p * Created upgma tree. will be allocated here. use FreeMuscleTree() * to free * @param[in] prMSeq * Multiple sequences * @param[in] iPairDistType * Distance measure for pairwise alignments * @param[in] pcGuidetreeOut * Passed down to GuideTreeUpgma() * * @return Zero on success, non-zero on error * */ int Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, const char *pcGuidetreeOut) { /* number of seeds */ int iNumSeeds; /* seed indices matching prMSeq */ int *piSeeds; /* seqs represented as distance vectors */ double **ppdSeqVec; /* kmeans result */ bisecting_kmeans_result_t *prKMeansResult = NULL; /* distance matrix of kmeans (pre-)cluster centers */ symmatrix_t *prPreClusterDistmat = NULL; /* auxiliary for symmetric matrix output; tree routines etc */ char **ppcLabels; int iNodeIndex; /* mapping of cluster-center tree node indices to corresponding cluster */ int *piClusterToTreeNode; int iClusterIndex; int iI, iJ; FILE *pfOut; progress_t *prSubClusterDistanceProgress; bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE; #if FULL_WITHIN_CLUSTER_DISTANCES Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed."); #else Log(&rLog, LOG_DEBUG, "Computing vector distances within subclusters for mBed."); #endif #if MBED_TIMING Stopwatch_t *stopwatch = StopwatchCreate(); #endif assert(NULL != prMbedTree_p); assert(NULL != prMSeq); iNumSeeds = (int) NUMBER_OF_SEEDS(prMSeq->nseqs); if (iNumSeeds >= prMSeq->nseqs) { /* -1 is condition for RandomUniqueIntArray */ iNumSeeds = prMSeq->nseqs-1; Log(&rLog, LOG_DEBUG, "Automatically determined number of seeds is bigger (or equal) the number of sequences. Will set it to %d", iNumSeeds); } /* Turn sequences into vectors of distances to the seeds * */ piSeeds = (int *) CKMALLOC(iNumSeeds * sizeof(int)); if (0 != SeedSelection(piSeeds, iNumSeeds, SEED_SELECTION, prMSeq)) { Log(&rLog, LOG_ERROR, "Something went wrong during seed selection for mbed"); return -1; } ppdSeqVec = (double **) CKMALLOC(prMSeq->nseqs * sizeof(double *)); for (iI=0; iInseqs; iI++) { ppdSeqVec[iI] = (double *) CKMALLOC(iNumSeeds * sizeof(double)); } if (0 != SeqToVec(ppdSeqVec, prMSeq, piSeeds, iNumSeeds, iPairDistType)) { Log(&rLog, LOG_ERROR, "Could not convert sequences into vectors for mbed"); return -1; } CKFREE(piSeeds); /* Calculate (pre-)clusters of sequence vectors by applying * bisecting kmeans * */ #if MBED_TIMING /* start clock only here, to make sure we don't include pairwise * distance computation */ StopwatchZero(stopwatch); StopwatchStart(stopwatch); #endif BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec, MIN_REQUIRED_SEQ_PER_PRECLUSTER, MAX_ALLOWED_SEQ_PER_PRECLUSTER); Log(&rLog, LOG_INFO, "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)", prKMeansResult->iNClusters, MIN_REQUIRED_SEQ_PER_PRECLUSTER, MAX_ALLOWED_SEQ_PER_PRECLUSTER); #if PRINT_CLUSTER_DISTRIBUTION Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters); for (iI=0; iIiNClusters; iI++) { #if TRACE int iD; Log(&rLog, LOG_FORCED_DEBUG, "Diagnostic output for cluster %d follows:", iI); fprintf(stderr, "DEBUG(%s|%s():%d): center coordinates =", __FILE__, __FUNCTION__, __LINE__); for (iD=0; iDppdClusterCenters[iI][iD]); } fprintf(stderr, "\n"); fflush(stderr); #endif Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d has %d objects assigned", iI, prKMeansResult->piNObjsPerCluster[iI]); #if TRACE for (iJ=0; iJpiNObjsPerCluster[iI]; iJ++) { int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ]; Log(&rLog, LOG_FORCED_DEBUG, "Cluster %u: object %u has index %u (= seq %s)", iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name); } #endif } #endif /* Cluster pre-clusters produced by k-means. * * Do this by calculating the vector distances of the cluster * centers and applying UPGMA. * * @note could try to force-balance the tree here * */ if (0 != NewSymMatrix(&prPreClusterDistmat, prKMeansResult->iNClusters, prKMeansResult->iNClusters)) { Log(&rLog, LOG_FATAL, "%s", "Memory allocation for pre-cluster distance-matrix failed"); } for (iI=0; iIiNClusters; iI++) { for (iJ=iI+1; iJiNClusters; iJ++) { double dDist; dDist = EuclDist(prKMeansResult->ppdClusterCenters[iI], prKMeansResult->ppdClusterCenters[iJ], iNumSeeds); SymMatrixSetValue(prPreClusterDistmat, iI, iJ, dDist); /* Log(&rLog, LOG_FORCED_DEBUG, "Euclidean distance between clusters %d and %d = %f", iI, iJ, dDist); */ } } /* labels needed for the guide tree building routine only */ ppcLabels = (char **) CKMALLOC(prKMeansResult->iNClusters * sizeof(char*)); for (iI=0; iIiNClusters; iI++) { ppcLabels[iI] = (char *) CKMALLOC(32 * sizeof(char)); (void) snprintf(ppcLabels[iI], 32, "Subcluster-%u", iI); } #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:"); SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL); #endif GuideTreeUpgma(prMbedTree_p, ppcLabels, prPreClusterDistmat, NULL); for (iI=0; iIiNClusters; iI++) { CKFREE(ppcLabels[iI]); } CKFREE(ppcLabels); #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:"); LogTree(*prMbedTree_p); #endif /* Now replace each leaf in the pre-cluster-center tree * appropriately, i.e. with the corresponding sub-cluster. * * For each leaf/sub-cluster, create a distance matrix for the * corresponding sequences. Use distances between vectors as * approximated distances between sequences. Then create a * guide-tree by applying UPGMA. * */ /* Get a mapping of (pre)cluster number and leaf-node indices in * the cluster-center tree. We can add trees to prMbedTree_p * because AppendTrees() guarantees that no other than the node to * append to changes. */ piClusterToTreeNode = (int*) CKMALLOC(prKMeansResult->iNClusters * sizeof(int)); iNodeIndex = FirstDepthFirstNode(*prMbedTree_p); do { if (IsLeaf(iNodeIndex, *prMbedTree_p)) { int iLeafId = GetLeafId(iNodeIndex, *prMbedTree_p); piClusterToTreeNode[iLeafId] = iNodeIndex; } iNodeIndex = NextDepthFirstNode(iNodeIndex, *prMbedTree_p); } while (NULL_NEIGHBOR != iNodeIndex); /* Now step through all the leafs and replace them with the * corresponding sub-trees */ NewProgress(&prSubClusterDistanceProgress, LogGetFP(&rLog, LOG_INFO), "Distance calculation within sub-clusters", bPrintCR); /* for each cluster */ for (iClusterIndex=0; iClusterIndex < prKMeansResult->iNClusters; iClusterIndex++) { /* distance matrix for the sub-cluster */ symmatrix_t *prWithinClusterDistances = NULL; int iNSeqInCluster; tree_t *prSubClusterTree = NULL; ProgressLog(prSubClusterDistanceProgress, iClusterIndex, prKMeansResult->iNClusters, FALSE); #if FULL_WITHIN_CLUSTER_DISTANCES mseq_t *prSubClusterMSeq; int iPairDistType; int iSeqIndex; int iOldLogLevel; Log(&rLog, LOG_DEBUG, "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER"); if (TRUE == prMSeq->aligned) { iPairDistType = PAIRDIST_SQUIDID_KIMURA; } else { iPairDistType = PAIRDIST_KTUPLE; } #endif iNSeqInCluster = prKMeansResult->piNObjsPerCluster[iClusterIndex]; #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "#seqs in subcluster no %d = %d", iClusterIndex, iNSeqInCluster); #endif #if FULL_WITHIN_CLUSTER_DISTANCES /* create an mseq structure for sequences in this cluster * don't need most of the members (e.g. orig_seq) but copy * them anyway for the sake of completeness */ NewMSeq(&prSubClusterMSeq); prSubClusterMSeq->nseqs = iNSeqInCluster; prSubClusterMSeq->seqtype = prMSeq->seqtype; if (NULL!=prMSeq->filename) { prSubClusterMSeq->filename = CkStrdup(prMSeq->filename); } prSubClusterMSeq->aligned = prMSeq->aligned; /* FS, r252 */ prSubClusterMSeq->seq = (char **) CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *)); prSubClusterMSeq->orig_seq = (char **) CKMALLOC(prSubClusterMSeq->nseqs * sizeof(char *)); prSubClusterMSeq->sqinfo = (SQINFO *) CKMALLOC(prSubClusterMSeq->nseqs * sizeof(SQINFO)); for (iSeqIndex=0; iSeqIndexppiObjIndicesPerCluster[iClusterIndex][iSeqIndex]; prSubClusterMSeq->seq[iSeqIndex] = CkStrdup(prMSeq->seq[iRealSeqIndex]); prSubClusterMSeq->orig_seq[iSeqIndex] = CkStrdup(prMSeq->orig_seq[iRealSeqIndex]); SeqinfoCopy(&prSubClusterMSeq->sqinfo[iSeqIndex], &prMSeq->sqinfo[iRealSeqIndex]); #if TRACE Log(&rLog, LOG_DEBUG, "seq no %d in cluster %d is %s (real index = %d)", iSeqIndex, iClusterIndex, prSubClusterMSeq->sqinfo[iSeqIndex].name, iRealSeqIndex); #endif } #endif /* Create a distance matrix for this sub-cluster * (prWithinClusterDistances) by using the vector distances or * ktuple distances. * * Then apply UPGMA to get a subcluster tree * (prSubClusterTree) and append created tree to the * pre-cluster-tree (prMbedTree_p) * */ #if FULL_WITHIN_CLUSTER_DISTANCES iOldLogLevel = rLog.iLogLevelEnabled; rLog.iLogLevelEnabled = LOG_WARN; /* compute distances, but be quiet */ if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs, NULL, NULL)) { Log(&rLog, LOG_ERROR, "Couldn't compute pair distances"); return -1; } rLog.iLogLevelEnabled = iOldLogLevel; #if COMPUTE_WITHIN_SUBCLUSTER_AVERAGE { double dSum = 0.0; for (iI=0; iInseqs; iI++) { for (iJ=iI+1; iJnseqs; iJ++) { dSum += SymMatrixGetValue(prWithinClusterDistances, iI, iJ); } } Log(&rLog, LOG_FORCED_DEBUG, "mean pair-wise distance within subcluster %d of %d = %f", iClusterIndex, prKMeansResult->iNClusters, dSum/prSubClusterMSeq->nseqs); } #endif #else if (NewSymMatrix(&prWithinClusterDistances, iNSeqInCluster, iNSeqInCluster)!=0) { Log(&rLog, LOG_FATAL, "%s", "Memory allocation for disparity matrix failed"); } for (iI=0; iIppiObjIndicesPerCluster[iClusterIndex][iI]; for (iJ=iI+1; iJppiObjIndicesPerCluster[iClusterIndex][iJ]; double dist; /* Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d: compute distance between %d:%s and %d:%s", iClusterIndex, i, prMSeq->sqinfo[iRealIndexI].name, iJ, prMSeq->sqinfo[iRealIndexJ].name); */ if (1 == USE_EUCLIDEAN_DISTANCE) { dist = EuclDist(ppdSeqVec[iRealIndexI], ppdSeqVec[iRealIndexJ], iNumSeeds); } else { dist = CosDist(ppdSeqVec[iRealIndexI], ppdSeqVec[iRealIndexJ], iNumSeeds); } SymMatrixSetValue(prWithinClusterDistances, iI, iJ, dist); } } #endif /* labels needed for the guide tree building routine only */ ppcLabels = (char**) CKMALLOC(iNSeqInCluster * sizeof(char*)); for (iI=0; iIsqinfo[iI].name; #else int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iI]; ppcLabels[iI] = prMSeq->sqinfo[iRealIndex].name; #endif } #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d", iClusterIndex, prKMeansResult->iNClusters); SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL); #endif GuideTreeUpgma(&prSubClusterTree, ppcLabels, prWithinClusterDistances, NULL); CKFREE(ppcLabels); /* don't free members, they just point */ #if 0 Log(&rLog, LOG_FORCED_DEBUG, "Cluster %d guide-tree:", iClusterIndex); LogTree(prSubClusterTree); #endif /* The guide tree id's (that point to the sequences) now start * from 0, i.e. the association with the prMSeq numbering is * broken and fixed in the following */ for (iNodeIndex = 0; iNodeIndex < (int)GetNodeCount(prSubClusterTree); iNodeIndex++) { if (IsLeaf(iNodeIndex, prSubClusterTree)) { int iLeafId = GetLeafId(iNodeIndex, prSubClusterTree); int iRealId = prKMeansResult->ppiObjIndicesPerCluster[iClusterIndex][iLeafId]; #if 0 Log(&rLog, LOG_FORCED_DEBUG, "Correcting leaf node %d which has (wrong) id %d and name %s to id %d (prMSeq name %s)", iNodeIndex, iLeafId, GetLeafName(iNodeIndex, prSubClusterTree), iRealId, prMSeq->sqinfo[iRealId].name); #endif SetLeafId(prSubClusterTree, iNodeIndex, iRealId); } } /* Append the newly created tree (prSubClusterTree) to the * corresponding node index of prMbedTree_p. */ #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "Will join trees at leaf node %d = %s", piClusterToTreeNode[iClusterIndex], GetLeafName(piClusterToTreeNode[iClusterIndex], *prMbedTree_p)); #endif AppendTree(*prMbedTree_p, piClusterToTreeNode[iClusterIndex], prSubClusterTree); /* Note: piClusterToTreeNode is still valid, because * AppendTrees() guarantees that no other than the node to * append to changes. */ #if 0 Log(&rLog, LOG_FORCED_DEBUG, "%s", "prMbedTree_p after cluster %d has appended:", iClusterIndex); LogTree(*prMbedTree_p); if (0) { char fname[] = "mbed-joined-tree.dnd"; FILE *pfOut; if (NULL == (pfOut = fopen(fname, "w"))) { Log(&rLog, LOG_FATAL, "Couldn't open %s for writing", fname); } MuscleTreeToFile(pfOut, *prMbedTree_p); Log(&rLog, LOG_FORCED_DEBUG, "Joined tree written to %s", fname); fclose(pfOut); Log(&rLog, LOG_FATAL, "DEBUG EXIT"); } #endif /* cleanup */ FreeMuscleTree(prSubClusterTree); FreeSymMatrix(&prWithinClusterDistances); #if FULL_WITHIN_CLUSTER_DISTANCES FreeMSeq(&prSubClusterMSeq); #endif } /* end for each cluster */ ProgressDone(prSubClusterDistanceProgress); FreeProgress(&prSubClusterDistanceProgress); if (NULL != pcGuidetreeOut) { if (NULL == (pfOut = fopen(pcGuidetreeOut, "w"))) { Log(&rLog, LOG_ERROR, "Couldn't open %s for writing", pcGuidetreeOut); } else { MuscleTreeToFile(pfOut, *prMbedTree_p); Log(&rLog, LOG_INFO, "Guide tree written to %s", pcGuidetreeOut); (void) fclose(pfOut); } } /* cleanup * */ #if MBED_TIMING StopwatchStop(stopwatch); StopwatchDisplay(stdout, "mBed time (without pairwise distance computation): ", stopwatch); StopwatchFree(stopwatch); #endif FreeKMeansResult(&prKMeansResult); FreeSymMatrix(&prPreClusterDistmat); for (iI=0; iInseqs; iI++) { CKFREE(ppdSeqVec[iI]); } CKFREE(ppdSeqVec); CKFREE(piClusterToTreeNode); #ifndef NDEBUG TreeValidate(*prMbedTree_p); #endif return 0; } /*** end: Mbed() ***/