********************************************************************/
/*
- * RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $
+ * RCS $Id: mbed.c 316 2016-12-16 16:14:39Z fabian $
*
*
- * Reimplementation from scratch of mBed:
- * Blackshields et al. (2010); PMID 20470396
- *
+ * Reimplementation from scratch of mBed (Blackshields et al., 2010;
+ * PMID 20470396) and addition of bisecting K-Means which allows
+ * control over subcluster size
*
*/
#define USE_KMEANS_LLOYDS 0
-#ifndef HAVE_LOG2
-#define log2(x) (log(x) / 0.69314718055994530942)
+//#ifndef HAVE_LOG2
+#ifndef CLUSTAL_OMEGA_HAVE_LOG2
+//#define log2(x) (log(x) / 0.69314718055994530942)
+#define log2(x) ( x<=0 ? (float)(-100000) : 1.442695041*log(x) )
#endif
#define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
int iSeqIndex;
int iSeedIndex;
/* indices for restoring order */
- int *restore;
+ int *restore = NULL;
/* sorted copy of piSeeds */
- int *piSortedSeeds;
+ int *piSortedSeeds = NULL;
#if TIMING
Stopwatch_t *stopwatch = StopwatchCreate();
* distance function for each seq/seed pair
*
*/
- if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
+ /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
+ if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, FALSE,
0, iNumSeeds, 0, prMSeq->nseqs,
NULL, NULL)) {
Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
}
- SymMatrixPrint(prDistmat, labels, NULL);
+ SymMatrixPrint(prDistmat, labels, NULL, FALSE);
CKFREE(labels);
}
#endif
FreeSymMatrix(&prDistmat);
CKFREE(restore);
+ CKFREE(piSortedSeeds);
#if TIMING
StopwatchStop(stopwatch);
StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
const int iNObjs, const int iDim, double **ppdVectors,
const int iMinRequiredObjsPerCluster,
- const int iMaxAllowedObjsPerCluster)
+ const int iMaxAllowedObjsPerCluster, char ***ppcClusterSplits_p)
{
int iN, iD;
/* cluster centers for each cluster created at each split */
bool bSmallClusterDetected;
/* queue of clusters which are to be split */
int_queue_t rClusterSplitQueue;
+ /* */
+ int iLog2N2 = 0;
#if TIMING
Stopwatch_t *stopwatch = StopwatchCreate();
}
+ /*
+ * this is an array that encodes where a sequences fell at a bisecting k-means
+ * ppcClusterSplits_p can consume quite a bit of memory,
+ but it is only needed if clustering info is written to file.
+ Use ppcClusterSplits_p as a flag, initialise in the calling function (Mbed)
+ to NULL, if no info written out then leave ppcClusterSplits_p=NULL.
+ If info is to be written out then malloc one char*, so that
+ ppcClusterSplits_p!=NULL. In that case realloc proper amount.
+ Only expect a certain number of splits, iLog2N2.
+ Worst case would be iNObjs but then ppcClusterSplits_p
+ would be quadratic, which is too big for many sequences.
+ */
+ if (NULL != *ppcClusterSplits_p){
+ /* (square) of logarithm (base 2) of number of objects */
+ double dLog2N = log10(iNObjs) / log10(2);
+ iLog2N2 = (int)(dLog2N*dLog2N);
+
+ *ppcClusterSplits_p = (char **)CKREALLOC(*ppcClusterSplits_p, iNObjs*sizeof(char *));
+ (*ppcClusterSplits_p)[0] = (char *)CKMALLOC(iNObjs*iLog2N2);
+ (*ppcClusterSplits_p)[0][0] = '\0';
+ for (iN = 1; iN < iNObjs; iN++){
+ (*ppcClusterSplits_p)[iN] = (*ppcClusterSplits_p)[iN-1] + iLog2N2;
+ (*ppcClusterSplits_p)[iN][0] = '\0';
+ }
+ }
+
/* Starting with the first cluster that now contains all the
* sequences keep splitting until no cluster contains more than
}
#if TRACE
- Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
- iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
+ Log(&rLog, LOG_FORCED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
+ iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
__FILE__, __FUNCTION__, __LINE__);
for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
__FUNCTION__,
iClusterToSplot, iNObjsInClusterToSplit,
- "Hope it's not too big and doesn't slow things down."
+ "Hope it's not too big and doesn't slow things down."
);
bNaNDetected = TRUE;
break;
if (0 == iThisClusterAssignment) {
iThisClusterIndex = iClusterToSplot;
+ if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
+ strcat((*ppcClusterSplits_p)[iThisObjIdx], "0");
+ }
} else if (1 == iThisClusterAssignment) {
iThisClusterIndex = iNewClusterIdx;
+ if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
+ strcat((*ppcClusterSplits_p)[iThisObjIdx], "1");
+ }
} else {
/* there used to be code for iK>=2 in r101 */
Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
* @param[in] pcGuidetreeOut
* Passed down to GuideTreeUpgma()
*
+ * @note: if the number of sequences is smaller than
+ * MAX_ALLOWED_SEQ_PER_PRECLUSTER then there's no need to do the subclustering
+ * first. In fact it costs some extra time. However, it's insignificant and
+ * for simplicities sake we don't do any special checks
+
+ *
* @return Zero on success, non-zero on error
*
*/
int
Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
- const char *pcGuidetreeOut)
+ const char *pcGuidetreeOut, int iClustersizes, const char *pcClusterFile)
{
/* number of seeds */
- int iNumSeeds;
+ int iNumSeeds = 0;
/* seed indices matching prMSeq */
- int *piSeeds;
+ int *piSeeds = NULL;
/* seqs represented as distance vectors */
- double **ppdSeqVec;
+ double **ppdSeqVec = NULL;
/* kmeans result */
bisecting_kmeans_result_t *prKMeansResult = NULL;
/* distance matrix of kmeans (pre-)cluster centers */
symmatrix_t *prPreClusterDistmat = NULL;
/* auxiliary for symmetric matrix output; tree routines etc */
- char **ppcLabels;
- int iNodeIndex;
+ char **ppcLabels = NULL;
+ int iNodeIndex = 0;
/* mapping of cluster-center tree node indices to corresponding
cluster */
- int *piClusterToTreeNode;
- int iClusterIndex;
+ int *piClusterToTreeNode = NULL;
+ int iClusterIndex = 0;
int iI, iJ;
- FILE *pfOut;
- progress_t *prSubClusterDistanceProgress;
+ FILE *pfOut = NULL;
+ progress_t *prSubClusterDistanceProgress = NULL;
bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
+ char **ppcClusterSplits = NULL;
#if FULL_WITHIN_CLUSTER_DISTANCES
Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
StopwatchStart(stopwatch);
#endif
+ /* ppcClusterSplits can consume quite a bit of memory,
+ however, it is only needed if cluster information is
+ to be written to file. If no pcClusterFile is specified,
+ then ppcClusterSplits does not have to be allocated.
+ Use ppcClusterSplits as a flag, initialise to NULL,
+ if NULL is passed into BisectingKmeans then do NOT malloc,
+ if !NULL is passed in then realloc the appropriate memory */
+ if (NULL != pcClusterFile){
+ ppcClusterSplits = (char **)malloc(sizeof(char*));
+ }
+ else {
+ ppcClusterSplits = NULL;
+ }
+
BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
MIN_REQUIRED_SEQ_PER_PRECLUSTER,
- MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+ iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/, &ppcClusterSplits);
Log(&rLog, LOG_INFO,
"mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
prKMeansResult->iNClusters,
MIN_REQUIRED_SEQ_PER_PRECLUSTER,
- MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+ iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/);
+ if (NULL != pcClusterFile){ /* print clustering: FS, r275 -> */
+ FILE *pfClust = NULL;
+ if (NULL == (pfClust = fopen(pcClusterFile, "w"))){
+ Log(&rLog, LOG_FATAL, "Could not open file %s for writing", pcClusterFile);
+ }
+ for (iI = 0; iI < prKMeansResult->iNClusters; iI++) {
+ for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
+ int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
+ fprintf(pfClust, "Cluster %u: object %u has index %u (=seq %s %d~len)\t %s\n",
+ iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name, prMSeq->sqinfo[iRealIndex].len, ppcClusterSplits[iRealIndex]);
+ }
+ }
+ fclose(pfClust); pfClust = NULL;
+ /* if there is no pcClusterFile then no memory will have been allocated */
+ CKFREE(ppcClusterSplits[0]);
+ CKFREE(ppcClusterSplits);
+ } /* there was a request to write out clustering */
#if PRINT_CLUSTER_DISTRIBUTION
Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
}
#if TRACE
Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
- SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
+ SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL, FALSE);
#endif
GuideTreeUpgma(prMbedTree_p,
CKFREE(ppcLabels);
#if TRACE
Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
- LogTree(*prMbedTree_p);
+ LogTree(*prMbedTree_p, stderr);
#endif
"%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
if (TRUE == prMSeq->aligned) {
- iPairDistType = PAIRDIST_SQUIDID_KIMURA;
+ if (SEQTYPE_PROTEIN == prMSeq->seqtype){
+ iPairDistType = PAIRDIST_SQUIDID_KIMURA;
+ }
+ else {
+ iPairDistType = PAIRDIST_SQUIDID;
+ }
} else {
iPairDistType = PAIRDIST_KTUPLE;
}
iOldLogLevel = rLog.iLogLevelEnabled;
rLog.iLogLevelEnabled = LOG_WARN;
/* compute distances, but be quiet */
- if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
+ /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
+ if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, FALSE,
0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
NULL, NULL)) {
Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
#if TRACE
Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
iClusterIndex, prKMeansResult->iNClusters);
- SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
+ SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL, FALSE);
#endif