X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=binaries%2Fsrc%2Fclustalo%2Fsrc%2Fclustal%2Fmbed.c;h=2282aa6bca7d92aa46d4f3fbd1c76f11da3c8678;hb=1eab6cadc31107fabe7ef060de1ae91eeb4a025b;hp=f143fe0a96c634bd216f4d76445ed4606e61c9f9;hpb=b222b987978b6b2d4b6fb1b4a3a2a0ea87a19c3d;p=jabaws.git diff --git a/binaries/src/clustalo/src/clustal/mbed.c b/binaries/src/clustalo/src/clustal/mbed.c index f143fe0..2282aa6 100644 --- a/binaries/src/clustalo/src/clustal/mbed.c +++ b/binaries/src/clustalo/src/clustal/mbed.c @@ -15,12 +15,12 @@ ********************************************************************/ /* - * RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $ + * RCS $Id: mbed.c 316 2016-12-16 16:14:39Z fabian $ * * - * Reimplementation from scratch of mBed: - * Blackshields et al. (2010); PMID 20470396 - * + * Reimplementation from scratch of mBed (Blackshields et al., 2010; + * PMID 20470396) and addition of bisecting K-Means which allows + * control over subcluster size * */ @@ -88,8 +88,10 @@ static const int RESTARTS_PER_SPLIT = 10; #define USE_KMEANS_LLOYDS 0 -#ifndef HAVE_LOG2 -#define log2(x) (log(x) / 0.69314718055994530942) +//#ifndef HAVE_LOG2 +#ifndef CLUSTAL_OMEGA_HAVE_LOG2 +//#define log2(x) (log(x) / 0.69314718055994530942) +#define log2(x) ( x<=0 ? (float)(-100000) : 1.442695041*log(x) ) #endif #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2) @@ -306,9 +308,9 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, int iSeqIndex; int iSeedIndex; /* indices for restoring order */ - int *restore; + int *restore = NULL; /* sorted copy of piSeeds */ - int *piSortedSeeds; + int *piSortedSeeds = NULL; #if TIMING Stopwatch_t *stopwatch = StopwatchCreate(); @@ -372,7 +374,8 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, * distance function for each seq/seed pair * */ - if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, + /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */ + if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, FALSE, 0, iNumSeeds, 0, prMSeq->nseqs, NULL, NULL)) { Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed."); @@ -388,7 +391,7 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, for (iSeqIndex=0; iSeqIndexnseqs; iSeqIndex++) { labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name; } - SymMatrixPrint(prDistmat, labels, NULL); + SymMatrixPrint(prDistmat, labels, NULL, FALSE); CKFREE(labels); } #endif @@ -445,6 +448,7 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq, FreeSymMatrix(&prDistmat); CKFREE(restore); + CKFREE(piSortedSeeds); #if TIMING StopwatchStop(stopwatch); StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch); @@ -662,7 +666,7 @@ void BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, const int iNObjs, const int iDim, double **ppdVectors, const int iMinRequiredObjsPerCluster, - const int iMaxAllowedObjsPerCluster) + const int iMaxAllowedObjsPerCluster, char ***ppcClusterSplits_p) { int iN, iD; /* cluster centers for each cluster created at each split */ @@ -678,6 +682,8 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, bool bSmallClusterDetected; /* queue of clusters which are to be split */ int_queue_t rClusterSplitQueue; + /* */ + int iLog2N2 = 0; #if TIMING Stopwatch_t *stopwatch = StopwatchCreate(); @@ -715,6 +721,32 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, } + /* + * this is an array that encodes where a sequences fell at a bisecting k-means + * ppcClusterSplits_p can consume quite a bit of memory, + but it is only needed if clustering info is written to file. + Use ppcClusterSplits_p as a flag, initialise in the calling function (Mbed) + to NULL, if no info written out then leave ppcClusterSplits_p=NULL. + If info is to be written out then malloc one char*, so that + ppcClusterSplits_p!=NULL. In that case realloc proper amount. + Only expect a certain number of splits, iLog2N2. + Worst case would be iNObjs but then ppcClusterSplits_p + would be quadratic, which is too big for many sequences. + */ + if (NULL != *ppcClusterSplits_p){ + /* (square) of logarithm (base 2) of number of objects */ + double dLog2N = log10(iNObjs) / log10(2); + iLog2N2 = (int)(dLog2N*dLog2N); + + *ppcClusterSplits_p = (char **)CKREALLOC(*ppcClusterSplits_p, iNObjs*sizeof(char *)); + (*ppcClusterSplits_p)[0] = (char *)CKMALLOC(iNObjs*iLog2N2); + (*ppcClusterSplits_p)[0][0] = '\0'; + for (iN = 1; iN < iNObjs; iN++){ + (*ppcClusterSplits_p)[iN] = (*ppcClusterSplits_p)[iN-1] + iLog2N2; + (*ppcClusterSplits_p)[iN][0] = '\0'; + } + } + /* Starting with the first cluster that now contains all the * sequences keep splitting until no cluster contains more than @@ -786,8 +818,8 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, } #if TRACE - Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects", - iNRounds, iClusterToSplot, iNObjsInClusterToSplit); + Log(&rLog, LOG_FORCED_DEBUG, "Round %d: Will split cluster %d which has %d objects", + iNRounds, iClusterToSplot, iNObjsInClusterToSplit); fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:", __FILE__, __FUNCTION__, __LINE__); for (iN=0; iN=2 in r101 */ Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)", @@ -1111,34 +1149,41 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p, * @param[in] pcGuidetreeOut * Passed down to GuideTreeUpgma() * + * @note: if the number of sequences is smaller than + * MAX_ALLOWED_SEQ_PER_PRECLUSTER then there's no need to do the subclustering + * first. In fact it costs some extra time. However, it's insignificant and + * for simplicities sake we don't do any special checks + + * * @return Zero on success, non-zero on error * */ int Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, - const char *pcGuidetreeOut) + const char *pcGuidetreeOut, int iClustersizes, const char *pcClusterFile) { /* number of seeds */ - int iNumSeeds; + int iNumSeeds = 0; /* seed indices matching prMSeq */ - int *piSeeds; + int *piSeeds = NULL; /* seqs represented as distance vectors */ - double **ppdSeqVec; + double **ppdSeqVec = NULL; /* kmeans result */ bisecting_kmeans_result_t *prKMeansResult = NULL; /* distance matrix of kmeans (pre-)cluster centers */ symmatrix_t *prPreClusterDistmat = NULL; /* auxiliary for symmetric matrix output; tree routines etc */ - char **ppcLabels; - int iNodeIndex; + char **ppcLabels = NULL; + int iNodeIndex = 0; /* mapping of cluster-center tree node indices to corresponding cluster */ - int *piClusterToTreeNode; - int iClusterIndex; + int *piClusterToTreeNode = NULL; + int iClusterIndex = 0; int iI, iJ; - FILE *pfOut; - progress_t *prSubClusterDistanceProgress; + FILE *pfOut = NULL; + progress_t *prSubClusterDistanceProgress = NULL; bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE; + char **ppcClusterSplits = NULL; #if FULL_WITHIN_CLUSTER_DISTANCES Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed."); @@ -1194,15 +1239,46 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, StopwatchStart(stopwatch); #endif + /* ppcClusterSplits can consume quite a bit of memory, + however, it is only needed if cluster information is + to be written to file. If no pcClusterFile is specified, + then ppcClusterSplits does not have to be allocated. + Use ppcClusterSplits as a flag, initialise to NULL, + if NULL is passed into BisectingKmeans then do NOT malloc, + if !NULL is passed in then realloc the appropriate memory */ + if (NULL != pcClusterFile){ + ppcClusterSplits = (char **)malloc(sizeof(char*)); + } + else { + ppcClusterSplits = NULL; + } + BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec, MIN_REQUIRED_SEQ_PER_PRECLUSTER, - MAX_ALLOWED_SEQ_PER_PRECLUSTER); + iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/, &ppcClusterSplits); Log(&rLog, LOG_INFO, "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)", prKMeansResult->iNClusters, MIN_REQUIRED_SEQ_PER_PRECLUSTER, - MAX_ALLOWED_SEQ_PER_PRECLUSTER); + iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/); + if (NULL != pcClusterFile){ /* print clustering: FS, r275 -> */ + FILE *pfClust = NULL; + if (NULL == (pfClust = fopen(pcClusterFile, "w"))){ + Log(&rLog, LOG_FATAL, "Could not open file %s for writing", pcClusterFile); + } + for (iI = 0; iI < prKMeansResult->iNClusters; iI++) { + for (iJ=0; iJpiNObjsPerCluster[iI]; iJ++) { + int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ]; + fprintf(pfClust, "Cluster %u: object %u has index %u (=seq %s %d~len)\t %s\n", + iI, iJ, iRealIndex, prMSeq->sqinfo[iRealIndex].name, prMSeq->sqinfo[iRealIndex].len, ppcClusterSplits[iRealIndex]); + } + } + fclose(pfClust); pfClust = NULL; + /* if there is no pcClusterFile then no memory will have been allocated */ + CKFREE(ppcClusterSplits[0]); + CKFREE(ppcClusterSplits); + } /* there was a request to write out clustering */ #if PRINT_CLUSTER_DISTRIBUTION Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters); @@ -1265,7 +1341,7 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, } #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:"); - SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL); + SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL, FALSE); #endif GuideTreeUpgma(prMbedTree_p, @@ -1277,7 +1353,7 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, CKFREE(ppcLabels); #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:"); - LogTree(*prMbedTree_p); + LogTree(*prMbedTree_p, stderr); #endif @@ -1337,7 +1413,12 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER"); if (TRUE == prMSeq->aligned) { - iPairDistType = PAIRDIST_SQUIDID_KIMURA; + if (SEQTYPE_PROTEIN == prMSeq->seqtype){ + iPairDistType = PAIRDIST_SQUIDID_KIMURA; + } + else { + iPairDistType = PAIRDIST_SQUIDID; + } } else { iPairDistType = PAIRDIST_KTUPLE; } @@ -1397,7 +1478,8 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, iOldLogLevel = rLog.iLogLevelEnabled; rLog.iLogLevelEnabled = LOG_WARN; /* compute distances, but be quiet */ - if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, + /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */ + if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, FALSE, 0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs, NULL, NULL)) { Log(&rLog, LOG_ERROR, "Couldn't compute pair distances"); @@ -1459,7 +1541,7 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType, #if TRACE Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d", iClusterIndex, prKMeansResult->iNClusters); - SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL); + SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL, FALSE); #endif