JWS-112 Bumping version of ClustalO (src, binaries and windows) to version 1.2.4.

[jabaws.git] / binaries / src / clustalo / src / clustal / mbed.c
diff --git a/binaries/src/clustalo/src/clustal/mbed.c b/binaries/src/clustalo/src/clustal/mbed.c

index f143fe0..2282aa6 100644 (file)
--- a/binaries/src/clustalo/src/clustal/mbed.c
+++ b/binaries/src/clustalo/src/clustal/mbed.c
@@ -15,12 +15,12 @@
   ********************************************************************/
  
  /*
- *  RCS $Id: mbed.c 254 2011-06-21 13:07:50Z andreas $
+ *  RCS $Id: mbed.c 316 2016-12-16 16:14:39Z fabian $
   *
   *
- * Reimplementation from scratch of mBed:
- * Blackshields et al. (2010); PMID 20470396
- *
+ * Reimplementation from scratch of mBed (Blackshields et al., 2010;
+ * PMID 20470396) and addition of bisecting K-Means which allows
+ * control over subcluster size
   *
   */
  
@@ -88,8 +88,10 @@ static const int RESTARTS_PER_SPLIT = 10;
  #define USE_KMEANS_LLOYDS 0
  
  
-#ifndef HAVE_LOG2
-#define log2(x)  (log(x) / 0.69314718055994530942)
+//#ifndef HAVE_LOG2
+#ifndef CLUSTAL_OMEGA_HAVE_LOG2
+//#define log2(x)  (log(x) / 0.69314718055994530942)
+#define log2(x)  ( x<=0 ? (float)(-100000) : 1.442695041*log(x) )
  #endif
  #define NUMBER_OF_SEEDS(n) pow(log2(((double)n)), 2)
  
@@ -306,9 +308,9 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
      int iSeqIndex;
      int iSeedIndex;
     /* indices for restoring order */
-    int *restore; 
+    int *restore = NULL; 
      /* sorted copy of piSeeds */
-    int *piSortedSeeds; 
+    int *piSortedSeeds = NULL; 
  
  #if TIMING
      Stopwatch_t *stopwatch = StopwatchCreate();
@@ -372,7 +374,8 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
       * distance function for each seq/seed pair
       *
       */
-    if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType,
+    /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
+    if (0 != PairDistances(&prDistmat, prMSeq, iPairDistType, FALSE, 
                             0, iNumSeeds, 0, prMSeq->nseqs,
                             NULL, NULL)) {
          Log(&rLog, LOG_ERROR, "Could not compute pairwise distances for mbed.");
@@ -388,7 +391,7 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
          for (iSeqIndex=0; iSeqIndex<prMSeq->nseqs; iSeqIndex++) {
              labels[iSeqIndex] = prMSeq->sqinfo[iSeqIndex].name;
          }
-        SymMatrixPrint(prDistmat, labels, NULL);
+        SymMatrixPrint(prDistmat, labels, NULL, FALSE);
          CKFREE(labels);
      }
  #endif
@@ -445,6 +448,7 @@ SeqToVec(double **ppdSeqVec, mseq_t *prMSeq,
      
      FreeSymMatrix(&prDistmat);
      CKFREE(restore);
+    CKFREE(piSortedSeeds);
  #if TIMING
      StopwatchStop(stopwatch);
      StopwatchDisplay(stdout, "Total time for SeqToVec(): ", stopwatch);
@@ -662,7 +666,7 @@ void
  BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
                  const int iNObjs, const int iDim, double **ppdVectors,
                  const int iMinRequiredObjsPerCluster,
-                const int iMaxAllowedObjsPerCluster)
+                const int iMaxAllowedObjsPerCluster, char ***ppcClusterSplits_p)
  {
      int iN, iD;
      /* cluster centers for each cluster created at each split */
@@ -678,6 +682,8 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
      bool bSmallClusterDetected;
      /* queue of clusters which are to be split */
      int_queue_t rClusterSplitQueue;
+    /* */
+    int iLog2N2 = 0;
      
  #if TIMING
      Stopwatch_t *stopwatch = StopwatchCreate();
@@ -715,6 +721,32 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
      }
  
  
+    /*
+     * this is an array that encodes where a sequences fell at a bisecting k-means
+     * ppcClusterSplits_p can consume quite a bit of memory, 
+     but it is only needed if clustering info is written to file.
+     Use ppcClusterSplits_p as a flag, initialise in the calling function (Mbed) 
+     to NULL, if no info written out then leave ppcClusterSplits_p=NULL.
+     If info is to be written out then malloc one char*, so that 
+     ppcClusterSplits_p!=NULL. In that case realloc proper amount.
+     Only expect a certain number of splits, iLog2N2. 
+     Worst case would be iNObjs but then ppcClusterSplits_p 
+     would be quadratic, which is too big for many sequences.
+     */
+    if (NULL != *ppcClusterSplits_p){
+        /* (square) of logarithm (base 2) of number of objects */
+        double dLog2N = log10(iNObjs) / log10(2);
+        iLog2N2 = (int)(dLog2N*dLog2N);
+
+        *ppcClusterSplits_p      = (char **)CKREALLOC(*ppcClusterSplits_p, iNObjs*sizeof(char *));
+        (*ppcClusterSplits_p)[0] =  (char *)CKMALLOC(iNObjs*iLog2N2);
+        (*ppcClusterSplits_p)[0][0] = '\0';
+        for (iN = 1; iN < iNObjs; iN++){
+            (*ppcClusterSplits_p)[iN] = (*ppcClusterSplits_p)[iN-1] + iLog2N2;
+            (*ppcClusterSplits_p)[iN][0] = '\0';
+        }
+    }
+
  
      /* Starting with the first cluster that now contains all the
       * sequences keep splitting until no cluster contains more than
@@ -786,8 +818,8 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
          }
  
  #if TRACE
-        Log(&rLog, LOG_FOCRED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
-                  iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
+        Log(&rLog, LOG_FORCED_DEBUG, "Round %d: Will split cluster %d which has %d objects",
+            iNRounds, iClusterToSplot, iNObjsInClusterToSplit);
          fprintf(stderr, "DEBUG(%s|%s():%d): Object indices in cluster to split are:",
                  __FILE__, __FUNCTION__, __LINE__);
          for (iN=0; iN<iNObjsInClusterToSplit; iN++) {
@@ -889,7 +921,7 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
                          Log(&rLog, LOG_WARN, "%s(): Can't split cluster no. %d which has %d objects any further. %s",
                               __FUNCTION__,
                               iClusterToSplot, iNObjsInClusterToSplit,
-                         "Hope it's not too big and doesn't slow things down."
+                            "Hope it's not too big and doesn't slow things down."
                              );
                          bNaNDetected = TRUE;
                          break;
@@ -1022,8 +1054,14 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
  
              if (0 == iThisClusterAssignment) {
                  iThisClusterIndex = iClusterToSplot;
+                if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
+                    strcat((*ppcClusterSplits_p)[iThisObjIdx], "0");
+                }
              } else if (1 == iThisClusterAssignment) {
                  iThisClusterIndex = iNewClusterIdx;
+                if (*ppcClusterSplits_p && strlen((*ppcClusterSplits_p)[iThisObjIdx])<iLog2N2){
+                    strcat((*ppcClusterSplits_p)[iThisObjIdx], "1");
+                }
              } else {
                  /* there used to be code for iK>=2 in r101 */
                  Log(&rLog, LOG_FATAL, "Internal error: split into more than two clusters (got assignment %d)",
@@ -1111,34 +1149,41 @@ BisectingKmeans(bisecting_kmeans_result_t **prKMeansResult_p,
   * @param[in] pcGuidetreeOut
   * Passed down to GuideTreeUpgma()
   *
+ * @note: if the number of sequences is smaller than
+ * MAX_ALLOWED_SEQ_PER_PRECLUSTER then there's no need to do the subclustering
+ * first. In fact it costs some extra time. However, it's insignificant and
+ * for simplicities sake we don't do any special checks
+
+ * 
   * @return Zero on success, non-zero on error
   *
   */
  int
  Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
-     const char *pcGuidetreeOut)
+     const char *pcGuidetreeOut, int iClustersizes, const char *pcClusterFile)
  {
      /* number of seeds */
-    int iNumSeeds;
+    int iNumSeeds = 0;
      /* seed indices matching prMSeq */
-    int *piSeeds; 
+    int *piSeeds = NULL; 
      /* seqs represented as distance vectors */
-    double **ppdSeqVec;
+    double **ppdSeqVec = NULL;
      /* kmeans result */
      bisecting_kmeans_result_t *prKMeansResult = NULL;
      /* distance matrix of kmeans (pre-)cluster centers */
      symmatrix_t *prPreClusterDistmat = NULL;
      /* auxiliary for symmetric matrix output; tree routines etc */
-    char **ppcLabels;
-    int iNodeIndex;
+    char **ppcLabels = NULL;
+    int iNodeIndex = 0;
      /* mapping of cluster-center tree node indices to corresponding
         cluster */
-    int *piClusterToTreeNode;
-    int iClusterIndex;
+    int *piClusterToTreeNode = NULL;
+    int iClusterIndex = 0;
      int iI, iJ;
-    FILE *pfOut;
-    progress_t *prSubClusterDistanceProgress;
+    FILE *pfOut = NULL;
+    progress_t *prSubClusterDistanceProgress = NULL;
      bool bPrintCR = (rLog.iLogLevelEnabled<=LOG_VERBOSE) ? FALSE : TRUE;
+    char **ppcClusterSplits = NULL;
  
  #if FULL_WITHIN_CLUSTER_DISTANCES
      Log(&rLog, LOG_DEBUG, "Computing real distances within subclusters for mBed.");
@@ -1194,15 +1239,46 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
      StopwatchStart(stopwatch);
  #endif
  
+    /* ppcClusterSplits can consume quite a bit of memory,
+       however, it is only needed if cluster information is 
+       to be written to file. If no pcClusterFile is specified, 
+       then ppcClusterSplits does not have to be allocated.
+       Use ppcClusterSplits as a flag, initialise to NULL, 
+       if NULL is passed into BisectingKmeans then do NOT malloc, 
+       if !NULL is passed in then realloc the appropriate memory */
+    if (NULL != pcClusterFile){
+        ppcClusterSplits = (char **)malloc(sizeof(char*));
+    }
+    else {
+        ppcClusterSplits = NULL;
+    }
+
      BisectingKmeans(&prKMeansResult, prMSeq->nseqs, iNumSeeds, ppdSeqVec,
                      MIN_REQUIRED_SEQ_PER_PRECLUSTER,
-                    MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+                    iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/, &ppcClusterSplits);
      Log(&rLog, LOG_INFO,
           "mBed created %u cluster/s (with a minimum of %d and a soft maximum of %d sequences each)",
           prKMeansResult->iNClusters,
           MIN_REQUIRED_SEQ_PER_PRECLUSTER,
-         MAX_ALLOWED_SEQ_PER_PRECLUSTER);
+        iClustersizes/*MAX_ALLOWED_SEQ_PER_PRECLUSTER*/);
  
+    if (NULL != pcClusterFile){ /* print clustering: FS, r275 -> */
+        FILE *pfClust = NULL;
+        if (NULL == (pfClust = fopen(pcClusterFile, "w"))){
+            Log(&rLog, LOG_FATAL, "Could not open file %s for writing", pcClusterFile);
+        }
+        for (iI = 0; iI < prKMeansResult->iNClusters; iI++) {
+            for (iJ=0; iJ<prKMeansResult->piNObjsPerCluster[iI]; iJ++) {
+                int iRealIndex = prKMeansResult->ppiObjIndicesPerCluster[iI][iJ];
+                fprintf(pfClust, "Cluster %u: object %u has index %u (=seq %s %d~len)\t %s\n",
+                        iI, iJ,  iRealIndex, prMSeq->sqinfo[iRealIndex].name, prMSeq->sqinfo[iRealIndex].len, ppcClusterSplits[iRealIndex]);
+            }
+        }
+        fclose(pfClust); pfClust = NULL;
+        /* if there is no pcClusterFile then no memory will have been allocated */
+        CKFREE(ppcClusterSplits[0]);
+        CKFREE(ppcClusterSplits);
+    } /* there was a request to write out clustering */
  
  #if PRINT_CLUSTER_DISTRIBUTION
      Log(&rLog, LOG_FORCED_DEBUG, "Bisecting Kmeans returned %d clusters", prKMeansResult->iNClusters);
@@ -1265,7 +1341,7 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
      }
  #if TRACE
      Log(&rLog, LOG_FORCED_DEBUG, "%s", "Distance matrix for pre-cluster centers:");
-    SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL);
+    SymMatrixPrint(prPreClusterDistmat, ppcLabels, NULL, FALSE);
  #endif
  
      GuideTreeUpgma(prMbedTree_p,
@@ -1277,7 +1353,7 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
      CKFREE(ppcLabels);
  #if TRACE
      Log(&rLog, LOG_FORCED_DEBUG, "%s", "Cluster-center guide-tree:");
-    LogTree(*prMbedTree_p);
+    LogTree(*prMbedTree_p, stderr);
  #endif
  
  
@@ -1337,7 +1413,12 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
              "%s\n", "Calling new Mbed use makes only sense if nseq>MAX_ALLOWED_SEQ_PER_PRECLUSTER");
  
          if (TRUE == prMSeq->aligned)  {
-            iPairDistType = PAIRDIST_SQUIDID_KIMURA;
+            if (SEQTYPE_PROTEIN == prMSeq->seqtype){
+                iPairDistType = PAIRDIST_SQUIDID_KIMURA;
+            }
+            else {
+                iPairDistType = PAIRDIST_SQUIDID;
+            }
          } else {
              iPairDistType = PAIRDIST_KTUPLE;
          }
@@ -1397,7 +1478,8 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
          iOldLogLevel = rLog.iLogLevelEnabled;
          rLog.iLogLevelEnabled = LOG_WARN;
          /* compute distances, but be quiet */
-        if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType,
+        /* 4th argument (FALSE) is bPercID, in mBed mode never use percent-identities */
+        if (PairDistances(&prWithinClusterDistances, prSubClusterMSeq, iPairDistType, FALSE, 
                            0, prSubClusterMSeq->nseqs, 0, prSubClusterMSeq->nseqs,
                            NULL, NULL)) {
              Log(&rLog, LOG_ERROR, "Couldn't compute pair distances");
@@ -1459,7 +1541,7 @@ Mbed(tree_t **prMbedTree_p, mseq_t *prMSeq, const int iPairDistType,
  #if TRACE
          Log(&rLog, LOG_FORCED_DEBUG, "Distance matrix for seqs within sub cluster %d/%d",
                    iClusterIndex, prKMeansResult->iNClusters);
-        SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL);
+        SymMatrixPrint(prWithinClusterDistances, ppcLabels, NULL, FALSE);
  #endif