+++ /dev/null
-// See KMeans.h\r
-//\r
-// Author: David Arthur (darthur@gmail.com), 2009\r
-\r
-// Includes\r
-\r
-#ifdef CLUSTALO\r
-/* previously in KMeans.h */\r
-#include "KmUtils.h"\r
-#include <iostream>\r
-#include <stdio.h>\r
-\r
-// Sets preferences for how much logging is done and where it is outputted, when k-means is run.\r
-void ClearKMeansLogging();\r
-void AddKMeansLogging(std::ostream *out, bool verbose);\r
-\r
-/*\r
-// Runs k-means++ on the given set of points. Set RunKMeans for info on the parameters.\r
-*/\r
-Scalar\r
-RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,\r
- Scalar *centers, int *assignments);\r
-\r
-Scalar\r
-RunKMeans(int n, int k, int d, Scalar *points, int attempts,\r
- Scalar *centers, int *assignments);\r
-#else\r
-#include "KMeans.h"\r
-#endif\r
-#include "KmTree.h"\r
-#include <sstream>\r
-#include <time.h>\r
-#include <vector>\r
-\r
-#ifdef CLUSTALO\r
-extern "C" double\r
-KMeans(int n, int k, int d, Scalar *points, int attempts, int use_lloyds_method,\r
- double *centers, int *assignments)\r
-{\r
- if (use_lloyds_method) {\r
- /*fprintf(stderr, "FIXME using LLoyd's method\n");*/\r
- return RunKMeans(n, k, d, points, attempts,\r
- centers, assignments);\r
- } else {\r
- /*fprintf(stderr, "FIXME using KMeansPP method\n");*/\r
- return RunKMeansPlusPlus(n, k, d, points, attempts,\r
- centers, assignments);\r
- }\r
-}\r
-#endif\r
-\r
-using namespace std;\r
-\r
-// Logging\r
-static vector<ostream*> gLogOutputs;\r
-static vector<ostream*> gVerboseLogOutputs;\r
-#define LOG(verbose, text) { \\r
- vector<ostream*> &outputs = (verbose? gVerboseLogOutputs : gLogOutputs); \\r
- if (outputs.size() > 0) { \\r
- ostringstream string_stream; \\r
- string_stream << text; \\r
- for (int i = 0; i < (int)outputs.size(); i++) \\r
- *(outputs[i]) << string_stream.str(); \\r
- } \\r
-}\r
-void AddKMeansLogging(std::ostream *out, bool verbose) {\r
- if (verbose)\r
- gVerboseLogOutputs.push_back(out);\r
- gLogOutputs.push_back(out);\r
-}\r
-void ClearKMeansLogging() {\r
- gLogOutputs.clear();\r
- gVerboseLogOutputs.clear();\r
-}\r
-\r
-// Returns the number of seconds since the program began execution.\r
-static double GetSeconds() {\r
- return double(clock()) / CLOCKS_PER_SEC;\r
-}\r
-\r
-// See KMeans.h\r
-// Performs one full execution of k-means, logging any relevant information, and tracking meta\r
-// statistics for the run. If min or max values are negative, they are treated as unset.\r
-// best_centers and best_assignment can be 0, in which case they are not set.\r
-static void RunKMeansOnce(const KmTree &tree, int n, int k, int d, Scalar *points, Scalar *centers,\r
- Scalar *min_cost, Scalar *max_cost, Scalar *total_cost,\r
- double start_time, double *min_time, double *max_time,\r
- double *total_time, Scalar *best_centers, int *best_assignment) {\r
- const Scalar kEpsilon = Scalar(1e-8); // Used to determine when to terminate k-means\r
-\r
- // Do iterations of k-means until the cost stabilizes\r
- Scalar old_cost = 0;\r
- bool is_done = false;\r
- for (int iteration = 0; !is_done; iteration++) {\r
- Scalar new_cost = tree.DoKMeansStep(k, centers, 0);\r
- is_done = (iteration > 0 && new_cost >= (1 - kEpsilon) * old_cost);\r
- old_cost = new_cost;\r
- LOG(true, "Completed iteration #" << (iteration+1) << ", cost=" << new_cost << "..." << endl);\r
- }\r
- double this_time = GetSeconds() - start_time;\r
-\r
- // Log the clustering we found\r
- LOG(false, "Completed run: cost=" << old_cost << " (" << this_time << " seconds)" << endl);\r
-\r
- // Handle a new min cost, updating best_centers and best_assignment as appropriate\r
- if (*min_cost < 0 || old_cost < *min_cost) {\r
- *min_cost = old_cost;\r
- if (best_assignment != 0)\r
- tree.DoKMeansStep(k, centers, best_assignment);\r
- if (best_centers != 0)\r
- memcpy(best_centers, centers, sizeof(Scalar)*k*d);\r
- }\r
-\r
- // Update all other aggregate stats\r
- if (*max_cost < old_cost) *max_cost = old_cost;\r
- *total_cost += old_cost;\r
- if (*min_time < 0 || *min_time > this_time)\r
- *min_time = this_time;\r
- if (*max_time < this_time) *max_time = this_time;\r
- *total_time += this_time;\r
-}\r
-\r
-// Outputs all meta-stats for a set of k-means or k-means++ runs.\r
-void LogMetaStats(Scalar min_cost, Scalar max_cost, Scalar total_cost,\r
- double min_time, double max_time, double total_time, int num_attempts) {\r
- LOG(false, "Aggregate info over " << num_attempts << " runs:" << endl);\r
- LOG(false, " Cost: min=" << min_cost << " average=" << (total_cost / num_attempts)\r
- << " max=" << max_cost << endl);\r
- LOG(false, " Time: min=" << min_time << " average=" << (total_time / num_attempts)\r
- << " max=" << max_time << endl << endl);\r
-}\r
-\r
-// See KMeans.h\r
-Scalar RunKMeans(int n, int k, int d, Scalar *points, int attempts,\r
- Scalar *ret_centers, int *ret_assignment) {\r
- KM_ASSERT(k >= 1);\r
- \r
- // Create the tree and log\r
- LOG(false, "Running k-means..." << endl);\r
- KmTree tree(n, d, points);\r
- LOG(false, "Done preprocessing..." << endl);\r
-\r
- // Initialization\r
- Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);\r
- int *unused_centers = (int*)malloc(sizeof(int)*n);\r
- KM_ASSERT(centers != 0 && unused_centers != 0);\r
- Scalar min_cost = -1, max_cost = -1, total_cost = 0;\r
- double min_time = -1, max_time = -1, total_time = 0;\r
- \r
- // Handle k > n\r
- if (k > n) {\r
- memset(centers + n*d, -1, (k-d)*sizeof(Scalar));\r
- k = n;\r
- }\r
-\r
- // Run all the attempts\r
- for (int attempt = 0; attempt < attempts; attempt++) {\r
- double start_time = GetSeconds();\r
-\r
- // Choose centers uniformly at random\r
- for (int i = 0; i < n; i++)\r
- unused_centers[i] = i;\r
- int num_unused_centers = n;\r
- for (int i = 0; i < k; i++) {\r
- int j = GetRandom(num_unused_centers--);\r
- memcpy(centers + i*d, points + unused_centers[j]*d, d*sizeof(Scalar));\r
- unused_centers[j] = unused_centers[num_unused_centers];\r
- }\r
- \r
- // Run k-means\r
- RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,\r
- &min_time, &max_time, &total_time, ret_centers, ret_assignment);\r
- }\r
- LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);\r
-\r
- // Clean up and return\r
- free(unused_centers);\r
- free(centers);\r
- return min_cost;\r
-}\r
-\r
-// See KMeans.h\r
-Scalar RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,\r
- Scalar *ret_centers, int *ret_assignment) {\r
- KM_ASSERT(k >= 1);\r
-\r
- // Create the tree and log\r
- LOG(false, "Running k-means++..." << endl);\r
- KmTree tree(n, d, points);\r
- LOG(false, "Done preprocessing..." << endl);\r
-\r
- // Initialization\r
- Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);\r
- KM_ASSERT(centers != 0);\r
- Scalar min_cost = -1, max_cost = -1, total_cost = 0;\r
- double min_time = -1, max_time = -1, total_time = 0;\r
-\r
- // Run all the attempts\r
- for (int attempt = 0; attempt < attempts; attempt++) {\r
- double start_time = GetSeconds();\r
-\r
- // Choose centers using k-means++ seeding\r
- tree.SeedKMeansPlusPlus(k, centers);\r
- \r
- // Run k-means\r
- RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,\r
- &min_time, &max_time, &total_time, ret_centers, ret_assignment);\r
- }\r
- LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);\r
-\r
- // Clean up and return\r
- free(centers);\r
- return min_cost;\r
-}\r