--- /dev/null
+// See KMeans.h\r
+//\r
+// Author: David Arthur (darthur@gmail.com), 2009\r
+\r
+// Includes\r
+\r
+#ifdef CLUSTALO\r
+/* previously in KMeans.h */\r
+#include "KmUtils.h"\r
+#include <iostream>\r
+#include <stdio.h>\r
+\r
+// Sets preferences for how much logging is done and where it is outputted, when k-means is run.\r
+void ClearKMeansLogging();\r
+void AddKMeansLogging(std::ostream *out, bool verbose);\r
+\r
+/*\r
+// Runs k-means++ on the given set of points. Set RunKMeans for info on the parameters.\r
+*/\r
+Scalar\r
+RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,\r
+ Scalar *centers, int *assignments);\r
+\r
+Scalar\r
+RunKMeans(int n, int k, int d, Scalar *points, int attempts,\r
+ Scalar *centers, int *assignments);\r
+#else\r
+#include "KMeans.h"\r
+#endif\r
+#include "KmTree.h"\r
+#include <sstream>\r
+#include <time.h>\r
+#include <vector>\r
+\r
+#ifdef CLUSTALO\r
+extern "C" double\r
+KMeans(int n, int k, int d, Scalar *points, int attempts, int use_lloyds_method,\r
+ double *centers, int *assignments)\r
+{\r
+ if (use_lloyds_method) {\r
+ /*fprintf(stderr, "FIXME using LLoyd's method\n");*/\r
+ return RunKMeans(n, k, d, points, attempts,\r
+ centers, assignments);\r
+ } else {\r
+ /*fprintf(stderr, "FIXME using KMeansPP method\n");*/\r
+ return RunKMeansPlusPlus(n, k, d, points, attempts,\r
+ centers, assignments);\r
+ }\r
+}\r
+#endif\r
+\r
+using namespace std;\r
+\r
+// Logging\r
+static vector<ostream*> gLogOutputs;\r
+static vector<ostream*> gVerboseLogOutputs;\r
+#define LOG(verbose, text) { \\r
+ vector<ostream*> &outputs = (verbose? gVerboseLogOutputs : gLogOutputs); \\r
+ if (outputs.size() > 0) { \\r
+ ostringstream string_stream; \\r
+ string_stream << text; \\r
+ for (int i = 0; i < (int)outputs.size(); i++) \\r
+ *(outputs[i]) << string_stream.str(); \\r
+ } \\r
+}\r
+void AddKMeansLogging(std::ostream *out, bool verbose) {\r
+ if (verbose)\r
+ gVerboseLogOutputs.push_back(out);\r
+ gLogOutputs.push_back(out);\r
+}\r
+void ClearKMeansLogging() {\r
+ gLogOutputs.clear();\r
+ gVerboseLogOutputs.clear();\r
+}\r
+\r
+// Returns the number of seconds since the program began execution.\r
+static double GetSeconds() {\r
+ return double(clock()) / CLOCKS_PER_SEC;\r
+}\r
+\r
+// See KMeans.h\r
+// Performs one full execution of k-means, logging any relevant information, and tracking meta\r
+// statistics for the run. If min or max values are negative, they are treated as unset.\r
+// best_centers and best_assignment can be 0, in which case they are not set.\r
+static void RunKMeansOnce(const KmTree &tree, int n, int k, int d, Scalar *points, Scalar *centers,\r
+ Scalar *min_cost, Scalar *max_cost, Scalar *total_cost,\r
+ double start_time, double *min_time, double *max_time,\r
+ double *total_time, Scalar *best_centers, int *best_assignment) {\r
+ const Scalar kEpsilon = Scalar(1e-8); // Used to determine when to terminate k-means\r
+\r
+ // Do iterations of k-means until the cost stabilizes\r
+ Scalar old_cost = 0;\r
+ bool is_done = false;\r
+ for (int iteration = 0; !is_done; iteration++) {\r
+ Scalar new_cost = tree.DoKMeansStep(k, centers, 0);\r
+ is_done = (iteration > 0 && new_cost >= (1 - kEpsilon) * old_cost);\r
+ old_cost = new_cost;\r
+ LOG(true, "Completed iteration #" << (iteration+1) << ", cost=" << new_cost << "..." << endl);\r
+ }\r
+ double this_time = GetSeconds() - start_time;\r
+\r
+ // Log the clustering we found\r
+ LOG(false, "Completed run: cost=" << old_cost << " (" << this_time << " seconds)" << endl);\r
+\r
+ // Handle a new min cost, updating best_centers and best_assignment as appropriate\r
+ if (*min_cost < 0 || old_cost < *min_cost) {\r
+ *min_cost = old_cost;\r
+ if (best_assignment != 0)\r
+ tree.DoKMeansStep(k, centers, best_assignment);\r
+ if (best_centers != 0)\r
+ memcpy(best_centers, centers, sizeof(Scalar)*k*d);\r
+ }\r
+\r
+ // Update all other aggregate stats\r
+ if (*max_cost < old_cost) *max_cost = old_cost;\r
+ *total_cost += old_cost;\r
+ if (*min_time < 0 || *min_time > this_time)\r
+ *min_time = this_time;\r
+ if (*max_time < this_time) *max_time = this_time;\r
+ *total_time += this_time;\r
+}\r
+\r
+// Outputs all meta-stats for a set of k-means or k-means++ runs.\r
+void LogMetaStats(Scalar min_cost, Scalar max_cost, Scalar total_cost,\r
+ double min_time, double max_time, double total_time, int num_attempts) {\r
+ LOG(false, "Aggregate info over " << num_attempts << " runs:" << endl);\r
+ LOG(false, " Cost: min=" << min_cost << " average=" << (total_cost / num_attempts)\r
+ << " max=" << max_cost << endl);\r
+ LOG(false, " Time: min=" << min_time << " average=" << (total_time / num_attempts)\r
+ << " max=" << max_time << endl << endl);\r
+}\r
+\r
+// See KMeans.h\r
+Scalar RunKMeans(int n, int k, int d, Scalar *points, int attempts,\r
+ Scalar *ret_centers, int *ret_assignment) {\r
+ KM_ASSERT(k >= 1);\r
+ \r
+ // Create the tree and log\r
+ LOG(false, "Running k-means..." << endl);\r
+ KmTree tree(n, d, points);\r
+ LOG(false, "Done preprocessing..." << endl);\r
+\r
+ // Initialization\r
+ Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);\r
+ int *unused_centers = (int*)malloc(sizeof(int)*n);\r
+ KM_ASSERT(centers != 0 && unused_centers != 0);\r
+ Scalar min_cost = -1, max_cost = -1, total_cost = 0;\r
+ double min_time = -1, max_time = -1, total_time = 0;\r
+ \r
+ // Handle k > n\r
+ if (k > n) {\r
+ memset(centers + n*d, -1, (k-d)*sizeof(Scalar));\r
+ k = n;\r
+ }\r
+\r
+ // Run all the attempts\r
+ for (int attempt = 0; attempt < attempts; attempt++) {\r
+ double start_time = GetSeconds();\r
+\r
+ // Choose centers uniformly at random\r
+ for (int i = 0; i < n; i++)\r
+ unused_centers[i] = i;\r
+ int num_unused_centers = n;\r
+ for (int i = 0; i < k; i++) {\r
+ int j = GetRandom(num_unused_centers--);\r
+ memcpy(centers + i*d, points + unused_centers[j]*d, d*sizeof(Scalar));\r
+ unused_centers[j] = unused_centers[num_unused_centers];\r
+ }\r
+ \r
+ // Run k-means\r
+ RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,\r
+ &min_time, &max_time, &total_time, ret_centers, ret_assignment);\r
+ }\r
+ LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);\r
+\r
+ // Clean up and return\r
+ free(unused_centers);\r
+ free(centers);\r
+ return min_cost;\r
+}\r
+\r
+// See KMeans.h\r
+Scalar RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,\r
+ Scalar *ret_centers, int *ret_assignment) {\r
+ KM_ASSERT(k >= 1);\r
+\r
+ // Create the tree and log\r
+ LOG(false, "Running k-means++..." << endl);\r
+ KmTree tree(n, d, points);\r
+ LOG(false, "Done preprocessing..." << endl);\r
+\r
+ // Initialization\r
+ Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);\r
+ KM_ASSERT(centers != 0);\r
+ Scalar min_cost = -1, max_cost = -1, total_cost = 0;\r
+ double min_time = -1, max_time = -1, total_time = 0;\r
+\r
+ // Run all the attempts\r
+ for (int attempt = 0; attempt < attempts; attempt++) {\r
+ double start_time = GetSeconds();\r
+\r
+ // Choose centers using k-means++ seeding\r
+ tree.SeedKMeansPlusPlus(k, centers);\r
+ \r
+ // Run k-means\r
+ RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,\r
+ &min_time, &max_time, &total_time, ret_centers, ret_assignment);\r
+ }\r
+ LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);\r
+\r
+ // Clean up and return\r
+ free(centers);\r
+ return min_cost;\r
+}\r