3 // Author: David Arthur (darthur@gmail.com), 2009
\r
8 /* previously in KMeans.h */
\r
13 // Sets preferences for how much logging is done and where it is outputted, when k-means is run.
\r
14 void ClearKMeansLogging();
\r
15 void AddKMeansLogging(std::ostream *out, bool verbose);
\r
18 // Runs k-means++ on the given set of points. Set RunKMeans for info on the parameters.
\r
21 RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,
\r
22 Scalar *centers, int *assignments);
\r
25 RunKMeans(int n, int k, int d, Scalar *points, int attempts,
\r
26 Scalar *centers, int *assignments);
\r
37 KMeans(int n, int k, int d, Scalar *points, int attempts, int use_lloyds_method,
\r
38 double *centers, int *assignments)
\r
40 if (use_lloyds_method) {
\r
41 /*fprintf(stderr, "FIXME using LLoyd's method\n");*/
\r
42 return RunKMeans(n, k, d, points, attempts,
\r
43 centers, assignments);
\r
45 /*fprintf(stderr, "FIXME using KMeansPP method\n");*/
\r
46 return RunKMeansPlusPlus(n, k, d, points, attempts,
\r
47 centers, assignments);
\r
52 using namespace std;
\r
55 static vector<ostream*> gLogOutputs;
\r
56 static vector<ostream*> gVerboseLogOutputs;
\r
57 #define LOG(verbose, text) { \
\r
58 vector<ostream*> &outputs = (verbose? gVerboseLogOutputs : gLogOutputs); \
\r
59 if (outputs.size() > 0) { \
\r
60 ostringstream string_stream; \
\r
61 string_stream << text; \
\r
62 for (int i = 0; i < (int)outputs.size(); i++) \
\r
63 *(outputs[i]) << string_stream.str(); \
\r
66 void AddKMeansLogging(std::ostream *out, bool verbose) {
\r
68 gVerboseLogOutputs.push_back(out);
\r
69 gLogOutputs.push_back(out);
\r
71 void ClearKMeansLogging() {
\r
72 gLogOutputs.clear();
\r
73 gVerboseLogOutputs.clear();
\r
76 // Returns the number of seconds since the program began execution.
\r
77 static double GetSeconds() {
\r
78 return double(clock()) / CLOCKS_PER_SEC;
\r
82 // Performs one full execution of k-means, logging any relevant information, and tracking meta
\r
83 // statistics for the run. If min or max values are negative, they are treated as unset.
\r
84 // best_centers and best_assignment can be 0, in which case they are not set.
\r
85 static void RunKMeansOnce(const KmTree &tree, int n, int k, int d, Scalar *points, Scalar *centers,
\r
86 Scalar *min_cost, Scalar *max_cost, Scalar *total_cost,
\r
87 double start_time, double *min_time, double *max_time,
\r
88 double *total_time, Scalar *best_centers, int *best_assignment) {
\r
89 const Scalar kEpsilon = Scalar(1e-8); // Used to determine when to terminate k-means
\r
91 // Do iterations of k-means until the cost stabilizes
\r
92 Scalar old_cost = 0;
\r
93 bool is_done = false;
\r
94 for (int iteration = 0; !is_done; iteration++) {
\r
95 Scalar new_cost = tree.DoKMeansStep(k, centers, 0);
\r
96 is_done = (iteration > 0 && new_cost >= (1 - kEpsilon) * old_cost);
\r
97 old_cost = new_cost;
\r
98 LOG(true, "Completed iteration #" << (iteration+1) << ", cost=" << new_cost << "..." << endl);
\r
100 double this_time = GetSeconds() - start_time;
\r
102 // Log the clustering we found
\r
103 LOG(false, "Completed run: cost=" << old_cost << " (" << this_time << " seconds)" << endl);
\r
105 // Handle a new min cost, updating best_centers and best_assignment as appropriate
\r
106 if (*min_cost < 0 || old_cost < *min_cost) {
\r
107 *min_cost = old_cost;
\r
108 if (best_assignment != 0)
\r
109 tree.DoKMeansStep(k, centers, best_assignment);
\r
110 if (best_centers != 0)
\r
111 memcpy(best_centers, centers, sizeof(Scalar)*k*d);
\r
114 // Update all other aggregate stats
\r
115 if (*max_cost < old_cost) *max_cost = old_cost;
\r
116 *total_cost += old_cost;
\r
117 if (*min_time < 0 || *min_time > this_time)
\r
118 *min_time = this_time;
\r
119 if (*max_time < this_time) *max_time = this_time;
\r
120 *total_time += this_time;
\r
123 // Outputs all meta-stats for a set of k-means or k-means++ runs.
\r
124 void LogMetaStats(Scalar min_cost, Scalar max_cost, Scalar total_cost,
\r
125 double min_time, double max_time, double total_time, int num_attempts) {
\r
126 LOG(false, "Aggregate info over " << num_attempts << " runs:" << endl);
\r
127 LOG(false, " Cost: min=" << min_cost << " average=" << (total_cost / num_attempts)
\r
128 << " max=" << max_cost << endl);
\r
129 LOG(false, " Time: min=" << min_time << " average=" << (total_time / num_attempts)
\r
130 << " max=" << max_time << endl << endl);
\r
134 Scalar RunKMeans(int n, int k, int d, Scalar *points, int attempts,
\r
135 Scalar *ret_centers, int *ret_assignment) {
\r
138 // Create the tree and log
\r
139 LOG(false, "Running k-means..." << endl);
\r
140 KmTree tree(n, d, points);
\r
141 LOG(false, "Done preprocessing..." << endl);
\r
144 Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);
\r
145 int *unused_centers = (int*)malloc(sizeof(int)*n);
\r
146 KM_ASSERT(centers != 0 && unused_centers != 0);
\r
147 Scalar min_cost = -1, max_cost = -1, total_cost = 0;
\r
148 double min_time = -1, max_time = -1, total_time = 0;
\r
152 memset(centers + n*d, -1, (k-d)*sizeof(Scalar));
\r
156 // Run all the attempts
\r
157 for (int attempt = 0; attempt < attempts; attempt++) {
\r
158 double start_time = GetSeconds();
\r
160 // Choose centers uniformly at random
\r
161 for (int i = 0; i < n; i++)
\r
162 unused_centers[i] = i;
\r
163 int num_unused_centers = n;
\r
164 for (int i = 0; i < k; i++) {
\r
165 int j = GetRandom(num_unused_centers--);
\r
166 memcpy(centers + i*d, points + unused_centers[j]*d, d*sizeof(Scalar));
\r
167 unused_centers[j] = unused_centers[num_unused_centers];
\r
171 RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,
\r
172 &min_time, &max_time, &total_time, ret_centers, ret_assignment);
\r
174 LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);
\r
176 // Clean up and return
\r
177 free(unused_centers);
\r
183 Scalar RunKMeansPlusPlus(int n, int k, int d, Scalar *points, int attempts,
\r
184 Scalar *ret_centers, int *ret_assignment) {
\r
187 // Create the tree and log
\r
188 LOG(false, "Running k-means++..." << endl);
\r
189 KmTree tree(n, d, points);
\r
190 LOG(false, "Done preprocessing..." << endl);
\r
193 Scalar *centers = (Scalar*)malloc(sizeof(Scalar)*k*d);
\r
194 KM_ASSERT(centers != 0);
\r
195 Scalar min_cost = -1, max_cost = -1, total_cost = 0;
\r
196 double min_time = -1, max_time = -1, total_time = 0;
\r
198 // Run all the attempts
\r
199 for (int attempt = 0; attempt < attempts; attempt++) {
\r
200 double start_time = GetSeconds();
\r
202 // Choose centers using k-means++ seeding
\r
203 tree.SeedKMeansPlusPlus(k, centers);
\r
206 RunKMeansOnce(tree, n, k, d, points, centers, &min_cost, &max_cost, &total_cost, start_time,
\r
207 &min_time, &max_time, &total_time, ret_centers, ret_assignment);
\r
209 LogMetaStats(min_cost, max_cost, total_cost, min_time, max_time, total_time, attempts);
\r
211 // Clean up and return
\r