--- /dev/null
+#!/usr/bin/env python
+
+"""
+This module provides code for doing k-nearest-neighbors classification.
+
+k Nearest Neighbors is a supervised learning algorithm that classifies
+a new observation based the classes in its surrounding neighborhood.
+
+Glossary:
+distance The distance between two points in the feature space.
+weight The importance given to each point for classification.
+
+
+Classes:
+kNN Holds information for a nearest neighbors classifier.
+
+
+Functions:
+train Train a new kNN classifier.
+calculate Calculate the probabilities of each class, given an observation.
+classify Classify an observation into a class.
+
+ Weighting Functions:
+equal_weight Every example is given a weight of 1.
+
+"""
+
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+ set = set
+except NameError:
+ from sets import Set as set
+
+import numpy
+
+class kNN:
+ """Holds information necessary to do nearest neighbors classification.
+
+ Members:
+ classes Set of the possible classes.
+ xs List of the neighbors.
+ ys List of the classes that the neighbors belong to.
+ k Number of neighbors to look at.
+
+ """
+ def __init__(self):
+ """kNN()"""
+ self.classes = set()
+ self.xs = []
+ self.ys = []
+ self.k = None
+
+def equal_weight(x, y):
+ """equal_weight(x, y) -> 1"""
+ # everything gets 1 vote
+ return 1
+
+def train(xs, ys, k, typecode=None):
+ """train(xs, ys, k) -> kNN
+
+ Train a k nearest neighbors classifier on a training set. xs is a
+ list of observations and ys is a list of the class assignments.
+ Thus, xs and ys should contain the same number of elements. k is
+ the number of neighbors that should be examined when doing the
+ classification.
+
+ """
+ knn = kNN()
+ knn.classes = set(ys)
+ knn.xs = numpy.asarray(xs, typecode)
+ knn.ys = ys
+ knn.k = k
+ return knn
+
+def calculate(knn, x, weight_fn=equal_weight, distance_fn=None):
+ """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
+
+ Calculate the probability for each class. knn is a kNN object. x
+ is the observed data. weight_fn is an optional function that
+ takes x and a training example, and returns a weight. distance_fn
+ is an optional function that takes two points and returns the
+ distance between them. If distance_fn is None (the default), the
+ Euclidean distance is used. Returns a dictionary of the class to
+ the weight given to the class.
+
+ """
+ x = numpy.asarray(x)
+
+ order = [] # list of (distance, index)
+ if distance_fn:
+ for i in range(len(knn.xs)):
+ dist = distance_fn(x, knn.xs[i])
+ order.append((dist, i))
+ else:
+ # Default: Use a fast implementation of the Euclidean distance
+ temp = numpy.zeros(len(x))
+ # Predefining temp allows reuse of this array, making this
+ # function about twice as fast.
+ for i in range(len(knn.xs)):
+ temp[:] = x - knn.xs[i]
+ dist = numpy.sqrt(numpy.dot(temp,temp))
+ order.append((dist, i))
+ order.sort()
+
+ # first 'k' are the ones I want.
+ weights = {} # class -> number of votes
+ for k in knn.classes:
+ weights[k] = 0.0
+ for dist, i in order[:knn.k]:
+ klass = knn.ys[i]
+ weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
+
+ return weights
+
+def classify(knn, x, weight_fn=equal_weight, distance_fn=None):
+ """classify(knn, x[, weight_fn][, distance_fn]) -> class
+
+ Classify an observation into a class. If not specified, weight_fn will
+ give all neighbors equal weight. distance_fn is an optional function
+ that takes two points and returns the distance between them. If
+ distance_fn is None (the default), the Euclidean distance is used.
+ """
+ weights = calculate(
+ knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
+
+ most_class = None
+ most_weight = None
+ for klass, weight in weights.items():
+ if most_class is None or weight > most_weight:
+ most_class = klass
+ most_weight = weight
+ return most_class