Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / disembl / biopython-1.50 / Bio / kNN.py
diff --git a/website/archive/binaries/mac/src/disembl/biopython-1.50/Bio/kNN.py b/website/archive/binaries/mac/src/disembl/biopython-1.50/Bio/kNN.py
new file mode 100644 (file)
index 0000000..4bd2d8d
--- /dev/null
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+
+"""
+This module provides code for doing k-nearest-neighbors classification.
+
+k Nearest Neighbors is a supervised learning algorithm that classifies
+a new observation based the classes in its surrounding neighborhood.
+
+Glossary:
+distance   The distance between two points in the feature space.
+weight     The importance given to each point for classification. 
+
+
+Classes:
+kNN           Holds information for a nearest neighbors classifier.
+
+
+Functions:
+train        Train a new kNN classifier.
+calculate    Calculate the probabilities of each class, given an observation.
+classify     Classify an observation into a class.
+
+    Weighting Functions:
+equal_weight    Every example is given a weight of 1.
+
+"""
+
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+    set = set
+except NameError:
+    from sets import Set as set
+
+import numpy
+
+class kNN:
+    """Holds information necessary to do nearest neighbors classification.
+
+    Members:
+    classes  Set of the possible classes.
+    xs       List of the neighbors.
+    ys       List of the classes that the neighbors belong to.
+    k        Number of neighbors to look at.
+
+    """
+    def __init__(self):
+        """kNN()"""
+        self.classes = set()
+        self.xs = []
+        self.ys = []
+        self.k = None
+
+def equal_weight(x, y):
+    """equal_weight(x, y) -> 1"""
+    # everything gets 1 vote
+    return 1
+
+def train(xs, ys, k, typecode=None):
+    """train(xs, ys, k) -> kNN
+    
+    Train a k nearest neighbors classifier on a training set.  xs is a
+    list of observations and ys is a list of the class assignments.
+    Thus, xs and ys should contain the same number of elements.  k is
+    the number of neighbors that should be examined when doing the
+    classification.
+    
+    """
+    knn = kNN()
+    knn.classes = set(ys)
+    knn.xs = numpy.asarray(xs, typecode)
+    knn.ys = ys
+    knn.k = k
+    return knn
+
+def calculate(knn, x, weight_fn=equal_weight, distance_fn=None):
+    """calculate(knn, x[, weight_fn][, distance_fn]) -> weight dict
+
+    Calculate the probability for each class.  knn is a kNN object.  x
+    is the observed data.  weight_fn is an optional function that
+    takes x and a training example, and returns a weight.  distance_fn
+    is an optional function that takes two points and returns the
+    distance between them.  If distance_fn is None (the default), the
+    Euclidean distance is used.  Returns a dictionary of the class to
+    the weight given to the class.
+    
+    """
+    x = numpy.asarray(x)
+
+    order = []  # list of (distance, index)
+    if distance_fn:
+        for i in range(len(knn.xs)):
+            dist = distance_fn(x, knn.xs[i])
+            order.append((dist, i))
+    else:
+        # Default: Use a fast implementation of the Euclidean distance
+        temp = numpy.zeros(len(x))
+        # Predefining temp allows reuse of this array, making this
+        # function about twice as fast.
+        for i in range(len(knn.xs)):
+            temp[:] = x - knn.xs[i]
+            dist = numpy.sqrt(numpy.dot(temp,temp))
+            order.append((dist, i))
+    order.sort()
+
+    # first 'k' are the ones I want.
+    weights = {}  # class -> number of votes
+    for k in knn.classes:
+        weights[k] = 0.0
+    for dist, i in order[:knn.k]:
+        klass = knn.ys[i]
+        weights[klass] = weights[klass] + weight_fn(x, knn.xs[i])
+
+    return weights
+
+def classify(knn, x, weight_fn=equal_weight, distance_fn=None):
+    """classify(knn, x[, weight_fn][, distance_fn]) -> class
+
+    Classify an observation into a class.  If not specified, weight_fn will
+    give all neighbors equal weight.  distance_fn is an optional function
+    that takes two points and returns the distance between them.  If
+    distance_fn is None (the default), the Euclidean distance is used.
+    """
+    weights = calculate(
+        knn, x, weight_fn=weight_fn, distance_fn=distance_fn)
+
+    most_class = None
+    most_weight = None
+    for klass, weight in weights.items():
+        if most_class is None or weight > most_weight:
+            most_class = klass
+            most_weight = weight
+    return most_class