--- /dev/null
+#!/usr/bin/env python
+
+"""
+This module provides code for doing logistic regressions.
+
+
+Classes:
+LogisticRegression Holds information for a LogisticRegression classifier.
+
+
+Functions:
+train Train a new classifier.
+calculate Calculate the probabilities of each class, given an observation.
+classify Classify an observation into a class.
+"""
+
+#TODO - Remove this work around once we drop python 2.3 support
+try:
+ set = set
+except NameError:
+ from sets import Set as set
+
+#from numpy import *
+#from numpy.linalg import *
+import numpy
+import numpy.linalg
+
+class LogisticRegression:
+ """Holds information necessary to do logistic regression
+ classification.
+
+ Members:
+ beta List of the weights for each dimension.
+
+ """
+ def __init__(self):
+ """LogisticRegression()"""
+ self.beta = []
+
+def train(xs, ys, update_fn=None, typecode=None):
+ """train(xs, ys[, update_fn]) -> LogisticRegression
+
+ Train a logistic regression classifier on a training set. xs is a
+ list of observations and ys is a list of the class assignments,
+ which should be 0 or 1. xs and ys should contain the same number
+ of elements. update_fn is an optional callback function that
+ takes as parameters that iteration number and log likelihood.
+
+ """
+ if len(xs) != len(ys):
+ raise ValueError("xs and ys should be the same length.")
+ classes = set(ys)
+ if classes != set([0, 1]):
+ raise ValueError("Classes should be 0's and 1's")
+ if typecode is None:
+ typecode = 'd'
+
+ # Dimensionality of the data is the dimensionality of the
+ # observations plus a constant dimension.
+ N, ndims = len(xs), len(xs[0]) + 1
+ if N==0 or ndims==1:
+ raise ValueError("No observations or observation of 0 dimension.")
+
+ # Make an X array, with a constant first dimension.
+ X = numpy.ones((N, ndims), typecode)
+ X[:, 1:] = xs
+ Xt = numpy.transpose(X)
+ y = numpy.asarray(ys, typecode)
+
+ # Initialize the beta parameter to 0.
+ beta = numpy.zeros(ndims, typecode)
+
+ MAX_ITERATIONS = 500
+ CONVERGE_THRESHOLD = 0.01
+ stepsize = 1.0
+ # Now iterate using Newton-Raphson until the log-likelihoods
+ # converge.
+ iter = 0
+ old_beta = old_llik = None
+ while iter < MAX_ITERATIONS:
+ # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X))
+ ebetaX = numpy.exp(numpy.dot(beta, Xt))
+ p = ebetaX / (1+ebetaX)
+
+ # Find the log likelihood score and see if I've converged.
+ logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
+ llik = sum(logp)
+ if update_fn is not None:
+ update_fn(iter, llik)
+ # Check to see if the likelihood decreased. If it did, then
+ # restore the old beta parameters and half the step size.
+ if llik < old_llik:
+ stepsize = stepsize / 2.0
+ beta = old_beta
+ # If I've converged, then stop.
+ if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
+ break
+ old_llik, old_beta = llik, beta
+ iter += 1
+
+ W = numpy.identity(N) * p
+ Xtyp = numpy.dot(Xt, y-p) # Calculate the first derivative.
+ XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative.
+ #u, s, vt = singular_value_decomposition(XtWX)
+ #print "U", u
+ #print "S", s
+ delta = numpy.linalg.solve(XtWX, Xtyp)
+ if numpy.fabs(stepsize-1.0) > 0.001:
+ delta = delta * stepsize
+ beta = beta + delta # Update beta.
+ else:
+ raise RuntimeError("Didn't converge.")
+
+ lr = LogisticRegression()
+ lr.beta = map(float, beta) # Convert back to regular array.
+ return lr
+
+def calculate(lr, x):
+ """calculate(lr, x) -> list of probabilities
+
+ Calculate the probability for each class. lr is a
+ LogisticRegression object. x is the observed data. Returns a
+ list of the probability that it fits each class.
+
+ """
+ # Insert a constant term for x.
+ x = numpy.asarray([1.0] + x)
+ # Calculate the probability. p = e^(beta X) / (1+e^(beta X))
+ ebetaX = numpy.exp(numpy.dot(lr.beta, x))
+ p = ebetaX / (1+ebetaX)
+ return [1-p, p]
+
+def classify(lr, x):
+ """classify(lr, x) -> 1 or 0
+
+ Classify an observation into a class.
+
+ """
+ probs = calculate(lr, x)
+ if probs[0] > probs[1]:
+ return 0
+ return 1