4 This module provides code for doing logistic regressions.
8 LogisticRegression Holds information for a LogisticRegression classifier.
12 train Train a new classifier.
13 calculate Calculate the probabilities of each class, given an observation.
14 classify Classify an observation into a class.
17 #TODO - Remove this work around once we drop python 2.3 support
21 from sets import Set as set
24 #from numpy.linalg import *
28 class LogisticRegression:
29 """Holds information necessary to do logistic regression
33 beta List of the weights for each dimension.
37 """LogisticRegression()"""
40 def train(xs, ys, update_fn=None, typecode=None):
41 """train(xs, ys[, update_fn]) -> LogisticRegression
43 Train a logistic regression classifier on a training set. xs is a
44 list of observations and ys is a list of the class assignments,
45 which should be 0 or 1. xs and ys should contain the same number
46 of elements. update_fn is an optional callback function that
47 takes as parameters that iteration number and log likelihood.
50 if len(xs) != len(ys):
51 raise ValueError("xs and ys should be the same length.")
53 if classes != set([0, 1]):
54 raise ValueError("Classes should be 0's and 1's")
58 # Dimensionality of the data is the dimensionality of the
59 # observations plus a constant dimension.
60 N, ndims = len(xs), len(xs[0]) + 1
62 raise ValueError("No observations or observation of 0 dimension.")
64 # Make an X array, with a constant first dimension.
65 X = numpy.ones((N, ndims), typecode)
67 Xt = numpy.transpose(X)
68 y = numpy.asarray(ys, typecode)
70 # Initialize the beta parameter to 0.
71 beta = numpy.zeros(ndims, typecode)
74 CONVERGE_THRESHOLD = 0.01
76 # Now iterate using Newton-Raphson until the log-likelihoods
79 old_beta = old_llik = None
80 while iter < MAX_ITERATIONS:
81 # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X))
82 ebetaX = numpy.exp(numpy.dot(beta, Xt))
83 p = ebetaX / (1+ebetaX)
85 # Find the log likelihood score and see if I've converged.
86 logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
88 if update_fn is not None:
90 # Check to see if the likelihood decreased. If it did, then
91 # restore the old beta parameters and half the step size.
93 stepsize = stepsize / 2.0
95 # If I've converged, then stop.
96 if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
98 old_llik, old_beta = llik, beta
101 W = numpy.identity(N) * p
102 Xtyp = numpy.dot(Xt, y-p) # Calculate the first derivative.
103 XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative.
104 #u, s, vt = singular_value_decomposition(XtWX)
107 delta = numpy.linalg.solve(XtWX, Xtyp)
108 if numpy.fabs(stepsize-1.0) > 0.001:
109 delta = delta * stepsize
110 beta = beta + delta # Update beta.
112 raise RuntimeError("Didn't converge.")
114 lr = LogisticRegression()
115 lr.beta = map(float, beta) # Convert back to regular array.
118 def calculate(lr, x):
119 """calculate(lr, x) -> list of probabilities
121 Calculate the probability for each class. lr is a
122 LogisticRegression object. x is the observed data. Returns a
123 list of the probability that it fits each class.
126 # Insert a constant term for x.
127 x = numpy.asarray([1.0] + x)
128 # Calculate the probability. p = e^(beta X) / (1+e^(beta X))
129 ebetaX = numpy.exp(numpy.dot(lr.beta, x))
130 p = ebetaX / (1+ebetaX)
134 """classify(lr, x) -> 1 or 0
136 Classify an observation into a class.
139 probs = calculate(lr, x)
140 if probs[0] > probs[1]: