binaries/src/disembl/biopython-1.50/Bio/LogisticRegression.py

   1 #!/usr/bin/env python
   2
   3 """
   4 This module provides code for doing logistic regressions.
   5
   6
   7 Classes:
   8 LogisticRegression    Holds information for a LogisticRegression classifier.
   9
  10
  11 Functions:
  12 train        Train a new classifier.
  13 calculate    Calculate the probabilities of each class, given an observation.
  14 classify     Classify an observation into a class.
  15 """
  16
  17 #TODO - Remove this work around once we drop python 2.3 support
  18 try:
  19     set = set
  20 except NameError:
  21     from sets import Set as set
  22
  23 #from numpy import *
  24 #from numpy.linalg import *
  25 import numpy
  26 import numpy.linalg
  27
  28 class LogisticRegression:
  29     """Holds information necessary to do logistic regression
  30     classification.
  31
  32     Members:
  33     beta    List of the weights for each dimension.
  34
  35     """
  36     def __init__(self):
  37         """LogisticRegression()"""
  38         self.beta = []
  39
  40 def train(xs, ys, update_fn=None, typecode=None):
  41     """train(xs, ys[, update_fn]) -> LogisticRegression
  42
  43     Train a logistic regression classifier on a training set.  xs is a
  44     list of observations and ys is a list of the class assignments,
  45     which should be 0 or 1.  xs and ys should contain the same number
  46     of elements.  update_fn is an optional callback function that
  47     takes as parameters that iteration number and log likelihood.
  48
  49     """
  50     if len(xs) != len(ys):
  51         raise ValueError("xs and ys should be the same length.")
  52     classes = set(ys)
  53     if classes != set([0, 1]):
  54         raise ValueError("Classes should be 0's and 1's")
  55     if typecode is None:
  56         typecode = 'd'
  57
  58     # Dimensionality of the data is the dimensionality of the
  59     # observations plus a constant dimension.
  60     N, ndims = len(xs), len(xs[0]) + 1
  61     if N==0 or ndims==1:
  62         raise ValueError("No observations or observation of 0 dimension.")
  63
  64     # Make an X array, with a constant first dimension.
  65     X = numpy.ones((N, ndims), typecode)
  66     X[:, 1:] = xs
  67     Xt = numpy.transpose(X)
  68     y = numpy.asarray(ys, typecode)
  69
  70     # Initialize the beta parameter to 0.
  71     beta = numpy.zeros(ndims, typecode)
  72
  73     MAX_ITERATIONS = 500
  74     CONVERGE_THRESHOLD = 0.01
  75     stepsize = 1.0
  76     # Now iterate using Newton-Raphson until the log-likelihoods
  77     # converge.
  78     iter = 0
  79     old_beta = old_llik = None
  80     while iter < MAX_ITERATIONS:
  81         # Calculate the probabilities.  p = e^(beta X) / (1+e^(beta X))
  82         ebetaX = numpy.exp(numpy.dot(beta, Xt))
  83         p = ebetaX / (1+ebetaX)
  84
  85         # Find the log likelihood score and see if I've converged.
  86         logp = y*numpy.log(p) + (1-y)*numpy.log(1-p)
  87         llik = sum(logp)
  88         if update_fn is not None:
  89             update_fn(iter, llik)
  90         # Check to see if the likelihood decreased.  If it did, then
  91         # restore the old beta parameters and half the step size.
  92         if llik < old_llik:
  93             stepsize = stepsize / 2.0
  94             beta = old_beta
  95         # If I've converged, then stop.
  96         if old_llik is not None and numpy.fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
  97             break
  98         old_llik, old_beta = llik, beta
  99         iter += 1
 100
 101         W = numpy.identity(N) * p
 102         Xtyp = numpy.dot(Xt, y-p)         # Calculate the first derivative.
 103         XtWX = numpy.dot(numpy.dot(Xt, W), X)   # Calculate the second derivative.
 104         #u, s, vt = singular_value_decomposition(XtWX)
 105         #print "U", u
 106         #print "S", s
 107         delta = numpy.linalg.solve(XtWX, Xtyp)
 108         if numpy.fabs(stepsize-1.0) > 0.001:
 109             delta = delta * stepsize
 110         beta = beta + delta                 # Update beta.
 111     else:
 112         raise RuntimeError("Didn't converge.")
 113
 114     lr = LogisticRegression()
 115     lr.beta = map(float, beta)   # Convert back to regular array.
 116     return lr
 117
 118 def calculate(lr, x):
 119     """calculate(lr, x) -> list of probabilities
 120
 121     Calculate the probability for each class.  lr is a
 122     LogisticRegression object.  x is the observed data.  Returns a
 123     list of the probability that it fits each class.
 124
 125     """
 126     # Insert a constant term for x.
 127     x = numpy.asarray([1.0] + x)
 128     # Calculate the probability.  p = e^(beta X) / (1+e^(beta X))
 129     ebetaX = numpy.exp(numpy.dot(lr.beta, x))
 130     p = ebetaX / (1+ebetaX)
 131     return [1-p, p]
 132
 133 def classify(lr, x):
 134     """classify(lr, x) -> 1 or 0
 135
 136     Classify an observation into a class.
 137
 138     """
 139     probs = calculate(lr, x)
 140     if probs[0] > probs[1]:
 141         return 0
 142     return 1