forester/archive/RIO/others/hmmer/src/histogram.c

   1 /************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  ************************************************************/
  10
  11 /* histogram.c
  12  * SRE, Sat Jan 20 16:16:17 1996
  13  *
  14  * Accumulation, printing, and fitting of score histograms
  15  * from database searches.
  16  *
  17  * RCS $Id: histogram.c,v 1.1.1.1 2005/03/22 08:34:00 cmzmasek Exp $
  18  ************************************************************
  19  * Basic API:
  20  *
  21  * struct histogram_s *h;
  22  *
  23  * h = AllocHistogram(min_hint, max_hint, lumpsize);
  24  *
  25  * while (getting scores x) AddToHistogram(h, x);
  26  *
  27  * ExtremeValueFitHistogram(h, high_hint);
  28  * PrintASCIIHistogram(fp, h);
  29  * FreeHistogram(h);
  30  */
  31
  32 #include <stdio.h>
  33 #include <string.h>
  34 #include <limits.h>
  35 #include <float.h>
  36 #include <math.h>
  37
  38 #include "squid.h"
  39 #include "config.h"
  40 #include "structs.h"
  41 #include "funcs.h"
  42
  43 #ifdef MEMDEBUG
  44 #include "dbmalloc.h"
  45 #endif
  46
  47 /* Function: AllocHistogram()
  48  *
  49  * Purpose:  Allocate and return a histogram structure.
  50  *           min and max are your best guess. They need
  51  *           not be absolutely correct; the histogram
  52  *           will expand dynamically to accomodate scores
  53  *           that exceed these suggested bounds. The amount
  54  *           that the histogram grows by is set by "lumpsize".
  55  *
  56  * Args:     min:      minimum score (integer)
  57  *           max:      maximum score (integer)
  58  *           lumpsize: when reallocating histogram, pad the reallocation
  59  *                     by this much (saves excessive reallocation)
  60  */
  61 struct histogram_s *
  62 AllocHistogram(int min, int max, int lumpsize)
  63 {
  64   struct histogram_s *h;
  65   int            newsize;
  66   int            i;
  67
  68   newsize = max - min + 1;
  69
  70   h = (struct histogram_s *) MallocOrDie(sizeof(struct histogram_s));
  71   h->min       = min;
  72   h->max       = max;
  73   h->total     = 0;
  74   h->lowscore  = INT_MAX;
  75   h->highscore = INT_MIN;
  76   h->lumpsize  = lumpsize;
  77   h->histogram = (int *) MallocOrDie (sizeof(int) * newsize);
  78   for (i = 0; i < newsize; i++) h->histogram[i] = 0;
  79
  80   h->expect    = NULL;
  81   h->fit_type  = HISTFIT_NONE;
  82
  83   return h;
  84 }
  85
  86
  87 /* Function: FreeHistogram()
  88  *
  89  * Purpose:  free a histogram structure.
  90  */
  91 void
  92 FreeHistogram(struct histogram_s *h)
  93 {
  94   free(h->histogram);
  95   if (h->expect != NULL) free(h->expect);
  96   free(h);
  97 }
  98
  99 /* Function: UnfitHistogram()
 100  *
 101  * Purpose:  Free only the theoretical fit part of a histogram.
 102  */
 103 void
 104 UnfitHistogram(struct histogram_s *h)
 105 {
 106   if (h->expect != NULL) free(h->expect);
 107   h->expect   = NULL;
 108   h->fit_type = HISTFIT_NONE;
 109 }
 110
 111
 112 /* Function: AddToHistogram()
 113  *
 114  * Purpose:  Bump the appropriate counter in a histogram
 115  *           structure, given a score. The score is
 116  *           rounded off from float precision to the
 117  *           next lower integer.
 118  */
 119 void
 120 AddToHistogram(struct histogram_s *h, float sc)
 121 {
 122   int score;
 123   int moveby;
 124   int prevsize;
 125   int newsize;
 126   int i;
 127
 128   /* Adding to a histogram conflicts with existing fit:
 129    * prohibit this.
 130    */
 131   if (h->fit_type != HISTFIT_NONE)
 132     Die("AddToHistogram(): Can't add to a fitted histogram\n");
 133
 134
 135   /* histogram bins are defined as:  score >= bin value, < bin+1
 136    * -1.9 -> -2    -0.4 -> -1    1.9 -> 1
 137    * -2.1 -> -3     0.4 -> 0     2.1 -> 2
 138    */
 139   score = (int) floor(sc);
 140
 141   /* Check to see if we must reallocate the histogram.
 142    */
 143   if (score < h->min)
 144     {
 145       prevsize = h->max - h->min + 1;
 146       moveby   = (h->min - score) + h->lumpsize;
 147       newsize  = prevsize + moveby;
 148       h->min  -= moveby;
 149
 150       h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize);
 151       memmove(h->histogram+moveby, h->histogram, sizeof(int) * prevsize);
 152       for (i = 0; i < moveby; i++)
 153         h->histogram[i] = 0;
 154     }
 155   else if (score > h->max)
 156     {
 157       prevsize = h->max - h->min + 1;
 158       h->max   = h->lumpsize + score;
 159       newsize  = h->max - h->min + 1;
 160
 161       h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize);
 162       for (i = prevsize; i < newsize; i++)
 163         h->histogram[i] = 0;
 164     }
 165
 166   /* Bump the correct bin.
 167    * The bin number is score - h->min
 168    */
 169   h->histogram[score - h->min]++;
 170   h->total++;
 171   if (score < h->lowscore) h->lowscore   = score;
 172   if (score > h->highscore) h->highscore = score;
 173
 174   SQD_DPRINTF3(("AddToHistogram(): added %.1f; rounded to %d; in bin %d (%d-%d)\n",
 175           sc, score, score-h->min, h->min, h->max));
 176   return;
 177 }
 178
 179
 180
 181 /* Function: PrintASCIIHistogram()
 182  *
 183  * Purpose:  Print a "prettified" histogram to a file pointer.
 184  *           Deliberately a look-and-feel clone of Bill Pearson's
 185  *           excellent FASTA output.
 186  *
 187  * Args:     fp     - open file to print to (stdout works)
 188  *           h      - histogram to print
 189  */
 190 void
 191 PrintASCIIHistogram(FILE *fp, struct histogram_s *h)
 192 {
 193   int units;
 194   int maxbar;
 195   int num;
 196   int i, idx;
 197   char buffer[81];              /* output line buffer */
 198   int  pos;                     /* position in output line buffer */
 199   int  lowbound, lowcount;      /* cutoffs on the low side  */
 200   int  highbound, highcount;    /* cutoffs on the high side */
 201   int  emptybins = 3;
 202
 203   /* Find out how we'll scale the histogram.
 204    * We have 59 characters to play with on a
 205    * standard 80-column terminal display:
 206    * leading "%5d %6d %6d|" occupies 20 chars.
 207    * Save the peak position, we'll use it later.
 208    */
 209   maxbar = 0;
 210   for (i = h->lowscore - h->min; i <= h->highscore - h->min; i++)
 211     if (h->histogram[i] > maxbar)
 212       {
 213         maxbar   = h->histogram[i];     /* max height    */
 214         lowbound = i + h->min;          /* peak position */
 215       }
 216
 217   /* Truncate histogram display on both sides, ad hoc fashion.
 218    * Start from the peak; then move out until we see <emptybins> empty bins,
 219    * and stop.
 220    */
 221   highbound = lowbound;         /* start at peak position */
 222   for (num = 0; lowbound > h->lowscore; lowbound--)
 223     {
 224       i = lowbound - h->min;
 225       if (h->histogram[i] > 0) { num = 0;               continue; } /* reset */
 226       if (++num == emptybins)  { lowbound += emptybins; break;    } /* stop  */
 227     }
 228   for (num = 0; highbound < h->highscore; highbound++)
 229     {
 230       i = highbound - h->min;
 231       if (h->histogram[i] > 0) { num = 0;                continue; } /* reset */
 232       if (++num == emptybins)  { highbound -= emptybins; break;    } /* stop  */
 233     }
 234                                 /* collect counts outside of bounds */
 235   for (lowcount = 0, i = h->lowscore - h->min; i <= lowbound - h->min; i++)
 236     lowcount += h->histogram[i];
 237   for (highcount = 0, i = h->highscore - h->min; i >= highbound - h->min; i--)
 238     highcount += h->histogram[i];
 239
 240                                 /* maxbar might need raised now; then set our units  */
 241   if (lowcount  > maxbar) maxbar = lowcount;
 242   if (highcount > maxbar) maxbar = highcount;
 243   units = ((maxbar-1)/ 59) + 1;
 244
 245
 246   /* Print the histogram
 247    */
 248   fprintf(fp, "%5s %6s %6s  (one = represents %d sequences)\n",
 249           "score", "obs", "exp", units);
 250   fprintf(fp, "%5s %6s %6s\n", "-----", "---", "---");
 251   buffer[80] = '\0';
 252   buffer[79] = '\n';
 253   for (i = h->lowscore; i <= h->highscore; i++)
 254     {
 255       memset(buffer, ' ', 79 * sizeof(char));
 256       idx = i - h->min;
 257
 258       /* Deal with special cases at edges
 259        */
 260       if      (i < lowbound)  continue;
 261       else if (i > highbound) continue;
 262       else if (i == lowbound && i != h->lowscore)
 263         {
 264           sprintf(buffer, "<%4d %6d %6s|", i+1, lowcount, "-");
 265           if (lowcount > 0) {
 266             num = 1+(lowcount-1) / units;
 267             if (num > 60) Die("oops");
 268             for (pos = 20; num > 0; num--)  buffer[pos++] = '=';
 269           }
 270           fputs(buffer, fp);
 271           continue;
 272         }
 273       else if (i == highbound && i != h->highscore)
 274         {
 275           sprintf(buffer, ">%4d %6d %6s|", i, highcount, "-");
 276           if (highcount > 0) {
 277             num = 1+(highcount-1) / units;
 278             for (pos = 20; num > 0; num--)  buffer[pos++] = '=';
 279           }
 280           fputs(buffer, fp);
 281           continue;
 282         }
 283
 284       /* Deal with most cases
 285        */
 286       if (h->fit_type != HISTFIT_NONE)
 287         sprintf(buffer, "%5d %6d %6d|",
 288                 i, h->histogram[idx], (int) h->expect[idx]);
 289       else
 290         sprintf(buffer, "%5d %6d %6s|", i, h->histogram[idx], "-");
 291       buffer[20] = ' ';         /* sprintf writes a null char */
 292
 293       /* Mark the histogram bar for observed hits
 294        */
 295       if (h->histogram[idx] > 0) {
 296         num = 1 + (h->histogram[idx]-1) / units;
 297         for (pos = 20; num > 0; num--)  buffer[pos++] = '=';
 298       }
 299
 300       /* Mark the theoretically expected value
 301        */
 302       if (h->fit_type != HISTFIT_NONE && (int) h->expect[idx] > 0)
 303         {
 304           pos = 20 + (int)(h->expect[idx]-1) / units;
 305           if (pos >= 78) pos = 78; /* be careful of buffer bounds */
 306           buffer[pos] = '*';
 307         }
 308
 309       /* Print the line
 310        */
 311       fputs(buffer, fp);
 312     }
 313
 314   /* Print details about the statistics
 315    */
 316   switch (h->fit_type) {
 317   case HISTFIT_NONE:
 318     fprintf(fp, "\n\n%% No statistical fit available\n");
 319     break;
 320
 321   case HISTFIT_EVD:
 322     fprintf(fp, "\n\n%% Statistical details of theoretical EVD fit:\n");
 323     fprintf(fp, "              mu = %10.4f\n", h->param[EVD_MU]);
 324     fprintf(fp, "          lambda = %10.4f\n", h->param[EVD_LAMBDA]);
 325     fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq);
 326     fprintf(fp, "  P(chi-square)  = %10.4g\n", h->chip);
 327     break;
 328
 329   case HISTFIT_GAUSSIAN:
 330     fprintf(fp, "\n\n%% Statistical details of theoretical Gaussian fit:\n");
 331     fprintf(fp, "            mean = %10.4f\n", h->param[GAUSS_MEAN]);
 332     fprintf(fp, "              sd = %10.4f\n", h->param[GAUSS_SD]);
 333     fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq);
 334     fprintf(fp, "  P(chi-square)  = %10.4g\n", h->chip);
 335     break;
 336   }
 337   return;
 338 }
 339
 340
 341
 342 /* Function: PrintXMGRHistogram()
 343  * Date:     SRE, Wed Nov 12 11:02:00 1997 [St. Louis]
 344  *
 345  * Purpose:  Print an XMGR data file that contains two data sets:
 346  *               - xy data for the observed histogram
 347  *               - xy data for the theoretical histogram
 348  */
 349 void
 350 PrintXMGRHistogram(FILE *fp, struct histogram_s *h)
 351 {
 352   int sc;                       /* integer score in histogram structure */
 353   double val;
 354
 355   /* First data set is the observed histogram
 356    */
 357   for (sc = h->lowscore; sc <= h->highscore; sc++)
 358     if (h->histogram[sc - h->min] > 0)
 359       fprintf(fp, "%-6d %f\n", sc,
 360               (float) h->histogram[sc - h->min]/ (float) h->total);
 361   fprintf(fp, "&\n");
 362
 363   /* Second data set is the theoretical histogram
 364    */
 365   if (h->fit_type != HISTFIT_NONE)
 366     {
 367         for (sc = h->lowscore; sc <= h->highscore; sc++)
 368           {
 369             val =
 370               (1. - ExtremeValueP((float)sc+1, h->param[EVD_MU], h->param[EVD_LAMBDA]))-
 371               (1. - ExtremeValueP((float)sc, h->param[EVD_MU], h->param[EVD_LAMBDA]));
 372             fprintf(fp, "%-6d %f\n", sc, val);
 373           }
 374         fprintf(fp, "&\n");
 375     }
 376 }
 377
 378 /* Function: PrintXMGRDistribution()
 379  * Date:     SRE, Wed Nov 12 11:02:09 1997 [St. Louis]
 380  *
 381  * Purpose:  Print an XMGR data file that contains two data sets:
 382  *               - xy data for the observed distribution P(S<x)
 383  *               - xy data for the theoretical distribution P(S<x)
 384  */
 385 void
 386 PrintXMGRDistribution(FILE *fp, struct histogram_s *h)
 387 {
 388   int sc;                       /* integer score in histogram structure */
 389   int cum;                      /* cumulative count */
 390   double val;
 391
 392   /* First data set is the observed distribution;
 393    * histogram bin x contains # of scores between x and x+1,
 394    * hence the sc+1 offset.
 395    */
 396   for (cum = 0, sc = h->lowscore; sc <= h->highscore; sc++)
 397     {
 398       cum += h->histogram[sc - h->min];
 399       fprintf(fp, "%-6d %f\n", sc + 1, (float) cum / (float) h->total);
 400     }
 401   fprintf(fp, "&\n");
 402
 403   /* Second data set is the theoretical histogram
 404    */
 405   if (h->fit_type != HISTFIT_NONE)
 406     {
 407       for (sc = h->lowscore; sc <= h->highscore; sc++)
 408         {
 409           val = (1. - ExtremeValueP((float) sc, h->param[EVD_MU],
 410                                     h->param[EVD_LAMBDA]));
 411           fprintf(fp, "%-6d %f\n", sc, val);
 412         }
 413       fprintf(fp, "&\n");
 414     }
 415 }
 416
 417 /* Function: PrintXMGRRegressionLine()
 418  * Date:     SRE, Wed Nov 12 11:02:19 1997 [St. Louis]
 419  *
 420  * Purpose:  Print an XMGR data file that contains two data sets:
 421  *               - xy data for log log transform of observed distribution P(S<x)
 422  *               - xy data for log log transform of theoretical distribution P(S<x)
 423  */
 424 void
 425 PrintXMGRRegressionLine(FILE *fp, struct histogram_s *h)
 426 {
 427   int sc;                       /* integer score in histogram structure */
 428   int cum;
 429   double val;                   /* log log transform */
 430
 431   /* First data set is the observed distribution;
 432    * histogram bin x contains # of scores between x and x+1,
 433    * hence the sc+1 offset.
 434    */
 435   for (cum = 0, sc = h->lowscore; sc <= h->highscore; sc++)
 436     {
 437       cum += h->histogram[sc - h->min];
 438       val = log (-1. * log((double) cum /  (double) h->total));
 439       if (cum < h->total)
 440         fprintf(fp, "%-6d %f\n", sc + 1, val);
 441     }
 442   fprintf(fp, "&\n");
 443
 444   /* Second data set is the theoretical histogram
 445    */
 446   if (h->fit_type != HISTFIT_NONE)
 447     {
 448       for (sc = h->lowscore; sc <= h->highscore; sc++)
 449         {
 450           val = log(-1. * log(1. - ExtremeValueP((float) sc, h->param[EVD_MU],
 451                                                        h->param[EVD_LAMBDA])));
 452           fprintf(fp, "%-6d %f\n", sc, val);
 453         }
 454       fprintf(fp, "&\n");
 455     }
 456 }
 457
 458 /* Function: EVDBasicFit()
 459  * Date:     SRE, Wed Nov 12 11:02:27 1997 [St. Louis]
 460  *
 461  * Purpose:  Fit a score histogram to the extreme value
 462  *           distribution. Set the parameters lambda
 463  *           and mu in the histogram structure. Fill in the
 464  *           expected values in the histogram. Calculate
 465  *           a chi-square test as a measure of goodness of fit.
 466  *
 467  *           This is the basic version of ExtremeValueFitHistogram(),
 468  *           in a nonrobust form: simple linear regression with no
 469  *           outlier pruning.
 470  *
 471  * Methods:  Uses a linear regression fitting method [Collins88,Lawless82]
 472  *
 473  * Args:     h         - histogram to fit
 474  *
 475  * Return:   (void)
 476  */
 477 void
 478 EVDBasicFit(struct histogram_s *h)
 479 {
 480   float *d;            /* distribution P(S < x)          */
 481   float *x;            /* x-axis of P(S<x) for Linefit() */
 482   int    hsize;
 483   int    sum;
 484   int    sc, idx;               /* loop indices for score or score-h->min   */
 485   float  slope, intercept;      /* m,b fit from Linefit()                   */
 486   float  corr;                  /* correlation coeff of line fit, not used  */
 487   float  lambda, mu;            /* slope, intercept converted to EVD params */
 488
 489   /* Allocations for x, y axes
 490    * distribution d runs from min..max with indices 0..max-min
 491    *     i.e. score - min = index into d, x, histogram, and expect
 492    */
 493   hsize = h->highscore - h->lowscore + 1;
 494   d         = (float *) MallocOrDie(sizeof(float) * hsize);
 495   x         = (float *) MallocOrDie(sizeof(float) * hsize);
 496   for (idx = 0; idx < hsize; idx++)
 497     d[idx] = x[idx] = 0.;
 498
 499   /* Calculate P(S < x) distribution from histogram.
 500    * note off-by-one of sc, because histogram bin contains scores between
 501    * x and x+1.
 502    */
 503   sum = 0;
 504   for (sc = h->lowscore; sc <= h->highscore; sc++)
 505     {
 506       sum += h->histogram[sc - h->min];
 507       d[sc - h->lowscore] = (float) sum / (float) h->total;
 508       x[sc - h->lowscore] = (float) (sc + 1);
 509     }
 510
 511   /* Do a linear regression fit to the log[-log(P(S<x))] "line".
 512    * we have log[-log(1-P(S>x))]  = -lambda * x + lambda * mu
 513    * so lambda = -m  and mu = b/lambda
 514    */
 515                                 /* convert y axis to log[-log(P(S<x))]  */
 516   for (sc = h->lowscore; sc < h->highscore; sc++)
 517     d[sc - h->lowscore] = log(-1. * log(d[sc - h->lowscore]));
 518
 519                                 /* do the linear regression */
 520   Linefit(x, d, hsize-1, &intercept, &slope, &corr);
 521                                 /* calc mu, lambda */
 522   lambda = -1. * slope;
 523   mu     = intercept / lambda;
 524
 525   /* Set the EVD parameters in the histogram;
 526    * pass 2 for additional lost degrees of freedom because we fit mu, lambda.
 527    */
 528   ExtremeValueSetHistogram(h, mu, lambda, h->lowscore, h->highscore, 2);
 529
 530   free(x);
 531   free(d);
 532   return;
 533 }
 534
 535
 536 /* Function: ExtremeValueFitHistogram()
 537  * Date:     SRE, Sat Nov 15 17:16:15 1997 [St. Louis]
 538  *
 539  * Purpose:  Fit a score histogram to the extreme value
 540  *           distribution. Set the parameters lambda
 541  *           and mu in the histogram structure. Calculate
 542  *           a chi-square test as a measure of goodness of fit.
 543  *
 544  * Methods:  Uses a maximum likelihood method [Lawless82].
 545  *           Lower outliers are removed by censoring the data below the peak.
 546  *           Upper outliers are removed iteratively using method
 547  *           described by [Mott92].
 548  *
 549  * Args:     h         - histogram to fit
 550  *           censor    - TRUE to censor data left of the peak
 551  *           high_hint - score cutoff; above this are `real' hits that aren't fit
 552  *
 553  * Return:   1 if fit is judged to be valid.
 554  *           else 0 if fit is invalid (too few seqs.)
 555  */
 556 int
 557 ExtremeValueFitHistogram(struct histogram_s *h, int censor, float high_hint)
 558 {
 559   float *x;                     /* array of EVD samples to fit */
 560   int   *y;                     /* histogram counts            */
 561   int    n;                     /* number of observed samples  */
 562   int    z;                     /* number of censored samples  */
 563   int    hsize;                 /* size of histogram           */
 564   float  lambda, mu;            /* new estimates of lambda, mu */
 565   int    sc;                    /* loop index for score        */
 566   int    lowbound;              /* lower bound of fitted region*/
 567   int    highbound;             /* upper bound of fitted region*/
 568   int    new_highbound;
 569   int    iteration;
 570
 571   /* Determine lower bound on fitted region;
 572    * if we're censoring the data, choose the peak of the histogram.
 573    * if we're not, then we take the whole histogram.
 574    */
 575   lowbound = h->lowscore;
 576   if (censor)
 577     {
 578       int max = -1;
 579       for (sc = h->lowscore; sc <= h->highscore; sc++)
 580         if (h->histogram[sc - h->min] > max)
 581           {
 582             max      = h->histogram[sc - h->min];
 583             lowbound = sc;
 584           }
 585     }
 586
 587   /* Determine initial upper bound on fitted region.
 588    */
 589   highbound = MIN(high_hint, h->highscore);
 590
 591   /* Now, iteratively converge on our lambda, mu:
 592    */
 593   for (iteration = 0; iteration < 100; iteration++)
 594     {
 595       /* Construct x, y vectors.
 596        */
 597       x = NULL;
 598       y = NULL;
 599       hsize = highbound - lowbound + 1;
 600       if (hsize < 5) goto FITFAILED; /* require at least 5 bins or we don't fit */
 601
 602       x = MallocOrDie(sizeof(float) * hsize);
 603       y = MallocOrDie(sizeof(int)   * hsize);
 604       n = 0;
 605       for (sc = lowbound; sc <= highbound; sc++)
 606         {
 607           x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */
 608           y[sc-lowbound] = h->histogram[sc - h->min];
 609           n             += h->histogram[sc - h->min];
 610         }
 611
 612       if (n < 100) goto FITFAILED;  /* require fitting to at least 100 points */
 613
 614       /* If we're censoring, estimate z, the number of censored guys
 615        * left of the bound. Our initial estimate is crudely that we're
 616        * missing e^-1 of the total distribution (which would be exact
 617        * if we censored exactly at mu; but we censored at the observed peak).
 618        * Subsequent estimates are more exact based on our current estimate of mu.
 619        */
 620       if (censor)
 621         {
 622           if (iteration == 0)
 623             z = MIN(h->total-n, (int) (0.58198 * (float) n));
 624           else
 625             {
 626               double psx;
 627               psx = EVDDistribution((float) lowbound, mu, lambda);
 628               z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx)));
 629             }
 630         }
 631
 632       /* Do an ML fit
 633        */
 634       if (censor) {
 635         if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda))
 636           goto FITFAILED;
 637       } else
 638         if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda))
 639           goto FITFAILED;
 640
 641       /* Find the Eval = 1 point as a new highbound;
 642        * the total number of samples estimated to "belong" to the EVD is n+z
 643        */
 644       new_highbound = (int)
 645         (mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda));
 646
 647       free(x);
 648       free(y);
 649       if (new_highbound >= highbound) break;
 650       highbound = new_highbound;
 651     }
 652
 653   /* Set the histogram parameters;
 654    * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom
 655    *   for fitting mu, lambda, but we get 1 back because we're unnormalized
 656    *   in this interval, hence we pass 2-1 = 1 as ndegrees.
 657    */
 658   ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1);
 659   return 1;
 660
 661 FITFAILED:
 662   UnfitHistogram(h);
 663   if (x != NULL) free(x);
 664   if (y != NULL) free(y);
 665   return 0;
 666 }
 667
 668
 669 /* Function: ExtremeValueSetHistogram()
 670  *
 671  * Purpose:  Instead of fitting the histogram to an EVD,
 672  *           simply set the EVD parameters from an external source.
 673  *
 674  * Args:     h        - the histogram to set
 675  *           mu       - mu location parameter
 676  *           lambda   - lambda scale parameter
 677  *           lowbound - low bound of the histogram that was fit
 678  *           highbound- high bound of histogram that was fit
 679  *           ndegrees - extra degrees of freedom to subtract in X^2 test:
 680  *                        typically 0 if mu, lambda are parametric,
 681  *                        else 2 if mu, lambda are estimated from data
 682  */
 683 void
 684 ExtremeValueSetHistogram(struct histogram_s *h, float mu, float lambda,
 685                          float lowbound, float highbound, int ndegrees)
 686 {
 687   int   sc;
 688   int   hsize, idx;
 689   int   nbins;
 690   float delta;
 691
 692   UnfitHistogram(h);
 693   h->fit_type          = HISTFIT_EVD;
 694   h->param[EVD_LAMBDA] = lambda;
 695   h->param[EVD_MU]     = mu;
 696
 697   hsize     = h->max - h->min + 1;
 698   h->expect = (float *) MallocOrDie(sizeof(float) * hsize);
 699   for (idx = 0; idx < hsize; idx++)
 700     h->expect[idx] = 0.;
 701
 702   /* Calculate the expected values for the histogram.
 703    */
 704   for (sc = h->min; sc <= h->max; sc++)
 705     h->expect[sc - h->min] =
 706       ExtremeValueE((float)(sc), h->param[EVD_MU], h->param[EVD_LAMBDA],
 707                     h->total) -
 708       ExtremeValueE((float)(sc+1), h->param[EVD_MU], h->param[EVD_LAMBDA],
 709                     h->total);
 710
 711   /* Calculate the goodness-of-fit (within whole region)
 712    */
 713   h->chisq = 0.;
 714   nbins    = 0;
 715   for (sc = lowbound; sc <= highbound; sc++)
 716     if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
 717       {
 718         delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
 719         h->chisq += delta * delta / h->expect[sc-h->min];
 720         nbins++;
 721       }
 722
 723   /* Since we fit the whole histogram, there is at least
 724    * one constraint on chi-square: the normalization to h->total.
 725    */
 726   if (nbins > 1 + ndegrees)
 727     h->chip = (float) IncompleteGamma((double)(nbins-1-ndegrees)/2.,
 728                                       (double) h->chisq/2.);
 729   else
 730     h->chip = 0.;
 731 }
 732
 733
 734
 735 /* Function: GaussianFitHistogram()
 736  *
 737  * Purpose:  Fit a score histogram to a Gaussian distribution.
 738  *           Set the parameters mean and sd in the histogram
 739  *           structure, as well as a chi-squared test for
 740  *           goodness of fit.
 741  *
 742  * Args:     h         - histogram to fit
 743  *           high_hint - score cutoff; above this are `real' hits that aren't fit
 744  *
 745  * Return:   1 if fit is judged to be valid.
 746  *           else 0 if fit is invalid (too few seqs.)
 747  */
 748 int
 749 GaussianFitHistogram(struct histogram_s *h, float high_hint)
 750 {
 751   float sum;
 752   float sqsum;
 753   float delta;
 754   int   sc;
 755   int   nbins;
 756   int   hsize, idx;
 757
 758   /* Clear any previous fitting from the histogram.
 759    */
 760   UnfitHistogram(h);
 761
 762   /* Determine if we have enough hits to fit the histogram;
 763    * arbitrarily require 1000.
 764    */
 765   if (h->total < 1000) { h->fit_type = HISTFIT_NONE; return 0; }
 766
 767   /* Simplest algorithm for mean and sd;
 768    * no outlier detection yet (not even using high_hint)
 769    *
 770    * Magic 0.5 correction is because our histogram is for
 771    * scores between x and x+1; we estimate the expectation
 772    * (roughly) as x + 0.5.
 773    */
 774   sum = sqsum = 0.;
 775   for (sc = h->lowscore; sc <= h->highscore; sc++)
 776     {
 777       delta  = (float) sc + 0.5;
 778       sum   += (float) h->histogram[sc-h->min] * delta;
 779       sqsum += (float) h->histogram[sc-h->min] * delta * delta;
 780     }
 781   h->fit_type          = HISTFIT_GAUSSIAN;
 782   h->param[GAUSS_MEAN] = sum / (float) h->total;
 783   h->param[GAUSS_SD]   = sqrt((sqsum - (sum*sum/(float)h->total)) /
 784                               (float)(h->total-1));
 785
 786   /* Calculate the expected values for the histogram.
 787    * Note that the magic 0.5 correction appears again.
 788    * Calculating difference between distribution functions for Gaussian
 789    * would be correct but hard.
 790    */
 791   hsize     = h->max - h->min + 1;
 792   h->expect = (float *) MallocOrDie(sizeof(float) * hsize);
 793   for (idx = 0; idx < hsize; idx++)
 794     h->expect[idx] = 0.;
 795
 796   for (sc = h->min; sc <= h->max; sc++)
 797     {
 798       delta = (float) sc + 0.5 - h->param[GAUSS_MEAN];
 799       h->expect[sc - h->min] =
 800         (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) *
 801         (exp(-1.* delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD]))));
 802     }
 803
 804   /* Calculate the goodness-of-fit (within region that was fitted)
 805    */
 806   h->chisq = 0.;
 807   nbins    = 0;
 808   for (sc = h->lowscore; sc <= h->highscore; sc++)
 809     if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
 810       {
 811         delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
 812         h->chisq += delta * delta / h->expect[sc-h->min];
 813         nbins++;
 814       }
 815         /* -1 d.f. for normalization; -2 d.f. for two free parameters */
 816   if (nbins > 3)
 817     h->chip = (float) IncompleteGamma((double)(nbins-3)/2.,
 818                                       (double) h->chisq/2.);
 819   else
 820     h->chip = 0.;
 821
 822   return 1;
 823 }
 824
 825
 826 /* Function: GaussianSetHistogram()
 827  *
 828  * Purpose:  Instead of fitting the histogram to a Gaussian,
 829  *           simply set the Gaussian parameters from an external source.
 830  */
 831 void
 832 GaussianSetHistogram(struct histogram_s *h, float mean, float sd)
 833 {
 834   int   sc;
 835   int   hsize, idx;
 836   int   nbins;
 837   float delta;
 838
 839   UnfitHistogram(h);
 840   h->fit_type          = HISTFIT_GAUSSIAN;
 841   h->param[GAUSS_MEAN] = mean;
 842   h->param[GAUSS_SD]   = sd;
 843
 844   /* Calculate the expected values for the histogram.
 845    */
 846   hsize     = h->max - h->min + 1;
 847   h->expect = (float *) MallocOrDie(sizeof(float) * hsize);
 848   for (idx = 0; idx < hsize; idx++)
 849     h->expect[idx] = 0.;
 850
 851   /* Note: ideally we'd use the Gaussian distribution function
 852    * to find the histogram occupancy in the window sc..sc+1.
 853    * However, the distribution function is hard to calculate.
 854    * Instead, estimate the histogram by taking the density at sc+0.5.
 855    */
 856   for (sc = h->min; sc <= h->max; sc++)
 857     {
 858       delta = ((float)sc + 0.5) - h->param[GAUSS_MEAN];
 859       h->expect[sc - h->min] =
 860         (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) *
 861             (exp(-1.*delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD]))));
 862     }
 863
 864   /* Calculate the goodness-of-fit (within whole region)
 865    */
 866   h->chisq = 0.;
 867   nbins    = 0;
 868   for (sc = h->lowscore; sc <= h->highscore; sc++)
 869     if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
 870       {
 871         delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
 872         h->chisq += delta * delta / h->expect[sc-h->min];
 873         nbins++;
 874       }
 875         /* -1 d.f. for normalization */
 876   if (nbins > 1)
 877     h->chip = (float) IncompleteGamma((double)(nbins-1)/2.,
 878                                       (double) h->chisq/2.);
 879   else
 880     h->chip = 0.;
 881 }
 882
 883
 884
 885 /* Function: EVDDensity()
 886  * Date:     SRE, Sat Nov 15 19:37:52 1997 [St. Louis]
 887  *
 888  * Purpose:  Return the extreme value density P(S=x) at
 889  *           a given point x, for an EVD controlled by
 890  *           parameters mu and lambda.
 891  */
 892 double
 893 EVDDensity(float x, float mu, float lambda)
 894 {
 895   return (lambda * exp(-1. * lambda * (x - mu)
 896                        - exp(-1. * lambda * (x - mu))));
 897 }
 898
 899 /* Function: EVDDistribution()
 900  * Date:     SRE, Tue Nov 18 08:02:22 1997 [St. Louis]
 901  *
 902  * Purpose:  Returns the extreme value distribution P(S < x)
 903  *           evaluated at x, for an EVD controlled by parameters
 904  *           mu and lambda.
 905  */
 906 double
 907 EVDDistribution(float x, float mu, float lambda)
 908 {
 909   return (exp(-1. * exp(-1. * lambda * (x - mu))));
 910 }
 911
 912 /* Function: ExtremeValueP()
 913  *
 914  * Purpose:  Calculate P(S>x) according to an extreme
 915  *           value distribution, given x and the parameters
 916  *           of the distribution (characteristic
 917  *           value mu, decay constant lambda).
 918  *
 919  *           This function is exquisitely prone to
 920  *           floating point exceptions if it isn't coded
 921  *           carefully.
 922  *
 923  * Args:     x      = score
 924  *           mu     = characteristic value of extreme value distribution
 925  *           lambda = decay constant of extreme value distribution
 926  *
 927  * Return:   P(S>x)
 928  */
 929 double
 930 ExtremeValueP(float x, float mu, float lambda)
 931 {
 932   double y;
 933                         /* avoid exceptions near P=1.0 */
 934                         /* typical 32-bit sys: if () < -3.6, return 1.0 */
 935   if ((lambda * (x - mu)) <= -1. * log(-1. * log(DBL_EPSILON))) return 1.0;
 936                         /* avoid underflow fp exceptions near P=0.0*/
 937   if ((lambda * (x - mu)) >= 2.3 * (double) DBL_MAX_10_EXP)     return 0.0;
 938                         /* a roundoff issue arises; use 1 - e^-x --> x for small x */
 939   y = exp(-1. * lambda * (x - mu));
 940   if       (y < 1e-7) return y;
 941   else     return (1.0 - exp(-1. * y));
 942 }
 943
 944
 945 /* Function: ExtremeValueP2()
 946  *
 947  * Purpose:  Calculate P(S>x) in a database of size N,
 948  *           using P(S>x) for a single sequence, according
 949  *           to a Poisson distribution.
 950  *
 951  * Args:     x      = score
 952  *           mu     = characteristic value of extreme value distribution
 953  *           lambda = decay constant of extreme value distribution
 954  *           N      = number of trials (number of sequences)
 955  *
 956  * Return:   P(S>x) for database of size N
 957  */
 958 double
 959 ExtremeValueP2(float x, float mu, float lambda, int N)
 960 {
 961   double y;
 962   y = N * ExtremeValueP(x,mu,lambda);
 963   if (y < 1e-7) return y;
 964   else          return (1.0 - exp(-1. * y));
 965 }
 966
 967 /* Function: ExtremeValueE()
 968  *
 969  * Purpose:  Calculate E(S>x) in a database of size N,
 970  *           using P(S>x) for a single sequence: simply np.
 971  *
 972  * Args:     x      = score
 973  *           mu     = characteristic value of extreme value distribution
 974  *           lambda = decay constant of extreme value distribution
 975  *           N      = number of trials (number of sequences)
 976  *
 977  * Return:   E(S>x) for database of size N
 978  */
 979 double
 980 ExtremeValueE(float x, float mu, float lambda, int N)
 981 {
 982   return (double)N * ExtremeValueP(x,mu,lambda);
 983 }
 984
 985
 986 /* Function: EVDrandom()
 987  *
 988  * Purpose:  Randomly sample an x from an EVD.
 989  *           Trivially done by the transformation method, since
 990  *           the distribution is analytical:
 991  *              x = \mu - \frac{\log \left[ -\log P(S<x) \right]}{\lambda}
 992  *           where P(S<x) is sampled uniformly on 0 < P(S<x) < 1.
 993  */
 994 float
 995 EVDrandom(float mu, float lambda)
 996 {
 997   float p = 0.0;
 998
 999   /* Very unlikely, but possible,
1000    * that sre_random() would give us exactly 0 or 1
1001    */
1002   while (p == 0. || p == 1.) p = sre_random();
1003   return mu - log(-1. * log(p)) / lambda;
1004 }
1005
1006
1007 /* Function: Lawless416()
1008  * Date:     SRE, Thu Nov 13 11:48:50 1997 [St. Louis]
1009  *
1010  * Purpose:  Equation 4.1.6 from [Lawless82], pg. 143, and
1011  *           its first derivative with respect to lambda,
1012  *           for finding the ML fit to EVD lambda parameter.
1013  *           This equation gives a result of zero for the maximum
1014  *           likelihood lambda.
1015  *
1016  *           Can either deal with a histogram or an array.
1017  *
1018  *           Warning: beware overflow/underflow issues! not bulletproof.
1019  *
1020  * Args:     x      - array of sample values (or x-axis of a histogram)
1021  *           y      - NULL (or y-axis of a histogram)
1022  *           n      - number of samples (or number of histogram bins)
1023  *           lambda - a lambda to test
1024  *           ret_f  - RETURN: 4.1.6 evaluated at lambda
1025  *           ret_df - RETURN: first derivative of 4.1.6 evaluated at lambda
1026  *
1027  * Return:   (void)
1028  */
1029 void
1030 Lawless416(float *x, int *y, int n, float lambda, float *ret_f, float *ret_df)
1031 {
1032
1033   double esum;                  /* \sum e^(-lambda xi)      */
1034   double xesum;                 /* \sum xi e^(-lambda xi)   */
1035   double xxesum;                /* \sum xi^2 e^(-lambda xi) */
1036   double xsum;                  /* \sum xi                  */
1037   double mult;                  /* histogram count multiplier */
1038   double total;                 /* total samples            */
1039   int i;
1040
1041
1042   esum = xesum = xsum  = xxesum = total = 0.;
1043   for (i = 0; i < n; i++)
1044     {
1045       mult = (y == NULL) ? 1. : (double) y[i];
1046       xsum   += mult * x[i];
1047       xesum  += mult * x[i] * exp(-1. * lambda * x[i]);
1048       xxesum += mult * x[i] * x[i] * exp(-1. * lambda * x[i]);
1049       esum   += mult * exp(-1. * lambda * x[i]);
1050       total  += mult;
1051     }
1052   *ret_f  = 1./lambda - xsum / total + xesum / esum;
1053   *ret_df = ((xesum / esum) * (xesum / esum))
1054     - (xxesum / esum)
1055     - (1. / (lambda * lambda));
1056
1057   return;
1058 }
1059
1060
1061 /* Function: Lawless422()
1062  * Date:     SRE, Mon Nov 17 09:42:48 1997 [St. Louis]
1063  *
1064  * Purpose:  Equation 4.2.2 from [Lawless82], pg. 169, and
1065  *           its first derivative with respect to lambda,
1066  *           for finding the ML fit to EVD lambda parameter
1067  *           for Type I censored data.
1068  *           This equation gives a result of zero for the maximum
1069  *           likelihood lambda.
1070  *
1071  *           Can either deal with a histogram or an array.
1072  *
1073  *           Warning: beware overflow/underflow issues! not bulletproof.
1074  *
1075  * Args:     x      - array of sample values (or x-axis of a histogram)
1076  *           y      - NULL (or y-axis of a histogram)
1077  *           n      - number of observed samples (or number of histogram bins)
1078  *           z      - number of censored samples
1079  *           c      - censoring value; all observed x_i >= c
1080  *           lambda - a lambda to test
1081  *           ret_f  - RETURN: 4.2.2 evaluated at lambda
1082  *           ret_df - RETURN: first derivative of 4.2.2 evaluated at lambda
1083  *
1084  * Return:   (void)
1085  */
1086 void
1087 Lawless422(float *x, int *y, int n, int z, float c,
1088            float lambda, float *ret_f, float *ret_df)
1089 {
1090   double esum;                  /* \sum e^(-lambda xi)      + z term    */
1091   double xesum;                 /* \sum xi e^(-lambda xi)   + z term    */
1092   double xxesum;                /* \sum xi^2 e^(-lambda xi) + z term    */
1093   double xsum;                  /* \sum xi                  (no z term) */
1094   double mult;                  /* histogram count multiplier */
1095   double total;                 /* total samples            */
1096   int i;
1097
1098   esum = xesum = xsum  = xxesum = total = 0.;
1099   for (i = 0; i < n; i++)
1100     {
1101       mult = (y == NULL) ? 1. : (double) y[i];
1102       xsum   += mult * x[i];
1103       esum   += mult *               exp(-1. * lambda * x[i]);
1104       xesum  += mult * x[i] *        exp(-1. * lambda * x[i]);
1105       xxesum += mult * x[i] * x[i] * exp(-1. * lambda * x[i]);
1106       total  += mult;
1107     }
1108
1109   /* Add z terms for censored data
1110    */
1111   esum   += (double) z *         exp(-1. * lambda * c);
1112   xesum  += (double) z * c *     exp(-1. * lambda * c);
1113   xxesum += (double) z * c * c * exp(-1. * lambda * c);
1114
1115   *ret_f  = 1./lambda - xsum / total + xesum / esum;
1116   *ret_df = ((xesum / esum) * (xesum / esum))
1117     - (xxesum / esum)
1118     - (1. / (lambda * lambda));
1119
1120   return;
1121 }
1122
1123
1124
1125 /* Function: EVDMaxLikelyFit()
1126  * Date:     SRE, Fri Nov 14 07:56:29 1997 [St. Louis]
1127  *
1128  * Purpose:  Given a list or a histogram of EVD-distributed samples,
1129  *           find maximum likelihood parameters lambda and
1130  *           mu.
1131  *
1132  * Algorithm: Uses approach described in [Lawless82]. Solves
1133  *           for lambda using Newton/Raphson iterations;
1134  *           then substitutes lambda into Lawless' equation 4.1.5
1135  *           to get mu.
1136  *
1137  *           Newton/Raphson algorithm developed from description in
1138  *           Numerical Recipes in C [Press88].
1139  *
1140  * Args:     x          - list of EVD distributed samples or x-axis of histogram
1141  *           c          - NULL, or y-axis of histogram
1142  *           n          - number of samples, or number of histogram bins
1143  *           ret_mu     : RETURN: ML estimate of mu
1144  *           ret_lambda : RETURN: ML estimate of lambda
1145  *
1146  * Return:   1 on success; 0 on any failure
1147  */
1148 int
1149 EVDMaxLikelyFit(float *x, int *c, int n, float *ret_mu, float *ret_lambda)
1150 {
1151   float  lambda, mu;
1152   float  fx;                    /* f(x)  */
1153   float  dfx;                   /* f'(x) */
1154   double esum;                  /* \sum e^(-lambda xi) */
1155   double mult;
1156   double total;
1157   float  tol = 1e-5;
1158   int    i;
1159
1160   /* 1. Find an initial guess at lambda: linear regression here?
1161    */
1162   lambda = 0.2;
1163
1164   /* 2. Use Newton/Raphson to solve Lawless 4.1.6 and find ML lambda
1165    */
1166   for (i = 0; i < 100; i++)
1167     {
1168       Lawless416(x, c, n, lambda, &fx, &dfx);
1169       if (fabs(fx) < tol) break;             /* success */
1170       lambda = lambda - fx / dfx;            /* Newton/Raphson is simple */
1171       if (lambda <= 0.) lambda = 0.001;      /* but be a little careful  */
1172     }
1173
1174   /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
1175    *      Resort to a bisection search. Worse convergence speed
1176    *      but guaranteed to converge (unlike Newton/Raphson).
1177    *      We assume (!?) that fx is a monotonically decreasing function of x;
1178    *      i.e. fx > 0 if we are left of the root, fx < 0 if we
1179    *      are right of the root.
1180    */
1181   if (i == 100)
1182     {
1183       float left, right, mid;
1184       SQD_DPRINTF2(("EVDMaxLikelyFit(): Newton/Raphson failed; switchover to bisection"));
1185
1186                                 /* First we need to bracket the root */
1187       lambda = right = left = 0.2;
1188       Lawless416(x, c, n, lambda, &fx, &dfx);
1189       if (fx < 0.)
1190         {                       /* fix right; search left. */
1191           do {
1192             left -= 0.1;
1193             if (left < 0.) {
1194               SQD_DPRINTF2(("EVDMaxLikelyFit(): failed to bracket root"));
1195               return 0;
1196             }
1197             Lawless416(x, c, n, left, &fx, &dfx);
1198           } while (fx < 0.);
1199         }
1200       else
1201         {                       /* fix left; search right. */
1202           do {
1203             right += 0.1;
1204             Lawless416(x, c, n, right, &fx, &dfx);
1205             if (right > 100.) {
1206               SQD_DPRINTF2(("EVDMaxLikelyFit(): failed to bracket root"));
1207               return 0;
1208             }
1209           } while (fx > 0.);
1210         }
1211                         /* now we bisection search in left/right interval */
1212       for (i = 0; i < 100; i++)
1213         {
1214           mid = (left + right) / 2.;
1215           Lawless416(x, c, n, mid, &fx, &dfx);
1216           if (fabs(fx) < tol) break;             /* success */
1217           if (fx > 0.)  left = mid;
1218           else          right = mid;
1219         }
1220       if (i == 100) {
1221         SQD_DPRINTF2(("EVDMaxLikelyFit(): even the bisection search failed"));
1222         return 0;
1223       }
1224       lambda = mid;
1225     }
1226
1227   /* 3. Substitute into Lawless 4.1.5 to find mu
1228    */
1229   esum = 0.;
1230   total = 0.;
1231   for (i = 0; i < n; i++)
1232     {
1233       mult   = (c == NULL) ? 1. : (double) c[i];
1234       esum  += mult * exp(-1 * lambda * x[i]);
1235       total += mult;
1236     }
1237   mu = -1. * log(esum / total) / lambda;
1238
1239   *ret_lambda = lambda;
1240   *ret_mu     = mu;
1241   return 1;
1242 }
1243
1244
1245 /* Function: EVDCensoredFit()
1246  * Date:     SRE, Mon Nov 17 10:01:05 1997 [St. Louis]
1247  *
1248  * Purpose:  Given a /left-censored/ list or histogram of EVD-distributed
1249  *           samples, as well as the number of censored samples z and the
1250  *           censoring value c,
1251  *           find maximum likelihood parameters lambda and
1252  *           mu.
1253  *
1254  * Algorithm: Uses approach described in [Lawless82]. Solves
1255  *           for lambda using Newton/Raphson iterations;
1256  *           then substitutes lambda into Lawless' equation 4.2.3
1257  *           to get mu.
1258  *
1259  *           Newton/Raphson algorithm developed from description in
1260  *           Numerical Recipes in C [Press88].
1261  *
1262  * Args:     x          - list of EVD distributed samples or x-axis of histogram
1263  *           y          - NULL, or y-axis of histogram
1264  *           n          - number of observed samples,or number of histogram bins
1265  *           z          - number of censored samples
1266  *           c          - censoring value (all x_i >= c)
1267  *           ret_mu     : RETURN: ML estimate of mu
1268  *           ret_lambda : RETURN: ML estimate of lambda
1269  *
1270  * Return:   (void)
1271  */
1272 int
1273 EVDCensoredFit(float *x, int *y, int n, int z, float c,
1274                float *ret_mu, float *ret_lambda)
1275 {
1276   float  lambda, mu;
1277   float  fx;                    /* f(x)  */
1278   float  dfx;                   /* f'(x) */
1279   double esum;                  /* \sum e^(-lambda xi) */
1280   double mult;
1281   double total;
1282   float  tol = 1e-5;
1283   int    i;
1284
1285   /* 1. Find an initial guess at lambda: linear regression here?
1286    */
1287   lambda = 0.2;
1288
1289   /* 2. Use Newton/Raphson to solve Lawless 4.2.2 and find ML lambda
1290    */
1291   for (i = 0; i < 100; i++)
1292     {
1293       Lawless422(x, y, n, z, c, lambda, &fx, &dfx);
1294       if (fabs(fx) < tol) break;             /* success */
1295       lambda = lambda - fx / dfx;            /* Newton/Raphson is simple */
1296       if (lambda <= 0.) lambda = 0.001;      /* but be a little careful  */
1297     }
1298
1299  /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
1300    *      Resort to a bisection search. Worse convergence speed
1301    *      but guaranteed to converge (unlike Newton/Raphson).
1302    *      We assume (!?) that fx is a monotonically decreasing function of x;
1303    *      i.e. fx > 0 if we are left of the root, fx < 0 if we
1304    *      are right of the root.
1305    */
1306   if (i == 100)
1307     {
1308       float left, right, mid;
1309                                 /* First we need to bracket the root */
1310       SQD_DPRINTF2(("EVDCensoredFit(): Newton/Raphson failed; switched to bisection"));
1311       lambda = right = left = 0.2;
1312       Lawless422(x, y, n, z, c, lambda, &fx, &dfx);
1313       if (fx < 0.)
1314         {                       /* fix right; search left. */
1315           do {
1316             left -= 0.03;
1317             if (left < 0.) {
1318               SQD_DPRINTF2(("EVDCensoredFit(): failed to bracket root"));
1319               return 0;
1320             }
1321             Lawless422(x, y, n, z, c, left, &fx, &dfx);
1322           } while (fx < 0.);
1323         }
1324       else
1325         {                       /* fix left; search right. */
1326           do {
1327             right += 0.1;
1328             Lawless422(x, y, n, z, c, left, &fx, &dfx);
1329             if (right > 100.) {
1330               SQD_DPRINTF2(("EVDCensoredFit(): failed to bracket root"));
1331               return 0;
1332             }
1333           } while (fx > 0.);
1334         }
1335                         /* now we bisection search in left/right interval */
1336       for (i = 0; i < 100; i++)
1337         {
1338           mid = (left + right) / 2.;
1339           Lawless422(x, y, n, z, c, left, &fx, &dfx);
1340           if (fabs(fx) < tol) break;             /* success */
1341           if (fx > 0.)  left = mid;
1342           else          right = mid;
1343         }
1344       if (i == 100) {
1345         SQD_DPRINTF2(("EVDCensoredFit(): even the bisection search failed"));
1346         return 0;
1347       }
1348       lambda = mid;
1349     }
1350
1351   /* 3. Substitute into Lawless 4.2.3 to find mu
1352    */
1353   esum =  total = 0.;
1354   for (i = 0; i < n; i++)
1355     {
1356       mult   = (y == NULL) ? 1. : (double) y[i];
1357       esum  += mult * exp(-1. * lambda * x[i]);
1358       total += mult;
1359     }
1360   esum += (double) z * exp(-1. * lambda * c);    /* term from censored data */
1361   mu = -1. * log(esum / total) / lambda;
1362
1363   *ret_lambda = lambda;
1364   *ret_mu     = mu;
1365   return 1;
1366 }
1367
1368
1369