1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Sat Jan 20 16:16:17 1996
14 * Accumulation, printing, and fitting of score histograms
15 * from database searches.
17 * RCS $Id: histogram.c,v 1.1.1.1 2005/03/22 08:34:00 cmzmasek Exp $
18 ************************************************************
21 * struct histogram_s *h;
23 * h = AllocHistogram(min_hint, max_hint, lumpsize);
25 * while (getting scores x) AddToHistogram(h, x);
27 * ExtremeValueFitHistogram(h, high_hint);
28 * PrintASCIIHistogram(fp, h);
47 /* Function: AllocHistogram()
49 * Purpose: Allocate and return a histogram structure.
50 * min and max are your best guess. They need
51 * not be absolutely correct; the histogram
52 * will expand dynamically to accomodate scores
53 * that exceed these suggested bounds. The amount
54 * that the histogram grows by is set by "lumpsize".
56 * Args: min: minimum score (integer)
57 * max: maximum score (integer)
58 * lumpsize: when reallocating histogram, pad the reallocation
59 * by this much (saves excessive reallocation)
62 AllocHistogram(int min, int max, int lumpsize)
64 struct histogram_s *h;
68 newsize = max - min + 1;
70 h = (struct histogram_s *) MallocOrDie(sizeof(struct histogram_s));
74 h->lowscore = INT_MAX;
75 h->highscore = INT_MIN;
76 h->lumpsize = lumpsize;
77 h->histogram = (int *) MallocOrDie (sizeof(int) * newsize);
78 for (i = 0; i < newsize; i++) h->histogram[i] = 0;
81 h->fit_type = HISTFIT_NONE;
87 /* Function: FreeHistogram()
89 * Purpose: free a histogram structure.
92 FreeHistogram(struct histogram_s *h)
95 if (h->expect != NULL) free(h->expect);
99 /* Function: UnfitHistogram()
101 * Purpose: Free only the theoretical fit part of a histogram.
104 UnfitHistogram(struct histogram_s *h)
106 if (h->expect != NULL) free(h->expect);
108 h->fit_type = HISTFIT_NONE;
112 /* Function: AddToHistogram()
114 * Purpose: Bump the appropriate counter in a histogram
115 * structure, given a score. The score is
116 * rounded off from float precision to the
117 * next lower integer.
120 AddToHistogram(struct histogram_s *h, float sc)
128 /* Adding to a histogram conflicts with existing fit:
131 if (h->fit_type != HISTFIT_NONE)
132 Die("AddToHistogram(): Can't add to a fitted histogram\n");
135 /* histogram bins are defined as: score >= bin value, < bin+1
136 * -1.9 -> -2 -0.4 -> -1 1.9 -> 1
137 * -2.1 -> -3 0.4 -> 0 2.1 -> 2
139 score = (int) floor(sc);
141 /* Check to see if we must reallocate the histogram.
145 prevsize = h->max - h->min + 1;
146 moveby = (h->min - score) + h->lumpsize;
147 newsize = prevsize + moveby;
150 h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize);
151 memmove(h->histogram+moveby, h->histogram, sizeof(int) * prevsize);
152 for (i = 0; i < moveby; i++)
155 else if (score > h->max)
157 prevsize = h->max - h->min + 1;
158 h->max = h->lumpsize + score;
159 newsize = h->max - h->min + 1;
161 h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize);
162 for (i = prevsize; i < newsize; i++)
166 /* Bump the correct bin.
167 * The bin number is score - h->min
169 h->histogram[score - h->min]++;
171 if (score < h->lowscore) h->lowscore = score;
172 if (score > h->highscore) h->highscore = score;
174 SQD_DPRINTF3(("AddToHistogram(): added %.1f; rounded to %d; in bin %d (%d-%d)\n",
175 sc, score, score-h->min, h->min, h->max));
181 /* Function: PrintASCIIHistogram()
183 * Purpose: Print a "prettified" histogram to a file pointer.
184 * Deliberately a look-and-feel clone of Bill Pearson's
185 * excellent FASTA output.
187 * Args: fp - open file to print to (stdout works)
188 * h - histogram to print
191 PrintASCIIHistogram(FILE *fp, struct histogram_s *h)
197 char buffer[81]; /* output line buffer */
198 int pos; /* position in output line buffer */
199 int lowbound, lowcount; /* cutoffs on the low side */
200 int highbound, highcount; /* cutoffs on the high side */
203 /* Find out how we'll scale the histogram.
204 * We have 59 characters to play with on a
205 * standard 80-column terminal display:
206 * leading "%5d %6d %6d|" occupies 20 chars.
207 * Save the peak position, we'll use it later.
210 for (i = h->lowscore - h->min; i <= h->highscore - h->min; i++)
211 if (h->histogram[i] > maxbar)
213 maxbar = h->histogram[i]; /* max height */
214 lowbound = i + h->min; /* peak position */
217 /* Truncate histogram display on both sides, ad hoc fashion.
218 * Start from the peak; then move out until we see <emptybins> empty bins,
221 highbound = lowbound; /* start at peak position */
222 for (num = 0; lowbound > h->lowscore; lowbound--)
224 i = lowbound - h->min;
225 if (h->histogram[i] > 0) { num = 0; continue; } /* reset */
226 if (++num == emptybins) { lowbound += emptybins; break; } /* stop */
228 for (num = 0; highbound < h->highscore; highbound++)
230 i = highbound - h->min;
231 if (h->histogram[i] > 0) { num = 0; continue; } /* reset */
232 if (++num == emptybins) { highbound -= emptybins; break; } /* stop */
234 /* collect counts outside of bounds */
235 for (lowcount = 0, i = h->lowscore - h->min; i <= lowbound - h->min; i++)
236 lowcount += h->histogram[i];
237 for (highcount = 0, i = h->highscore - h->min; i >= highbound - h->min; i--)
238 highcount += h->histogram[i];
240 /* maxbar might need raised now; then set our units */
241 if (lowcount > maxbar) maxbar = lowcount;
242 if (highcount > maxbar) maxbar = highcount;
243 units = ((maxbar-1)/ 59) + 1;
246 /* Print the histogram
248 fprintf(fp, "%5s %6s %6s (one = represents %d sequences)\n",
249 "score", "obs", "exp", units);
250 fprintf(fp, "%5s %6s %6s\n", "-----", "---", "---");
253 for (i = h->lowscore; i <= h->highscore; i++)
255 memset(buffer, ' ', 79 * sizeof(char));
258 /* Deal with special cases at edges
260 if (i < lowbound) continue;
261 else if (i > highbound) continue;
262 else if (i == lowbound && i != h->lowscore)
264 sprintf(buffer, "<%4d %6d %6s|", i+1, lowcount, "-");
266 num = 1+(lowcount-1) / units;
267 if (num > 60) Die("oops");
268 for (pos = 20; num > 0; num--) buffer[pos++] = '=';
273 else if (i == highbound && i != h->highscore)
275 sprintf(buffer, ">%4d %6d %6s|", i, highcount, "-");
277 num = 1+(highcount-1) / units;
278 for (pos = 20; num > 0; num--) buffer[pos++] = '=';
284 /* Deal with most cases
286 if (h->fit_type != HISTFIT_NONE)
287 sprintf(buffer, "%5d %6d %6d|",
288 i, h->histogram[idx], (int) h->expect[idx]);
290 sprintf(buffer, "%5d %6d %6s|", i, h->histogram[idx], "-");
291 buffer[20] = ' '; /* sprintf writes a null char */
293 /* Mark the histogram bar for observed hits
295 if (h->histogram[idx] > 0) {
296 num = 1 + (h->histogram[idx]-1) / units;
297 for (pos = 20; num > 0; num--) buffer[pos++] = '=';
300 /* Mark the theoretically expected value
302 if (h->fit_type != HISTFIT_NONE && (int) h->expect[idx] > 0)
304 pos = 20 + (int)(h->expect[idx]-1) / units;
305 if (pos >= 78) pos = 78; /* be careful of buffer bounds */
314 /* Print details about the statistics
316 switch (h->fit_type) {
318 fprintf(fp, "\n\n%% No statistical fit available\n");
322 fprintf(fp, "\n\n%% Statistical details of theoretical EVD fit:\n");
323 fprintf(fp, " mu = %10.4f\n", h->param[EVD_MU]);
324 fprintf(fp, " lambda = %10.4f\n", h->param[EVD_LAMBDA]);
325 fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq);
326 fprintf(fp, " P(chi-square) = %10.4g\n", h->chip);
329 case HISTFIT_GAUSSIAN:
330 fprintf(fp, "\n\n%% Statistical details of theoretical Gaussian fit:\n");
331 fprintf(fp, " mean = %10.4f\n", h->param[GAUSS_MEAN]);
332 fprintf(fp, " sd = %10.4f\n", h->param[GAUSS_SD]);
333 fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq);
334 fprintf(fp, " P(chi-square) = %10.4g\n", h->chip);
342 /* Function: PrintXMGRHistogram()
343 * Date: SRE, Wed Nov 12 11:02:00 1997 [St. Louis]
345 * Purpose: Print an XMGR data file that contains two data sets:
346 * - xy data for the observed histogram
347 * - xy data for the theoretical histogram
350 PrintXMGRHistogram(FILE *fp, struct histogram_s *h)
352 int sc; /* integer score in histogram structure */
355 /* First data set is the observed histogram
357 for (sc = h->lowscore; sc <= h->highscore; sc++)
358 if (h->histogram[sc - h->min] > 0)
359 fprintf(fp, "%-6d %f\n", sc,
360 (float) h->histogram[sc - h->min]/ (float) h->total);
363 /* Second data set is the theoretical histogram
365 if (h->fit_type != HISTFIT_NONE)
367 for (sc = h->lowscore; sc <= h->highscore; sc++)
370 (1. - ExtremeValueP((float)sc+1, h->param[EVD_MU], h->param[EVD_LAMBDA]))-
371 (1. - ExtremeValueP((float)sc, h->param[EVD_MU], h->param[EVD_LAMBDA]));
372 fprintf(fp, "%-6d %f\n", sc, val);
378 /* Function: PrintXMGRDistribution()
379 * Date: SRE, Wed Nov 12 11:02:09 1997 [St. Louis]
381 * Purpose: Print an XMGR data file that contains two data sets:
382 * - xy data for the observed distribution P(S<x)
383 * - xy data for the theoretical distribution P(S<x)
386 PrintXMGRDistribution(FILE *fp, struct histogram_s *h)
388 int sc; /* integer score in histogram structure */
389 int cum; /* cumulative count */
392 /* First data set is the observed distribution;
393 * histogram bin x contains # of scores between x and x+1,
394 * hence the sc+1 offset.
396 for (cum = 0, sc = h->lowscore; sc <= h->highscore; sc++)
398 cum += h->histogram[sc - h->min];
399 fprintf(fp, "%-6d %f\n", sc + 1, (float) cum / (float) h->total);
403 /* Second data set is the theoretical histogram
405 if (h->fit_type != HISTFIT_NONE)
407 for (sc = h->lowscore; sc <= h->highscore; sc++)
409 val = (1. - ExtremeValueP((float) sc, h->param[EVD_MU],
410 h->param[EVD_LAMBDA]));
411 fprintf(fp, "%-6d %f\n", sc, val);
417 /* Function: PrintXMGRRegressionLine()
418 * Date: SRE, Wed Nov 12 11:02:19 1997 [St. Louis]
420 * Purpose: Print an XMGR data file that contains two data sets:
421 * - xy data for log log transform of observed distribution P(S<x)
422 * - xy data for log log transform of theoretical distribution P(S<x)
425 PrintXMGRRegressionLine(FILE *fp, struct histogram_s *h)
427 int sc; /* integer score in histogram structure */
429 double val; /* log log transform */
431 /* First data set is the observed distribution;
432 * histogram bin x contains # of scores between x and x+1,
433 * hence the sc+1 offset.
435 for (cum = 0, sc = h->lowscore; sc <= h->highscore; sc++)
437 cum += h->histogram[sc - h->min];
438 val = log (-1. * log((double) cum / (double) h->total));
440 fprintf(fp, "%-6d %f\n", sc + 1, val);
444 /* Second data set is the theoretical histogram
446 if (h->fit_type != HISTFIT_NONE)
448 for (sc = h->lowscore; sc <= h->highscore; sc++)
450 val = log(-1. * log(1. - ExtremeValueP((float) sc, h->param[EVD_MU],
451 h->param[EVD_LAMBDA])));
452 fprintf(fp, "%-6d %f\n", sc, val);
458 /* Function: EVDBasicFit()
459 * Date: SRE, Wed Nov 12 11:02:27 1997 [St. Louis]
461 * Purpose: Fit a score histogram to the extreme value
462 * distribution. Set the parameters lambda
463 * and mu in the histogram structure. Fill in the
464 * expected values in the histogram. Calculate
465 * a chi-square test as a measure of goodness of fit.
467 * This is the basic version of ExtremeValueFitHistogram(),
468 * in a nonrobust form: simple linear regression with no
471 * Methods: Uses a linear regression fitting method [Collins88,Lawless82]
473 * Args: h - histogram to fit
478 EVDBasicFit(struct histogram_s *h)
480 float *d; /* distribution P(S < x) */
481 float *x; /* x-axis of P(S<x) for Linefit() */
484 int sc, idx; /* loop indices for score or score-h->min */
485 float slope, intercept; /* m,b fit from Linefit() */
486 float corr; /* correlation coeff of line fit, not used */
487 float lambda, mu; /* slope, intercept converted to EVD params */
489 /* Allocations for x, y axes
490 * distribution d runs from min..max with indices 0..max-min
491 * i.e. score - min = index into d, x, histogram, and expect
493 hsize = h->highscore - h->lowscore + 1;
494 d = (float *) MallocOrDie(sizeof(float) * hsize);
495 x = (float *) MallocOrDie(sizeof(float) * hsize);
496 for (idx = 0; idx < hsize; idx++)
497 d[idx] = x[idx] = 0.;
499 /* Calculate P(S < x) distribution from histogram.
500 * note off-by-one of sc, because histogram bin contains scores between
504 for (sc = h->lowscore; sc <= h->highscore; sc++)
506 sum += h->histogram[sc - h->min];
507 d[sc - h->lowscore] = (float) sum / (float) h->total;
508 x[sc - h->lowscore] = (float) (sc + 1);
511 /* Do a linear regression fit to the log[-log(P(S<x))] "line".
512 * we have log[-log(1-P(S>x))] = -lambda * x + lambda * mu
513 * so lambda = -m and mu = b/lambda
515 /* convert y axis to log[-log(P(S<x))] */
516 for (sc = h->lowscore; sc < h->highscore; sc++)
517 d[sc - h->lowscore] = log(-1. * log(d[sc - h->lowscore]));
519 /* do the linear regression */
520 Linefit(x, d, hsize-1, &intercept, &slope, &corr);
521 /* calc mu, lambda */
522 lambda = -1. * slope;
523 mu = intercept / lambda;
525 /* Set the EVD parameters in the histogram;
526 * pass 2 for additional lost degrees of freedom because we fit mu, lambda.
528 ExtremeValueSetHistogram(h, mu, lambda, h->lowscore, h->highscore, 2);
536 /* Function: ExtremeValueFitHistogram()
537 * Date: SRE, Sat Nov 15 17:16:15 1997 [St. Louis]
539 * Purpose: Fit a score histogram to the extreme value
540 * distribution. Set the parameters lambda
541 * and mu in the histogram structure. Calculate
542 * a chi-square test as a measure of goodness of fit.
544 * Methods: Uses a maximum likelihood method [Lawless82].
545 * Lower outliers are removed by censoring the data below the peak.
546 * Upper outliers are removed iteratively using method
547 * described by [Mott92].
549 * Args: h - histogram to fit
550 * censor - TRUE to censor data left of the peak
551 * high_hint - score cutoff; above this are `real' hits that aren't fit
553 * Return: 1 if fit is judged to be valid.
554 * else 0 if fit is invalid (too few seqs.)
557 ExtremeValueFitHistogram(struct histogram_s *h, int censor, float high_hint)
559 float *x; /* array of EVD samples to fit */
560 int *y; /* histogram counts */
561 int n; /* number of observed samples */
562 int z; /* number of censored samples */
563 int hsize; /* size of histogram */
564 float lambda, mu; /* new estimates of lambda, mu */
565 int sc; /* loop index for score */
566 int lowbound; /* lower bound of fitted region*/
567 int highbound; /* upper bound of fitted region*/
571 /* Determine lower bound on fitted region;
572 * if we're censoring the data, choose the peak of the histogram.
573 * if we're not, then we take the whole histogram.
575 lowbound = h->lowscore;
579 for (sc = h->lowscore; sc <= h->highscore; sc++)
580 if (h->histogram[sc - h->min] > max)
582 max = h->histogram[sc - h->min];
587 /* Determine initial upper bound on fitted region.
589 highbound = MIN(high_hint, h->highscore);
591 /* Now, iteratively converge on our lambda, mu:
593 for (iteration = 0; iteration < 100; iteration++)
595 /* Construct x, y vectors.
599 hsize = highbound - lowbound + 1;
600 if (hsize < 5) goto FITFAILED; /* require at least 5 bins or we don't fit */
602 x = MallocOrDie(sizeof(float) * hsize);
603 y = MallocOrDie(sizeof(int) * hsize);
605 for (sc = lowbound; sc <= highbound; sc++)
607 x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */
608 y[sc-lowbound] = h->histogram[sc - h->min];
609 n += h->histogram[sc - h->min];
612 if (n < 100) goto FITFAILED; /* require fitting to at least 100 points */
614 /* If we're censoring, estimate z, the number of censored guys
615 * left of the bound. Our initial estimate is crudely that we're
616 * missing e^-1 of the total distribution (which would be exact
617 * if we censored exactly at mu; but we censored at the observed peak).
618 * Subsequent estimates are more exact based on our current estimate of mu.
623 z = MIN(h->total-n, (int) (0.58198 * (float) n));
627 psx = EVDDistribution((float) lowbound, mu, lambda);
628 z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx)));
635 if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda))
638 if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda))
641 /* Find the Eval = 1 point as a new highbound;
642 * the total number of samples estimated to "belong" to the EVD is n+z
644 new_highbound = (int)
645 (mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda));
649 if (new_highbound >= highbound) break;
650 highbound = new_highbound;
653 /* Set the histogram parameters;
654 * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom
655 * for fitting mu, lambda, but we get 1 back because we're unnormalized
656 * in this interval, hence we pass 2-1 = 1 as ndegrees.
658 ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1);
663 if (x != NULL) free(x);
664 if (y != NULL) free(y);
669 /* Function: ExtremeValueSetHistogram()
671 * Purpose: Instead of fitting the histogram to an EVD,
672 * simply set the EVD parameters from an external source.
674 * Args: h - the histogram to set
675 * mu - mu location parameter
676 * lambda - lambda scale parameter
677 * lowbound - low bound of the histogram that was fit
678 * highbound- high bound of histogram that was fit
679 * ndegrees - extra degrees of freedom to subtract in X^2 test:
680 * typically 0 if mu, lambda are parametric,
681 * else 2 if mu, lambda are estimated from data
684 ExtremeValueSetHistogram(struct histogram_s *h, float mu, float lambda,
685 float lowbound, float highbound, int ndegrees)
693 h->fit_type = HISTFIT_EVD;
694 h->param[EVD_LAMBDA] = lambda;
695 h->param[EVD_MU] = mu;
697 hsize = h->max - h->min + 1;
698 h->expect = (float *) MallocOrDie(sizeof(float) * hsize);
699 for (idx = 0; idx < hsize; idx++)
702 /* Calculate the expected values for the histogram.
704 for (sc = h->min; sc <= h->max; sc++)
705 h->expect[sc - h->min] =
706 ExtremeValueE((float)(sc), h->param[EVD_MU], h->param[EVD_LAMBDA],
708 ExtremeValueE((float)(sc+1), h->param[EVD_MU], h->param[EVD_LAMBDA],
711 /* Calculate the goodness-of-fit (within whole region)
715 for (sc = lowbound; sc <= highbound; sc++)
716 if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
718 delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
719 h->chisq += delta * delta / h->expect[sc-h->min];
723 /* Since we fit the whole histogram, there is at least
724 * one constraint on chi-square: the normalization to h->total.
726 if (nbins > 1 + ndegrees)
727 h->chip = (float) IncompleteGamma((double)(nbins-1-ndegrees)/2.,
728 (double) h->chisq/2.);
735 /* Function: GaussianFitHistogram()
737 * Purpose: Fit a score histogram to a Gaussian distribution.
738 * Set the parameters mean and sd in the histogram
739 * structure, as well as a chi-squared test for
742 * Args: h - histogram to fit
743 * high_hint - score cutoff; above this are `real' hits that aren't fit
745 * Return: 1 if fit is judged to be valid.
746 * else 0 if fit is invalid (too few seqs.)
749 GaussianFitHistogram(struct histogram_s *h, float high_hint)
758 /* Clear any previous fitting from the histogram.
762 /* Determine if we have enough hits to fit the histogram;
763 * arbitrarily require 1000.
765 if (h->total < 1000) { h->fit_type = HISTFIT_NONE; return 0; }
767 /* Simplest algorithm for mean and sd;
768 * no outlier detection yet (not even using high_hint)
770 * Magic 0.5 correction is because our histogram is for
771 * scores between x and x+1; we estimate the expectation
772 * (roughly) as x + 0.5.
775 for (sc = h->lowscore; sc <= h->highscore; sc++)
777 delta = (float) sc + 0.5;
778 sum += (float) h->histogram[sc-h->min] * delta;
779 sqsum += (float) h->histogram[sc-h->min] * delta * delta;
781 h->fit_type = HISTFIT_GAUSSIAN;
782 h->param[GAUSS_MEAN] = sum / (float) h->total;
783 h->param[GAUSS_SD] = sqrt((sqsum - (sum*sum/(float)h->total)) /
784 (float)(h->total-1));
786 /* Calculate the expected values for the histogram.
787 * Note that the magic 0.5 correction appears again.
788 * Calculating difference between distribution functions for Gaussian
789 * would be correct but hard.
791 hsize = h->max - h->min + 1;
792 h->expect = (float *) MallocOrDie(sizeof(float) * hsize);
793 for (idx = 0; idx < hsize; idx++)
796 for (sc = h->min; sc <= h->max; sc++)
798 delta = (float) sc + 0.5 - h->param[GAUSS_MEAN];
799 h->expect[sc - h->min] =
800 (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) *
801 (exp(-1.* delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD]))));
804 /* Calculate the goodness-of-fit (within region that was fitted)
808 for (sc = h->lowscore; sc <= h->highscore; sc++)
809 if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
811 delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
812 h->chisq += delta * delta / h->expect[sc-h->min];
815 /* -1 d.f. for normalization; -2 d.f. for two free parameters */
817 h->chip = (float) IncompleteGamma((double)(nbins-3)/2.,
818 (double) h->chisq/2.);
826 /* Function: GaussianSetHistogram()
828 * Purpose: Instead of fitting the histogram to a Gaussian,
829 * simply set the Gaussian parameters from an external source.
832 GaussianSetHistogram(struct histogram_s *h, float mean, float sd)
840 h->fit_type = HISTFIT_GAUSSIAN;
841 h->param[GAUSS_MEAN] = mean;
842 h->param[GAUSS_SD] = sd;
844 /* Calculate the expected values for the histogram.
846 hsize = h->max - h->min + 1;
847 h->expect = (float *) MallocOrDie(sizeof(float) * hsize);
848 for (idx = 0; idx < hsize; idx++)
851 /* Note: ideally we'd use the Gaussian distribution function
852 * to find the histogram occupancy in the window sc..sc+1.
853 * However, the distribution function is hard to calculate.
854 * Instead, estimate the histogram by taking the density at sc+0.5.
856 for (sc = h->min; sc <= h->max; sc++)
858 delta = ((float)sc + 0.5) - h->param[GAUSS_MEAN];
859 h->expect[sc - h->min] =
860 (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) *
861 (exp(-1.*delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD]))));
864 /* Calculate the goodness-of-fit (within whole region)
868 for (sc = h->lowscore; sc <= h->highscore; sc++)
869 if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5)
871 delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min];
872 h->chisq += delta * delta / h->expect[sc-h->min];
875 /* -1 d.f. for normalization */
877 h->chip = (float) IncompleteGamma((double)(nbins-1)/2.,
878 (double) h->chisq/2.);
885 /* Function: EVDDensity()
886 * Date: SRE, Sat Nov 15 19:37:52 1997 [St. Louis]
888 * Purpose: Return the extreme value density P(S=x) at
889 * a given point x, for an EVD controlled by
890 * parameters mu and lambda.
893 EVDDensity(float x, float mu, float lambda)
895 return (lambda * exp(-1. * lambda * (x - mu)
896 - exp(-1. * lambda * (x - mu))));
899 /* Function: EVDDistribution()
900 * Date: SRE, Tue Nov 18 08:02:22 1997 [St. Louis]
902 * Purpose: Returns the extreme value distribution P(S < x)
903 * evaluated at x, for an EVD controlled by parameters
907 EVDDistribution(float x, float mu, float lambda)
909 return (exp(-1. * exp(-1. * lambda * (x - mu))));
912 /* Function: ExtremeValueP()
914 * Purpose: Calculate P(S>x) according to an extreme
915 * value distribution, given x and the parameters
916 * of the distribution (characteristic
917 * value mu, decay constant lambda).
919 * This function is exquisitely prone to
920 * floating point exceptions if it isn't coded
924 * mu = characteristic value of extreme value distribution
925 * lambda = decay constant of extreme value distribution
930 ExtremeValueP(float x, float mu, float lambda)
933 /* avoid exceptions near P=1.0 */
934 /* typical 32-bit sys: if () < -3.6, return 1.0 */
935 if ((lambda * (x - mu)) <= -1. * log(-1. * log(DBL_EPSILON))) return 1.0;
936 /* avoid underflow fp exceptions near P=0.0*/
937 if ((lambda * (x - mu)) >= 2.3 * (double) DBL_MAX_10_EXP) return 0.0;
938 /* a roundoff issue arises; use 1 - e^-x --> x for small x */
939 y = exp(-1. * lambda * (x - mu));
940 if (y < 1e-7) return y;
941 else return (1.0 - exp(-1. * y));
945 /* Function: ExtremeValueP2()
947 * Purpose: Calculate P(S>x) in a database of size N,
948 * using P(S>x) for a single sequence, according
949 * to a Poisson distribution.
952 * mu = characteristic value of extreme value distribution
953 * lambda = decay constant of extreme value distribution
954 * N = number of trials (number of sequences)
956 * Return: P(S>x) for database of size N
959 ExtremeValueP2(float x, float mu, float lambda, int N)
962 y = N * ExtremeValueP(x,mu,lambda);
963 if (y < 1e-7) return y;
964 else return (1.0 - exp(-1. * y));
967 /* Function: ExtremeValueE()
969 * Purpose: Calculate E(S>x) in a database of size N,
970 * using P(S>x) for a single sequence: simply np.
973 * mu = characteristic value of extreme value distribution
974 * lambda = decay constant of extreme value distribution
975 * N = number of trials (number of sequences)
977 * Return: E(S>x) for database of size N
980 ExtremeValueE(float x, float mu, float lambda, int N)
982 return (double)N * ExtremeValueP(x,mu,lambda);
986 /* Function: EVDrandom()
988 * Purpose: Randomly sample an x from an EVD.
989 * Trivially done by the transformation method, since
990 * the distribution is analytical:
991 * x = \mu - \frac{\log \left[ -\log P(S<x) \right]}{\lambda}
992 * where P(S<x) is sampled uniformly on 0 < P(S<x) < 1.
995 EVDrandom(float mu, float lambda)
999 /* Very unlikely, but possible,
1000 * that sre_random() would give us exactly 0 or 1
1002 while (p == 0. || p == 1.) p = sre_random();
1003 return mu - log(-1. * log(p)) / lambda;
1007 /* Function: Lawless416()
1008 * Date: SRE, Thu Nov 13 11:48:50 1997 [St. Louis]
1010 * Purpose: Equation 4.1.6 from [Lawless82], pg. 143, and
1011 * its first derivative with respect to lambda,
1012 * for finding the ML fit to EVD lambda parameter.
1013 * This equation gives a result of zero for the maximum
1014 * likelihood lambda.
1016 * Can either deal with a histogram or an array.
1018 * Warning: beware overflow/underflow issues! not bulletproof.
1020 * Args: x - array of sample values (or x-axis of a histogram)
1021 * y - NULL (or y-axis of a histogram)
1022 * n - number of samples (or number of histogram bins)
1023 * lambda - a lambda to test
1024 * ret_f - RETURN: 4.1.6 evaluated at lambda
1025 * ret_df - RETURN: first derivative of 4.1.6 evaluated at lambda
1030 Lawless416(float *x, int *y, int n, float lambda, float *ret_f, float *ret_df)
1033 double esum; /* \sum e^(-lambda xi) */
1034 double xesum; /* \sum xi e^(-lambda xi) */
1035 double xxesum; /* \sum xi^2 e^(-lambda xi) */
1036 double xsum; /* \sum xi */
1037 double mult; /* histogram count multiplier */
1038 double total; /* total samples */
1042 esum = xesum = xsum = xxesum = total = 0.;
1043 for (i = 0; i < n; i++)
1045 mult = (y == NULL) ? 1. : (double) y[i];
1046 xsum += mult * x[i];
1047 xesum += mult * x[i] * exp(-1. * lambda * x[i]);
1048 xxesum += mult * x[i] * x[i] * exp(-1. * lambda * x[i]);
1049 esum += mult * exp(-1. * lambda * x[i]);
1052 *ret_f = 1./lambda - xsum / total + xesum / esum;
1053 *ret_df = ((xesum / esum) * (xesum / esum))
1055 - (1. / (lambda * lambda));
1061 /* Function: Lawless422()
1062 * Date: SRE, Mon Nov 17 09:42:48 1997 [St. Louis]
1064 * Purpose: Equation 4.2.2 from [Lawless82], pg. 169, and
1065 * its first derivative with respect to lambda,
1066 * for finding the ML fit to EVD lambda parameter
1067 * for Type I censored data.
1068 * This equation gives a result of zero for the maximum
1069 * likelihood lambda.
1071 * Can either deal with a histogram or an array.
1073 * Warning: beware overflow/underflow issues! not bulletproof.
1075 * Args: x - array of sample values (or x-axis of a histogram)
1076 * y - NULL (or y-axis of a histogram)
1077 * n - number of observed samples (or number of histogram bins)
1078 * z - number of censored samples
1079 * c - censoring value; all observed x_i >= c
1080 * lambda - a lambda to test
1081 * ret_f - RETURN: 4.2.2 evaluated at lambda
1082 * ret_df - RETURN: first derivative of 4.2.2 evaluated at lambda
1087 Lawless422(float *x, int *y, int n, int z, float c,
1088 float lambda, float *ret_f, float *ret_df)
1090 double esum; /* \sum e^(-lambda xi) + z term */
1091 double xesum; /* \sum xi e^(-lambda xi) + z term */
1092 double xxesum; /* \sum xi^2 e^(-lambda xi) + z term */
1093 double xsum; /* \sum xi (no z term) */
1094 double mult; /* histogram count multiplier */
1095 double total; /* total samples */
1098 esum = xesum = xsum = xxesum = total = 0.;
1099 for (i = 0; i < n; i++)
1101 mult = (y == NULL) ? 1. : (double) y[i];
1102 xsum += mult * x[i];
1103 esum += mult * exp(-1. * lambda * x[i]);
1104 xesum += mult * x[i] * exp(-1. * lambda * x[i]);
1105 xxesum += mult * x[i] * x[i] * exp(-1. * lambda * x[i]);
1109 /* Add z terms for censored data
1111 esum += (double) z * exp(-1. * lambda * c);
1112 xesum += (double) z * c * exp(-1. * lambda * c);
1113 xxesum += (double) z * c * c * exp(-1. * lambda * c);
1115 *ret_f = 1./lambda - xsum / total + xesum / esum;
1116 *ret_df = ((xesum / esum) * (xesum / esum))
1118 - (1. / (lambda * lambda));
1125 /* Function: EVDMaxLikelyFit()
1126 * Date: SRE, Fri Nov 14 07:56:29 1997 [St. Louis]
1128 * Purpose: Given a list or a histogram of EVD-distributed samples,
1129 * find maximum likelihood parameters lambda and
1132 * Algorithm: Uses approach described in [Lawless82]. Solves
1133 * for lambda using Newton/Raphson iterations;
1134 * then substitutes lambda into Lawless' equation 4.1.5
1137 * Newton/Raphson algorithm developed from description in
1138 * Numerical Recipes in C [Press88].
1140 * Args: x - list of EVD distributed samples or x-axis of histogram
1141 * c - NULL, or y-axis of histogram
1142 * n - number of samples, or number of histogram bins
1143 * ret_mu : RETURN: ML estimate of mu
1144 * ret_lambda : RETURN: ML estimate of lambda
1146 * Return: 1 on success; 0 on any failure
1149 EVDMaxLikelyFit(float *x, int *c, int n, float *ret_mu, float *ret_lambda)
1152 float fx; /* f(x) */
1153 float dfx; /* f'(x) */
1154 double esum; /* \sum e^(-lambda xi) */
1160 /* 1. Find an initial guess at lambda: linear regression here?
1164 /* 2. Use Newton/Raphson to solve Lawless 4.1.6 and find ML lambda
1166 for (i = 0; i < 100; i++)
1168 Lawless416(x, c, n, lambda, &fx, &dfx);
1169 if (fabs(fx) < tol) break; /* success */
1170 lambda = lambda - fx / dfx; /* Newton/Raphson is simple */
1171 if (lambda <= 0.) lambda = 0.001; /* but be a little careful */
1174 /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
1175 * Resort to a bisection search. Worse convergence speed
1176 * but guaranteed to converge (unlike Newton/Raphson).
1177 * We assume (!?) that fx is a monotonically decreasing function of x;
1178 * i.e. fx > 0 if we are left of the root, fx < 0 if we
1179 * are right of the root.
1183 float left, right, mid;
1184 SQD_DPRINTF2(("EVDMaxLikelyFit(): Newton/Raphson failed; switchover to bisection"));
1186 /* First we need to bracket the root */
1187 lambda = right = left = 0.2;
1188 Lawless416(x, c, n, lambda, &fx, &dfx);
1190 { /* fix right; search left. */
1194 SQD_DPRINTF2(("EVDMaxLikelyFit(): failed to bracket root"));
1197 Lawless416(x, c, n, left, &fx, &dfx);
1201 { /* fix left; search right. */
1204 Lawless416(x, c, n, right, &fx, &dfx);
1206 SQD_DPRINTF2(("EVDMaxLikelyFit(): failed to bracket root"));
1211 /* now we bisection search in left/right interval */
1212 for (i = 0; i < 100; i++)
1214 mid = (left + right) / 2.;
1215 Lawless416(x, c, n, mid, &fx, &dfx);
1216 if (fabs(fx) < tol) break; /* success */
1217 if (fx > 0.) left = mid;
1221 SQD_DPRINTF2(("EVDMaxLikelyFit(): even the bisection search failed"));
1227 /* 3. Substitute into Lawless 4.1.5 to find mu
1231 for (i = 0; i < n; i++)
1233 mult = (c == NULL) ? 1. : (double) c[i];
1234 esum += mult * exp(-1 * lambda * x[i]);
1237 mu = -1. * log(esum / total) / lambda;
1239 *ret_lambda = lambda;
1245 /* Function: EVDCensoredFit()
1246 * Date: SRE, Mon Nov 17 10:01:05 1997 [St. Louis]
1248 * Purpose: Given a /left-censored/ list or histogram of EVD-distributed
1249 * samples, as well as the number of censored samples z and the
1250 * censoring value c,
1251 * find maximum likelihood parameters lambda and
1254 * Algorithm: Uses approach described in [Lawless82]. Solves
1255 * for lambda using Newton/Raphson iterations;
1256 * then substitutes lambda into Lawless' equation 4.2.3
1259 * Newton/Raphson algorithm developed from description in
1260 * Numerical Recipes in C [Press88].
1262 * Args: x - list of EVD distributed samples or x-axis of histogram
1263 * y - NULL, or y-axis of histogram
1264 * n - number of observed samples,or number of histogram bins
1265 * z - number of censored samples
1266 * c - censoring value (all x_i >= c)
1267 * ret_mu : RETURN: ML estimate of mu
1268 * ret_lambda : RETURN: ML estimate of lambda
1273 EVDCensoredFit(float *x, int *y, int n, int z, float c,
1274 float *ret_mu, float *ret_lambda)
1277 float fx; /* f(x) */
1278 float dfx; /* f'(x) */
1279 double esum; /* \sum e^(-lambda xi) */
1285 /* 1. Find an initial guess at lambda: linear regression here?
1289 /* 2. Use Newton/Raphson to solve Lawless 4.2.2 and find ML lambda
1291 for (i = 0; i < 100; i++)
1293 Lawless422(x, y, n, z, c, lambda, &fx, &dfx);
1294 if (fabs(fx) < tol) break; /* success */
1295 lambda = lambda - fx / dfx; /* Newton/Raphson is simple */
1296 if (lambda <= 0.) lambda = 0.001; /* but be a little careful */
1299 /* 2.5: If we did 100 iterations but didn't converge, Newton/Raphson failed.
1300 * Resort to a bisection search. Worse convergence speed
1301 * but guaranteed to converge (unlike Newton/Raphson).
1302 * We assume (!?) that fx is a monotonically decreasing function of x;
1303 * i.e. fx > 0 if we are left of the root, fx < 0 if we
1304 * are right of the root.
1308 float left, right, mid;
1309 /* First we need to bracket the root */
1310 SQD_DPRINTF2(("EVDCensoredFit(): Newton/Raphson failed; switched to bisection"));
1311 lambda = right = left = 0.2;
1312 Lawless422(x, y, n, z, c, lambda, &fx, &dfx);
1314 { /* fix right; search left. */
1318 SQD_DPRINTF2(("EVDCensoredFit(): failed to bracket root"));
1321 Lawless422(x, y, n, z, c, left, &fx, &dfx);
1325 { /* fix left; search right. */
1328 Lawless422(x, y, n, z, c, left, &fx, &dfx);
1330 SQD_DPRINTF2(("EVDCensoredFit(): failed to bracket root"));
1335 /* now we bisection search in left/right interval */
1336 for (i = 0; i < 100; i++)
1338 mid = (left + right) / 2.;
1339 Lawless422(x, y, n, z, c, left, &fx, &dfx);
1340 if (fabs(fx) < tol) break; /* success */
1341 if (fx > 0.) left = mid;
1345 SQD_DPRINTF2(("EVDCensoredFit(): even the bisection search failed"));
1351 /* 3. Substitute into Lawless 4.2.3 to find mu
1354 for (i = 0; i < n; i++)
1356 mult = (y == NULL) ? 1. : (double) y[i];
1357 esum += mult * exp(-1. * lambda * x[i]);
1360 esum += (double) z * exp(-1. * lambda * c); /* term from censored data */
1361 mu = -1. * log(esum / total) / lambda;
1363 *ret_lambda = lambda;