1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
13 * Portability for and extensions to C math library.
14 * RCS $Id: sre_math.c,v 1.1.1.1 2005/03/22 08:34:32 cmzmasek Exp $
26 static int sre_reseed = 0; /* TRUE to reinit sre_random() */
27 static int sre_randseed = 666; /* default seed for sre_random() */
29 /* Function: ExponentialRandom()
30 * Date: SRE, Mon Sep 6 21:24:29 1999 [St. Louis]
32 * Purpose: Pick an exponentially distributed random variable
40 ExponentialRandom(void)
44 do x = sre_random(); while (x == 0.0);
48 /* Function: Gaussrandom()
50 * Pick a Gaussian-distributed random variable
51 * with some mean and standard deviation, and
54 * Based on RANLIB.c public domain implementation.
55 * Thanks to the authors, Barry W. Brown and James Lovato,
56 * University of Texas, M.D. Anderson Cancer Center, Houston TX.
57 * Their implementation is from Ahrens and Dieter, "Extensions
58 * of Forsythe's method for random sampling from the normal
59 * distribution", Math. Comput. 27:927-937 (1973).
61 * Impenetrability of the code is to be blamed on its FORTRAN/f2c lineage.
65 Gaussrandom(float mean, float stddev)
67 static float a[32] = {
68 0.0,3.917609E-2,7.841241E-2,0.11777,0.1573107,0.1970991,0.2372021,0.2776904, 0.3186394,0.36013,0.4022501,0.4450965,0.4887764,0.5334097,0.5791322,
69 0.626099,0.6744898,0.7245144,0.7764218,0.8305109,0.8871466,0.9467818,
70 1.00999,1.077516,1.150349,1.229859,1.318011,1.417797,1.534121,1.67594,
73 static float d[31] = {
74 0.0,0.0,0.0,0.0,0.0,0.2636843,0.2425085,0.2255674,0.2116342,0.1999243,
75 0.1899108,0.1812252,0.1736014,0.1668419,0.1607967,0.1553497,0.1504094,
76 0.1459026,0.14177,0.1379632,0.1344418,0.1311722,0.128126,0.1252791,
77 0.1226109,0.1201036,0.1177417,0.1155119,0.1134023,0.1114027,0.1095039
79 static float t[31] = {
80 7.673828E-4,2.30687E-3,3.860618E-3,5.438454E-3,7.0507E-3,8.708396E-3,
81 1.042357E-2,1.220953E-2,1.408125E-2,1.605579E-2,1.81529E-2,2.039573E-2,
82 2.281177E-2,2.543407E-2,2.830296E-2,3.146822E-2,3.499233E-2,3.895483E-2,
83 4.345878E-2,4.864035E-2,5.468334E-2,6.184222E-2,7.047983E-2,8.113195E-2,
84 9.462444E-2,0.1123001,0.136498,0.1716886,0.2276241,0.330498,0.5847031
86 static float h[31] = {
87 3.920617E-2,3.932705E-2,3.951E-2,3.975703E-2,4.007093E-2,4.045533E-2,
88 4.091481E-2,4.145507E-2,4.208311E-2,4.280748E-2,4.363863E-2,4.458932E-2,
89 4.567523E-2,4.691571E-2,4.833487E-2,4.996298E-2,5.183859E-2,5.401138E-2,
90 5.654656E-2,5.95313E-2,6.308489E-2,6.737503E-2,7.264544E-2,7.926471E-2,
91 8.781922E-2,9.930398E-2,0.11556,0.1404344,0.1836142,0.2790016,0.7010474
94 static float snorm,u,s,ustar,aa,w,y,tt;
103 if(i == 0) goto S100;
110 if(ustar <= *(t+i-1)) goto S60;
111 w = (ustar-*(t+i-1))**(h+i-1);
118 if(s == 1.0) snorm = -y;
119 return (stddev*snorm + mean);
130 ustar = sre_random();
132 if(ustar > tt) goto S50;
134 if(ustar >= u) goto S70;
135 ustar = sre_random();
149 if(u < 1.0) goto S110;
158 ustar = sre_random();
159 if(ustar > tt) goto S50;
161 if(ustar >= u) goto S150;
167 /* Function: Linefit()
169 * Purpose: Given points x[0..N-1] and y[0..N-1], fit to
170 * a straight line y = a + bx.
171 * a, b, and the linear correlation coefficient r
172 * are filled in for return.
174 * Args: x - x values of data
175 * y - y values of data
176 * N - number of data points
177 * ret_a - RETURN: intercept
178 * ret_b - RETURN: slope
179 * ret_r - RETURN: correlation coefficient
181 * Return: 1 on success, 0 on failure.
184 Linefit(float *x, float *y, int N, float *ret_a, float *ret_b, float *ret_r)
190 /* Calculate averages, xavg and yavg
193 for (i = 0; i < N; i++)
201 sxx = syy = sxy = 0.0;
202 for (i = 0; i < N; i++)
204 sxx += (x[i] - xavg) * (x[i] - xavg);
205 syy += (y[i] - yavg) * (y[i] - xavg);
206 sxy += (x[i] - xavg) * (y[i] - yavg);
209 *ret_a = yavg - xavg*(*ret_b);
210 *ret_r = sxy / (sqrt(sxx) * sqrt(syy));
215 /* Function: WeightedLinefit()
217 * Purpose: Given points x[0..N-1] and y[0..N-1] with
218 * variances (measurement errors) var[0..N-1],
219 * fit to a straight line y = mx + b.
221 * Method: Algorithm from Numerical Recipes in C, [Press88].
224 * ret_m contains slope; ret_b contains intercept
227 WeightedLinefit(float *x, float *y, float *var, int N, float *ret_m, float *ret_b)
236 s = sx = sy = sxx = sxy = 0.;
237 for (i = 0; i < N; i++)
242 sxx += x[i] * x[i] / var[i];
243 sxy += x[i] * y[i] / var[i];
246 delta = s * sxx - (sx * sx);
247 b = (sxx * sy - sx * sxy) / delta;
248 m = (s * sxy - sx * sy) / delta;
255 /* Function: Gammln()
257 * Returns the natural log of the gamma function of x.
260 * Adapted from a public domain implementation in the
261 * NCBI core math library. Thanks to John Spouge and
262 * the NCBI. (According to the NCBI, that's Dr. John
263 * "Gammas Galore" Spouge to you, pal.)
271 static double cof[11] = {
272 4.694580336184385e+04,
273 -1.560605207784446e+05,
274 2.065049568014106e+05,
275 -1.388934775095388e+05,
276 5.031796415085709e+04,
277 -9.601592329182778e+03,
278 8.785855930895250e+02,
279 -3.155153906098611e+01,
280 2.908143421162229e-01,
281 -2.319827630494973e-04,
282 1.251639670050933e-10
285 /* Protect against x=0. We see this in Dirichlet code,
286 * for terms alpha = 0. This is a severe hack but it is effective
287 * and (we think?) safe. (due to GJM)
289 if (x <= 0.0) return 999999.;
292 tx = tmp = xx + 11.0;
294 for (i = 10; i >= 0; i--) /* sum least significant terms first */
296 value += cof[i] / tmp;
301 value += 0.918938533 + (xx+0.5)*log(tx) - tx;
306 /* Vector operations for doubles and floats.
307 * DNorm(), FNorm() -- normalize a probability vector of length n.
308 * return 0 if all values were zero.
309 * DScale(), FScale() -- multiply all items in vector by scale
310 * DSet(), FSet() -- set all items in vector to value.
311 * DAdd(), FAdd() -- add vec2 to vec1.
312 * DDot(), FDot() -- calculate dot product of two vectors.
313 * DCopy(), FCopy() -- set vec1 to be same as vec2.
314 * DMax(), FMax() -- return index of maximum element in vec
317 DNorm(double *vec, int n)
323 for (x = 0; x < n; x++) sum += vec[x];
325 for (x = 0; x < n; x++) vec[x] /= sum;
327 { squid_errno = SQERR_DIVZERO; return 0; }
331 FNorm(float *vec, int n)
337 for (x = 0; x < n; x++) sum += vec[x];
339 for (x = 0; x < n; x++) vec[x] /= sum;
341 { squid_errno = SQERR_DIVZERO; return 0; }
346 DScale(double *vec, int n, double scale)
349 for (x = 0; x < n; x++)
353 FScale(float *vec, int n, float scale)
356 for (x = 0; x < n; x++)
361 DSet(double *vec, int n, double value)
364 for (x = 0; x < n; x++)
368 FSet(float *vec, int n, float value)
371 for (x = 0; x < n; x++)
376 DSum(double *vec, int n)
380 for (x = 0; x < n; x++)
385 FSum(float *vec, int n)
389 for (x = 0; x < n; x++)
395 DAdd(double *vec1, double *vec2, int n)
398 for (x = 0; x < n; x++)
402 FAdd(float *vec1, float *vec2, int n)
405 for (x = 0; x < n; x++)
410 DCopy(double *vec1, double *vec2, int n)
413 for (x = 0; x < n; x++)
417 FCopy(float *vec1, float *vec2, int n)
420 for (x = 0; x < n; x++)
425 DDot(double *vec1, double *vec2, int n)
430 for (x = 0; x < n; x++)
431 result += vec1[x] * vec2[x];
435 FDot(float *vec1, float *vec2, int n)
440 for (x = 0; x < n; x++)
441 result += vec1[x] * vec2[x];
445 /* Functions: DMax(), FMax()
446 * Date: SRE, Fri Aug 29 11:14:08 1997 (Denver CO)
448 * Purpose: return index of maximum element in vec.
451 DMax(double *vec, int n)
456 for (i = 1; i < n; i++)
457 if (vec[i] > vec[best]) best = i;
461 FMax(float *vec, int n)
466 for (i = 1; i < n; i++)
467 if (vec[i] > vec[best]) best = i;
472 /* 2D matrix operations
475 FMX2Alloc(int rows, int cols)
480 mx = (float **) MallocOrDie(sizeof(float *) * rows);
481 mx[0] = (float *) MallocOrDie(sizeof(float) * rows * cols);
482 for (r = 1; r < rows; r++)
483 mx[r] = mx[0] + r*cols;
493 DMX2Alloc(int rows, int cols)
498 mx = (double **) MallocOrDie(sizeof(double *) * rows);
499 mx[0] = (double *) MallocOrDie(sizeof(double) * rows * cols);
500 for (r = 1; r < rows; r++)
501 mx[r] = mx[0] + r*cols;
505 DMX2Free(double **mx)
510 /* Function: FMX2Multiply()
512 * Purpose: Matrix multiplication.
513 * Multiply an m x p matrix A by a p x n matrix B,
514 * giving an m x n matrix C.
515 * Matrix C must be a preallocated matrix of the right
519 FMX2Multiply(float **A, float **B, float **C, int m, int p, int n)
523 for (i = 0; i < m; i++)
524 for (j = 0; j < n; j++)
527 for (k = 0; k < p; k++)
528 C[i][j] += A[i][p] * B[p][j];
532 /* Function: sre_random()
534 * Purpose: Return a uniform deviate from 0.0 to 1.0.
535 * sre_randseed is a static variable, set
536 * by sre_srandom(). sre_reseed is a static flag
537 * raised by sre_srandom(), saying that we need
541 * Uses a simple linear congruential generator with
542 * period 2^28. Based on discussion in Robert Sedgewick's
543 * _Algorithms in C_, Addison-Wesley, 1990.
545 * Requires that long int's have at least 32 bits.
547 * Reliable and portable, but slow. Benchmarks on wol,
548 * using IRIX cc and IRIX C library rand() and random():
549 * sre_random(): 0.8 usec/call
550 * random(): 0.3 usec/call
551 * rand(): 0.3 usec/call
553 #define RANGE 268435456 /* 2^28 */
554 #define DIV 16384 /* sqrt(RANGE) */
555 #define MULT 72530821 /* my/Cathy's birthdays, x21, x even (Knuth)*/
560 static int firsttime = 1;
564 if (sre_reseed || firsttime)
566 sre_reseed = firsttime = 0;
567 if (sre_randseed <= 0) sre_randseed = 666; /* seeds of zero break me */
568 high1 = sre_randseed / DIV; low1 = sre_randseed % DIV;
569 high2 = MULT / DIV; low2 = MULT % DIV;
570 rnd = (((high2*low1 + high1*low2) % DIV)*DIV + low1*low2) % RANGE;
572 high1 = rnd / DIV; low1 = rnd % DIV;
573 high2 = MULT / DIV; low2 = MULT % DIV;
574 rnd = (((high2*low1 + high1*low2) % DIV)*DIV + low1*low2) % RANGE;
576 return ((float) rnd / (float) RANGE);
583 /* Function: sre_srandom()
585 * Purpose: Initialize with a random seed. Seed can be
589 sre_srandom(int seed)
591 if (seed < 0) seed = -1 * seed;
597 /* Functions: DChoose(), FChoose()
599 * Purpose: Make a random choice from a normalized distribution.
600 * DChoose() is for double-precision vectors;
601 * FChoose() is for single-precision float vectors.
602 * Returns the number of the choice.
605 DChoose(double *p, int N)
607 double roll; /* random fraction */
608 double sum; /* integrated prob */
609 int i; /* counter over the probs */
613 for (i = 0; i < N; i++)
616 if (roll < sum) return i;
618 SQD_DASSERT2((fabs(1.0 - sum) < 1e-14)); /* a verification at level 2 */
619 return (int) (sre_random() * N); /* bulletproof */
622 FChoose(float *p, int N)
624 float roll; /* random fraction */
625 float sum; /* integrated prob */
626 int i; /* counter over the probs */
630 for (i = 0; i < N; i++)
633 if (roll < sum) return i;
635 SQD_DASSERT2((fabs(1.0f - sum) < 1e-6f)); /* a verification at level 2 */
636 return (int) (sre_random() * N); /* bulletproof */
639 /* Functions: DLogSum(), FLogSum()
641 * Calculate the sum of a log vector
642 * *in normal space*, and return the log of the sum.
645 DLogSum(double *logp, int n)
651 for (x = 1; x < n; x++)
652 if (logp[x] > max) max = logp[x];
654 for (x = 0; x < n; x++)
655 if (logp[x] > max - 50.)
656 sum += exp(logp[x] - max);
657 sum = log(sum) + max;
661 FLogSum(float *logp, int n)
667 for (x = 1; x < n; x++)
668 if (logp[x] > max) max = logp[x];
670 for (x = 0; x < n; x++)
671 if (logp[x] > max - 50.)
672 sum += exp(logp[x] - max);
673 sum = log(sum) + max;
678 /* Function: IncompleteGamma()
680 * Purpose: Returns 1 - P(a,x) where:
681 * P(a,x) = \frac{1}{\Gamma(a)} \int_{0}^{x} t^{a-1} e^{-t} dt
682 * = \frac{\gamma(a,x)}{\Gamma(a)}
683 * = 1 - \frac{\Gamma(a,x)}{\Gamma(a)}
685 * Used in a chi-squared test: for a X^2 statistic x
686 * with v degrees of freedom, call:
687 * p = IncompleteGamma(v/2., x/2.)
688 * to get the probability p that a chi-squared value
689 * greater than x could be obtained by chance even for
690 * a correct model. (i.e. p should be large, say
693 * Method: Based on ideas from Numerical Recipes in C, Press et al.,
694 * Cambridge University Press, 1988.
696 * Args: a - for instance, degrees of freedom / 2 [a > 0]
697 * x - for instance, chi-squared statistic / 2 [x >= 0]
699 * Return: 1 - P(a,x).
702 IncompleteGamma(double a, double x)
704 int iter; /* iteration counter */
706 if (a <= 0.) Die("IncompleteGamma(): a must be > 0");
707 if (x < 0.) Die("IncompleteGamma(): x must be >= 0");
709 /* For x > a + 1 the following gives rapid convergence;
710 * calculate 1 - P(a,x) = \frac{\Gamma(a,x)}{\Gamma(a)}:
711 * use a continued fraction development for \Gamma(a,x).
715 double oldp; /* previous value of p */
716 double nu0, nu1; /* numerators for continued fraction calc */
717 double de0, de1; /* denominators for continued fraction calc */
719 nu0 = 0.; /* A_0 = 0 */
720 de0 = 1.; /* B_0 = 1 */
721 nu1 = 1.; /* A_1 = 1 */
722 de1 = x; /* B_1 = x */
725 for (iter = 1; iter < 100; iter++)
727 /* Continued fraction development:
728 * set A_j = b_j A_j-1 + a_j A_j-2
729 * B_j = b_j B_j-1 + a_j B_j-2
730 * We start with A_2, B_2.
732 /* j = even: a_j = iter-a, b_j = 1 */
733 /* A,B_j-2 are in nu0, de0; A,B_j-1 are in nu1,de1 */
734 nu0 = nu1 + ((double)iter - a) * nu0;
735 de0 = de1 + ((double)iter - a) * de0;
737 /* j = odd: a_j = iter, b_j = x */
738 /* A,B_j-2 are in nu1, de1; A,B_j-1 in nu0,de0 */
739 nu1 = x * nu0 + (double) iter * nu1;
740 de1 = x * de0 + (double) iter * de1;
750 /* check for convergence */
751 if (fabs((nu1-oldp)/nu1) < 1.e-7)
752 return nu1 * exp(a * log(x) - x - Gammln(a));
756 Die("IncompleteGamma(): failed to converge using continued fraction approx");
760 double p; /* current sum */
761 double val; /* current value used in sum */
763 /* For x <= a+1 we use a convergent series instead:
764 * P(a,x) = \frac{\gamma(a,x)}{\Gamma(a)},
766 * \gamma(a,x) = e^{-x}x^a \sum_{n=0}{\infty} \frac{\Gamma{a}}{\Gamma{a+1+n}} x^n
767 * which looks appalling but the sum is in fact rearrangeable to
768 * a simple series without the \Gamma functions:
769 * = \frac{1}{a} + \frac{x}{a(a+1)} + \frac{x^2}{a(a+1)(a+2)} ...
770 * and it's obvious that this should converge nicely for x <= a+1.
774 for (iter = 1; iter < 10000; iter++)
776 val *= x / (a+(double)iter);
779 if (fabs(val/p) < 1.e-7)
780 return 1. - p * exp(a * log(x) - x - Gammln(a));
782 Die("IncompleteGamma(): failed to converge using series approx");