1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
13 * SRE, Mon Nov 11 15:07:33 1996
15 * Miscellaneous mathematical functions.
16 * General functions are in the SQUID library sre_math.c.
17 * These functions are too HMM-specific to warrant being in the
33 /* Function: Prob2Score()
35 * Purpose: Convert a probability to a scaled integer log_2 odds score.
36 * Round to nearest integer (i.e. note use of +0.5 and floor())
40 Prob2Score(float p, float null)
42 if (p == 0.0) return -INFTY;
43 else return (int) floor(0.5 + INTSCALE * sreLOG2(p/null));
46 /* Function: Score2Prob()
48 * Purpose: Convert an integer log_2 odds score back to a probability;
49 * needs the null model probability, if any, to do the conversion.
52 Score2Prob(int sc, float null)
54 if (sc == -INFTY) return 0.;
55 else return (null * sreEXP2((float) sc / INTSCALE));
59 /* Function: Scorify()
61 * Purpose: Convert a scaled integer log-odds score to a floating
62 * point score for output. (could be a macro but who cares.)
67 return ((float) sc / INTSCALE);
72 * Date: SRE, Mon Oct 27 12:21:02 1997 [Sanger Centre, UK]
74 * Purpose: Convert an HMM score to a P-value.
75 * We know P(S>x) is bounded by 1 / (1 + exp_2^x) for a bit score of x.
76 * We can also use EVD parameters for a tighter bound if we have
79 * Args: hmm - model structure, contains EVD parameters
82 * Returns: P value for score significance.
85 PValue(struct plan7_s *hmm, float sc)
89 /* the bound from Bayes */
90 if (sc >= sreLOG2(DBL_MAX)) pval = 0.0;
91 else pval = 1. / (1.+sreEXP2(sc));
93 /* try for a better estimate from EVD fit */
94 if (hmm != NULL && (hmm->flags & PLAN7_STATS))
96 pval2 = ExtremeValueP(sc, hmm->mu, hmm->lambda);
97 if (pval2 < pval) pval = pval2;
102 /* Function: LogSum()
104 * Purpose: Returns the log of the sum of two log probabilities.
105 * log(exp(p1)+exp(p2)) = p1 + log(1 + exp(p2-p1)) for p1 > p2
106 * Note that this is in natural log space, not log_2.
109 LogSum(float p1, float p2)
112 return (p1-p2 > 50.) ? p1 + log(1. + exp(p2-p1)) : p1;
114 return (p2-p1 > 50.) ? p2 + log(1. + exp(p1-p2)) : p2;
118 /* Function: ILogsum()
120 * Purpose: Return the scaled integer log probability of
121 * the sum of two probabilities p1 and p2, where
122 * p1 and p2 are also given as scaled log probabilities.
124 * log(exp(p1)+exp(p2)) = p1 + log(1 + exp(p2-p1)) for p1 > p2
126 * For speed, builds a lookup table the first time it's called.
127 * LOGSUM_TBL is set to 20000 by default, in config.h.
129 * Because of the one-time initialization, we have to
130 * be careful in a multithreaded implementation... hence
131 * the use of pthread_once(), which forces us to put
132 * the initialization routine and the lookup table outside
133 * ILogsum(). (Thanks to Henry Gabb at Intel for pointing
136 * Args: p1,p2 -- scaled integer log_2 probabilities to be summed
137 * in probability space.
139 * Return: scaled integer log_2 probability of the sum.
141 static int ilogsum_lookup[LOGSUM_TBL];
146 for (i = 0; i < LOGSUM_TBL; i++)
147 ilogsum_lookup[i] = (int) (INTSCALE * 1.44269504 *
148 (log(1.+exp(0.69314718 * (float) -i/INTSCALE))));
151 ILogsum(int p1, int p2)
155 static pthread_once_t firsttime = PTHREAD_ONCE_INIT;
156 pthread_once(&firsttime, init_ilogsum);
158 static int firsttime = 1;
159 if (firsttime) { init_ilogsum(); firsttime = 0; }
163 if (diff >= LOGSUM_TBL) return p1;
164 else if (diff <= -LOGSUM_TBL) return p2;
165 else if (diff > 0) return p1 + ilogsum_lookup[diff];
166 else return p2 + ilogsum_lookup[-diff];
169 /* Function: LogNorm()
171 * Purpose: Normalize a vector of log likelihoods, changing it
172 * to a probability vector. Be careful of overflowing exp().
173 * Implementation adapted from Graeme Mitchison.
175 * Args: vec - vector destined to become log probabilities
179 LogNorm(float *vec, int n)
185 for (x = 0; x < n; x++)
186 if (vec[x] > max) max = vec[x];
187 for (x = 0; x < n; x++)
188 if (vec[x] > max - 50.)
189 denom += exp(vec[x] - max);
190 for (x = 0; x < n; x++)
191 if (vec[x] > max - 50.)
192 vec[x] = exp(vec[x] - max) / denom;
198 /* Function: Logp_cvec()
200 * Purpose: Calculates ln P(cvec|dirichlet), the log probability of a
201 * count vector given a Dirichlet distribution. Adapted
202 * from an implementation by Graeme Mitchison.
204 * Args: cvec - count vector
206 * alpha - Dirichlet alpha terms
208 * Return: log P(cvec|dirichlet)
211 Logp_cvec(float *cvec, int n, float *alpha)
213 float lnp; /* log likelihood of P(cvec | Dirichlet) */
214 float sum1, sum2, sum3;
217 sum1 = sum2 = sum3 = lnp = 0.0;
218 for (x = 0; x < n; x++)
220 sum1 += cvec[x] + alpha[x];
223 lnp += Gammln(alpha[x] + cvec[x]);
224 lnp -= Gammln(cvec[x] + 1.);
225 lnp -= Gammln(alpha[x]);
229 lnp += Gammln(sum3 + 1.);
233 /* Function: SampleDirichlet()
235 * Purpose: Given a Dirichlet distribution defined by
236 * a vector of n alpha terms, sample of probability
237 * distribution of dimension n.
239 * This code was derived from source provided
240 * by Betty Lazareva, from Gary Churchill's group.
242 * Args: alpha - vector of Dirichlet alphas components
243 * n - number of components
244 * ret_p - RETURN: sampled probability vector.
247 * ret_p, an n-dimensional array alloced by the caller,
251 SampleDirichlet(float *alpha, int n, float *p)
255 for (x = 0; x < n; x++)
256 p[x] = SampleGamma(alpha[x]);
261 /* Function: SampleGamma()
263 * Purpose: Return a random deviate distributed as Gamma(alpha, 1.0).
264 * Uses two different accept/reject algorithms, one
265 * for 0<alpha<1, the other for 1<=alpha.
267 * Code modified from source provided by Betty Lazareva
268 * and Gary Churchill.
270 * Args: alpha - order of gamma function
272 * Return: the gamma-distributed deviate.
275 SampleGamma(float alpha)
277 float U,V,X,W,lambda;
281 /*CONSTCOND*/ while (1)
283 lambda = sqrt(2.0*alpha -1.0);
286 X = alpha * pow(V, 1/lambda);
287 W = .25*exp(-X+alpha)*pow(V,1.0+alpha/lambda)*pow(1.0+1.0/V, 2.0);
288 if (sre_random() <= W)
292 else if (alpha > 0.0)
294 /*CONSTCOND*/ while (1)
297 V = U*(1+ alpha/exp(1.0));
300 X = -log( (1-V+alpha/exp(1.0))/alpha);
301 if (sre_random() <= pow(X, alpha-1.0))
306 X = pow(V,1.0/alpha);
307 if (sre_random() <= exp(-X))
312 Die("Invalid argument alpha < 0.0 to SampleGamma()");
317 /* Function: SampleCountvector()
319 * Purpose: Given a probability vector p of dimensionality
320 * n, sample c counts and store them in cvec.
321 * cvec is n-dimensional and is alloced by the caller.
324 SampleCountvector(float *p, int n, int c, float *cvec)
329 for (i = 0; i < c; i++)
330 cvec[FChoose(p,n)] += 1.0;
335 /* Function: P_PvecGivenDirichlet()
337 * Purpose: Calculate the log probability of a probability
338 * vector given a single Dirichlet component, alpha.
339 * Follows Sjolander (1996) appendix, lemma 2.
341 * Return: log P(p | alpha)
344 P_PvecGivenDirichlet(float *p, int n, float *alpha)
346 float sum; /* for Gammln(|alpha|) in Z */
347 float logp; /* RETURN: log P(p|alpha) */
351 for (x = 0; x < n; x++)
352 if (p[x] > 0.0) /* any param that is == 0.0 doesn't exist */
354 logp += (alpha[x]-1.0) * log(p[x]);
355 logp -= Gammln(alpha[x]);