forester/archive/RIO/others/hmmer/src/mathsupport.c

   1 /************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  ************************************************************/
  10
  11
  12 /* mathsupport.c
  13  * SRE, Mon Nov 11 15:07:33 1996
  14  *
  15  * Miscellaneous mathematical functions.
  16  * General functions are in the SQUID library sre_math.c.
  17  * These functions are too HMM-specific to warrant being in the
  18  * SQUID library.
  19  *
  20  */
  21
  22
  23 #include <math.h>
  24 #include <float.h>
  25 #ifdef HMMER_THREADS
  26 #include <pthread.h>
  27 #endif
  28 #include "funcs.h"
  29 #include "config.h"
  30 #include "structs.h"
  31 #include "squid.h"
  32
  33 /* Function: Prob2Score()
  34  *
  35  * Purpose:  Convert a probability to a scaled integer log_2 odds score.
  36  *           Round to nearest integer (i.e. note use of +0.5 and floor())
  37  *           Return the score.
  38  */
  39 int
  40 Prob2Score(float p, float null)
  41 {
  42   if   (p == 0.0) return -INFTY;
  43   else            return (int) floor(0.5 + INTSCALE * sreLOG2(p/null));
  44 }
  45
  46 /* Function: Score2Prob()
  47  *
  48  * Purpose:  Convert an integer log_2 odds score back to a probability;
  49  *           needs the null model probability, if any, to do the conversion.
  50  */
  51 float
  52 Score2Prob(int sc, float null)
  53 {
  54   if (sc == -INFTY) return 0.;
  55   else              return (null * sreEXP2((float) sc / INTSCALE));
  56 }
  57
  58
  59 /* Function: Scorify()
  60  *
  61  * Purpose:  Convert a scaled integer log-odds score to a floating
  62  *           point score for output. (could be a macro but who cares.)
  63  */
  64 float
  65 Scorify(int sc)
  66 {
  67   return ((float) sc / INTSCALE);
  68 }
  69
  70
  71 /* Function: PValue()
  72  * Date:     SRE, Mon Oct 27 12:21:02 1997 [Sanger Centre, UK]
  73  *
  74  * Purpose:  Convert an HMM score to a P-value.
  75  *           We know P(S>x) is bounded by 1 / (1 + exp_2^x) for a bit score of x.
  76  *           We can also use EVD parameters for a tighter bound if we have
  77  *           them available.
  78  *
  79  * Args:     hmm - model structure, contains EVD parameters
  80  *           sc  - score in bits
  81  *
  82  * Returns:  P value for score significance.
  83  */
  84 double
  85 PValue(struct plan7_s *hmm, float sc)
  86 {
  87   double pval;
  88   double pval2;
  89                                 /* the bound from Bayes */
  90   if (sc >= sreLOG2(DBL_MAX)) pval = 0.0;
  91   else                        pval = 1. / (1.+sreEXP2(sc));
  92
  93                                 /* try for a better estimate from EVD fit */
  94   if (hmm != NULL && (hmm->flags & PLAN7_STATS))
  95     {
  96       pval2 = ExtremeValueP(sc, hmm->mu, hmm->lambda);
  97       if (pval2 < pval) pval = pval2;
  98     }
  99   return pval;
 100 }
 101
 102 /* Function: LogSum()
 103  *
 104  * Purpose:  Returns the log of the sum of two log probabilities.
 105  *           log(exp(p1)+exp(p2)) = p1 + log(1 + exp(p2-p1)) for p1 > p2
 106  *           Note that this is in natural log space, not log_2.
 107  */
 108 float
 109 LogSum(float p1, float p2)
 110 {
 111   if (p1 > p2)
 112     return (p1-p2 > 50.) ? p1 + log(1. + exp(p2-p1)) : p1;
 113   else
 114     return (p2-p1 > 50.) ? p2 + log(1. + exp(p1-p2)) : p2;
 115 }
 116
 117
 118 /* Function: ILogsum()
 119  *
 120  * Purpose:  Return the scaled integer log probability of
 121  *           the sum of two probabilities p1 and p2, where
 122  *           p1 and p2 are also given as scaled log probabilities.
 123  *
 124  *           log(exp(p1)+exp(p2)) = p1 + log(1 + exp(p2-p1)) for p1 > p2
 125  *
 126  *           For speed, builds a lookup table the first time it's called.
 127  *           LOGSUM_TBL is set to 20000 by default, in config.h.
 128  *
 129  *           Because of the one-time initialization, we have to
 130  *           be careful in a multithreaded implementation... hence
 131  *           the use of pthread_once(), which forces us to put
 132  *           the initialization routine and the lookup table outside
 133  *           ILogsum(). (Thanks to Henry Gabb at Intel for pointing
 134  *           out this problem.)
 135  *
 136  * Args:     p1,p2 -- scaled integer log_2 probabilities to be summed
 137  *                    in probability space.
 138  *
 139  * Return:   scaled integer log_2 probability of the sum.
 140  */
 141 static int ilogsum_lookup[LOGSUM_TBL];
 142 static void
 143 init_ilogsum(void)
 144 {
 145   int i;
 146   for (i = 0; i < LOGSUM_TBL; i++)
 147     ilogsum_lookup[i] = (int) (INTSCALE * 1.44269504 *
 148            (log(1.+exp(0.69314718 * (float) -i/INTSCALE))));
 149 }
 150 int
 151 ILogsum(int p1, int p2)
 152 {
 153   int    diff;
 154 #ifdef HMMER_THREADS
 155   static pthread_once_t firsttime = PTHREAD_ONCE_INIT;
 156   pthread_once(&firsttime, init_ilogsum);
 157 #else
 158   static int firsttime = 1;
 159   if (firsttime) { init_ilogsum(); firsttime = 0; }
 160 #endif
 161
 162   diff = p1-p2;
 163   if      (diff >=  LOGSUM_TBL) return p1;
 164   else if (diff <= -LOGSUM_TBL) return p2;
 165   else if (diff > 0)            return p1 + ilogsum_lookup[diff];
 166   else                          return p2 + ilogsum_lookup[-diff];
 167 }
 168
 169 /* Function: LogNorm()
 170  *
 171  * Purpose:  Normalize a vector of log likelihoods, changing it
 172  *           to a probability vector. Be careful of overflowing exp().
 173  *           Implementation adapted from Graeme Mitchison.
 174  *
 175  * Args:     vec - vector destined to become log probabilities
 176  *           n   - length of vec
 177  */
 178 void
 179 LogNorm(float *vec, int n)
 180 {
 181   int   x;
 182   float max   = -1.0e30;
 183   float denom = 0.;
 184
 185   for (x = 0; x < n; x++)
 186     if (vec[x] > max) max = vec[x];
 187   for (x = 0; x < n; x++)
 188     if (vec[x] > max - 50.)
 189       denom += exp(vec[x] - max);
 190   for (x = 0; x < n; x++)
 191     if (vec[x] > max - 50.)
 192       vec[x] = exp(vec[x] - max) / denom;
 193     else
 194       vec[x] = 0.0;
 195 }
 196
 197
 198 /* Function: Logp_cvec()
 199  *
 200  * Purpose:  Calculates ln P(cvec|dirichlet), the log probability of a
 201  *           count vector given a Dirichlet distribution. Adapted
 202  *           from an implementation by Graeme Mitchison.
 203  *
 204  * Args:     cvec  - count vector
 205  *           n     - length of cvec
 206  *           alpha - Dirichlet alpha terms
 207  *
 208  * Return:   log P(cvec|dirichlet)
 209  */
 210 float
 211 Logp_cvec(float *cvec, int n, float *alpha)
 212 {
 213   float lnp;                   /* log likelihood of P(cvec | Dirichlet) */
 214   float sum1, sum2, sum3;
 215   int   x;
 216
 217   sum1 = sum2 = sum3 = lnp = 0.0;
 218   for (x = 0; x < n; x++)
 219     {
 220       sum1 += cvec[x] + alpha[x];
 221       sum2 += alpha[x];
 222       sum3 += cvec[x];
 223       lnp  += Gammln(alpha[x] + cvec[x]);
 224       lnp  -= Gammln(cvec[x] + 1.);
 225       lnp  -= Gammln(alpha[x]);
 226     }
 227   lnp -= Gammln(sum1);
 228   lnp += Gammln(sum2);
 229   lnp += Gammln(sum3 + 1.);
 230   return lnp;
 231 }
 232
 233 /* Function: SampleDirichlet()
 234  *
 235  * Purpose:  Given a Dirichlet distribution defined by
 236  *           a vector of n alpha terms, sample of probability
 237  *           distribution of dimension n.
 238  *
 239  *           This code was derived from source provided
 240  *           by Betty Lazareva, from Gary Churchill's group.
 241  *
 242  * Args:     alpha - vector of Dirichlet alphas components
 243  *           n     - number of components
 244  *           ret_p - RETURN: sampled probability vector.
 245  *
 246  * Return:   (void)
 247  *           ret_p, an n-dimensional array alloced by the caller,
 248  *           is filled.
 249  */
 250 void
 251 SampleDirichlet(float *alpha, int n, float *p)
 252 {
 253   int    x;
 254
 255   for (x = 0; x < n; x++)
 256     p[x] = SampleGamma(alpha[x]);
 257   FNorm(p, n);
 258 }
 259
 260
 261 /* Function: SampleGamma()
 262  *
 263  * Purpose:  Return a random deviate distributed as Gamma(alpha, 1.0).
 264  *           Uses two different accept/reject algorithms, one
 265  *           for 0<alpha<1, the other for 1<=alpha.
 266  *
 267  *           Code modified from source provided by Betty Lazareva
 268  *           and Gary Churchill.
 269  *
 270  * Args:     alpha - order of gamma function
 271  *
 272  * Return:   the gamma-distributed deviate.
 273  */
 274 float
 275 SampleGamma(float alpha)
 276 {
 277   float U,V,X,W,lambda;
 278
 279   if (alpha >= 1.0)
 280     {
 281       /*CONSTCOND*/ while (1)
 282         {
 283           lambda = sqrt(2.0*alpha -1.0);
 284           U = sre_random();
 285           V = U/(1-U);
 286           X = alpha * pow(V, 1/lambda);
 287           W = .25*exp(-X+alpha)*pow(V,1.0+alpha/lambda)*pow(1.0+1.0/V, 2.0);
 288           if (sre_random() <= W)
 289             return X;
 290         }
 291     }
 292   else if (alpha > 0.0)
 293     {
 294       /*CONSTCOND*/ while (1)
 295         {
 296           U = sre_random();
 297           V = U*(1+ alpha/exp(1.0));
 298           if (V > 1.0)
 299             {
 300               X = -log( (1-V+alpha/exp(1.0))/alpha);
 301               if (sre_random() <= pow(X, alpha-1.0))
 302                 return X;
 303             }
 304           else
 305             {
 306               X = pow(V,1.0/alpha);
 307               if (sre_random() <= exp(-X))
 308                 return X;
 309             }
 310         }
 311     }
 312   Die("Invalid argument alpha < 0.0 to SampleGamma()");
 313   /*NOTREACHED*/
 314   return 0.0;
 315 }
 316
 317 /* Function: SampleCountvector()
 318  *
 319  * Purpose:  Given a probability vector p of dimensionality
 320  *           n, sample c counts and store them in cvec.
 321  *           cvec is n-dimensional and is alloced by the caller.
 322  */
 323 void
 324 SampleCountvector(float *p, int n, int c, float *cvec)
 325 {
 326   int i;
 327
 328   FSet(cvec, n, 0.0);
 329   for (i = 0; i < c; i++)
 330     cvec[FChoose(p,n)] += 1.0;
 331 }
 332
 333
 334
 335 /* Function: P_PvecGivenDirichlet()
 336  *
 337  * Purpose:  Calculate the log probability of a probability
 338  *           vector given a single Dirichlet component, alpha.
 339  *           Follows Sjolander (1996) appendix, lemma 2.
 340  *
 341  * Return:   log P(p | alpha)
 342  */
 343 float
 344 P_PvecGivenDirichlet(float *p, int n, float *alpha)
 345 {
 346   float sum;                    /* for Gammln(|alpha|) in Z     */
 347   float logp;                   /* RETURN: log P(p|alpha)       */
 348   int x;
 349
 350   sum = logp = 0.0;
 351   for (x = 0; x < n; x++)
 352     if (p[x] > 0.0)             /* any param that is == 0.0 doesn't exist */
 353       {
 354         logp += (alpha[x]-1.0) * log(p[x]);
 355         logp -= Gammln(alpha[x]);
 356         sum  += alpha[x];
 357       }
 358   logp += Gammln(sum);
 359   return logp;
 360 }
 361
 362