sources/multicoil/hmm.c

   1 /* Written by David Wilson */
   2 /******************************************************************************
   3   Operations for evaluating probabilities pertaining to a protein sequence.
   4   The sequence is specified as a character string char *seq and a length.
   5 ******************************************************************************/
   6 #include "scconst.h"
   7 #include <stdio.h>
   8 #include "sc2seq.h"
   9 #include "hmm.h"
  10
  11 #define MALLOC(NUMBER,TYPE) ((TYPE*)Malloc((NUMBER)*sizeof(TYPE)))
  12 #define CALLOC(NUMBER,TYPE) ((TYPE*)Calloc((NUMBER),sizeof(TYPE)))
  13 #define REALLOC(PTR,NUMBER,TYPE) ((TYPE*)Realloc((PTR),(NUMBER)*sizeof(TYPE)))
  14 #define FREE(PTR) free(PTR)
  15
  16
  17 /*********The following are defined in sc2seq_interface.c **************/
  18
  19 extern double many_pprobs[MAX_TABLE_NUMBER][NUM_RES_TYPE][POSNUM],
  20          many_pprobp[MAX_TABLE_NUMBER][NUM_RES_TYPE][NUM_RES_TYPE][POSNUM][POSNUM];
  21
  22
  23 extern double gprobs[NUM_RES_TYPE], gprobp[NUM_RES_TYPE][NUM_RES_TYPE][POSNUM];
  24
  25 extern double (*pprobs)[POSNUM];
  26 extern double (*pprobp)[NUM_RES_TYPE][POSNUM][POSNUM];
  27
  28
  29
  30 /************************************************************************/
  31
  32
  33
  34 void *Malloc(), *Calloc(), *Realloc(), Free();
  35 /******************************************************************************
  36   Given a sequence of normalized probabilities, alpha (beta) gives the
  37   normalized probability that one sees the prefix upto (suffix from) the
  38   Tth state, given this state is q.  ;; Works even if alpha (beta) == nprobs.
  39   The alpha and beta arrays are indexed as the nprobs array is.
  40 ******************************************************************************/
  41 Alphas (double *alpha, double *nprob, int length, HMM *hmm)
  42 {
  43   int i,t,n;
  44   double *alphat, *nprobt;
  45   n = hmm->nstates;
  46   bzero(alpha,length*n*sizeof(double));
  47   for (i=n-1; i>=0; --i)
  48     alpha[i] = hmm->initprob[i] * nprob[i];
  49   for (t=1; t<length; ++t) {
  50     alphat = &(alpha[n]);
  51     nprobt = &(nprob[n]);
  52     for (i=hmm->ntrans-1; i>=0; --i)
  53       alphat[hmm->transt[i]] += alpha[hmm->transf[i]] * hmm->transprob[i];
  54     for (i=n-1; i>=0; --i)
  55       alphat[i] *= nprobt[i];
  56     alpha = alphat;
  57     nprob = nprobt;
  58   }
  59 }
  60 Betas  (double  *beta, double *nprob, int length, HMM *hmm)
  61 {
  62   int i,t,n;
  63   double *betat, *nprobt;
  64   n = hmm->nstates;
  65   bzero(beta,length*n*sizeof(double));
  66   betat  = &(beta [n*(length-1)]);
  67   nprobt = &(nprob[n*(length-1)]);
  68   for (i=n-1; i>=0; --i)
  69     betat[i] = hmm->finprob[i] * nprobt[i];
  70   for (t=length-2; t>=0; --t) {
  71     beta  = &(betat [-n]);
  72     nprob = &(nprobt[-n]);
  73     for (i=hmm->ntrans-1; i>=0; --i)
  74       beta[hmm->transf[i]] += betat[hmm->transt[i]] * hmm->transprob[i];
  75     for (i=n-1; i>=0; --i)
  76       beta[i] *= nprob[i];
  77     betat  = beta;
  78     nprobt = nprob;
  79   }
  80 }
  81
  82
  83 /******************************************************************************
  84   Given a sequence, for each residue for each state output the probability
  85   that this residue was in that state.
  86 ******************************************************************************/
  87 /* Contains a static variable pointing to a dynamically allocated array of
  88    doubles, and a static variable giving the length of the previous array size
  89    (initially 0).  Memory is needed for 4*nstates*length.  This value is
  90    rounded up to the nearest power of 2 when determining how much memory to
  91    allocate.
  92    */
  93 double *Memory (amount)
  94 {
  95   static double *memory=(void*)0;
  96   static int memsize=0;
  97   if (!memsize) {
  98     memsize = amount;
  99     memory = MALLOC(memsize,double);
 100   }
 101   if (memsize<amount) {
 102     for (memsize=1; memsize<amount; memsize *= 2);
 103     memory = REALLOC(memory,memsize,double);
 104   }
 105   return memory;
 106 }
 107 #define AlphaBeta(i) (nprob[i] ? alpha[i]*beta[i]/nprob[i] : 0)
 108
 109
 110
 111 /* Score sequence using the hidden Markov model */
 112 /** Note that prob use is determined by extern setting of pprobs, pprobp */
 113 void HMMScore (char *seq, int length, HMM *hmm,
 114                 double scores[MAXSEQLEN][POSNUM+1], int table,
 115                int offset_to_use)
 116 {
 117   int n, i,t,j;
 118   double *memory, *alpha, *beta, *nprob, *gamma, sum;
 119
 120
 121   n = hmm->nstates;
 122   memory = Memory(4*n*length);
 123   gamma = &(memory[0]);
 124   alpha = &(memory[1*n*length]);
 125   beta  = &(memory[2*n*length]);
 126   nprob = &(memory[3*n*length]);
 127   hmm->outprobs(nprob,seq,length);
 128   Alphas(alpha,nprob,length,hmm);
 129   Betas(beta,nprob,length,hmm);
 130   for (t=0; t<length; ++t) {
 131     sum = 0;
 132     for (i=n-1; i>=0; --i)
 133       sum += gamma[i] = AlphaBeta(i);
 134     for (i=n-1; i>=0; --i)
 135       gamma[i] /= sum;
 136     gamma = &(gamma[n]);
 137     alpha = &(alpha[n]);
 138     beta  = &(beta[n]);
 139     nprob = &(nprob[n]);
 140   }
 141
 142   for (i=0; i<length; i++) {
 143     scores[i][POSNUM] =0;
 144     for (j=0; j<POSNUM; j++) {
 145       scores[i][j] = memory[8*i +j];
 146       if (scores[i][j] > scores[i][POSNUM])
 147         scores[i][POSNUM] = scores[i][j];
 148     }
 149     if ((offset_to_use<=6) && (offset_to_use >=0) )
 150       scores[i][POSNUM] = scores[i][(i+offset_to_use)%POSNUM];
 151   }
 152 }
 153
 154 CountTransitions (char *seq, int length, HMM *hmm, double *tbl, double *start, double *end, double  *steady)
 155 {
 156   int n, m, i,t;
 157   double *memory, *alpha, *beta, *at, *bt1, *nprob, sum=0, *contrib, econtrib;
 158   n = hmm->nstates;
 159   m = hmm->ntrans;
 160   memory = Memory(3*n*length+n);
 161   alpha =  &(memory[0*n*length]);
 162   beta  =  &(memory[1*n*length]);
 163   nprob =  &(memory[2*n*length]);
 164   contrib= &(memory[3*n*length]);
 165   hmm->outprobs(nprob,seq,length);
 166   Alphas(alpha,nprob,length,hmm);
 167   Betas(beta,nprob,length,hmm);
 168   for (sum=0,i=0; i<n; ++i)
 169     sum += beta[i] * hmm->initprob[i];
 170   for (i=0; i<m; ++i) {
 171     econtrib = 0;
 172     at = alpha; bt1 = &(beta[n]);
 173     for (t=0; t+1<length; ++t) {
 174       econtrib += at[hmm->transf[i]] * hmm->transprob[i] * bt1[hmm->transt[i]];
 175       at  = &(at[n]);
 176       bt1 = &(bt1[n]);
 177     }
 178     tbl[i] += econtrib / sum;
 179   }
 180   bzero(contrib,n*sizeof(double));
 181   for (i=n-1; i>=0; --i)
 182     start[i] += AlphaBeta(i)/sum;
 183   for (t=0; t<length; ++t) {
 184     for (i=n-1; i>=0; --i)
 185       contrib[i] += AlphaBeta(i);
 186     alpha = &(alpha[n]);
 187     beta  = &(beta[n]);
 188     nprob = &(nprob[n]);
 189   }
 190   for (i=n-1; i>=0; --i)
 191     steady[i] += contrib[i]/sum;
 192   alpha = &(alpha[-n]);
 193   beta  = &(beta[-n]);
 194   nprob = &(nprob[-n]);
 195   for (i=n-1; i>=0; --i)
 196     end[i] += AlphaBeta(i)/sum;
 197 }
 198
 199
 200
 201 #include "scconst.h"
 202 #include <stdio.h>
 203 #include "scio.h"
 204 #include <math.h>
 205
 206
 207 context (double *nprobs, char *seq, int length)
 208 {
 209   double lnprob;
 210   int i, j, dist;
 211   int x, y, xoffs, yoffs;
 212
 213   bzero(nprobs,length*8*sizeof(double));
 214
 215   /* Pairwise interactions */
 216   for (xoffs=0; xoffs<8; ++xoffs)
 217     for (dist=1; dist<=4; dist+=3) {
 218       yoffs = (xoffs<7) ? (xoffs+dist)%7 : xoffs;
 219       for (x = 0, y = x+dist; y < length; x++, y++) {
 220         lnprob = (xoffs<7) ? (pprobp[seq[x]][seq[y]][xoffs][yoffs] -
 221                               pprobs[seq[x]][xoffs] -
 222                               pprobs[seq[y]][yoffs])
 223           : (gprobp[seq[x]][seq[y]][dist-1] - gprobs[seq[x]] - gprobs[seq[y]]);
 224         nprobs[8*x+xoffs] += lnprob;
 225         nprobs[8*y+yoffs] += lnprob;
 226       }
 227     }
 228
 229   /* Singles component */
 230   for (xoffs=0; xoffs<8; ++xoffs)
 231     for (x = 0; x < length; x++) {
 232       nprobs[8*x+xoffs] /= 4;
 233       nprobs[8*x+xoffs] += (xoffs<7) ? pprobs[seq[x]][xoffs] : gprobs[seq[x]];
 234     }
 235   /* Now normalize */
 236   for (xoffs=0; xoffs<8; ++xoffs)
 237     for (x = 0; x < length; x++)
 238       nprobs[8*x+xoffs] -= nprobs[8*x+7];
 239   /* Make position 'f' like genbank, and don't use 'b' and 'c' as much */
 240 /*  for (x = 0; x < length; x++) {
 241     nprobs[8*x+5] /= 2;
 242     nprobs[8*x+1] /= 2;
 243     nprobs[8*x+2] /= 2;
 244   }*/
 245   /* Convert from log */
 246   for (xoffs=0; xoffs<8; ++xoffs)
 247     for (x = 0; x < length; x++)
 248       nprobs[8*x+xoffs] = exp(nprobs[8*x+xoffs]);
 249 /*
 250       if ((nprobs[8*x+xoffs] = exp(nprobs[8*x+xoffs]))>10)
 251         printf("Check out position %d, %c%c%c\n",x,
 252                numaa(seq[x-1]), numaa(seq[x]), numaa(seq[x+1]));
 253 */
 254 }
 255
 256
 257 /* The purpose of the next function to allow the new scoring method to be
 258    easily interfaced with the old code so that histograms and the like may
 259    be made.  The long term future of this function is not a concern.
 260    */
 261
 262 /* Create the CC HMM given global variable prob. functions. */
 263 HMM *CCHMM (FILE *fgin, FILE *fpin)
 264 {
 265   int i,j;
 266   double *initprob, *finprob, *transprob;
 267   int *transf, *transt;
 268   HMM *hmm;
 269   double FractionInCCs = 0.02;
 270   double AverageCCLength = 40.0;
 271   double CC, Gen, GenGen, GenCC, CCGen, CCCC, CCNewPhase;
 272   double CCShift[7];
 273
 274   CC = FractionInCCs;
 275   GenCC = FractionInCCs/AverageCCLength;
 276   CCNewPhase = GenCC;
 277   CCGen = 1/AverageCCLength - (POSNUM-1)*CCNewPhase;
 278   CCCC = 1-1/AverageCCLength;
 279   for (i=0; i<7; ++i)
 280     CCShift[i] = CCNewPhase;
 281   CCShift[1] = CCCC;
 282
 283   CC = GenCC;
 284
 285   Gen = 1-CC;
 286   GenGen = 1-GenCC;
 287   CC /= 7;
 288   GenCC /= 7;
 289
 290   initprob = MALLOC(8,double);
 291   finprob = MALLOC(8,double);
 292   transprob = MALLOC(64,double);
 293   transf = MALLOC(64,int);
 294   transt = MALLOC(64,int);
 295 /*  tables = MALLOC(1,ProbTbls); */
 296   for (i=0; i<7; ++i)
 297     initprob[i] = CC;
 298   initprob[7] = Gen;
 299   for (i=0; i<7; ++i)
 300     finprob[i] = CCGen;
 301   finprob[7] = GenGen;
 302   for (i=0; i<8; ++i)
 303     for (j=0; j<8; ++j) {
 304       int e = i*8+j;
 305       transf[e] = i;
 306       transt[e] = j;
 307       transprob[e] = CCShift[(j+7-i)%7];
 308       if (i==7)
 309         transprob[e] = GenCC;
 310       if (j==7)
 311         transprob[e] = CCGen;
 312       if (i==7 && j==7)
 313         transprob[e] = GenGen;
 314     }
 315
 316   hmm = MALLOC(1,HMM);
 317   hmm->nstates = 8;
 318   hmm->initprob = initprob;
 319   hmm->finprob = finprob;
 320   hmm->transf = transf;
 321   hmm->transt = transt;
 322   hmm->ntrans = 64;
 323   hmm->transprob = transprob;
 324   hmm->outprobs = context;
 325   return hmm;
 326 }
 327 /*---------------------------------------------------------------------------*/
 328 /*---------------------------------------------------------------------------*/
 329
 330 void *Calloc (n,elsize)
 331      int n,elsize;
 332 {
 333   void *ptr;
 334   ptr = (void*)calloc(n,elsize);
 335   if (ptr) return ptr;
 336   fprintf(stderr,"Memory shortage!!\n");
 337   exit(-1);
 338 }
 339
 340 void *Realloc (ptr,n)
 341      void *ptr;
 342      int n;
 343 {
 344   ptr = (void*)realloc(ptr,n);
 345   if (ptr) return ptr;
 346   fprintf(stderr,"Memory shortage!!\n");
 347   exit(-1);
 348 }
 349
 350
 351