forester/archive/RIO/others/hmmer/src/plan7.c

   1 /************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  ************************************************************/
  10
  11
  12 /* plan7.c
  13  * SRE, Sat Nov 16 14:19:56 1996
  14  *
  15  * Support for Plan 7 HMM data structure, plan7_s.
  16  */
  17
  18 #include <stdio.h>
  19 #include <string.h>
  20 #include <ctype.h>
  21 #include <time.h>
  22
  23 #include "funcs.h"
  24 #include "config.h"
  25 #include "structs.h"
  26 #include "squid.h"
  27
  28 /* Functions: AllocPlan7(), AllocPlan7Shell(), AllocPlan7Body(), FreePlan7()
  29  *
  30  * Purpose:   Allocate or free a Plan7 HMM structure.
  31  *            Can either allocate all at one (AllocPlan7()) or
  32  *            in two steps (AllocPlan7Shell(), AllocPlan7Body()).
  33  *            The two step method is used in hmmio.c where we start
  34  *            parsing the header of an HMM file but don't
  35  *            see the size of the model 'til partway thru the header.
  36  */
  37 struct plan7_s *
  38 AllocPlan7(int M)
  39 {
  40   struct plan7_s *hmm;
  41
  42   hmm = AllocPlan7Shell();
  43   AllocPlan7Body(hmm, M);
  44   return hmm;
  45 }
  46 struct plan7_s *
  47 AllocPlan7Shell(void)
  48 {
  49   struct plan7_s *hmm;
  50
  51   hmm    = (struct plan7_s *) MallocOrDie (sizeof(struct plan7_s));
  52   hmm->M = 0;
  53
  54   hmm->name     = NULL;
  55   hmm->acc      = NULL;
  56   hmm->desc     = NULL;
  57   hmm->rf       = NULL;
  58   hmm->cs       = NULL;
  59   hmm->ca       = NULL;
  60   hmm->comlog   = NULL;
  61   hmm->nseq     = 0;
  62   hmm->ctime    = NULL;
  63   hmm->map      = NULL;
  64   hmm->checksum = 0;
  65
  66   hmm->tpri = NULL;
  67   hmm->mpri = NULL;
  68   hmm->ipri = NULL;
  69
  70   hmm->ga1 = hmm->ga2 = 0.0;
  71   hmm->tc1 = hmm->tc2 = 0.0;
  72   hmm->nc1 = hmm->nc2 = 0.0;
  73
  74   hmm->t      = NULL;
  75   hmm->tsc    = NULL;
  76   hmm->mat    = NULL;
  77   hmm->ins    = NULL;
  78   hmm->msc    = NULL;
  79   hmm->isc    = NULL;
  80
  81   hmm->begin  = NULL;
  82   hmm->bsc    = NULL;
  83   hmm->end    = NULL;
  84   hmm->esc    = NULL;
  85                                 /* DNA translation is not enabled by default */
  86   hmm->dnam   = NULL;
  87   hmm->dnai   = NULL;
  88   hmm->dna2   = -INFTY;
  89   hmm->dna4   = -INFTY;
  90                         /* statistical parameters set to innocuous empty values */
  91   hmm->mu     = 0.;
  92   hmm->lambda = 0.;
  93
  94   hmm->flags = 0;
  95   return hmm;
  96 }
  97 void
  98 AllocPlan7Body(struct plan7_s *hmm, int M)
  99 {
 100   int k, x;
 101
 102   hmm->M = M;
 103
 104   hmm->rf     = MallocOrDie ((M+2) * sizeof(char));
 105   hmm->cs     = MallocOrDie ((M+2) * sizeof(char));
 106   hmm->ca     = MallocOrDie ((M+2) * sizeof(char));
 107   hmm->map    = MallocOrDie ((M+1) * sizeof(int));
 108
 109   hmm->t      = MallocOrDie (M     *           sizeof(float *));
 110   hmm->tsc    = MallocOrDie (M     *           sizeof(int *));
 111   hmm->mat    = MallocOrDie ((M+1) *           sizeof(float *));
 112   hmm->ins    = MallocOrDie (M     *           sizeof(float *));
 113   hmm->msc    = MallocOrDie (MAXCODE   *       sizeof(int *));
 114   hmm->isc    = MallocOrDie (MAXCODE   *       sizeof(int *));
 115   hmm->t[0]   = MallocOrDie ((7*M)     *       sizeof(float));
 116   hmm->tsc[0] = MallocOrDie ((7*M)     *       sizeof(int));
 117   hmm->mat[0] = MallocOrDie ((MAXABET*(M+1)) * sizeof(float));
 118   hmm->ins[0] = MallocOrDie ((MAXABET*M) *     sizeof(float));
 119   hmm->msc[0] = MallocOrDie ((MAXCODE*(M+1)) * sizeof(int));
 120   hmm->isc[0] = MallocOrDie ((MAXCODE*M) *     sizeof(int));
 121
 122   /* note allocation strategy for important 2D arrays -- trying
 123    * to keep locality as much as possible, cache efficiency etc.
 124    */
 125   for (k = 1; k <= M; k++) {
 126     hmm->mat[k] = hmm->mat[0] + k * MAXABET;
 127     if (k < M) {
 128       hmm->ins[k] = hmm->ins[0] + k * MAXABET;
 129       hmm->t[k]   = hmm->t[0]   + k * 7;
 130       hmm->tsc[k] = hmm->tsc[0] + k * 7;
 131     }
 132   }
 133   for (x = 1; x < MAXCODE; x++) {
 134     hmm->msc[x] = hmm->msc[0] + x * (M+1);
 135     hmm->isc[x] = hmm->isc[0] + x * M;
 136   }
 137   /* tsc[0] is used as a boundary condition sometimes [Viterbi()],
 138    * so set to -inf always.
 139    */
 140   for (x = 0; x < 7; x++)
 141     hmm->tsc[0][x] = -INFTY;
 142
 143   hmm->begin  = MallocOrDie  ((M+1) * sizeof(float));
 144   hmm->bsc    = MallocOrDie  ((M+1) * sizeof(int));
 145   hmm->end    = MallocOrDie  ((M+1) * sizeof(float));
 146   hmm->esc    = MallocOrDie  ((M+1) * sizeof(int));
 147
 148   return;
 149 }
 150
 151
 152 void
 153 FreePlan7(struct plan7_s *hmm)
 154 {
 155   if (hmm->name    != NULL) free(hmm->name);
 156   if (hmm->desc    != NULL) free(hmm->desc);
 157   if (hmm->rf      != NULL) free(hmm->rf);
 158   if (hmm->cs      != NULL) free(hmm->cs);
 159   if (hmm->ca      != NULL) free(hmm->ca);
 160   if (hmm->comlog  != NULL) free(hmm->comlog);
 161   if (hmm->ctime   != NULL) free(hmm->ctime);
 162   if (hmm->map     != NULL) free(hmm->map);
 163   if (hmm->tpri    != NULL) free(hmm->tpri);
 164   if (hmm->mpri    != NULL) free(hmm->mpri);
 165   if (hmm->ipri    != NULL) free(hmm->ipri);
 166   if (hmm->bsc     != NULL) free(hmm->bsc);
 167   if (hmm->begin   != NULL) free(hmm->begin);
 168   if (hmm->esc     != NULL) free(hmm->esc);
 169   if (hmm->end     != NULL) free(hmm->end);
 170   if (hmm->msc     != NULL) free(hmm->msc[0]);
 171   if (hmm->mat     != NULL) free(hmm->mat[0]);
 172   if (hmm->isc     != NULL) free(hmm->isc[0]);
 173   if (hmm->ins     != NULL) free(hmm->ins[0]);
 174   if (hmm->tsc     != NULL) free(hmm->tsc[0]);
 175   if (hmm->t       != NULL) free(hmm->t[0]);
 176   if (hmm->msc     != NULL) free(hmm->msc);
 177   if (hmm->mat     != NULL) free(hmm->mat);
 178   if (hmm->isc     != NULL) free(hmm->isc);
 179   if (hmm->ins     != NULL) free(hmm->ins);
 180   if (hmm->tsc     != NULL) free(hmm->tsc);
 181   if (hmm->t       != NULL) free(hmm->t);
 182   if (hmm->dnam    != NULL) free(hmm->dnam);
 183   if (hmm->dnai    != NULL) free(hmm->dnai);
 184   free(hmm);
 185 }
 186
 187 /* Function: ZeroPlan7()
 188  *
 189  * Purpose:  Zeros the counts/probabilities fields in a model.
 190  *           Leaves null model untouched.
 191  */
 192 void
 193 ZeroPlan7(struct plan7_s *hmm)
 194 {
 195   int k;
 196   for (k = 1; k < hmm->M; k++)
 197     {
 198       FSet(hmm->t[k], 7, 0.);
 199       FSet(hmm->mat[k], Alphabet_size, 0.);
 200       FSet(hmm->ins[k], Alphabet_size, 0.);
 201     }
 202   FSet(hmm->mat[hmm->M], Alphabet_size, 0.);
 203   hmm->tbd1 = 0.;
 204   FSet(hmm->begin+1, hmm->M, 0.);
 205   FSet(hmm->end+1, hmm->M, 0.);
 206   for (k = 0; k < 4; k++)
 207     FSet(hmm->xt[k], 2, 0.);
 208   hmm->flags &= ~PLAN7_HASBITS; /* invalidates scores */
 209   hmm->flags &= ~PLAN7_HASPROB; /* invalidates probabilities */
 210 }
 211
 212
 213 /* Function: Plan7SetName()
 214  *
 215  * Purpose:  Change the name of a Plan7 HMM. Convenience function.
 216  *
 217  * Note:     Trailing whitespace and \n's are chopped.
 218  */
 219 void
 220 Plan7SetName(struct plan7_s *hmm, char *name)
 221 {
 222   if (hmm->name != NULL) free(hmm->name);
 223   hmm->name = Strdup(name);
 224   StringChop(hmm->name);
 225 }
 226 /* Function: Plan7SetAccession()
 227  *
 228  * Purpose:  Change the accession number of a Plan7 HMM. Convenience function.
 229  *
 230  * Note:     Trailing whitespace and \n's are chopped.
 231  */
 232 void
 233 Plan7SetAccession(struct plan7_s *hmm, char *acc)
 234 {
 235   if (hmm->acc != NULL) free(hmm->acc);
 236   hmm->acc = Strdup(acc);
 237   StringChop(hmm->acc);
 238   hmm->flags |= PLAN7_ACC;
 239 }
 240
 241 /* Function: Plan7SetDescription()
 242  *
 243  * Purpose:  Change the description line of a Plan7 HMM. Convenience function.
 244  *
 245  * Note:     Trailing whitespace and \n's are chopped.
 246  */
 247 void
 248 Plan7SetDescription(struct plan7_s *hmm, char *desc)
 249 {
 250   if (hmm->desc != NULL) free(hmm->desc);
 251   hmm->desc = Strdup(desc);
 252   StringChop(hmm->desc);
 253   hmm->flags |= PLAN7_DESC;
 254 }
 255
 256 /* Function: Plan7ComlogAppend()
 257  * Date:     SRE, Wed Oct 29 09:57:30 1997 [TWA 721 over Greenland]
 258  *
 259  * Purpose:  Concatenate command line options and append to the
 260  *           command line log.
 261  */
 262 void
 263 Plan7ComlogAppend(struct plan7_s *hmm, int argc, char **argv)
 264 {
 265   int len;
 266   int i;
 267
 268   /* figure out length of command line, w/ spaces and \n */
 269   len = argc;
 270   for (i = 0; i < argc; i++)
 271     len += strlen(argv[i]);
 272
 273   /* allocate */
 274   if (hmm->comlog != NULL)
 275     {
 276       len += strlen(hmm->comlog);
 277       hmm->comlog = ReallocOrDie(hmm->comlog, sizeof(char)* (len+1));
 278     }
 279   else
 280     {
 281       hmm->comlog = MallocOrDie(sizeof(char)* (len+1));
 282       *(hmm->comlog) = '\0'; /* need this to make strcat work */
 283     }
 284
 285   /* append */
 286   strcat(hmm->comlog, "\n");
 287   for (i = 0; i < argc; i++)
 288     {
 289       strcat(hmm->comlog, argv[i]);
 290       if (i < argc-1) strcat(hmm->comlog, " ");
 291     }
 292 }
 293
 294 /* Function: Plan7SetCtime()
 295  * Date:     SRE, Wed Oct 29 11:53:19 1997 [TWA 721 over the Atlantic]
 296  *
 297  * Purpose:  Set the ctime field in a new HMM to the current time.
 298  */
 299 void
 300 Plan7SetCtime(struct plan7_s *hmm)
 301 {
 302   time_t date = time(NULL);
 303   if (hmm->ctime != NULL) free(hmm->ctime);
 304   hmm->ctime = Strdup(ctime(&date));
 305   StringChop(hmm->ctime);
 306 }
 307
 308
 309 /* Function: Plan7SetNullModel()
 310  *
 311  * Purpose:  Set the null model section of an HMM.
 312  *           Convenience function.
 313  */
 314 void
 315 Plan7SetNullModel(struct plan7_s *hmm, float null[MAXABET], float p1)
 316 {
 317   int x;
 318   for (x = 0; x < Alphabet_size; x++)
 319     hmm->null[x] = null[x];
 320   hmm->p1 = p1;
 321 }
 322
 323
 324 /* Function: P7Logoddsify()
 325  *
 326  * Purpose:  Take an HMM with valid probabilities, and
 327  *           fill in the integer log-odds score section of the model.
 328  *
 329  *    Notes on log-odds scores:
 330  *         type of parameter       probability        score
 331  *         -----------------       -----------        ------
 332  *         any emission             p_x           log_2 p_x/null_x
 333  *             N,J,C /assume/ p_x = null_x so /always/ score zero.
 334  *         transition to emitters   t_x           log_2 t_x/p1
 335  *            (M,I; N,C; J)
 336  *             NN and CC loops are often equal to p1, so usu. score zero.
 337  *         C->T transition          t_x            log_2 t_x/p2
 338  *            often zero, usu. C->T = p2.
 339  *         all other transitions    t_x           log_2 t_x
 340  *             (no null model counterpart, so null prob is 1)
 341  *
 342  *    Notes on entry/exit scores, B->M and M->E:
 343  *         The probability form model includes delete states 1 and M.
 344  *         these states are removed from a search form model to
 345  *         prevent B->D...D->E->J->B mute cycles, which would complicate
 346  *         dynamic programming algorithms. The data-independent
 347  *         S/W B->M and M->E transitions are folded together with
 348  *         data-dependent B->D...D->M and M->D...D->E paths.
 349  *
 350  *         This process is referred to in the code as "wing folding"
 351  *         or "wing retraction"... the analogy is to a swept-wing
 352  *         fighter in landing vs. high speed flight configuration.
 353  *
 354  *    Note on Viterbi vs. forward flag:
 355  *         Wing retraction must take forward vs. Viterbi
 356  *         into account. If forward, sum two paths; if Viterbi, take
 357  *         max. I tried to slide this by as a sum, without
 358  *         the flag, but Alex detected it as a bug, because you can
 359  *         then find cases where the Viterbi score doesn't match
 360  *         the P7TraceScore().
 361  *
 362  * Args:      hmm          - the hmm to calculate scores in.
 363  *            viterbi_mode - TRUE to fold wings in Viterbi configuration.
 364  *
 365  * Return:    (void)
 366  *            hmm scores are filled in.
 367  */
 368 void
 369 P7Logoddsify(struct plan7_s *hmm, int viterbi_mode)
 370 {
 371   int k;                        /* counter for model position */
 372   int x;                        /* counter for symbols        */
 373   float accum;
 374   float tbm, tme;
 375
 376   if (hmm->flags & PLAN7_HASBITS) return;
 377
 378   /* Symbol emission scores
 379    */
 380   for (k = 1; k <= hmm->M; k++)
 381     {
 382                                 /* match/insert emissions in main model */
 383       for (x = 0; x < Alphabet_size; x++)
 384         {
 385           hmm->msc[x][k] = Prob2Score(hmm->mat[k][x], hmm->null[x]);
 386           if (k < hmm->M)
 387             hmm->isc[x][k] =  Prob2Score(hmm->ins[k][x], hmm->null[x]);
 388         }
 389                                 /* degenerate match/insert emissions */
 390       for (x = Alphabet_size; x < Alphabet_iupac; x++)
 391         {
 392           hmm->msc[x][k] = DegenerateSymbolScore(hmm->mat[k], hmm->null, x);
 393           if (k < hmm->M)
 394             hmm->isc[x][k] = DegenerateSymbolScore(hmm->ins[k], hmm->null, x);
 395         }
 396     }
 397
 398   /* State transitions.
 399    *
 400    * A note on "folding" of D_1 and D_M.
 401    * These two delete states are folded out of search form models
 402    * in order to prevent null cycles in the dynamic programming
 403    * algorithms (see code below). However, we use their log transitions
 404    * when we save the model! So the following log transition probs
 405    * are used *only* in save files, *never* in search algorithms:
 406    *    log (tbd1), D1 -> M2, D1 -> D2
 407    *    Mm-1 -> Dm, Dm-1 -> Dm
 408    *
 409    * In a search algorithm, these have to be interpreted as -INFTY
 410    * because their contributions are folded into bsc[] and esc[]
 411    * entry/exit scores. They can't be set to -INFTY here because
 412    * we need them in save files.
 413    */
 414   for (k = 1; k < hmm->M; k++)
 415     {
 416       hmm->tsc[k][TMM] = Prob2Score(hmm->t[k][TMM], hmm->p1);
 417       hmm->tsc[k][TMI] = Prob2Score(hmm->t[k][TMI], hmm->p1);
 418       hmm->tsc[k][TMD] = Prob2Score(hmm->t[k][TMD], 1.0);
 419       hmm->tsc[k][TIM] = Prob2Score(hmm->t[k][TIM], hmm->p1);
 420       hmm->tsc[k][TII] = Prob2Score(hmm->t[k][TII], hmm->p1);
 421       hmm->tsc[k][TDM] = Prob2Score(hmm->t[k][TDM], hmm->p1);
 422       hmm->tsc[k][TDD] = Prob2Score(hmm->t[k][TDD], 1.0);
 423     }
 424
 425   /* B->M entry transitions. Note how D_1 is folded out.
 426    * M1 is just B->M1
 427    * M2 is sum (or max) of B->M2 and B->D1->M2
 428    * M_k is sum (or max) of B->M_k and B->D1...D_k-1->M_k
 429    * These have to be done in log space, else you'll get
 430    * underflow errors; and we also have to watch for log(0).
 431    * A little sloppier than it probably has to be; historically,
 432    * doing in this in log space was in response to a bug report.
 433    */
 434   accum = hmm->tbd1 > 0.0 ? log(hmm->tbd1) : -9999.;
 435   for (k = 1; k <= hmm->M; k++)
 436     {
 437       tbm = hmm->begin[k] > 0. ? log(hmm->begin[k]) : -9999.;   /* B->M_k part */
 438
 439       /* B->D1...D_k-1->M_k part we get from accum*/
 440       if (k > 1 && accum > -9999.)
 441         {
 442           if (hmm->t[k-1][TDM] > 0.0)
 443             {
 444               if (viterbi_mode) tbm =  MAX(tbm, accum + log(hmm->t[k-1][TDM]));
 445               else              tbm =  LogSum(tbm, accum + log(hmm->t[k-1][TDM]));
 446             }
 447
 448           accum = (hmm->t[k-1][TDD] > 0.0) ? accum + log(hmm->t[k-1][TDD]) : -9999.;
 449         }
 450                                 /* Convert from log_e to scaled integer log_2 odds. */
 451       if (tbm > -9999.)
 452         hmm->bsc[k] = (int) floor(0.5 + INTSCALE * 1.44269504 * (tbm - log(hmm->p1)));
 453       else
 454         hmm->bsc[k] = -INFTY;
 455     }
 456
 457   /* M->E exit transitions. Note how D_M is folded out.
 458    * M_M is 1 by definition
 459    * M_M-1 is sum of M_M-1->E and M_M-1->D_M->E, where D_M->E is 1 by definition
 460    * M_k is sum of M_k->E and M_k->D_k+1...D_M->E
 461    * Must be done in log space to avoid underflow errors.
 462    * A little sloppier than it probably has to be; historically,
 463    * doing in this in log space was in response to a bug report.
 464    */
 465   hmm->esc[hmm->M] = 0;
 466   accum = 0.;
 467   for (k = hmm->M-1; k >= 1; k--)
 468     {
 469       tme = hmm->end[k] > 0. ? log(hmm->end[k]) : -9999.;
 470       if (accum > -9999.)
 471         {
 472           if (hmm->t[k][TMD] > 0.0)
 473             {
 474               if (viterbi_mode) tme = MAX(tme, accum + log(hmm->t[k][TMD]));
 475               else              tme = LogSum(tme, accum + log(hmm->t[k][TMD]));
 476             }
 477           accum = (hmm->t[k][TDD] > 0.0) ? accum + log(hmm->t[k][TDD]) : -9999.;
 478         }
 479                                 /* convert from log_e to scaled integer log odds. */
 480       hmm->esc[k] = (tme > -9999.) ? (int) floor(0.5 + INTSCALE * 1.44269504 * tme) : -INFTY;
 481     }
 482
 483                                 /* special transitions */
 484   hmm->xsc[XTN][LOOP] = Prob2Score(hmm->xt[XTN][LOOP], hmm->p1);
 485   hmm->xsc[XTN][MOVE] = Prob2Score(hmm->xt[XTN][MOVE], 1.0);
 486   hmm->xsc[XTE][LOOP] = Prob2Score(hmm->xt[XTE][LOOP], 1.0);
 487   hmm->xsc[XTE][MOVE] = Prob2Score(hmm->xt[XTE][MOVE], 1.0);
 488   hmm->xsc[XTC][LOOP] = Prob2Score(hmm->xt[XTC][LOOP], hmm->p1);
 489   hmm->xsc[XTC][MOVE] = Prob2Score(hmm->xt[XTC][MOVE], 1.-hmm->p1);
 490   hmm->xsc[XTJ][LOOP] = Prob2Score(hmm->xt[XTJ][LOOP], hmm->p1);
 491   hmm->xsc[XTJ][MOVE] = Prob2Score(hmm->xt[XTJ][MOVE], 1.0);
 492
 493   hmm->flags |= PLAN7_HASBITS;  /* raise the log-odds ready flag */
 494 }
 495
 496
 497
 498 /* Function: Plan7Renormalize()
 499  *
 500  * Purpose:  Take an HMM in counts form, and renormalize
 501  *           all of its probability vectors. Also enforces
 502  *           Plan7 restrictions on nonexistent transitions.
 503  *
 504  * Args:     hmm - the model to renormalize.
 505  *
 506  * Return:   (void)
 507  *           hmm is changed.
 508  */
 509 void
 510 Plan7Renormalize(struct plan7_s *hmm)
 511 {
 512   int   k;                      /* counter for model position */
 513   int   st;                     /* counter for special states */
 514   float d;                      /* denominator */
 515
 516                                 /* match emissions */
 517   for (k = 1; k <= hmm->M; k++)
 518     FNorm(hmm->mat[k], Alphabet_size);
 519                                 /* insert emissions */
 520   for (k = 1; k < hmm->M; k++)
 521     FNorm(hmm->ins[k], Alphabet_size);
 522                                 /* begin transitions */
 523   d = FSum(hmm->begin+1, hmm->M) + hmm->tbd1;
 524   FScale(hmm->begin+1, hmm->M, 1./d);
 525   hmm->tbd1 /= d;
 526                                 /* main model transitions */
 527   for (k = 1; k < hmm->M; k++)
 528     {
 529       d = FSum(hmm->t[k], 3) + hmm->end[k];
 530       FScale(hmm->t[k], 3, 1./d);
 531       hmm->end[k] /= d;
 532
 533       FNorm(hmm->t[k]+3, 2);    /* insert */
 534       FNorm(hmm->t[k]+5, 2);    /* delete */
 535     }
 536                                 /* null model emissions */
 537   FNorm(hmm->null, Alphabet_size);
 538                                 /* special transitions  */
 539   for (st = 0; st < 4; st++)
 540     FNorm(hmm->xt[st], 2);
 541                                 /* enforce nonexistent transitions */
 542                                 /* (is this necessary?) */
 543   hmm->t[0][TDM] = hmm->t[0][TDD] = 0.0;
 544
 545   hmm->flags &= ~PLAN7_HASBITS; /* clear the log-odds ready flag */
 546   hmm->flags |= PLAN7_HASPROB;  /* set the probabilities OK flag */
 547 }
 548
 549
 550 /* Function: Plan7RenormalizeExits()
 551  * Date:     SRE, Fri Aug 14 11:22:19 1998 [St. Louis]
 552  *
 553  * Purpose:  Renormalize just the match state transitions;
 554  *           for instance, after a Config() function has
 555  *           modified the exit distribution.
 556  *
 557  * Args:     hmm - hmm to renormalize
 558  *
 559  * Returns:  void
 560  */
 561 void
 562 Plan7RenormalizeExits(struct plan7_s *hmm)
 563 {
 564   int   k;
 565   float d;
 566
 567   for (k = 1; k < hmm->M; k++)
 568     {
 569       d = FSum(hmm->t[k], 3);
 570       FScale(hmm->t[k], 3, 1./(d + d*hmm->end[k]));
 571     }
 572 }
 573
 574
 575 /*****************************************************************
 576  * Plan7 configuration functions
 577  * The following few functions are the Plan7 equivalent of choosing
 578  * different alignment styles (fully local, fully global, global/local,
 579  * multihit, etc.)
 580  *
 581  * There is (at least) one constraint worth noting.
 582  * If you want per-domain scores to sum up to per-sequence scores,
 583  * then one of the following two sets of conditions must be met:
 584  *
 585  *   1) t(E->J) = 0
 586  *      e.g. no multidomain hits
 587  *
 588  *   2) t(N->N) = t(C->C) = t(J->J) = hmm->p1
 589  *      e.g. unmatching sequence scores zero, and
 590  *      N->B first-model score is equal to J->B another-model score.
 591  *
 592  * These constraints are obeyed in the default Config() functions below,
 593  * but in the future (when HMM editing may be allowed) we'll have
 594  * to remember this. Non-equality of the summed domain scores and
 595  * the total sequence score is a really easy "red flag" for people to
 596  * notice and report as a bug, even if it may make probabilistic
 597  * sense not to meet either constraint for certain modeling problems.
 598  *****************************************************************
 599  */
 600
 601 /* Function: Plan7NakedConfig()
 602  *
 603  * Purpose:  Set the alignment-independent, algorithm-dependent parameters
 604  *           of a Plan7 model so that no special states (N,C,J) emit anything:
 605  *           one simple, full global pass through the model.
 606  *
 607  * Args:     hmm - the plan7 model
 608  *
 609  * Return:   (void)
 610  *           The HMM is modified; algorithm dependent parameters are set.
 611  *           Previous scores are invalidated if they existed.
 612  */
 613 void
 614 Plan7NakedConfig(struct plan7_s *hmm)
 615 {
 616   hmm->xt[XTN][MOVE] = 1.;            /* disallow N-terminal tail */
 617   hmm->xt[XTN][LOOP] = 0.;
 618   hmm->xt[XTE][MOVE] = 1.;            /* only 1 domain/sequence ("global" alignment) */
 619   hmm->xt[XTE][LOOP] = 0.;
 620   hmm->xt[XTC][MOVE] = 1.;            /* disallow C-terminal tail */
 621   hmm->xt[XTC][LOOP] = 0.;
 622   hmm->xt[XTJ][MOVE] = 0.;            /* J state unused */
 623   hmm->xt[XTJ][LOOP] = 1.;
 624   FSet(hmm->begin+2, hmm->M-1, 0.);   /* disallow internal entries. */
 625   hmm->begin[1]    = 1. - hmm->tbd1;
 626   FSet(hmm->end+1,   hmm->M-1, 0.);   /* disallow internal exits. */
 627   hmm->end[hmm->M] = 1.;
 628   Plan7RenormalizeExits(hmm);
 629   hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
 630 }
 631
 632 /* Function: Plan7GlobalConfig()
 633  *
 634  * Purpose:  Set the alignment-independent, algorithm-dependent parameters
 635  *           of a Plan7 model to global (Needleman/Wunsch) configuration.
 636  *
 637  *           Like a non-looping hmmls, since we actually allow flanking
 638  *           N and C terminal sequence.
 639  *
 640  * Args:     hmm - the plan7 model
 641  *
 642  * Return:   (void)
 643  *           The HMM is modified; algorithm dependent parameters are set.
 644  *           Previous scores are invalidated if they existed.
 645  */
 646 void
 647 Plan7GlobalConfig(struct plan7_s *hmm)
 648 {
 649   hmm->xt[XTN][MOVE] = 1. - hmm->p1;  /* allow N-terminal tail */
 650   hmm->xt[XTN][LOOP] = hmm->p1;
 651   hmm->xt[XTE][MOVE] = 1.;            /* only 1 domain/sequence ("global" alignment) */
 652   hmm->xt[XTE][LOOP] = 0.;
 653   hmm->xt[XTC][MOVE] = 1. - hmm->p1;  /* allow C-terminal tail */
 654   hmm->xt[XTC][LOOP] = hmm->p1;
 655   hmm->xt[XTJ][MOVE] = 0.;            /* J state unused */
 656   hmm->xt[XTJ][LOOP] = 1.;
 657   FSet(hmm->begin+2, hmm->M-1, 0.);   /* disallow internal entries. */
 658   hmm->begin[1]    = 1. - hmm->tbd1;
 659   FSet(hmm->end+1,   hmm->M-1, 0.);   /* disallow internal exits. */
 660   hmm->end[hmm->M] = 1.;
 661   Plan7RenormalizeExits(hmm);
 662   hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
 663 }
 664
 665 /* Function: Plan7LSConfig()
 666  *
 667  * Purpose:  Set the alignment independent parameters of a Plan7 model
 668  *           to hmmls (global in HMM, local in sequence) configuration.
 669  *
 670  * Args:     hmm  - the plan7 model
 671  *
 672  * Return:   (void);
 673  *           the HMM probabilities are modified.
 674  */
 675 void
 676 Plan7LSConfig(struct plan7_s *hmm)
 677 {
 678   hmm->xt[XTN][MOVE] = 1.-hmm->p1;    /* allow N-terminal tail */
 679   hmm->xt[XTN][LOOP] = hmm->p1;
 680   hmm->xt[XTE][MOVE] = 0.5;          /* expectation 2 domains/seq */
 681   hmm->xt[XTE][LOOP] = 0.5;
 682   hmm->xt[XTC][MOVE] = 1.-hmm->p1;    /* allow C-terminal tail */
 683   hmm->xt[XTC][LOOP] = hmm->p1;
 684   hmm->xt[XTJ][MOVE] = 1.-hmm->p1;   /* allow J junction state */
 685   hmm->xt[XTJ][LOOP] = hmm->p1;
 686   FSet(hmm->begin+2, hmm->M-1, 0.);  /* start at M1/D1 */
 687   hmm->begin[1]    = 1. - hmm->tbd1;
 688   FSet(hmm->end+1,   hmm->M-1, 0.);  /* end at M_m/D_m */
 689   hmm->end[hmm->M] = 1.;
 690   Plan7RenormalizeExits(hmm);
 691   hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
 692 }
 693
 694
 695 /* Function: Plan7SWConfig()
 696  *
 697  * Purpose:  Set the alignment independent parameters of
 698  *           a Plan7 model to hmmsw (Smith/Waterman) configuration.
 699  *
 700  * Notes:    entry/exit is asymmetric because of the left/right
 701  *           nature of the HMM/profile. Entry probability is distributed
 702  *           simply by assigning p_x = pentry / (M-1) to M-1
 703  *           internal match states. However, the same approach doesn't
 704  *           lead to a flat distribution over exit points. Exit p's
 705  *           must be corrected for the probability of a previous exit
 706  *           from the model. Requiring a flat distribution over exit
 707  *           points leads to an easily solved piece of algebra, giving:
 708  *                      p_1 = pexit / (M-1)
 709  *                      p_x = p_1 / (1 - (x-1) p_1)
 710  *
 711  * Args:     hmm    - the Plan7 model w/ data-dep prob's valid
 712  *           pentry - probability of an internal entry somewhere;
 713  *                    will be evenly distributed over M-1 match states
 714  *           pexit  - probability of an internal exit somewhere;
 715  *                    will be distributed over M-1 match states.
 716  *
 717  * Return:   (void)
 718  *           HMM probabilities are modified.
 719  */
 720 void
 721 Plan7SWConfig(struct plan7_s *hmm, float pentry, float pexit)
 722 {
 723   float basep;                  /* p1 for exits: the base p */
 724   int   k;                      /* counter over states      */
 725
 726   /* Configure special states.
 727    */
 728   hmm->xt[XTN][MOVE] = 1-hmm->p1;    /* allow N-terminal tail */
 729   hmm->xt[XTN][LOOP] = hmm->p1;
 730   hmm->xt[XTE][MOVE] = 1.;           /* disallow jump state   */
 731   hmm->xt[XTE][LOOP] = 0.;
 732   hmm->xt[XTC][MOVE] = 1-hmm->p1;    /* allow C-terminal tail */
 733   hmm->xt[XTC][LOOP] = hmm->p1;
 734   hmm->xt[XTJ][MOVE] = 1.;           /* J is unused */
 735   hmm->xt[XTJ][LOOP] = 0.;
 736
 737   /* Configure entry.
 738    */
 739   hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1);
 740   FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- hmm->tbd1)) / (float)(hmm->M-1));
 741
 742   /* Configure exit.
 743    */
 744   hmm->end[hmm->M] = 1.0;
 745   basep = pexit / (float) (hmm->M-1);
 746   for (k = 1; k < hmm->M; k++)
 747     hmm->end[k] = basep / (1. - basep * (float) (k-1));
 748   Plan7RenormalizeExits(hmm);
 749   hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
 750 }
 751
 752 /* Function: Plan7FSConfig()
 753  * Date:     SRE, Fri Jan  2 15:34:40 1998 [StL]
 754  *
 755  * Purpose:  Set the alignment independent parameters of
 756  *           a Plan7 model to hmmfs (multihit Smith/Waterman) configuration.
 757  *
 758  *           See comments on Plan7SWConfig() for explanation of
 759  *           how pentry and pexit are used.
 760  *
 761  * Args:     hmm    - the Plan7 model w/ data-dep prob's valid
 762  *           pentry - probability of an internal entry somewhere;
 763  *                    will be evenly distributed over M-1 match states
 764  *           pexit  - probability of an internal exit somewhere;
 765  *                    will be distributed over M-1 match states.
 766  *
 767  * Return:   (void)
 768  *           HMM probabilities are modified.
 769  */
 770 void
 771 Plan7FSConfig(struct plan7_s *hmm, float pentry, float pexit)
 772 {
 773   float basep;                  /* p1 for exits: the base p */
 774   int   k;                      /* counter over states      */
 775
 776   /* Configure special states.
 777    */
 778   hmm->xt[XTN][MOVE] = 1-hmm->p1;    /* allow N-terminal tail     */
 779   hmm->xt[XTN][LOOP] = hmm->p1;
 780   hmm->xt[XTE][MOVE] = 0.5;          /* allow loops / multihits   */
 781   hmm->xt[XTE][LOOP] = 0.5;
 782   hmm->xt[XTC][MOVE] = 1-hmm->p1;    /* allow C-terminal tail     */
 783   hmm->xt[XTC][LOOP] = hmm->p1;
 784   hmm->xt[XTJ][MOVE] = 1.-hmm->p1;   /* allow J junction between domains */
 785   hmm->xt[XTJ][LOOP] = hmm->p1;
 786
 787   /* Configure entry.
 788    */
 789   hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1);
 790   FSet(hmm->begin+2, hmm->M-1, (pentry * (1.-hmm->tbd1)) / (float)(hmm->M-1));
 791
 792   /* Configure exit.
 793    */
 794   hmm->end[hmm->M] = 1.0;
 795   basep = pexit / (float) (hmm->M-1);
 796   for (k = 1; k < hmm->M; k++)
 797     hmm->end[k] = basep / (1. - basep * (float) (k-1));
 798   Plan7RenormalizeExits(hmm);
 799   hmm->flags       &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
 800 }
 801
 802
 803
 804
 805 /* Function: Plan7ESTConfig()
 806  *
 807  * Purpose:  Configure a Plan7 model for EST Smith/Waterman
 808  *           analysis.
 809  *
 810  *           OUTDATED; DO NOT USE WITHOUT RECHECKING
 811  *
 812  * Args:     hmm        - hmm to configure.
 813  *           aacode     - 0..63 vector mapping genetic code to amino acids
 814  *           estmodel   - 20x64 translation matrix, w/ codon bias and substitution error
 815  *           dna2       - probability of a -1 frameshift in a triplet
 816  *           dna4       - probability of a +1 frameshift in a triplet
 817  */
 818 void
 819 Plan7ESTConfig(struct plan7_s *hmm, int *aacode, float **estmodel,
 820                float dna2, float dna4)
 821 {
 822   int k;
 823   int x;
 824   float p;
 825   float *tripnull;              /* UNFINISHED!!! */
 826
 827                                 /* configure specials */
 828   hmm->xt[XTN][MOVE] = 1./351.;
 829   hmm->xt[XTN][LOOP] = 350./351.;
 830   hmm->xt[XTE][MOVE] = 1.;
 831   hmm->xt[XTE][LOOP] = 0.;
 832   hmm->xt[XTC][MOVE] = 1./351.;
 833   hmm->xt[XTC][LOOP] = 350./351.;
 834   hmm->xt[XTJ][MOVE] = 1.;
 835   hmm->xt[XTJ][LOOP] = 0.;
 836                                 /* configure entry/exit */
 837   hmm->begin[1] = 0.5;
 838   FSet(hmm->begin+2, hmm->M-1, 0.5 / ((float)hmm->M - 1.));
 839   hmm->end[hmm->M] = 1.;
 840   FSet(hmm->end, hmm->M-1, 0.5 / ((float)hmm->M - 1.));
 841
 842                                 /* configure dna triplet/frameshift emissions */
 843   for (k = 1; k <= hmm->M; k++)
 844     {
 845                                 /* translate aa to triplet probabilities */
 846       for (x = 0; x < 64; x++) {
 847         p =  hmm->mat[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4);
 848         hmm->dnam[x][k] = Prob2Score(p, tripnull[x]);
 849
 850         p = hmm->ins[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4);
 851         hmm->dnai[x][k] = Prob2Score(p, tripnull[x]);
 852       }
 853       hmm->dnam[64][k] = 0;     /* ambiguous codons score 0 (danger?) */
 854       hmm->dna2 = Prob2Score(dna2, 1.);
 855       hmm->dna4 = Prob2Score(dna4, 1.);
 856     }
 857 }
 858
 859 /* Function: PrintPlan7Stats()
 860  *
 861  * Purpose:  Given a newly constructed HMM and the tracebacks
 862  *           of the sequences it was trained on, print out all
 863  *           the interesting information at the end of hmmb
 864  *           and hmmt runs that convinces the user we actually
 865  *           did something.
 866  *
 867  * Args:     fp   - where to send the output (stdout, usually)
 868  *           hmm  - the new HMM, probability form
 869  *           dsq  - digitized training seqs
 870  *           nseq - number of dsq's
 871  *           tr   - array of tracebacks for dsq
 872  *
 873  * Return:   (void)
 874  */
 875 void
 876 PrintPlan7Stats(FILE *fp, struct plan7_s *hmm, char **dsq, int nseq,
 877                 struct p7trace_s **tr)
 878 {
 879   int   idx;                    /* counter for sequences                */
 880   float score;                  /* an individual trace score            */
 881   float total, best, worst;     /* for the avg. and range of the scores */
 882   float sqsum, stddev;          /* for the std. deviation of the scores */
 883
 884   P7Logoddsify(hmm, TRUE);      /* make sure model scores are ready */
 885
 886                                 /* find individual trace scores */
 887   score = P7TraceScore(hmm, dsq[0], tr[0]);
 888   total = best = worst = score;
 889   sqsum = score * score;
 890   for (idx = 1; idx < nseq; idx++) {
 891     /* P7PrintTrace(stdout, tr[idx], hmm, dsq[idx]); */
 892     score  = P7TraceScore(hmm, dsq[idx], tr[idx]);
 893     total += score;
 894     sqsum += score * score;
 895     if (score > best)  best = score;
 896     if (score < worst) worst = score;
 897   }
 898   if (nseq > 1) {
 899     stddev = (sqsum - (total * total / (float) nseq)) / ((float) nseq - 1.);
 900     stddev = (stddev > 0) ? sqrt(stddev) : 0.0;
 901   } else stddev = 0.0;
 902                                 /* print out stuff. */
 903   fprintf(fp, "Average score:  %10.2f bits\n", total / (float) nseq);
 904   fprintf(fp, "Minimum score:  %10.2f bits\n", worst);
 905   fprintf(fp, "Maximum score:  %10.2f bits\n", best);
 906   fprintf(fp, "Std. deviation: %10.2f bits\n", stddev);
 907 }
 908
 909 /* Function: DegenerateSymbolScore()
 910  *
 911  * Purpose:  Given a sequence character x and an hmm emission probability
 912  *           vector, calculate the log-odds (base 2) score of
 913  *           the symbol.
 914  *
 915  *           Easy if x is in the emission alphabet, but not so easy
 916  *           is x is a degenerate symbol. The "correct" Bayesian
 917  *           philosophy is to calculate score(X) by summing over
 918  *           p(x) for all x in the degenerate symbol X to get P(X),
 919  *           doing the same sum over the prior to get F(X), and
 920  *           doing log_2 (P(X)/F(X)). This gives an X a zero score,
 921  *           for instance.
 922  *
 923  *           Though this is correct in a formal Bayesian sense --
 924  *           we have no information on the sequence, so we can't
 925  *           say if it's random or model, so it scores zero --
 926  *           it sucks, big time, for scoring biological sequences.
 927  *           Sequences with lots of X's score near zero, while
 928  *           real sequences have average scores that are negative --
 929  *           so the X-laden sequences appear to be lifted out
 930  *           of the noise of a full histogram of a database search.
 931  *           Correct or not, this is highly undesirable.
 932  *
 933  *           So therefore we calculated the expected score of
 934  *           the degenerate symbol by summing over all x in X:
 935  *                 e_x log_2 (p(x)/f(x))
 936  *           where the expectation of x, e_x, is calculated from
 937  *           the random model.
 938  *
 939  *           Empirically, this works; it also has a wooly hand-waving
 940  *           probabilistic justification that I'm happy enough about.
 941  *
 942  * Args:     p      - probabilities of normal symbols
 943  *           null   - null emission model
 944  *           ambig  - index of the degenerate character in Alphabet[]
 945  *
 946  * Return:   the integer log odds score of x given the emission
 947  *           vector and the null model, scaled up by INTSCALE.
 948  */
 949 int
 950 DegenerateSymbolScore(float *p, float *null, int ambig)
 951 {
 952   int x;
 953   float numer = 0.;
 954   float denom = 0.;
 955
 956   for (x = 0; x < Alphabet_size; x++) {
 957     if (Degenerate[ambig][x]) {
 958       numer += null[x] * sreLOG2(p[x] / null[x]);
 959       denom += null[x];
 960     }
 961   }
 962   return (int) (INTSCALE * numer / denom);
 963 }
 964
 965 /*****************************************************************
 966  *
 967  * Plan9/Plan7 interface
 968  *
 969  * Very important code during the evolutionary takeover by Plan7 --
 970  * convert between Krogh/Haussler and Plan7 models.
 971  *****************************************************************/
 972
 973 /* Function: Plan9toPlan7()
 974  *
 975  * Purpose:  Convert an old HMM into Plan7. Configures it in
 976  *           ls mode.
 977  *
 978  * Args:     hmm       - old ugly plan9 style HMM
 979  *           ret_plan7 - new wonderful Plan7 HMM
 980  *
 981  * Return:   (void)
 982  *           Plan7 HMM is allocated here. Free w/ FreePlan7().
 983  */
 984 void
 985 Plan9toPlan7(struct plan9_s *hmm, struct plan7_s **ret_plan7)
 986 {
 987   struct plan7_s *plan7;
 988   int k, x;
 989
 990   plan7 = AllocPlan7(hmm->M);
 991
 992   for (k = 1; k < hmm->M; k++)
 993     {
 994       plan7->t[k][TMM] = hmm->mat[k].t[MATCH];
 995       plan7->t[k][TMD] = hmm->mat[k].t[DELETE];
 996       plan7->t[k][TMI] = hmm->mat[k].t[INSERT];
 997       plan7->t[k][TDM] = hmm->del[k].t[MATCH];
 998       plan7->t[k][TDD] = hmm->del[k].t[DELETE];
 999       plan7->t[k][TIM] = hmm->ins[k].t[MATCH];
1000       plan7->t[k][TII] = hmm->ins[k].t[INSERT];
1001     }
1002
1003   for (k = 1; k <= hmm->M; k++)
1004     for (x = 0; x < Alphabet_size; x++)
1005       plan7->mat[k][x] = hmm->mat[k].p[x];
1006
1007   for (k = 1; k < hmm->M; k++)
1008     for (x = 0; x < Alphabet_size; x++)
1009       plan7->ins[k][x] = hmm->ins[k].p[x];
1010
1011   plan7->tbd1 = hmm->mat[0].t[DELETE] / (hmm->mat[0].t[DELETE] + hmm->mat[0].t[MATCH]);
1012
1013                 /* We have to make up the null transition p1; use default */
1014   P7DefaultNullModel(plan7->null, &(plan7->p1));
1015   for (x = 0; x < Alphabet_size; x++)
1016     plan7->null[x] = hmm->null[x];
1017
1018   if (hmm->name != NULL)
1019     Plan7SetName(plan7, hmm->name);
1020   if (hmm->flags & HMM_REF) {
1021     strcpy(plan7->rf, hmm->ref);
1022     plan7->flags |= PLAN7_RF;
1023   }
1024   if (hmm->flags & HMM_CS) {
1025     strcpy(plan7->cs, hmm->cs);
1026     plan7->flags |= PLAN7_CS;
1027   }
1028
1029   Plan7LSConfig(plan7);         /* configure specials for ls-style alignment */
1030   Plan7Renormalize(plan7);      /* mainly to correct for missing ID and DI */
1031   plan7->flags |= PLAN7_HASPROB;        /* probabilities are valid */
1032   plan7->flags &= ~PLAN7_HASBITS;       /* scores are not valid    */
1033   *ret_plan7 = plan7;
1034 }
1035
1036