1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Mon Nov 11 15:58:52 1996
13 * CVS $Id: core_algorithms.c,v 1.1.1.1 2005/03/22 08:34:11 cmzmasek Exp $
15 * Simple and robust "research" implementations of Forward, Backward,
16 * and Viterbi for Plan7.
27 static float get_wee_midpt(struct plan7_s *hmm, char *dsq, int L,
28 int k1, char t1, int s1,
29 int k3, char t3, int s3,
30 int *ret_k2, char *ret_t2, int *ret_s2);
33 /* Function: AllocPlan7Matrix()
35 * Purpose: Allocate a dynamic programming matrix for standard Forward,
36 * Backward, or Viterbi, with scores kept as scaled log-odds
37 * integers. Keeps 2D arrays compact in RAM in an attempt
38 * to maximize cache hits. Sets up individual ptrs to the
39 * four matrix components as a convenience.
41 * Args: rows - number of rows to allocate; typically L+1
44 * - RETURN: ptrs to four mx components as a convenience
47 * mx is allocated here. Caller frees with FreeDPMatrix(mx).
51 AllocPlan7Matrix(int rows, int M, int ***xmx, int ***mmx, int ***imx, int ***dmx)
53 struct dpmatrix_s *mx;
56 mx = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s));
57 mx->xmx = (int **) MallocOrDie (sizeof(int *) * rows);
58 mx->mmx = (int **) MallocOrDie (sizeof(int *) * rows);
59 mx->imx = (int **) MallocOrDie (sizeof(int *) * rows);
60 mx->dmx = (int **) MallocOrDie (sizeof(int *) * rows);
61 mx->xmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*5));
62 mx->mmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2)));
63 mx->imx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2)));
64 mx->dmx[0] = (int *) MallocOrDie (sizeof(int) * (rows*(M+2)));
65 for (i = 1; i < rows; i++)
67 mx->xmx[i] = mx->xmx[0] + (i*5);
68 mx->mmx[i] = mx->mmx[0] + (i*(M+2));
69 mx->imx[i] = mx->imx[0] + (i*(M+2));
70 mx->dmx[i] = mx->dmx[0] + (i*(M+2));
73 if (xmx != NULL) *xmx = mx->xmx;
74 if (mmx != NULL) *mmx = mx->mmx;
75 if (imx != NULL) *imx = mx->imx;
76 if (dmx != NULL) *dmx = mx->dmx;
80 /* Function: FreePlan7Matrix()
82 * Purpose: Free a dynamic programming matrix allocated by AllocPlan7Matrix().
87 FreePlan7Matrix(struct dpmatrix_s *mx)
100 /* Function: AllocShadowMatrix()
102 * Purpose: Allocate a dynamic programming traceback pointer matrix for
103 * a Viterbi algorithm.
105 * Args: rows - number of rows to allocate; typically L+1
108 * - RETURN: ptrs to four mx components as a convenience
111 * mx is allocated here. Caller frees with FreeDPMatrix(mx).
115 AllocShadowMatrix(int rows, int M, char ***xtb, char ***mtb, char ***itb, char ***dtb)
117 struct dpshadow_s *tb;
120 tb = (struct dpshadow_s *) MallocOrDie (sizeof(struct dpshadow_s));
121 tb->xtb = (char **) MallocOrDie (sizeof(char *) * rows);
122 tb->mtb = (char **) MallocOrDie (sizeof(char *) * rows);
123 tb->itb = (char **) MallocOrDie (sizeof(char *) * rows);
124 tb->dtb = (char **) MallocOrDie (sizeof(char *) * rows);
125 tb->esrc = (int *) MallocOrDie (sizeof(int) * rows);
126 tb->xtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*5));
127 tb->mtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2)));
128 tb->itb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2)));
129 tb->dtb[0] = (char *) MallocOrDie (sizeof(char) * (rows*(M+2)));
130 for (i = 1; i < rows; i++)
132 tb->xtb[i] = tb->xtb[0] + (i*5);
133 tb->mtb[i] = tb->mtb[0] + (i*(M+2));
134 tb->itb[i] = tb->itb[0] + (i*(M+2));
135 tb->dtb[i] = tb->dtb[0] + (i*(M+2));
138 if (xtb != NULL) *xtb = tb->xtb;
139 if (mtb != NULL) *mtb = tb->mtb;
140 if (itb != NULL) *itb = tb->itb;
141 if (dtb != NULL) *dtb = tb->dtb;
145 /* Function: FreeShadowMatrix()
147 * Purpose: Free a dynamic programming matrix allocated by AllocShadowMatrix().
152 FreeShadowMatrix(struct dpshadow_s *tb)
166 /* Function: P7ViterbiSize()
167 * Date: SRE, Fri Mar 6 15:13:20 1998 [St. Louis]
169 * Purpose: Returns the ballpark predicted memory requirement for a
170 * P7Viterbi() alignment, in MB.
172 * Currently L must fit in an int (< 2 GB), but we have
173 * to deal with LM > 2 GB - e.g. watch out for overflow, do
174 * the whole calculation in floating point. Bug here detected
175 * in 2.1.1 by David Harper, Sanger Centre.
177 * Args: L - length of sequence
183 P7ViterbiSize(int L, int M)
187 /* We're excessively precise here, but it doesn't cost
188 * us anything to be pedantic. The four terms are:
189 * 1. the matrix structure itself;
190 * 2. the O(NM) main matrix (this dominates!)
191 * 3. ptrs into the rows of the matrix
192 * 4. storage for 5 special states. (xmx)
194 Mbytes = (float) sizeof(struct dpmatrix_s);
195 Mbytes += 3. * (float) (L+1) * (float) (M+2) * (float) sizeof(int);
196 Mbytes += 4. * (float) (L+1) * (float) sizeof(int *);
197 Mbytes += 5. * (float) (L+1) * (float) sizeof(int);
202 /* Function: P7SmallViterbiSize()
203 * Date: SRE, Fri Mar 6 15:20:04 1998 [St. Louis]
205 * Purpose: Returns the ballpark predicted memory requirement for
206 * a P7SmallViterbi() alignment, in MB.
208 * P7SmallViterbi() is a wrapper, calling both P7ParsingViterbi()
209 * and P7WeeViterbi(). P7ParsingViterbi() typically dominates
210 * the memory requirement, so the value returned
211 * is the P7ParsingViterbi() number.
213 * We don't (yet) worry about overflow issues like we did with
214 * P7ViterbiSize(). We'll have many other 32-bit int issues in the
215 * code if we overflow here.
217 * Args: L - length of sequence
223 P7SmallViterbiSize(int L, int M)
225 return ((2 * sizeof(struct dpmatrix_s) +
226 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */
227 16 * sizeof(int *) + /* ptrs into rows of matrix */
228 20 * sizeof(int) + /* 5 special states */
229 2 * (L+1) * sizeof(int)) /* traceback indices */
234 /* Function: P7WeeViterbiSize()
235 * Date: SRE, Fri Mar 6 15:40:42 1998 [St. Louis]
237 * Purpose: Returns the ballpark predicted memory requirement for
238 * a P7WeeViterbi() alignment, in MB.
240 * Args: L - length of sequence
246 P7WeeViterbiSize(int L, int M)
248 return ((2 * sizeof(struct dpmatrix_s) +
249 12 * (M+2) * sizeof(int) + /* 2 matrices w/ 2 rows */
250 16 * sizeof(int *) + /* ptrs into rows of matrix */
251 20 * sizeof(int) + /* 5 special states */
252 2 * (L+2) * sizeof(int) + /* stacks for starts/ends (overkill) */
253 (L+2) * sizeof(int) + /* k assignments to seq positions */
254 (L+2) * sizeof(char)) /* state assignments to seq pos */
259 /* Function: P7Forward()
261 * Purpose: The Forward dynamic programming algorithm.
262 * The scaling issue is dealt with by working in log space
263 * and calling ILogsum(); this is a slow but robust approach.
265 * Args: dsq - sequence in digitized form
268 * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted
270 * Return: log P(S|M)/P(S|R), as a bit score.
273 P7Forward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx)
275 struct dpmatrix_s *mx;
283 /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.
285 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
287 /* Initialization of the zero row.
288 * Note that xmx[i][stN] = 0 by definition for all i,
289 * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need
290 * to be calculated in DP matrices.
292 xmx[0][XMN] = 0; /* S->N, p=1 */
293 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
294 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
295 for (k = 0; k <= hmm->M; k++)
296 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
298 /* Recursion. Done as a pull.
299 * Note some slightly wasteful boundary conditions:
300 * tsc[0] = -INFTY for all eight transitions (no node 0)
301 * D_M and I_M are wastefully calculated (they don't exist)
303 for (i = 1; i <= L; i++)
305 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
306 for (k = 1; k < hmm->M; k++)
308 mmx[i][k] = ILogsum(ILogsum(mmx[i-1][k-1] + hmm->tsc[k-1][TMM],
309 imx[i-1][k-1] + hmm->tsc[k-1][TIM]),
310 ILogsum(xmx[i-1][XMB] + hmm->bsc[k],
311 dmx[i-1][k-1] + hmm->tsc[k-1][TDM]));
312 mmx[i][k] += hmm->msc[(int) dsq[i]][k];
314 dmx[i][k] = ILogsum(mmx[i][k-1] + hmm->tsc[k-1][TMD],
315 dmx[i][k-1] + hmm->tsc[k-1][TDD]);
316 imx[i][k] = ILogsum(mmx[i-1][k] + hmm->tsc[k][TMI],
317 imx[i-1][k] + hmm->tsc[k][TII]);
318 imx[i][k] += hmm->isc[(int) dsq[i]][k];
320 mmx[i][hmm->M] = ILogsum(ILogsum(mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM],
321 imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]),
322 ILogsum(xmx[i-1][XMB] + hmm->bsc[hmm->M-1],
323 dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM]));
324 mmx[i][hmm->M] += hmm->msc[(int) dsq[i]][hmm->M];
326 /* Now the special states.
327 * remember, C and J emissions are zero score by definition
329 xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP];
331 xmx[i][XME] = -INFTY;
332 for (k = 1; k <= hmm->M; k++)
333 xmx[i][XME] = ILogsum(xmx[i][XME], mmx[i][k] + hmm->esc[k]);
335 xmx[i][XMJ] = ILogsum(xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP],
336 xmx[i][XME] + hmm->xsc[XTE][LOOP]);
338 xmx[i][XMB] = ILogsum(xmx[i][XMN] + hmm->xsc[XTN][MOVE],
339 xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]);
341 xmx[i][XMC] = ILogsum(xmx[i-1][XMC] + hmm->xsc[XTC][LOOP],
342 xmx[i][XME] + hmm->xsc[XTE][MOVE]);
345 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
347 if (ret_mx != NULL) *ret_mx = mx;
348 else FreePlan7Matrix(mx);
350 return Scorify(sc); /* the total Forward score. */
354 /* Function: P7Viterbi()
356 * Purpose: The Viterbi dynamic programming algorithm.
357 * Identical to Forward() except that max's
360 * Args: dsq - sequence in digitized form
363 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
365 * Return: log P(S|M)/P(S|R), as a bit score
368 P7Viterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
370 struct dpmatrix_s *mx;
371 struct p7trace_s *tr;
379 /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.
381 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
383 /* Initialization of the zero row.
385 xmx[0][XMN] = 0; /* S->N, p=1 */
386 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
387 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
388 for (k = 0; k <= hmm->M; k++)
389 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
391 /* Recursion. Done as a pull.
392 * Note some slightly wasteful boundary conditions:
393 * tsc[0] = -INFTY for all eight transitions (no node 0)
394 * D_M and I_M are wastefully calculated (they don't exist)
396 for (i = 1; i <= L; i++) {
397 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
399 for (k = 1; k <= hmm->M; k++) {
402 if ((sc = mmx[i-1][k-1] + hmm->tsc[k-1][TMM]) > mmx[i][k])
404 if ((sc = imx[i-1][k-1] + hmm->tsc[k-1][TIM]) > mmx[i][k])
406 if ((sc = xmx[i-1][XMB] + hmm->bsc[k]) > mmx[i][k])
408 if ((sc = dmx[i-1][k-1] + hmm->tsc[k-1][TDM]) > mmx[i][k])
410 if (hmm->msc[(int) dsq[i]][k] != -INFTY) mmx[i][k] += hmm->msc[(int) dsq[i]][k];
411 else mmx[i][k] = -INFTY;
415 if ((sc = mmx[i][k-1] + hmm->tsc[k-1][TMD]) > dmx[i][k])
417 if ((sc = dmx[i][k-1] + hmm->tsc[k-1][TDD]) > dmx[i][k])
423 if ((sc = mmx[i-1][k] + hmm->tsc[k][TMI]) > imx[i][k])
425 if ((sc = imx[i-1][k] + hmm->tsc[k][TII]) > imx[i][k])
427 if (hmm->isc[(int)dsq[i]][k] != -INFTY) imx[i][k] += hmm->isc[(int) dsq[i]][k];
428 else imx[i][k] = -INFTY;
432 /* Now the special states. Order is important here.
433 * remember, C and J emissions are zero score by definition,
436 xmx[i][XMN] = -INFTY;
437 if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
441 xmx[i][XME] = -INFTY;
442 for (k = 1; k <= hmm->M; k++)
443 if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME])
446 xmx[i][XMJ] = -INFTY;
447 if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
449 if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
453 xmx[i][XMB] = -INFTY;
454 if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
456 if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
460 xmx[i][XMC] = -INFTY;
461 if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
463 if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
466 /* T state (not stored) */
467 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
469 if (ret_tr != NULL) {
470 P7ViterbiTrace(hmm, dsq, L, mx, &tr);
475 return Scorify(sc); /* the total Viterbi score. */
479 /* Function: P7ViterbiTrace()
480 * Date: SRE, Sat Aug 23 10:30:11 1997 (St. Louis Lambert Field)
482 * Purpose: Traceback of a Viterbi matrix: i.e. retrieval
483 * of optimum alignment.
485 * Args: hmm - hmm, log odds form, used to make mx
486 * dsq - sequence aligned to (digital form) 1..N
488 * mx - the matrix to trace back in, N x hmm->M
489 * ret_tr - RETURN: traceback.
492 * ret_tr is allocated here. Free using P7FreeTrace().
495 P7ViterbiTrace(struct plan7_s *hmm, char *dsq, int N,
496 struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
498 struct p7trace_s *tr;
499 int curralloc; /* current allocated length of trace */
500 int tpos; /* position in trace */
501 int i; /* position in seq (1..N) */
502 int k; /* position in model (1..M) */
503 int **xmx, **mmx, **imx, **dmx;
504 int sc; /* temp var for pre-emission score */
506 /* Overallocate for the trace.
507 * S-N-B- ... - E-C-T : 6 states + N is minimum trace;
508 * add N more as buffer.
510 curralloc = N * 2 + 6;
511 P7AllocTrace(curralloc, &tr);
518 /* Initialization of trace
519 * We do it back to front; ReverseTrace() is called later.
521 tr->statetype[0] = STT;
524 tr->statetype[1] = STC;
528 i = N; /* current i (seq pos) we're trying to assign */
532 while (tr->statetype[tpos-1] != STS) {
533 switch (tr->statetype[tpos-1]) {
534 case STM: /* M connects from i-1,k-1, or B */
535 sc = mmx[i+1][k+1] - hmm->msc[(int) dsq[i+1]][k+1];
536 if (sc == xmx[i][XMB] + hmm->bsc[k+1])
538 /* Check for wing unfolding */
539 if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1])
542 tr->statetype[tpos] = STD;
543 tr->nodeidx[tpos] = k--;
546 if (tpos == curralloc)
547 { /* grow trace if necessary */
549 P7ReallocTrace(tr, curralloc);
553 tr->statetype[tpos] = STB;
554 tr->nodeidx[tpos] = 0;
557 else if (sc == mmx[i][k] + hmm->tsc[k][TMM])
559 tr->statetype[tpos] = STM;
560 tr->nodeidx[tpos] = k--;
563 else if (sc == imx[i][k] + hmm->tsc[k][TIM])
565 tr->statetype[tpos] = STI;
566 tr->nodeidx[tpos] = k;
569 else if (sc == dmx[i][k] + hmm->tsc[k][TDM])
571 tr->statetype[tpos] = STD;
572 tr->nodeidx[tpos] = k--;
575 else Die("traceback failed");
578 case STD: /* D connects from M,D */
579 if (dmx[i][k+1] == mmx[i][k] + hmm->tsc[k][TMD])
581 tr->statetype[tpos] = STM;
582 tr->nodeidx[tpos] = k--;
585 else if (dmx[i][k+1] == dmx[i][k] + hmm->tsc[k][TDD])
587 tr->statetype[tpos] = STD;
588 tr->nodeidx[tpos] = k--;
591 else Die("traceback failed");
594 case STI: /* I connects from M,I */
595 sc = imx[i+1][k] - hmm->isc[(int) dsq[i+1]][k];
596 if (sc == mmx[i][k] + hmm->tsc[k][TMI])
598 tr->statetype[tpos] = STM;
599 tr->nodeidx[tpos] = k--;
602 else if (sc == imx[i][k] + hmm->tsc[k][TII])
604 tr->statetype[tpos] = STI;
605 tr->nodeidx[tpos] = k;
608 else Die("traceback failed");
611 case STN: /* N connects from S, N */
612 if (i == 0 && xmx[i][XMN] == 0)
614 tr->statetype[tpos] = STS;
615 tr->nodeidx[tpos] = 0;
618 else if (i > 0 && xmx[i+1][XMN] == xmx[i][XMN] + hmm->xsc[XTN][LOOP])
620 tr->statetype[tpos] = STN;
621 tr->nodeidx[tpos] = 0;
622 tr->pos[tpos] = 0; /* note convention adherence: */
623 tr->pos[tpos-1] = i--; /* first N doesn't emit */
625 else Die("traceback failed");
628 case STB: /* B connects from N, J */
629 if (xmx[i][XMB] == xmx[i][XMN] + hmm->xsc[XTN][MOVE])
631 tr->statetype[tpos] = STN;
632 tr->nodeidx[tpos] = 0;
635 else if (xmx[i][XMB] == xmx[i][XMJ] + hmm->xsc[XTJ][MOVE])
637 tr->statetype[tpos] = STJ;
638 tr->nodeidx[tpos] = 0;
641 else Die("traceback failed");
644 case STE: /* E connects from any M state. k set here */
645 for (k = hmm->M; k >= 1; k--)
646 if (xmx[i][XME] == mmx[i][k] + hmm->esc[k])
648 /* check for wing unfolding */
649 if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k])
651 int dk; /* need a tmp k while moving thru delete wing */
652 for (dk = hmm->M; dk > k; dk--)
654 tr->statetype[tpos] = STD;
655 tr->nodeidx[tpos] = dk;
658 if (tpos == curralloc)
659 { /* grow trace if necessary */
661 P7ReallocTrace(tr, curralloc);
666 tr->statetype[tpos] = STM;
667 tr->nodeidx[tpos] = k--;
671 if (k < 0) Die("traceback failed");
674 case STC: /* C comes from C, E */
675 if (xmx[i][XMC] == xmx[i-1][XMC] + hmm->xsc[XTC][LOOP])
677 tr->statetype[tpos] = STC;
678 tr->nodeidx[tpos] = 0;
679 tr->pos[tpos] = 0; /* note convention adherence: */
680 tr->pos[tpos-1] = i--; /* first C doesn't emit */
682 else if (xmx[i][XMC] == xmx[i][XME] + hmm->xsc[XTE][MOVE])
684 tr->statetype[tpos] = STE;
685 tr->nodeidx[tpos] = 0;
686 tr->pos[tpos] = 0; /* E is a nonemitter */
688 else Die("Traceback failed.");
691 case STJ: /* J connects from E, J */
692 if (xmx[i][XMJ] == xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP])
694 tr->statetype[tpos] = STJ;
695 tr->nodeidx[tpos] = 0;
696 tr->pos[tpos] = 0; /* note convention adherence: */
697 tr->pos[tpos-1] = i--; /* first J doesn't emit */
699 else if (xmx[i][XMJ] == xmx[i][XME] + hmm->xsc[XTE][LOOP])
701 tr->statetype[tpos] = STE;
702 tr->nodeidx[tpos] = 0;
703 tr->pos[tpos] = 0; /* E is a nonemitter */
705 else Die("Traceback failed.");
709 Die("traceback failed");
711 } /* end switch over statetype[tpos-1] */
714 if (tpos == curralloc)
715 { /* grow trace if necessary */
717 P7ReallocTrace(tr, curralloc);
720 } /* end traceback, at S state; tpos == tlen now */
727 /* Function: P7SmallViterbi()
728 * Date: SRE, Fri Mar 6 15:29:41 1998 [St. Louis]
730 * Purpose: Wrapper function, for linear memory alignment
731 * with same arguments as P7Viterbi().
733 * Calls P7ParsingViterbi to break the sequence
734 * into fragments. Then, based on size of fragments,
735 * calls either P7Viterbi() or P7WeeViterbi() to
736 * get traces for them. Finally, assembles all these
737 * traces together to produce an overall optimal
738 * trace for the sequence.
740 * If the trace isn't needed for some reason,
741 * all we do is call P7ParsingViterbi.
743 * Args: dsq - sequence in digitized form
746 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
748 * Returns: Score of optimal alignment in bits.
751 P7SmallViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
753 struct p7trace_s *ctr; /* collapsed trace of optimal parse */
754 struct p7trace_s *tr; /* full trace of optimal alignment */
755 struct p7trace_s **tarr; /* trace array */
756 int ndom; /* number of subsequences */
757 int i; /* counter over domains */
758 int pos; /* position in sequence */
759 int tpos; /* position in trace */
760 int tlen; /* length of full trace */
761 int sqlen; /* length of a subsequence */
762 int totlen; /* length of L matched by model (as opposed to N/C/J) */
763 float sc; /* score of optimal alignment */
764 int t2; /* position in a subtrace */
766 /* Step 1. Call P7ParsingViterbi to calculate an optimal parse
767 * of the sequence into single-hit subsequences; this parse
768 * is returned in a "collapsed" trace
770 sc = P7ParsingViterbi(dsq, L, hmm, &ctr);
772 /* If we don't want full trace, we're done */
779 /* Step 2. Call either P7Viterbi or P7WeeViterbi on each subsequence
780 * to recover a full traceback of each, collecting them in
783 ndom = ctr->tlen/2 - 1;
784 tarr = MallocOrDie(sizeof(struct p7trace_s *) * ndom);
786 for (i = 0; i < ndom; i++)
788 sqlen = ctr->pos[i*2+2] - ctr->pos[i*2+1]; /* length of subseq */
790 if (P7ViterbiSize(sqlen, hmm->M) > RAMLIMIT)
791 P7WeeViterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));
793 P7Viterbi(dsq + ctr->pos[i*2+1], sqlen, hmm, &(tarr[i]));
795 tlen += tarr[i]->tlen - 4; /* not counting S->N,...,C->T */
799 /* Step 3. Compose the subtraces into one big final trace.
800 * This is wasteful because we're going to TraceDecompose()
801 * it again in both hmmsearch and hmmpfam to look at
802 * individual domains; but we do it anyway so the P7SmallViterbi
803 * interface looks exactly like the P7Viterbi interface. Maybe
804 * long traces shouldn't include all the N/J/C states anyway,
805 * since they're unambiguously implied.
808 /* Calculate total trace len and alloc;
809 * nonemitting SNCT + nonemitting J's + emitting NJC
811 tlen += 4 + (ndom-1) + (L-totlen);
812 P7AllocTrace(tlen, &tr);
815 /* Add N-terminal trace framework
817 tr->statetype[0] = STS;
820 tr->statetype[1] = STN;
824 /* add implied N's */
825 for (pos = 1; pos <= ctr->pos[1]; pos++)
827 tr->statetype[tpos] = STN;
828 tr->nodeidx[tpos] = 0;
833 /* Add each subseq trace in, with its appropriate
834 * sequence offset derived from the collapsed trace
836 for (i = 0; i < ndom; i++)
837 { /* skip SN, CT framework at ends */
838 for (t2 = 2; t2 < tarr[i]->tlen-2; t2++)
840 tr->statetype[tpos] = tarr[i]->statetype[t2];
841 tr->nodeidx[tpos] = tarr[i]->nodeidx[t2];
842 if (tarr[i]->pos[t2] > 0)
843 tr->pos[tpos] = tarr[i]->pos[t2] + ctr->pos[i*2+1];
848 /* add nonemitting J or C */
849 tr->statetype[tpos] = (i == ndom-1) ? STC : STJ;
850 tr->nodeidx[tpos] = 0;
853 /* add implied emitting J's */
855 for (pos = ctr->pos[i*2+2]+1; pos <= ctr->pos[(i+1)*2+1]; pos++)
857 tr->statetype[tpos] = STJ;
858 tr->nodeidx[tpos] = 0;
864 /* add implied C's */
865 for (pos = ctr->pos[ndom*2]+1; pos <= L; pos++)
867 tr->statetype[tpos] = STC;
868 tr->nodeidx[tpos] = 0;
873 tr->statetype[tpos] = STT;
874 tr->nodeidx[tpos] = 0;
878 for (i = 0; i < ndom; i++) P7FreeTrace(tarr[i]);
889 /* Function: P7ParsingViterbi()
890 * Date: SRE, Wed Mar 4 14:07:31 1998 [St. Louis]
892 * Purpose: The "hmmfs" linear-memory algorithm for finding
893 * the optimal alignment of a very long sequence to
894 * a looping, multihit (e.g. Plan7) model, parsing it into
895 * a series of nonoverlapping subsequences that match
896 * the model once. Other algorithms (e.g. P7Viterbi()
897 * or P7WeeViterbi()) are applied subsequently to
898 * these subsequences to recover complete alignments.
900 * The hmmfs algorithm appears briefly in [Durbin98],
901 * but is otherwise unpublished.
903 * The traceback structure returned is special: a
904 * "collapsed" trace S->B->E->...->B->E->T, where
905 * stateidx is unused and pos is used to indicate the
906 * position of B and E in the sequence. The matched
907 * subsequence is B_pos+1...E_pos. The number of
908 * matches in the trace is (tlen/2)-1.
910 * Args: dsq - sequence in digitized form
912 * hmm - the model (log odds scores ready)
913 * ret_tr - RETURN: a collapsed traceback.
915 * Returns: Score of the optimal Viterbi alignment, in bits.
918 P7ParsingViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
920 struct dpmatrix_s *mx; /* two rows of score matrix */
921 struct dpmatrix_s *tmx; /* two rows of misused score matrix: traceback ptrs */
922 struct p7trace_s *tr; /* RETURN: collapsed traceback */
923 int **xmx, **mmx, **dmx, **imx; /* convenience ptrs to score matrix */
924 int **xtr, **mtr, **dtr, **itr; /* convenience ptrs to traceback pointers */
925 int *btr, *etr; /* O(L) trace ptrs for B, E state pts in seq */
926 int sc; /* integer score of optimal alignment */
927 int i,k,tpos; /* index for seq, model, trace position */
928 int cur, prv; /* indices for rolling dp matrix */
929 int curralloc; /* size of allocation for tr */
932 /* Alloc a DP matrix and traceback pointers, two rows each, O(M).
933 * Alloc two O(L) arrays to trace back through the sequence thru B and E.
935 mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
936 tmx = AllocPlan7Matrix(2, hmm->M, &xtr, &mtr, &itr, &dtr);
937 btr = MallocOrDie(sizeof(int) * (L+1));
938 etr = MallocOrDie(sizeof(int) * (L+1));
940 /* Initialization of the zero row.
942 xmx[0][XMN] = 0; /* S->N, p=1 */
943 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
945 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
947 for (k = 0; k <= hmm->M; k++)
948 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
950 /* Recursion. Done as a pull. Rolling index trick. Trace ptr propagation trick.
951 * Note some slightly wasteful boundary conditions:
952 * tsc[0] = -INFTY for all eight transitions (no node 0)
953 * D_M and I_M are wastefully calculated (they don't exist)
955 * Notes on traceback pointer propagation.
956 * - In the path B->E, we propagate the i that B was aligned to in the optimal
957 * alignment, via mtr, dtr, and itr.
958 * - When we reach an E, we record the i of the B it started from in etr.
959 * - In a looping path E->J...->B or terminal path E->C...->T, we propagate
960 * the i that E was aligned to in the optimal alignment via xtr[][XMC]
962 * - When we enter B, we record the i of the best previous E, or 0 if there
965 for (i = 1; i <= L; i++) {
969 mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;
971 for (k = 1; k <= hmm->M; k++) {
973 mmx[cur][k] = -INFTY;
974 if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY)
975 { mmx[cur][k] = sc; mtr[cur][k] = mtr[prv][k-1]; }
976 if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k])
977 { mmx[cur][k] = sc; mtr[cur][k] = itr[prv][k-1]; }
978 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
979 { mmx[cur][k] = sc; mtr[cur][k] = i-1; }
980 if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k])
981 { mmx[cur][k] = sc; mtr[cur][k] = dtr[prv][k-1]; }
982 if (hmm->msc[(int) dsq[i]][k] != -INFTY)
983 mmx[cur][k] += hmm->msc[(int) dsq[i]][k];
985 mmx[cur][k] = -INFTY;
988 dmx[cur][k] = -INFTY;
989 if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY)
990 { dmx[cur][k] = sc; dtr[cur][k] = mtr[cur][k-1]; }
991 if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])
992 { dmx[cur][k] = sc; dtr[cur][k] = dtr[cur][k-1]; }
996 imx[cur][k] = -INFTY;
997 if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY)
998 { imx[cur][k] = sc; itr[cur][k] = mtr[prv][k]; }
999 if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k])
1000 { imx[cur][k] = sc; itr[cur][k] = itr[prv][k]; }
1001 if (hmm->isc[(int) dsq[i]][k] != -INFTY)
1002 imx[cur][k] += hmm->isc[(int) dsq[i]][k];
1004 imx[cur][k] = -INFTY;
1008 /* Now the special states. Order is important here.
1009 * remember, C and J emissions are zero score by definition,
1012 xmx[cur][XMN] = -INFTY;
1013 if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
1016 xmx[cur][XME] = -INFTY;
1017 for (k = 1; k <= hmm->M; k++)
1018 if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
1019 { xmx[cur][XME] = sc; etr[i] = mtr[cur][k]; }
1021 xmx[cur][XMJ] = -INFTY;
1022 if ((sc = xmx[prv][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
1023 { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = xtr[prv][XMJ]; }
1024 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][LOOP]) > xmx[cur][XMJ])
1025 { xmx[cur][XMJ] = sc; xtr[cur][XMJ] = i; }
1027 xmx[cur][XMB] = -INFTY;
1028 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1029 { xmx[cur][XMB] = sc; btr[i] = 0; }
1030 if ((sc = xmx[cur][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[cur][XMB])
1031 { xmx[cur][XMB] = sc; btr[i] = xtr[cur][XMJ]; }
1033 xmx[cur][XMC] = -INFTY;
1034 if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1035 { xmx[cur][XMC] = sc; xtr[cur][XMC] = xtr[prv][XMC]; }
1036 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
1037 { xmx[cur][XMC] = sc; xtr[cur][XMC] = i; }
1039 /* T state (not stored) */
1040 sc = xmx[cur][XMC] + hmm->xsc[XTC][MOVE];
1042 /*****************************************************************
1043 * Collapsed traceback stage.
1044 * xtr[L%2][XMC] contains the position j of the previous E
1045 * etr[j] contains the position i of the previous B
1046 * btr[i] contains the position j of the previous E, or 0
1047 * continue until btr[i] = 0.
1048 *****************************************************************/
1050 curralloc = 2; /* minimum: no hits */
1051 P7AllocTrace(curralloc, &tr);
1053 /* Init of collapsed trace. Back to front; we ReverseTrace() later.
1056 tr->statetype[tpos] = STT;
1062 P7ReallocTrace(tr, curralloc);
1065 tr->statetype[tpos] = STE;
1070 tr->statetype[tpos] = STB;
1076 tr->statetype[tpos] = STS;
1078 tr->tlen = tpos + 1;
1081 FreePlan7Matrix(mx);
1082 FreePlan7Matrix(tmx);
1090 /* Function: P7WeeViterbi()
1091 * Date: SRE, Wed Mar 4 08:24:04 1998 [St. Louis]
1093 * Purpose: Hirschberg/Myers/Miller linear memory alignment.
1094 * See [Hirschberg75,MyM-88a] for the idea of the algorithm.
1095 * Adapted to HMM implementation.
1097 * Requires that you /know/ that there's only
1098 * one hit to the model in the sequence: either
1099 * because you're forcing single-hit, or you've
1100 * previously called P7ParsingViterbi to parse
1101 * the sequence into single-hit segments. The reason
1102 * for this is that a cyclic model (a la Plan7)
1103 * defeats the nice divide and conquer trick.
1104 * (I think some trickery with propagated trace pointers
1105 * could get around this but haven't explored it.)
1106 * This is implemented by ignoring transitions
1109 * Args: dsq - sequence in digitized form
1112 * ret_tr - RETURN: traceback.
1114 * Returns: Score of the optimal Viterbi alignment.
1117 P7WeeViterbi(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
1119 struct p7trace_s *tr; /* RETURN: traceback */
1120 int *kassign; /* 0..L+1, alignment of seq positions to model nodes */
1121 char *tassign; /* 0..L+1, alignment of seq positions to state types */
1122 int *endlist; /* stack of end points on sequence to work on */
1123 int *startlist; /* stack of start points on sequence to work on */
1124 int lpos; /* position in endlist, startlist */
1125 int k1, k2, k3; /* start, mid, end in model */
1126 char t1, t2, t3; /* start, mid, end in state type */
1127 int s1, s2, s3; /* start, mid, end in sequence */
1128 float sc; /* score of segment optimal alignment */
1129 float ret_sc; /* optimal score over complete seq */
1130 int tlen; /* length needed for trace */
1131 int i, k, tpos; /* index in sequence, model, trace */
1136 kassign = MallocOrDie (sizeof(int) * (L+1));
1137 tassign = MallocOrDie (sizeof(char)* (L+1));
1138 endlist = MallocOrDie (sizeof(int) * (L+1));
1139 startlist = MallocOrDie (sizeof(int) * (L+1));
1142 startlist[lpos] = 1;
1145 kassign[L] = hmm->M;
1146 tassign[1] = STS; /* temporary boundary condition! will become N or M */
1147 tassign[L] = STT; /* temporary boundary condition! will become M or C */
1149 /* Recursive divide-and-conquer alignment.
1153 /* Pop a segment off the stack */
1154 s1 = startlist[lpos];
1161 /* find optimal midpoint of segment */
1162 sc = get_wee_midpt(hmm, dsq, L, k1, t1, s1, k3, t3, s3, &k2, &t2, &s2);
1165 /* score is valid on first pass */
1166 if (t1 == STS && t3 == STT) ret_sc = sc;
1168 /* push N-terminal segment on stack */
1169 if (t2 != STN && (s2 - s1 > 1 || (s2 - s1 == 1 && t1 == STS)))
1172 startlist[lpos] = s1;
1175 /* push C-terminal segment on stack */
1176 if (t2 != STC && (s3 - s2 > 1 || (s3 - s2 == 1 && t3 == STT)))
1179 startlist[lpos] = s2;
1184 { /* if we see STN midpoint, we know the whole N-term is STN */
1185 for (; s2 >= s1; s2--) {
1191 { /* if we see STC midpoint, we know whole C-term is STC */
1192 for (; s2 <= s3; s2++) {
1193 kassign[s2] = hmm->M;
1199 /*****************************************************************
1200 * Construct a traceback structure from kassign/tassign by interpolating
1202 * Trace allocation is as follows. We clearly need L emitting states.
1203 * We also need nonemitting states as follows:
1204 * STS,STN,STB,STE,STC,STT = 6
1205 * STD: count k2-k1-1 in kassign M->M's
1206 * Also, count N->M's and M->C's (potential wing unfoldings)...
1207 * ...and be careful to check wing unfoldings when there aren't
1208 * any emitting N or C flanks! (bugfix, 2.1.1b)
1209 *****************************************************************/
1212 for (i = 1; i < L; i++)
1214 if (tassign[i] == STM && tassign[i+1] == STM)
1215 tlen += kassign[i+1] - kassign[i] - 1;
1216 if (tassign[i] == STN && tassign[i+1] == STM)
1217 tlen += kassign[i+1] - 1;
1218 if (tassign[i] == STM && tassign[i+1] == STC)
1219 tlen += hmm->M - kassign[i];
1221 if (tassign[1] == STM) tlen += kassign[1] - 1;
1222 if (tassign[L] == STM) tlen += hmm->M - kassign[L];
1223 P7AllocTrace(tlen, &tr);
1225 tr->statetype[0] = STS;
1228 tr->statetype[1] = STN;
1233 for (i = 1; i <= L; i++)
1235 switch(tassign[i]) {
1237 /* check for first match state */
1238 if (tr->statetype[tpos-1] == STN) {
1239 tr->statetype[tpos] = STB;
1240 tr->nodeidx[tpos] = 0;
1243 /* check for wing unfolding */
1244 if (Prob2Score(hmm->begin[kassign[i]], hmm->p1) + INTSCALE <= hmm->bsc[kassign[i]])
1245 for (k = 1; k < kassign[i]; k++) {
1246 tr->statetype[tpos] = STD;
1247 tr->nodeidx[tpos] = k;
1252 /* do the match state itself */
1253 tr->statetype[tpos] = STM;
1254 tr->nodeidx[tpos] = kassign[i];
1257 /* do any deletes necessary 'til next match */
1258 if (i < L && tassign[i+1] == STM && kassign[i+1] - kassign[i] > 1)
1259 for (k = kassign[i] + 1; k < kassign[i+1]; k++)
1261 tr->statetype[tpos] = STD;
1262 tr->nodeidx[tpos] = k;
1266 /* check for last match state */
1267 if (i == L || tassign[i+1] == STC) {
1268 /* check for wing unfolding */
1269 if (Prob2Score(hmm->end[kassign[i-1]], 1.) + INTSCALE <= hmm->esc[kassign[i-1]])
1270 for (k = kassign[i]+1; k <= hmm->M; k++)
1272 tr->statetype[tpos] = STD;
1273 tr->nodeidx[tpos] = k;
1277 /* add on the end state */
1278 tr->statetype[tpos] = STE;
1279 tr->nodeidx[tpos] = 0;
1282 /* and a nonemitting C state */
1283 tr->statetype[tpos] = STC;
1284 tr->nodeidx[tpos] = 0;
1291 tr->statetype[tpos] = STI;
1292 tr->nodeidx[tpos] = kassign[i];
1298 tr->statetype[tpos] = STN;
1299 tr->nodeidx[tpos] = 0;
1305 tr->statetype[tpos] = STC;
1306 tr->nodeidx[tpos] = 0;
1311 default: Die("Bogus state %s", Statetype(tassign[i]));
1314 /* terminate the trace */
1315 tr->statetype[tpos] = STT;
1316 tr->nodeidx[tpos] = 0;
1330 /* Function: Plan7ESTViterbi()
1332 * Purpose: Frameshift-tolerant alignment of protein model to cDNA EST.
1337 Plan7ESTViterbi(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx)
1339 struct dpmatrix_s *mx;
1348 /* Allocate a DP matrix with 0..L rows, 0..M+1 columns.
1350 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
1352 /* Initialization of the zero row (DNA sequence of length 0)
1353 * Note that xmx[i][stN] = 0 by definition for all i,
1354 * and xmx[i][stT] = xmx[i][stC], so neither stN nor stT need
1355 * to be calculated in DP matrices.
1357 xmx[0][XMN] = 0; /* S->N, p=1 */
1358 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
1359 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
1360 for (k = 0; k <= hmm->M; k++)
1361 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
1363 /* Initialization of the first row (DNA sequence of length 1);
1364 * only N state can make this nucleotide.
1366 xmx[1][XMN] = xmx[0][XMN] + hmm->xsc[XTN][LOOP];
1367 xmx[1][XMB] = xmx[1][XMN] + hmm->xsc[XTN][MOVE];
1368 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need 2 nt to get here */
1369 for (k = 0; k <= hmm->M; k++)
1370 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need 2 nt to get into model */
1372 /* Recursion. Done as a pull.
1373 * Note some slightly wasteful boundary conditions:
1374 * tsc[0] = -INFTY for all eight transitions (no node 0)
1375 * D_M and I_M are wastefully calculated (they don't exist)
1377 for (i = 2; i <= L; i++) {
1378 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
1380 /* crude calculation of lookup value for codon */
1382 if (dsq[i-2] < 4 && dsq[i-1] < 4 && dsq[i] < 4)
1383 codon = dsq[i-2] * 16 + dsq[i-1] * 4 + dsq[i];
1385 codon = 64; /* ambiguous codon; punt */
1388 for (k = 1; k <= hmm->M; k++) {
1391 mmx[i][k] = mmx[i-3][k-1] + hmm->tsc[k-1][TMM];
1392 if ((sc = imx[i-3][k-1] + hmm->tsc[k-1][TIM]) > mmx[i][k])
1394 if ((sc = xmx[i-3][XMB] + hmm->bsc[k]) > mmx[i][k])
1396 if ((sc = dmx[i-3][k-1] + hmm->tsc[k-1][TDM]) > mmx[i][k])
1398 mmx[i][k] += hmm->dnam[codon][k];
1400 /* -1 frameshifts into match state */
1401 if ((sc = mmx[i-2][k-1] + hmm->tsc[k-1][TMM] + hmm->dna2) > mmx[i][k])
1403 if ((sc = imx[i-2][k-1] + hmm->tsc[k-1][TIM] + hmm->dna2) > mmx[i][k])
1405 if ((sc = xmx[i-2][XMB] + hmm->bsc[k] + hmm->dna2) > mmx[i][k])
1407 if ((sc = dmx[i-2][k-1] + hmm->tsc[k-1][TDM] + hmm->dna2) > mmx[i][k])
1410 /* +1 frameshifts into match state */
1412 if ((sc = mmx[i-4][k-1] + hmm->tsc[k-1][TMM] + hmm->dna4) > mmx[i][k])
1414 if ((sc = imx[i-4][k-1] + hmm->tsc[k-1][TIM] + hmm->dna4) > mmx[i][k])
1416 if ((sc = xmx[i-4][XMB] + hmm->bsc[k] + hmm->dna4) > mmx[i][k])
1418 if ((sc = dmx[i-4][k-1] + hmm->tsc[k-1][TDM] + hmm->dna4) > mmx[i][k])
1422 dmx[i][k] = mmx[i][k-1] + hmm->tsc[k-1][TMD];
1423 if ((sc = dmx[i][k-1] + hmm->tsc[k-1][TDD]) > dmx[i][k])
1428 imx[i][k] = mmx[i-3][k] + hmm->tsc[k][TMI];
1429 if ((sc = imx[i-3][k] + hmm->tsc[k][TII]) > imx[i][k])
1431 imx[i][k] += hmm->dnai[codon][k];
1434 /* -1 frameshifts into insert state */
1435 if ((sc = mmx[i-2][k] + hmm->tsc[k][TMI] + hmm->dna2) > imx[i][k])
1437 if ((sc = imx[i-2][k] + hmm->tsc[k][TII] + hmm->dna2) > imx[i][k])
1440 /* +1 frameshifts into insert state */
1442 if ((sc = mmx[i-4][k] + hmm->tsc[k][TMI] + hmm->dna4) > imx[i][k])
1444 if ((sc = imx[i-4][k] + hmm->tsc[k][TII] + hmm->dna4) > imx[i][k])
1448 /* Now the special states. Order is important here.
1449 * remember, C and J emissions are zero score by definition,
1451 /* N state: +1 nucleotide */
1452 xmx[i][XMN] = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP];
1453 /* E state: collect from M's, and last D */
1454 xmx[i][XME] = dmx[i][hmm->M]; /* transition prob from last D = 1.0 */
1455 for (k = 1; k <= hmm->M; k++)
1456 if ((sc = mmx[i][k] + hmm->esc[k]) > xmx[i][XME])
1458 /* J state: +1 nucleotide */
1459 xmx[i][XMJ] = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP];
1460 if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
1462 /* B state: collect from N,J */
1463 xmx[i][XMB] = xmx[i][XMN] + hmm->xsc[XTN][MOVE];
1464 if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
1466 /* C state: +1 nucleotide */
1467 xmx[i][XMC] = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP];
1468 if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
1472 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
1474 if (ret_mx != NULL) *ret_mx = mx;
1475 else FreePlan7Matrix(mx);
1477 return Scorify(sc); /* the total Viterbi score. */
1482 /* Function: get_wee_midpt()
1483 * Date: SRE, Wed Mar 4 08:27:11 1998 [St. Louis]
1485 * Purpose: The heart of the divide and conquer algorithm
1486 * for P7WeeViterbi(). This function is called
1487 * recursively to find successive optimal midpoints
1488 * in the alignment matrix. See P7WeeViterbi() for
1489 * further comments on the assumptions of this algorithm.
1491 * Args: hmm - the model, set up for integer scores
1492 * dsq - the sequence, digitized
1493 * L - length of the sequence
1494 * k1 - model node to start with, 1..M
1495 * t1 - state type to start with, STM | STI | STN | STC; STS to start
1496 * s1 - sequence position to start with, 1..L; 1 to start
1497 * k3 - model node to end with, 1..M
1498 * t3 - state type to end with, STM | STI | STN | STC; STT to start
1499 * s3 - sequence position to end with, 1..L; L to start
1500 * ret_k2 - RETURN: optimal midpoint, node position in model
1501 * ret_t2 - RETURN: optimal midpoint, state type
1502 * ret_s2 - RETURN: optimal midpoint, sequence position
1504 * Returns: score of optimal alignment, in bits.
1507 get_wee_midpt(struct plan7_s *hmm, char *dsq, int L,
1508 int k1, char t1, int s1,
1509 int k3, char t3, int s3,
1510 int *ret_k2, char *ret_t2, int *ret_s2)
1512 struct dpmatrix_s *fwd;
1513 struct dpmatrix_s *bck;
1514 int **xmx; /* convenience ptr into special states */
1515 int **mmx; /* convenience ptr into match states */
1516 int **imx; /* convenience ptr into insert states */
1517 int **dmx; /* convenience ptr into delete states */
1521 int cur, prv, nxt; /* current, previous, next row index (0 or 1)*/
1522 int i,k; /* indices for seq, model */
1523 int sc; /* integer score */
1524 int max; /* maximum integer score */
1525 int start; /* s1 to start at (need, for STS special case) */
1528 /* Choose our midpoint.
1529 * Special cases: s1, s3 adjacent and t1 == STS: s2 = s1
1530 * s1, s3 adjacent and t3 == STT: s2 = s3
1531 * (where we must replace STS, STT eventually)
1533 s2 = s1 + (s3-s1) / 2;
1534 if (s3-s1 == 1 && t1 == STS) s2 = s1;
1535 if (s3-s1 == 1 && t3 == STT) s2 = s3;
1537 /* STS is a special case. STS aligns to row zero by convention,
1538 * but we'll be passed s1=1, t1=STS. We have to init on row
1539 * zero then start DP on row 1.
1541 start = (t1 == STS) ? 0 : s1;
1543 /* Allocate our forward two rows.
1544 * Initialize row zero.
1546 fwd = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1548 xmx[cur][XMN] = xmx[cur][XMB] = -INFTY;
1549 xmx[cur][XME] = xmx[cur][XMC] = -INFTY;
1550 for (k = k1; k <= k3; k++)
1551 mmx[cur][k] = imx[cur][k] = dmx[cur][k] = -INFTY;
1553 /* Where to put our zero for our start point...
1554 * (only possible to start on an emitting state; J disallowed)
1557 case STM: mmx[cur][k1] = 0; break;
1558 case STI: imx[cur][k1] = 0; break;
1559 case STN: xmx[cur][XMN] = 0; break;
1560 case STC: xmx[cur][XMC] = 0; break;
1561 case STS: xmx[cur][XMN] = 0; break;
1562 default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t1));
1565 /* Still initializing.
1566 * Deal with pulling horizontal matrix moves in initial row.
1567 * These are any transitions to nonemitters:
1571 * STC-> (T, but we never observe this in the forward pass of a d&c)
1573 * STS-> (N, already implied by setting xmx[cur][XMN] = 0)
1578 for (k = k1+1; k <= k3; k++)
1579 { /* transits into STD */
1580 dmx[cur][k] = -INFTY;
1581 if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY)
1583 if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])
1586 /* transit into STE */
1587 xmx[cur][XME] = -INFTY;
1588 if ((sc = mmx[cur][k1] + hmm->esc[k1]) > -INFTY)
1591 /* transit into STB from STN */
1592 xmx[cur][XMB] = -INFTY;
1593 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1595 /* transit into STC from STE */
1596 xmx[cur][XMC] = -INFTY;
1597 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > -INFTY)
1600 /* Done initializing.
1601 * Start recursive DP; sweep forward to chosen s2 midpoint. Done as a pull.
1603 for (i = start+1; i <= s2; i++) {
1607 mmx[cur][k1] = imx[cur][k1] = dmx[cur][k1] = -INFTY;
1609 /* Insert state in column k1, and B->M transition in k1.
1612 imx[cur][k1] = -INFTY;
1613 if ((sc = mmx[prv][k1] + hmm->tsc[k1][TMI]) > -INFTY)
1615 if ((sc = imx[prv][k1] + hmm->tsc[k1][TII]) > imx[cur][k1])
1617 if (hmm->isc[(int) dsq[i]][k1] != -INFTY)
1618 imx[cur][k1] += hmm->isc[(int) dsq[i]][k1];
1620 imx[cur][k1] = -INFTY;
1622 if ((sc = xmx[prv][XMB] + hmm->bsc[k1]) > -INFTY)
1624 if (hmm->msc[(int) dsq[i]][k1] != -INFTY)
1625 mmx[cur][k1] += hmm->msc[(int) dsq[i]][k1];
1627 mmx[cur][k1] = -INFTY;
1629 /* Main chunk of recursion across model positions
1631 for (k = k1+1; k <= k3; k++) {
1633 mmx[cur][k] = -INFTY;
1634 if ((sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > -INFTY)
1636 if ((sc = imx[prv][k-1] + hmm->tsc[k-1][TIM]) > mmx[cur][k])
1638 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
1640 if ((sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k])
1642 if (hmm->msc[(int) dsq[i]][k] != -INFTY)
1643 mmx[cur][k] += hmm->msc[(int) dsq[i]][k];
1645 mmx[cur][k] = -INFTY;
1648 dmx[cur][k] = -INFTY;
1650 if ((sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > -INFTY)
1652 if ((sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])
1657 imx[cur][k] = -INFTY;
1659 if ((sc = mmx[prv][k] + hmm->tsc[k][TMI]) > -INFTY)
1661 if ((sc = imx[prv][k] + hmm->tsc[k][TII]) > imx[cur][k])
1663 if (hmm->isc[(int) dsq[i]][k] != -INFTY)
1664 imx[cur][k] += hmm->isc[(int) dsq[i]][k];
1666 imx[cur][k] = -INFTY;
1670 xmx[cur][XMN] = -INFTY;
1671 if ((sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
1674 xmx[cur][XME] = -INFTY;
1675 for (k = k1; k <= k3 && k <= hmm->M; k++)
1676 if ((sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
1679 xmx[cur][XMB] = -INFTY;
1680 if ((sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1683 xmx[cur][XMC] = -INFTY;
1684 if ((sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1686 if ((sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
1690 /* Row s2%2 in fwd matrix now contains valid scores from s1 (start) to s2,
1691 * with J transitions disallowed (no cycles through model).
1694 /*****************************************************************
1696 *****************************************************************/
1698 /* Allocate our backwards two rows. Init last row.
1700 bck = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1702 xmx[nxt][XMN] = xmx[nxt][XMB] = -INFTY;
1703 xmx[nxt][XME] = xmx[nxt][XMC] = -INFTY;
1704 for (k = k1; k <= k3 + 1; k++)
1705 mmx[nxt][k] = imx[nxt][k] = dmx[nxt][k] = -INFTY;
1707 mmx[cur][k3+1] = imx[cur][k3+1] = dmx[cur][k3+1] = -INFTY;
1709 /* Where to put the zero for our end point on last row.
1712 case STM: mmx[nxt][k3] = 0; break;
1713 case STI: imx[nxt][k3] = 0; break;
1714 case STN: xmx[nxt][XMN] = 0; break;
1715 case STC: xmx[nxt][XMC] = 0; break; /* must be an emitting C */
1716 case STT: xmx[nxt][XMC] = hmm->xsc[XTC][MOVE]; break; /* C->T implied */
1717 default: Die("you can't init get_wee_midpt with a %s\n", Statetype(t3));
1720 /* Still initializing.
1721 * In the case t3==STT, there are a few horizontal moves possible
1722 * on row s3, because STT isn't an emitter. All other states are
1723 * emitters, so their connections have to be to the previous row s3-1.
1727 xmx[nxt][XME] = xmx[nxt][XMC] + hmm->xsc[XTE][MOVE];
1729 for (k = k3; k >= k1; k--) {
1730 mmx[nxt][k] = xmx[nxt][XME] + hmm->esc[k];
1732 mmx[nxt][k] += hmm->msc[(int)dsq[s3]][k];
1736 /* Start recursive DP; sweep backwards to chosen s2 midpoint.
1737 * Done as a pull. M, I scores at current row do /not/ include
1738 * emission scores. Be careful of integer underflow.
1740 for (i = s3-1; i >= s2; i--) {
1741 /* note i < L, so i+1 is always a legal index */
1744 /* C pulls from C (T is special cased) */
1745 xmx[cur][XMC] = -INFTY;
1746 if ((sc = xmx[nxt][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1748 /* B pulls from M's */
1749 xmx[cur][XMB] = -INFTY;
1750 for (k = k1; k <= k3; k++)
1751 if ((sc = mmx[nxt][k] + hmm->bsc[k]) > xmx[cur][XMB])
1753 /* E pulls from C (J disallowed) */
1754 xmx[cur][XME] = -INFTY;
1755 if ((sc = xmx[cur][XMC] + hmm->xsc[XTE][MOVE]) > -INFTY)
1757 /* N pulls from B, N */
1758 xmx[cur][XMN] = -INFTY;
1759 if ((sc = xmx[cur][XMB] + hmm->xsc[XTN][MOVE]) > -INFTY)
1761 if ((sc = xmx[nxt][XMN] + hmm->xsc[XTN][LOOP]) > xmx[cur][XMN])
1764 /* Main recursion across model
1766 for (k = k3; k >= k1; k--) {
1767 /* special case k == M */
1769 mmx[cur][k] = xmx[cur][XME]; /* p=1 transition to E by definition */
1770 dmx[cur][k] = -INFTY; /* doesn't exist */
1771 imx[cur][k] = -INFTY; /* doesn't exist */
1773 mmx[cur][k] += hmm->msc[(int)dsq[i]][k];
1775 } /* below this k < M, so k+1 is a legal index */
1777 /* pull into match state */
1778 mmx[cur][k] = -INFTY;
1779 if ((sc = xmx[cur][XME] + hmm->esc[k]) > -INFTY)
1781 if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TMM]) > mmx[cur][k])
1783 if ((sc = imx[nxt][k] + hmm->tsc[k][TMI]) > mmx[cur][k])
1785 if ((sc = dmx[cur][k+1] + hmm->tsc[k][TMD]) > mmx[cur][k])
1788 mmx[cur][k] += hmm->msc[(int)dsq[i]][k];
1790 /* pull into delete state */
1791 dmx[cur][k] = -INFTY;
1792 if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TDM]) > -INFTY)
1794 if ((sc = dmx[cur][k+1] + hmm->tsc[k][TDD]) > dmx[cur][k])
1796 /* pull into insert state */
1797 imx[cur][k] = -INFTY;
1798 if ((sc = mmx[nxt][k+1] + hmm->tsc[k][TIM]) > -INFTY)
1800 if ((sc = imx[nxt][k] + hmm->tsc[k][TII]) > imx[cur][k])
1803 imx[cur][k] += hmm->isc[(int)dsq[i]][k];
1808 /*****************************************************************
1809 * DP complete; we have both forward and backward passes. Now we
1810 * look across the s2 row and find the optimal emitting state.
1811 *****************************************************************/
1815 for (k = k1; k <= k3; k++)
1817 if ((sc = fwd->mmx[cur][k] + bck->mmx[cur][k]) > max)
1818 { k2 = k; t2 = STM; max = sc; }
1819 if ((sc = fwd->imx[cur][k] + bck->imx[cur][k]) > max)
1820 { k2 = k; t2 = STI; max = sc; }
1822 if ((sc = fwd->xmx[cur][XMN] + bck->xmx[cur][XMN]) > max)
1823 { k2 = 1; t2 = STN; max = sc; }
1824 if ((sc = fwd->xmx[cur][XMC] + bck->xmx[cur][XMC]) > max)
1825 { k2 = hmm->M; t2 = STC; max = sc; }
1827 /*****************************************************************
1828 * Garbage collection, return.
1829 *****************************************************************/
1831 FreePlan7Matrix(fwd);
1832 FreePlan7Matrix(bck);
1836 return Scorify(max);
1840 /* Function: P7ViterbiAlignAlignment()
1841 * Date: SRE, Sat Jul 4 13:39:00 1998 [St. Louis]
1843 * Purpose: Align a multiple alignment to an HMM without
1844 * changing the multiple alignment itself.
1845 * Adapted from P7Viterbi().
1847 * Heuristic; not a guaranteed optimal alignment.
1848 * Guaranteeing an optimal alignment appears difficult.
1849 * [cryptic note to myself:] In paths connecting to I* metastates,
1850 * recursion breaks down; if there is a gap in the
1851 * previous column for a given seq, we can't determine what state the
1852 * I* metastate corresponds to for this sequence, unless we
1853 * look back in the DP matrix. The lookback would either involve
1854 * recursing back to the previous M* metastate (giving a
1855 * O(MN^2) algorithm instead of O(MN)) or expanding the I*
1856 * metastate into 3^nseq separate I* metastates to keep track
1857 * of which of three states each seq is in. Since the second
1858 * option blows up exponentially w/ nseq, it is not attractive.
1859 * If the first option were used, the correct algorithm would be related to
1860 * modelmakers.c:Maxmodelmaker(), but somewhat more difficult.
1862 * The heuristic approach here is to calculate a "consensus"
1863 * sequence from the alignment, and align the consensus to the HMM.
1864 * Some hackery is employed, weighting transitions and emissions
1865 * to make things work (re: con and mocc arrays).
1867 * Args: aseq - aligned sequences
1868 * ainfo - info for aseqs (includes alen, nseq, wgt)
1869 * hmm - model to align to
1871 * Returns: Traceback. Caller must free with P7FreeTrace().
1872 * pos[] contains alignment columns, indexed 1..alen.
1873 * statetype[] contains metastates M*, etc. as STM, etc.
1876 P7ViterbiAlignAlignment(MSA *msa, struct plan7_s *hmm)
1878 struct dpmatrix_s *mx; /* Viterbi calculation lattice (two rows) */
1879 struct dpshadow_s *tb; /* shadow matrix of traceback pointers */
1880 struct p7trace_s *tr; /* RETURN: traceback */
1881 int **xmx, **mmx, **imx, **dmx;
1882 char **xtb, **mtb, **itb, **dtb;
1883 float **con; /* [1..alen][0..Alphabet_size-1], consensus counts */
1884 float *mocc; /* fractional occupancy of a column; used to weight transitions */
1885 int i; /* counter for columns */
1886 int k; /* counter for model positions */
1887 int idx; /* counter for seqs */
1888 int sym; /* counter for alphabet symbols */
1889 int sc; /* temp variable for holding score */
1890 float denom; /* total weight of seqs; used to "normalize" counts */
1893 /* The "consensus" is a counts matrix, [1..alen][0..Alphabet_size-1].
1894 * Gaps are not counted explicitly, but columns with lots of gaps get
1895 * less total weight because they have fewer counts.
1898 con = MallocOrDie(sizeof(float *) * (msa->alen+1));
1899 mocc = MallocOrDie(sizeof(float) * (msa->alen+1));
1900 for (i = 1; i <= msa->alen; i++) {
1901 con[i] = MallocOrDie(sizeof(float) * Alphabet_size);
1902 FSet(con[i], Alphabet_size, 0.0);
1905 /* initialization */
1906 /* note: aseq is off by one, 0..alen-1 */
1907 /* "normalized" to have a max total count of 1 per col */
1908 denom = FSum(msa->wgt, msa->nseq);
1909 for (i = 1; i <= msa->alen; i++)
1911 for (idx = 0; idx < msa->nseq; idx++)
1912 if (! isgap(msa->aseq[idx][i-1]))
1913 P7CountSymbol(con[i], SYMIDX(msa->aseq[idx][i-1]), msa->wgt[idx]);
1914 FScale(con[i], Alphabet_size, 1./denom);
1915 mocc[i] = FSum(con[i], Alphabet_size);
1918 /* Allocate a DP matrix with 2 rows, 0..M columns,
1919 * and a shadow matrix with 0,1..alen rows, 0..M columns.
1921 mx = AllocPlan7Matrix(2, hmm->M, &xmx, &mmx, &imx, &dmx);
1922 tb = AllocShadowMatrix(msa->alen+1, hmm->M, &xtb, &mtb, &itb, &dtb);
1924 /* Initialization of the zero row.
1926 xmx[0][XMN] = 0; /* S->N, p=1 */
1928 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
1930 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
1932 xtb[0][XMC] = xtb[0][XMJ] = STBOGUS;
1933 for (k = 0; k <= hmm->M; k++) {
1934 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
1935 mtb[0][k] = itb[0][k] = dtb[0][k] = STBOGUS;
1938 /* Recursion. Done as a pull.
1939 * Note some slightly wasteful boundary conditions:
1940 * tsc[0] = -INFTY for all eight transitions (no node 0)
1941 * D_M and I_M are wastefully calculated (they don't exist)
1943 for (i = 1; i <= msa->alen; i++) {
1947 mmx[cur][0] = imx[cur][0] = dmx[cur][0] = -INFTY;
1948 mtb[i][0] = itb[i][0] = dtb[i][0] = STBOGUS;
1950 for (k = 1; k <= hmm->M; k++) {
1952 mmx[cur][k] = -INFTY;
1953 mtb[i][k] = STBOGUS;
1954 if (mmx[prv][k-1] > -INFTY && hmm->tsc[k-1][TMM] > -INFTY &&
1955 (sc = mmx[prv][k-1] + hmm->tsc[k-1][TMM]) > mmx[cur][k])
1956 { mmx[cur][k] = sc; mtb[i][k] = STM; }
1957 if (imx[prv][k-1] > -INFTY && hmm->tsc[k-1][TIM] > -INFTY &&
1958 (sc = imx[prv][k-1] + hmm->tsc[k-1][TIM] * mocc[i-1]) > mmx[cur][k])
1959 { mmx[cur][k] = sc; mtb[i][k] = STI; }
1960 if ((sc = xmx[prv][XMB] + hmm->bsc[k]) > mmx[cur][k])
1961 { mmx[cur][k] = sc; mtb[i][k] = STB; }
1962 if (dmx[prv][k-1] > -INFTY && hmm->tsc[k-1][TDM] > -INFTY &&
1963 (sc = dmx[prv][k-1] + hmm->tsc[k-1][TDM]) > mmx[cur][k])
1964 { mmx[cur][k] = sc; mtb[i][k] = STD; }
1965 /* average over "consensus" sequence */
1966 for (sym = 0; sym < Alphabet_size; sym++)
1968 if (con[i][sym] > 0 && hmm->msc[sym][k] == -INFTY) { mmx[cur][k] = -INFTY; break; }
1969 mmx[cur][k] += hmm->msc[sym][k] * con[i][sym];
1973 dmx[cur][k] = -INFTY;
1974 dtb[i][k] = STBOGUS;
1975 if (mmx[cur][k-1] > -INFTY && hmm->tsc[k-1][TMD] > -INFTY &&
1976 (sc = mmx[cur][k-1] + hmm->tsc[k-1][TMD]) > dmx[cur][k])
1977 { dmx[cur][k] = sc; dtb[i][k] = STM; }
1978 if (dmx[cur][k-1] > -INFTY && hmm->tsc[k-1][TDD] > -INFTY &&
1979 (sc = dmx[cur][k-1] + hmm->tsc[k-1][TDD]) > dmx[cur][k])
1980 { dmx[cur][k] = sc; dtb[i][k] = STD; }
1984 imx[cur][k] = -INFTY;
1985 itb[i][k] = STBOGUS;
1986 if (mmx[prv][k] > -INFTY && hmm->tsc[k][TMI] > -INFTY &&
1987 (sc = mmx[prv][k] + hmm->tsc[k][TMI] * mocc[i]) > imx[cur][k])
1988 { imx[cur][k] = sc; itb[i][k] = STM; }
1989 if (imx[prv][k] > -INFTY && hmm->tsc[k][TII] > -INFTY &&
1990 (sc = imx[prv][k] + hmm->tsc[k][TII] * mocc[i-1] * mocc[i]) > imx[cur][k])
1991 { imx[cur][k] = sc; itb[i][k] = STI; }
1992 /* average over "consensus" sequence */
1993 for (sym = 0; sym < Alphabet_size; sym++)
1995 if (con[i][sym] > 0 && hmm->isc[sym][k] == -INFTY) { imx[cur][k] = -INFTY; break; }
1996 imx[cur][k] += hmm->isc[sym][k] * con[i][sym];
2001 /* Now the special states. Order is important here.
2002 * remember, N, C, and J emissions are zero score by definition.
2005 xmx[cur][XMN] = -INFTY;
2006 xtb[i][XMN] = STBOGUS;
2007 if (xmx[prv][XMN] > -INFTY && hmm->xsc[XTN][LOOP] > -INFTY &&
2008 (sc = xmx[prv][XMN] + hmm->xsc[XTN][LOOP] * mocc[i]) > -INFTY)
2009 { xmx[cur][XMN] = sc; xtb[i][XMN] = STN; }
2011 xmx[cur][XME] = -INFTY;
2012 xtb[i][XME] = STBOGUS;
2013 for (k = 1; k <= hmm->M; k++)
2014 if (mmx[cur][k] > -INFTY && hmm->esc[k] > -INFTY &&
2015 (sc = mmx[cur][k] + hmm->esc[k]) > xmx[cur][XME])
2016 { xmx[cur][XME] = sc; tb->esrc[i] = k; }
2018 /* we don't check J state */
2019 /* B state; don't connect from J */
2020 xmx[cur][XMB] = -INFTY;
2021 xtb[i][XMB] = STBOGUS;
2022 if (xmx[cur][XMN] > -INFTY && hmm->xsc[XTN][MOVE] > -INFTY &&
2023 (sc = xmx[cur][XMN] + hmm->xsc[XTN][MOVE]) > xmx[cur][XMB])
2024 { xmx[cur][XMB] = sc; xtb[i][XMB] = STN; }
2027 xmx[cur][XMC] = -INFTY;
2028 xtb[i][XMC] = STBOGUS;
2029 if (xmx[prv][XMC] > -INFTY && hmm->xsc[XTC][LOOP] > -INFTY &&
2030 (sc = xmx[prv][XMC] + hmm->xsc[XTC][LOOP] * mocc[i]) > -INFTY)
2031 { xmx[cur][XMC] = sc; xtb[i][XMC] = STC; }
2032 if (xmx[cur][XME] > -INFTY && hmm->xsc[XTE][MOVE] > -INFTY &&
2033 (sc = xmx[cur][XME] + hmm->xsc[XTE][MOVE]) > xmx[cur][XMC])
2034 { xmx[cur][XMC] = sc; xtb[i][XMC] = STE; }
2036 /* T state (not stored in mx) */
2037 sc = xmx[msa->alen%2][XMC] + hmm->xsc[XTC][MOVE];
2039 /* do the traceback */
2040 tr = ShadowTrace(tb, hmm, msa->alen);
2041 /* cleanup and return */
2042 FreePlan7Matrix(mx);
2043 FreeShadowMatrix(tb);
2044 for (i = 1; i <= msa->alen; i++)
2054 /* Function: ShadowTrace()
2055 * Date: SRE, Sun Jul 5 11:38:24 1998 [St. Louis]
2057 * Purpose: Given a shadow matrix, trace it back, and return
2060 * Args: tb - shadow matrix of traceback pointers
2061 * hmm - the model (needed for figuring out wing unfolding)
2062 * L - sequence length
2064 * Returns: traceback. Caller must free w/ P7FreeTrace().
2067 ShadowTrace(struct dpshadow_s *tb, struct plan7_s *hmm, int L)
2069 struct p7trace_s *tr;
2070 int curralloc; /* current allocated length of trace */
2071 int tpos; /* position in trace */
2072 int i; /* position in seq (1..N) */
2073 int k; /* position in model (1..M) */
2074 char nxtstate; /* next state to assign in traceback */
2076 /* Overallocate for the trace.
2077 * S-N-B- ... - E-C-T : 6 states + L is minimum trace;
2078 * add L more as buffer.
2080 curralloc = L * 2 + 6;
2081 P7AllocTrace(curralloc, &tr);
2083 /* Initialization of trace
2084 * We do it back to front; ReverseTrace() is called later.
2086 tr->statetype[0] = STT;
2090 i = L; /* current i (seq pos) we're trying to assign */
2091 k = 0; /* current k (model pos) we're trying to assign */
2092 nxtstate = STC; /* assign the C state first, for C->T */
2096 while (nxtstate != STS) {
2099 tr->statetype[tpos] = STM;
2100 nxtstate = tb->mtb[i][k];
2101 tr->nodeidx[tpos] = k--;
2102 tr->pos[tpos] = i--;
2107 tr->statetype[tpos] = STI;
2108 nxtstate = tb->itb[i][k];
2109 tr->nodeidx[tpos] = k;
2110 tr->pos[tpos] = i--;
2115 tr->statetype[tpos] = STD;
2116 nxtstate = tb->dtb[i][k];
2117 tr->nodeidx[tpos] = k--;
2123 tr->statetype[tpos] = STN;
2124 nxtstate = tb->xtb[i][XMN];
2125 tr->nodeidx[tpos] = 0;
2126 tr->pos[tpos] = (nxtstate == STN) ? i-- : 0; /* N->N; 2nd one emits. */
2131 /* Check for wing unfolding */
2132 if (Prob2Score(hmm->begin[k+1], hmm->p1) + 1 * INTSCALE <= hmm->bsc[k+1])
2135 tr->statetype[tpos] = STD;
2136 tr->nodeidx[tpos] = k--;
2139 if (tpos == curralloc)
2140 { /* grow trace if necessary */
2142 P7ReallocTrace(tr, curralloc);
2146 tr->statetype[tpos] = STB;
2147 nxtstate = tb->xtb[i][XMB];
2148 tr->nodeidx[tpos] = 0;
2154 tr->statetype[tpos] = STJ;
2155 nxtstate = tb->xtb[i][XMJ];
2156 tr->nodeidx[tpos] = 0;
2157 tr->pos[tpos] = (nxtstate == STJ) ? i-- : 0; /* J->J; 2nd one emits. */
2162 tr->statetype[tpos] = STE;
2163 tr->nodeidx[tpos] = 0;
2168 /* check for wing unfolding */
2169 if (Prob2Score(hmm->end[k], 1.) + 1*INTSCALE <= hmm->esc[k])
2171 int dk; /* need a tmp k while moving thru delete wing */
2172 for (dk = hmm->M; dk > k; dk--)
2174 tr->statetype[tpos] = STD;
2175 tr->nodeidx[tpos] = dk;
2178 if (tpos == curralloc)
2179 { /* grow trace if necessary */
2181 P7ReallocTrace(tr, curralloc);
2188 tr->statetype[tpos] = STC;
2189 nxtstate = tb->xtb[i][XMC];
2190 tr->nodeidx[tpos] = 0;
2191 tr->pos[tpos] = (nxtstate == STC) ? i-- : 0; /* C->C; 2nd one emits. */
2196 Die("HMMER: Bad state (%s) in ShadowTrace()\n", Statetype(nxtstate));
2198 } /* end switch over nxtstate */
2200 if (tpos == curralloc)
2201 { /* grow trace if necessary */
2203 P7ReallocTrace(tr, curralloc);
2206 } /* end traceback, just before assigning S state */
2208 tr->statetype[tpos] = STS;
2209 tr->nodeidx[tpos] = 0;
2211 tr->tlen = tpos + 1;
2219 /* Function: PostprocessSignificantHit()
2220 * Date: SRE, Wed Dec 20 12:11:01 2000 [StL]
2222 * Purpose: Add a significant hit to per-seq and per-domain hit
2223 * lists, after postprocessing the scores appropriately,
2224 * and making sure per-domain scores add up to the per-seq
2227 * [doesn't really belong in core_algorithms.c, because
2228 * it's more of a hack than an algorithm, but on the other
2229 * hand it's now part of the core of how HMMER scores
2230 * things. Maybe there should be a core_hacks.c.]
2232 * Given: active hit lists for per-seq and per-domain
2233 * scores (e.g. hmmpfam and hmmsearch, collating their
2234 * results), and a new hit that's significant enough
2235 * that it may need to be reported in final output.
2237 * Breaks the traceback into individual domain traces;
2238 * scores each one of them, then applies null2 correction
2239 * for biased composition. Recalculates the per-seq score
2240 * as the sum of the per-domain scores. Stores the hits
2241 * in the lists, for eventual sorting and output by the
2244 * Notes: In principle we've got the score, and a pvalue, and a traceback
2245 * by doing the Viterbi algorithm, right? What else is left
2246 * to do? Well, in practice, life is more complicated, because
2247 * of the trace-dependent null2 score correction.
2249 * After a null2 score correction is carried out on
2250 * each domain (the default) the number of detected domains
2251 * with scores > 0 may have decreased. We want the
2252 * global (per-seq) hit list to have the recalculated number of
2253 * domains, not necessarily what Viterbi gave us.
2255 * Also, since we want the global score to be the sum of
2256 * the individual domains, but the null2 correction is
2257 * applied to each domain individually, we have to calculate
2258 * an adjusted global score. (To do otherwise invites
2259 * subtle inconsistencies; xref bug 2.)
2261 * We don't have final evalues, so we may put a few
2262 * more hits into the hit lists than we end up reporting.
2263 * The main output routine is responsible for final
2264 * enforcement of the thresholds.
2266 * This routine is NOT THREADSAFE. When multithreaded,
2267 * with using shared ghit/dhit output buffers, calls to
2268 * PostprocessSignificantHit() need to be protected.
2270 * Args: ghit - an active list of per-seq (global) hits
2271 * dhit - an active list of per-domain hits
2272 * tr - the significant HMM/seq traceback we'll report on
2273 * hmm - ptr to the HMM
2274 * dsq - digitized sequence (1..L)
2276 * seqname - name of sequence (same as targname, in hmmsearch)
2277 * seqacc - seq's accession (or NULL)
2278 * seqdesc - seq's description (or NULL)
2279 * do_forward - TRUE if we've already calculated final per-seq score
2280 * sc_override - per-seq score to use if do_forward is TRUE
2281 * do_null2 - TRUE to apply the null2 scoring correction
2282 * thresh - contains the threshold/cutoff information.
2283 * hmmpfam_mode - TRUE if called by hmmpfam, else assumes hmmsearch;
2284 * affects how the lists' sort keys are set.
2289 PostprocessSignificantHit(struct tophit_s *ghit,
2290 struct tophit_s *dhit,
2291 struct p7trace_s *tr,
2292 struct plan7_s *hmm,
2301 struct threshold_s *thresh,
2304 struct p7trace_s **tarr; /* array of per-domain traces */
2305 struct fancyali_s *ali; /* alignment of a domain */
2306 int ntr; /* number of domain traces from Viterbi */
2307 int tidx; /* index for traces (0..ntr-1) */
2308 int ndom; /* # of domains accepted in sequence */
2309 int didx; /* index for domains (1..ndom) */
2310 int k1, k2; /* start, stop coord in model */
2311 int i1, i2; /* start, stop in sequence */
2312 float whole_sc; /* whole sequence score = \sum domain scores */
2313 float *score; /* array of raw scores for each domain */
2314 int *usedomain; /* TRUE if this domain is accepted */
2319 /* Break the trace into one or more individual domains.
2321 TraceDecompose(tr, &tarr, &ntr);
2322 if (ntr == 0) Die("TraceDecompose() screwup"); /* "can't happen" (!) */
2324 /* Rescore each domain, apply null2 correction if asked.
2325 * Mark positive-scoring ones (we'll definitely report those),
2326 * and include their score in the whole sequence score.
2328 score = MallocOrDie(sizeof(float) * ntr);
2329 usedomain = MallocOrDie(sizeof(int) * ntr);
2332 for (tidx = 0; tidx < ntr; tidx++)
2334 score[tidx] = P7TraceScore(hmm, dsq, tarr[tidx]);
2335 if (do_null2) score[tidx] -= TraceScoreCorrection(hmm, tarr[tidx], dsq);
2336 if (score[tidx] > 0.0) {
2337 usedomain[tidx] = TRUE;
2339 whole_sc += score[tidx];
2341 usedomain[tidx] = FALSE;
2344 /* Make sure at least one positive scoring domain is in
2345 * the trace. If not, invoke "weak single domain" rules:
2346 * we will always report at least one domain per sequence, even
2347 * if it has a negative score. (HMMER's Plan7 architecture can report
2348 * one negative scoring domain but not more.)
2351 tidx = FMax(score, ntr);
2352 usedomain[tidx] = TRUE;
2353 whole_sc = score[tidx];
2357 /* Implement --do_forward: override the trace-dependent sum-of-domain
2358 * whole score, use the P7Forward() score that the called passed
2359 * us instead. This is a hack; null2 is trace-dependent and
2360 * thus undefined for P7Forward() scoring; see commentary in hmmpfam.c.
2362 if (do_forward) whole_sc = sc_override;
2364 /* Go through and put all the accepted domains into the hit list.
2366 whole_pval = PValue(hmm, whole_sc);
2367 for (tidx = 0, didx = 1; tidx < ntr; tidx++) {
2368 if (! usedomain[tidx]) continue;
2370 TraceSimpleBounds(tarr[tidx], &i1, &i2, &k1, &k2);
2371 pvalue = PValue(hmm, score[tidx]);
2373 if (pvalue <= thresh->domE && score[tidx] >= thresh->domT) {
2374 ali = CreateFancyAli(tarr[tidx], hmm, dsq, seqname);
2377 sortkey = -1.*(double)i1; /* hmmpfam: sort on position in seq */
2379 sortkey = score[tidx]; /* hmmsearch: sort on E (monotonic w/ sc) */
2381 RegisterHit(dhit, sortkey,
2382 pvalue, score[tidx],
2383 whole_pval, whole_sc,
2384 hmmpfam_mode ? hmm->name : seqname,
2385 hmmpfam_mode ? hmm->acc : seqacc,
2386 hmmpfam_mode ? hmm->desc : seqdesc,
2394 /* Now register the global hit, with the domain-derived score.
2398 * hmmpfam has to worry that score and E-value are not monotonic
2399 * when multiple HMMs (with different EVD parameters) are potential
2400 * targets. Therefore in hmmpfam_mode we apply a weird hack
2401 * to sort primarily on E-value, but on score
2402 * for really good hits with E=0.0... works because we can
2403 * assume 100000. > -log(DBL_MIN).
2404 * hmmsearch simply sorts on score (which for a single HMM, we
2405 * know is monotonic with E-value).
2408 sortkey = (whole_pval > 0.0) ? -1.*log(whole_pval) : 100000. + whole_sc;
2412 /* Note: we've recalculated whole_sc and it may have decreased
2413 * after the null2 correction was applied. For Pfam GA, TC,
2414 * or NC cutoffs, we have to be sure that everything on the
2415 * hitlist is correct (the hmmpfam output routine assumes it,
2416 * otherwise it would have to reload each HMM to get its
2417 * cutoffs). In all other cases, though, we don't care if
2418 * the hit list has a bit too many things on it, because the
2419 * output routine in hmmsearch or hmmpfam will check against
2420 * the cutoffs. Hence we only need to check against globT
2421 * (it may be set by GA, TC, or NC) but not globE.
2422 * - SRE, CSHL genome mtg May 2001
2424 if (whole_sc >= thresh->globT) {
2425 RegisterHit(ghit, sortkey,
2426 whole_pval, whole_sc,
2427 0., 0., /* no mother seq */
2428 hmmpfam_mode ? hmm->name : seqname,
2429 hmmpfam_mode ? hmm->acc : seqacc,
2430 hmmpfam_mode ? hmm->desc : seqdesc,
2431 0,0,0, /* seq positions */
2432 0,0,0, /* HMM positions */
2433 0, ndom, /* # domains info */
2434 NULL); /* alignment info */
2437 /* Clean up and return.
2439 for (tidx = 0; tidx < ntr; tidx++)
2440 P7FreeTrace(tarr[tidx]);