1 /************************************************************
2 * Copyright (C) 1998 Ian Holmes (ihh@sanger.ac.uk)
3 * HMMER - Biological sequence analysis with profile HMMs
4 * Copyright (C) 1992-1999 Washington University School of Medicine
7 * This source code is distributed under the terms of the
8 * GNU General Public License. See the files COPYING and LICENSE
10 ************************************************************/
13 * Author: Ian Holmes (ihh@sanger.ac.uk, Jun 5 1998)
14 * Derived from core_algorithms.c (SRE, Nov 11 1996)
15 * Incorporated SRE, Sat Nov 6 09:07:12 1999 [Cold Spring Harbor]
17 * RCS $Id: postprob.c,v 1.1.1.1 2005/03/22 08:34:15 cmzmasek Exp $
18 *****************************************************************
21 * Functions for working with posterior probabilities,
22 * including unfussed "backwards" and "optimal accuracy"
24 *****************************************************************
28 * struct p7trace_s *tr;
29 * struct dpmatrix_s *fwd;
30 * struct dpmatrix_s *bck;
31 * struct dpmatrix_s *posterior;
34 * (get a traceback from somewhere: P7Viterbi() or a modelmaker)
35 * (get an HMM from somewhere: read file or construct it)
36 * P7Forward (dsq, len, hmm, &fwd);
37 * P7Backward(dsq, len, hmm, &bck);
38 * posterior = bck; -- can alloc posterior, but also can re-use bck --
39 * P7EmitterPosterior(len, hmm, fwd, bck, posterior);
40 * postcode = PostalCode(len, posterior, tr);
42 * MSAAppendGR(msa, "POST", seqidx, postcode); -- or a similar annotation call --
45 * FreePlan7Matrix(fwd);
46 * FreePlan7Matrix(bck);
48 * P7OptimalAccuracy() - the Durbin/Holmes optimal accuracy
49 * alignment algorithm. Takes a sequence
50 * and an HMM, returns an alignment as
53 * P7Backward() - The Backward() algorithm, counterpart
54 * of P7Forward() in core_algorithms.c.
56 * P7EmitterPosterior()- The heart of postprob.c: given a Forward
57 * and a Backward matrix, calculate a new matrix
58 * that contains the posterior probabilities
59 * for each symbol i being emitted by
60 * state k (so, \sum_k p(k | x_i) = 1.0).
62 * P7FillOptimalAccuracy() - The core DP algorithm called by
63 * P7OptimalAccuracy().
65 * P7OptimalAccuracyTrace() - the traceback algorithm called by
66 * P7FillOptimalAccuracy().
68 * PostalCode() - Create a character string for annotating
71 * No small memory variants of these algorithms are available
81 /* Function: P7OptimalAccuracy()
83 * Purpose: The optimal accuracy dynamic programming algorithm.
84 * Identical to Viterbi() except that posterior residue
85 * label probabilities are used as scores.
87 * Args: dsq - sequence in digitized form
90 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
92 * Return: log ( sum_{residues} P(label|M,D) ), as a bit score
93 * (i.e. log of expected accuracy)
96 P7OptimalAccuracy(char *dsq, int L, struct plan7_s *hmm, struct p7trace_s **ret_tr)
99 struct dpmatrix_s *forward;
100 struct dpmatrix_s *backward;
102 (void) P7Forward(dsq, L, hmm, &forward);
103 (void) P7Backward(dsq, L, hmm, &backward);
105 P7EmitterPosterior(L, hmm, forward, backward, backward); /* Re-use backward matrix for posterior scores */
107 sc = P7FillOptimalAccuracy(L, hmm->M, backward, forward, ret_tr); /* Re-use forward matrix for optimal accuracy scores */
109 FreePlan7Matrix(forward);
110 FreePlan7Matrix(backward);
117 /* Function: P7Backward()
119 * Purpose: The Backward dynamic programming algorithm.
120 * The scaling issue is dealt with by working in log space
121 * and calling ILogsum(); this is a slow but robust approach.
123 * Args: dsq - sequence in digitized form
126 * ret_mx - RETURN: dp matrix; pass NULL if it's not wanted
128 * Return: log P(S|M)/P(S|R), as a bit score.
131 P7Backward(char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s **ret_mx)
133 struct dpmatrix_s *mx;
141 /* Allocate a DP matrix with 0..L rows, 0..M-1 columns.
143 mx = AllocPlan7Matrix(L+1, hmm->M, &xmx, &mmx, &imx, &dmx);
145 /* Initialization of the L row.
146 * Note that xmx[i][stS] = xmx[i][stN] by definition for all i,
147 * so stS need not be calculated in backward DP matrices.
149 xmx[L][XMC] = hmm->xsc[XTC][MOVE]; /* C<-T */
150 xmx[L][XME] = xmx[L][XMC] + hmm->xsc[XTE][MOVE]; /* E<-C, no C-tail */
151 xmx[L][XMJ] = xmx[L][XMB] = xmx[L][XMN] = -INFTY; /* need seq to get out from here */
152 for (k = hmm->M; k >= 1; k--) {
153 mmx[L][k] = xmx[L][XME] + hmm->esc[k]; /* M<-E ... */
154 mmx[L][k] += hmm->msc[(int) dsq[L]][k]; /* ... + emitted match symbol */
155 imx[L][k] = dmx[L][k] = -INFTY; /* need seq to get out from here */
158 /* Recursion. Done as a pull.
159 * Note slightly wasteful boundary conditions:
160 * M_M precalculated, D_M set to -INFTY,
161 * D_1 wastefully calculated.
162 * Scores for transitions to D_M also have to be hacked to -INFTY,
163 * as Plan7Logoddsify does not do this for us (I think? - ihh).
165 hmm->tsc[hmm->M-1][TDD] = hmm->tsc[hmm->M-1][TMD] = -INFTY; /* no D_M state -- HACK -- should be in Plan7Logoddsify */
166 for (i = L-1; i >= 0; i--)
168 /* Do the special states first.
169 * remember, C, N and J emissions are zero score by definition
171 xmx[i][XMC] = xmx[i+1][XMC] + hmm->xsc[XTC][LOOP];
173 xmx[i][XMB] = -INFTY;
174 /* The following section has been hacked to fit a bug in core_algorithms.c
175 * The "correct" code is:
176 * for (k = hmm->M; k >= 1; k--)
177 * xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k];
179 * The following code gives the same results as core_algorithms.c:
181 xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][hmm->M] + hmm->bsc[hmm->M-1]);
182 for (k = hmm->M-1; k >= 1; k--)
183 xmx[i][XMB] = ILogsum(xmx[i][XMB], mmx[i+1][k] + hmm->bsc[k]);
185 xmx[i][XMJ] = ILogsum(xmx[i][XMB] + hmm->xsc[XTJ][MOVE],
186 xmx[i+1][XMJ] + hmm->xsc[XTJ][LOOP]);
188 xmx[i][XME] = ILogsum(xmx[i][XMC] + hmm->xsc[XTE][MOVE],
189 xmx[i][XMJ] + hmm->xsc[XTE][LOOP]);
191 xmx[i][XMN] = ILogsum(xmx[i][XMB] + hmm->xsc[XTN][MOVE],
192 xmx[i+1][XMN] + hmm->xsc[XTN][LOOP]);
194 /* Now the main states. Note the boundary conditions at M.
198 mmx[i][hmm->M] = xmx[i][XME] + hmm->esc[hmm->M] + hmm->msc[(int) dsq[i]][hmm->M];
199 dmx[i][hmm->M] = -INFTY;
200 for (k = hmm->M-1; k >= 1; k--)
202 mmx[i][k] = ILogsum(ILogsum(xmx[i][XME] + hmm->esc[k],
203 mmx[i+1][k+1] + hmm->tsc[k][TMM]),
204 ILogsum(imx[i+1][k] + hmm->tsc[k][TMI],
205 dmx[i][k+1] + hmm->tsc[k][TMD]));
206 mmx[i][k] += hmm->msc[(int) dsq[i]][k];
208 imx[i][k] = ILogsum(imx[i+1][k] + hmm->tsc[k][TII],
209 mmx[i+1][k+1] + hmm->tsc[k][TIM]);
210 imx[i][k] += hmm->isc[(int) dsq[i]][k];
212 dmx[i][k] = ILogsum(dmx[i][k+1] + hmm->tsc[k][TDD],
213 mmx[i+1][k+1] + hmm->tsc[k][TDM]);
222 if (ret_mx != NULL) *ret_mx = mx;
223 else FreePlan7Matrix(mx);
225 return Scorify(sc); /* the total Backward score. */
229 /* Function: P7EmitterPosterior()
231 * Purpose: Combines Forward and Backward matrices into a posterior
232 * probability matrix.
233 * The entries in row i of this matrix are the logs of the
234 * posterior probabilities of each state emitting symbol i of
235 * the sequence, i.e. all entries for non-emitting states are -INFTY.
236 * The caller must allocate space for the matrix, although the
237 * backward matrix can be used instead (overwriting it will not
238 * compromise the algorithm).
240 * Args: L - length of sequence
242 * forward - pre-calculated forward matrix
243 * backward - pre-calculated backward matrix
244 * mx - pre-allocated dynamic programming matrix
249 P7EmitterPosterior(int L,
251 struct dpmatrix_s *forward,
252 struct dpmatrix_s *backward,
253 struct dpmatrix_s *mx)
259 sc = backward->xmx[0][XMN];
261 for (i = L; i >= 1; i--)
263 mx->xmx[i][XMC] = forward->xmx[i-1][XMC] + hmm->xsc[XTC][LOOP] + backward->xmx[i][XMC] - sc;
265 mx->xmx[i][XMJ] = forward->xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP] + backward->xmx[i][XMJ] - sc;
267 mx->xmx[i][XMN] = forward->xmx[i-1][XMN] + hmm->xsc[XTN][LOOP] + backward->xmx[i][XMN] - sc;
269 mx->xmx[i][XMB] = mx->xmx[i][XME] = -INFTY;
271 for (k = 1; k < hmm->M; k++) {
272 mx->mmx[i][k] = backward->mmx[i][k];
273 mx->mmx[i][k] += ILogsum(ILogsum(forward->mmx[i-1][k-1] + hmm->tsc[k-1][TMM],
274 forward->imx[i-1][k-1] + hmm->tsc[k-1][TIM]),
275 ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[k],
276 forward->dmx[i-1][k-1] + hmm->tsc[k-1][TDM]));
279 mx->imx[i][k] = backward->imx[i][k];
280 mx->imx[i][k] += ILogsum(forward->mmx[i-1][k] + hmm->tsc[k][TMI],
281 forward->imx[i-1][k] + hmm->tsc[k][TII]);
284 mx->dmx[i][k] = -INFTY;
286 mx->mmx[i][hmm->M] = backward->mmx[i][hmm->M];
287 mx->mmx[i][hmm->M] += ILogsum(ILogsum(forward->mmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TMM],
288 forward->imx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TIM]),
289 ILogsum(forward->xmx[i-1][XMB] + hmm->bsc[hmm->M],
290 forward->dmx[i-1][hmm->M-1] + hmm->tsc[hmm->M-1][TDM]));
291 mx->mmx[i][hmm->M] -= sc;
293 mx->imx[i][hmm->M] = mx->dmx[i][hmm->M] = mx->dmx[i][0] = -INFTY;
299 /* Function: P7FillOptimalAccuracy()
301 * Purpose: The core of the optimal accuracy dynamic programming algorithm.
302 * Identical to Viterbi() except that scores are given by a
303 * posterior matrix (that the caller must pre-calculate).
304 * Also, the caller must pre-allocate the optimal accuracy matrix
305 * (this allows the forward matrix to be re-used).
306 * P7OptimalAccuracy() does all this for you and cleans up.
309 * Args: L - length of sequence
310 * M - length of model
311 * posterior - pre-calculated emitter posterior matrix
312 * mx - pre-allocated dynamic programming matrix
313 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
315 * Return: log ( sum_{residues} P(label|M,D) ), as a bit score
316 * (i.e. log of expected accuracy)
318 float P7FillOptimalAccuracy(int L,
320 struct dpmatrix_s *posterior,
321 struct dpmatrix_s *mx,
322 struct p7trace_s **ret_tr)
324 struct p7trace_s *tr;
337 /* Initialization of the zero row.
338 * Each cell in the optimal accuracy matrix holds the log of the expected
339 * of correctly assigned symbols up to that point.
340 * To begin with, everything is log(0) = -INFTY.
342 xmx[0][XMN] = xmx[0][XMB] = xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;
343 for (k = 0; k <= M; k++)
344 mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;
346 /* Recursion. Done as a pull.
347 * Note some slightly wasteful boundary conditions:
348 * D_M and I_M are wastefully calculated (they don't exist)
350 for (i = 1; i <= L; i++)
352 mmx[i][0] = imx[i][0] = dmx[i][0] = -INFTY;
354 for (k = 1; k <= M; k++)
358 if ((sc = mmx[i-1][k-1]) > mmx[i][k])
360 if ((sc = imx[i-1][k-1]) > mmx[i][k])
362 if ((sc = dmx[i-1][k-1]) > mmx[i][k])
364 if ((sc = xmx[i-1][XMB]) > mmx[i][k])
366 mmx[i][k] = ILogsum(mmx[i][k], posterior->mmx[i][k]);
370 if ((sc = mmx[i][k-1]) > dmx[i][k])
372 if ((sc = dmx[i][k-1]) > dmx[i][k])
377 if ((sc = mmx[i-1][k]) > imx[i][k])
379 if ((sc = imx[i-1][k]) > imx[i][k])
381 imx[i][k] = ILogsum(imx[i][k], posterior->imx[i][k]);
384 /* Now the special states. Order is important here.
385 * remember, C and J emissions are zero score by definition,
389 xmx[i][XMN] = -INFTY;
390 if ((sc = ILogsum(xmx[i-1][XMN], posterior->xmx[i][XMN])) > -INFTY)
394 xmx[i][XME] = -INFTY;
395 for (k = 1; k <= M; k++)
396 if ((sc = mmx[i][k]) > xmx[i][XME])
400 xmx[i][XMJ] = -INFTY;
401 if ((sc = ILogsum(xmx[i-1][XMJ], posterior->xmx[i][XMJ])) > -INFTY)
403 if ((sc = xmx[i][XME]) > xmx[i][XMJ]) /* no E->J emission */
407 xmx[i][XMB] = -INFTY;
408 if ((sc = xmx[i][XMN]) > -INFTY)
410 if ((sc = xmx[i][XMJ]) > xmx[i][XMB])
414 xmx[i][XMC] = -INFTY;
415 if ((sc = ILogsum(xmx[i-1][XMC], posterior->xmx[i][XMC])) > -INFTY)
417 if ((sc = xmx[i][XME]) > xmx[i][XMC]) /* no E->C emission */
421 /* T state (not stored) */
424 if (ret_tr != NULL) {
425 P7OptimalAccuracyTrace(L, M, posterior, mx, &tr);
429 return Score2Prob(sc,1); /* the log of the expected accuracy. */
433 /* Function: P7OptimalAccuracyTrace()
435 * Purpose: Traceback of an optimal accuracy matrix: i.e. retrieval
436 * of optimum alignment.
438 * Args: L - length of sequence
440 * posterior - the posterior matrix
441 * mx - the matrix to trace back in, (L+1) x M
442 * ret_tr - RETURN: traceback.
445 * ret_tr is allocated here. Free using P7FreeTrace().
448 P7OptimalAccuracyTrace(int L,
450 struct dpmatrix_s *posterior,
451 struct dpmatrix_s *mx,
452 struct p7trace_s **ret_tr)
454 struct p7trace_s *tr;
455 int curralloc; /* current allocated length of trace */
456 int tpos; /* position in trace */
457 int i; /* position in seq (1..L) */
458 int k; /* position in model (1..M) */
459 int **xmx, **mmx, **imx, **dmx;
460 int sc; /* temp var for pre-emission score */
462 /* Overallocate for the trace.
463 * S-N-B- ... - E-C-T : 6 states + L is minimum trace;
464 * add L more as buffer.
466 curralloc = L * 2 + 6;
467 P7AllocTrace(curralloc, &tr);
474 /* Initialization of trace
475 * We do it back to front; ReverseTrace() is called later.
477 tr->statetype[0] = STT;
480 tr->statetype[1] = STC;
484 i = L; /* current i (seq pos) we're trying to assign */
488 while (tr->statetype[tpos-1] != STS) {
489 switch (tr->statetype[tpos-1]) {
490 case STM: /* M connects from i-1,k-1, or B */
492 if (sc == ILogsum(mmx[i][k], posterior->mmx[i+1][k+1]) && i > 0 && k > 0)
494 tr->statetype[tpos] = STM;
495 tr->nodeidx[tpos] = k--;
498 else if (sc == ILogsum(imx[i][k], posterior->mmx[i+1][k+1]) && i > 0 && k > 0)
500 tr->statetype[tpos] = STI;
501 tr->nodeidx[tpos] = k;
504 else if (sc == ILogsum(dmx[i][k], posterior->mmx[i+1][k+1]) && i > 0 && k > 1)
506 tr->statetype[tpos] = STD;
507 tr->nodeidx[tpos] = k--;
510 else if (sc == ILogsum(xmx[i][XMB], posterior->mmx[i+1][k+1]))
512 tr->statetype[tpos] = STB;
513 tr->nodeidx[tpos] = 0;
516 else Die("traceback failed");
519 case STD: /* D connects from M,D */
520 if (dmx[i][k+1] == mmx[i][k] && i > 0 && k > 0)
522 tr->statetype[tpos] = STM;
523 tr->nodeidx[tpos] = k--;
526 else if (dmx[i][k+1] == dmx[i][k] && k > 1)
528 tr->statetype[tpos] = STD;
529 tr->nodeidx[tpos] = k--;
532 else Die("traceback failed");
535 case STI: /* I connects from M,I */
537 if (sc == ILogsum(mmx[i][k], posterior->imx[i+1][k]) && i > 0 && k > 0)
539 tr->statetype[tpos] = STM;
540 tr->nodeidx[tpos] = k--;
543 else if (sc == ILogsum(imx[i][k], posterior->imx[i+1][k]) && i > 0 && k > 0)
545 tr->statetype[tpos] = STI;
546 tr->nodeidx[tpos] = k;
549 else Die("traceback failed");
552 case STN: /* N connects from S, N */
553 if (i == 0 && xmx[i][XMN] == -INFTY)
555 tr->statetype[tpos] = STS;
556 tr->nodeidx[tpos] = 0;
559 else if (i > 0 && xmx[i+1][XMN] == ILogsum(xmx[i][XMN], posterior->xmx[i+1][XMN]) && i > 0)
561 tr->statetype[tpos] = STN;
562 tr->nodeidx[tpos] = 0;
563 tr->pos[tpos] = 0; /* note convention adherence: */
564 tr->pos[tpos-1] = i--; /* first N doesn't emit */
566 else Die("traceback failed");
569 case STB: /* B connects from N, J */
570 if (xmx[i][XMB] == xmx[i][XMN])
572 tr->statetype[tpos] = STN;
573 tr->nodeidx[tpos] = 0;
576 else if (xmx[i][XMB] == xmx[i][XMJ])
578 tr->statetype[tpos] = STJ;
579 tr->nodeidx[tpos] = 0;
582 else Die("traceback failed");
585 case STE: /* E connects from any M state. k set here */
586 for (k = M; k >= 1; k--)
587 if (xmx[i][XME] == mmx[i][k] && i > 0)
589 tr->statetype[tpos] = STM;
590 tr->nodeidx[tpos] = k--;
594 if (k <= 0) Die("traceback failed");
597 case STC: /* C comes from C, E */
598 if (xmx[i][XMC] == ILogsum(xmx[i-1][XMC], posterior->xmx[i][XMC]) && i > 0)
600 tr->statetype[tpos] = STC;
601 tr->nodeidx[tpos] = 0;
602 tr->pos[tpos] = 0; /* note convention adherence: */
603 tr->pos[tpos-1] = i--; /* first C doesn't emit */
605 else if (xmx[i][XMC] == xmx[i][XME])
607 tr->statetype[tpos] = STE;
608 tr->nodeidx[tpos] = 0;
609 tr->pos[tpos] = 0; /* E is a nonemitter */
611 else Die("Traceback failed.");
614 case STJ: /* J connects from E, J */
615 if (xmx[i][XMJ] == ILogsum(xmx[i-1][XMJ], posterior->xmx[i][XMJ]) && i > 0)
617 tr->statetype[tpos] = STJ;
618 tr->nodeidx[tpos] = 0;
619 tr->pos[tpos] = 0; /* note convention adherence: */
620 tr->pos[tpos-1] = i--; /* first J doesn't emit */
622 else if (xmx[i][XMJ] == xmx[i][XME])
624 tr->statetype[tpos] = STE;
625 tr->nodeidx[tpos] = 0;
626 tr->pos[tpos] = 0; /* E is a nonemitter */
628 else Die("Traceback failed.");
632 Die("traceback failed");
634 } /* end switch over statetype[tpos-1] */
637 if (tpos == curralloc)
638 { /* grow trace if necessary */
640 P7ReallocTrace(tr, curralloc);
643 } /* end traceback, at S state; tpos == tlen now */
651 /* Function: PostalCode()
652 * Date: SRE, Sun Nov 7 15:31:35 1999 [Cold Spring Harbor]
654 * Purpose: Given a traceback and one of Ian's posterior
655 * probability matrices, calculate a string that
656 * represents the confidence values on each
657 * residue in the sequence.
659 * The code string is 0..L-1 (L = len of target seq),
660 * so it's in the coordinate system of the sequence string;
661 * off by one from dsq; and convertible to the coordinate
662 * system of aseq using MakeAlignedString().
665 * for example, 9 means with >=90% posterior probabiility,
666 * residue i is aligned to the state k that it
667 * is assigned to in the given trace.
669 * Args: L - length of seq
670 * mx - posterior prob matrix: see P7EmitterPosterior()
671 * tr - a traceback to get a Postal code string for.
673 * Returns: char * array of codes, 0..L-1
674 * Caller is responsible for free'ing it.
677 score2postcode(int sc)
680 i = (char) (Score2Prob(sc, 1.) * 10.);
681 return ((i > 9) ? '*' : '0'+i);
684 PostalCode(int L, struct dpmatrix_s *mx, struct p7trace_s *tr)
691 postcode = MallocOrDie((L+1) * sizeof(char));
692 for (tpos = 0; tpos < tr->tlen; tpos++)
695 k = tr->nodeidx[tpos];
696 if (i == 0) continue;
698 switch (tr->statetype[tpos]) {
699 case STM: postcode[i-1] = score2postcode(mx->mmx[i][k]); break;
700 case STI: postcode[i-1] = score2postcode(mx->imx[i][k]); break;
701 case STN: postcode[i-1] = score2postcode(mx->xmx[i][XMN]); break;
702 case STC: postcode[i-1] = score2postcode(mx->xmx[i][XMC]); break;
703 case STJ: postcode[i-1] = score2postcode(mx->xmx[i][XMJ]); break;