1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Sun Mar 8 12:26:58 1998
13 * RCS $Id: emit.c,v 1.1.1.1 2005/03/22 08:34:04 cmzmasek Exp $
15 * Generation of sequences/traces from an HMM.
29 /* Function: EmitSequence()
30 * Date: SRE, Sun Mar 8 12:28:03 1998 [St. Louis]
32 * Purpose: Given a model, sample a sequence and/or traceback.
34 * Args: hmm - the model
35 * ret_dsq - RETURN: generated digitized sequence (pass NULL if unwanted)
36 * ret_L - RETURN: length of generated sequence
37 * ret_tr - RETURN: generated trace (pass NULL if unwanted)
42 EmitSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr)
45 char type; /* current state type */
46 int k; /* current node index */
47 char *dsq; /* generated sequence, digitized */
48 int L; /* length of sequence */
49 int alloc_tlen; /* allocated space for traceback */
50 int alloc_L; /* allocated space for sequence */
51 int tpos; /* position in traceback */
52 int sym; /* a generated symbol index */
53 float t[4]; /* little array for choosing M transition from */
55 /* Initialize; allocations
57 P7AllocTrace(64, &tr);
59 dsq = MallocOrDie(sizeof(char) * 64);
62 TraceSet(tr, 0, STS, 0, 0);
63 TraceSet(tr, 1, STN, 0, 0);
64 dsq[0] = (char) Alphabet_iupac;
72 /* Deal with state transition
76 hmm->begin[0] = hmm->tbd1; /* begin[0] hack (documented in structs.h) */
77 k = FChoose(hmm->begin, hmm->M+1);
78 if (k == 0) { type = STD; k = 1; } else {type = STM; }
81 case STI: type = (FChoose(hmm->t[k]+TIM, 2) == 0) ? STM : STI; if (type == STM) k++; break;
82 case STN: type = (FChoose(hmm->xt[XTN], 2) == LOOP) ? STN : STB; k = 0; break;
83 case STE: type = (FChoose(hmm->xt[XTE], 2) == LOOP) ? STJ : STC; k = 0; break;
84 case STC: type = (FChoose(hmm->xt[XTC], 2) == LOOP) ? STC : STT; k = 0; break;
85 case STJ: type = (FChoose(hmm->xt[XTJ], 2) == LOOP) ? STJ : STB; k = 0; break;
89 type = (FChoose(hmm->t[k]+TDM, 2) == 0) ? STM : STD;
99 FCopy(t, hmm->t[k], 3);
101 switch (FChoose(t,4)) {
102 case 0: k++; type = STM; break;
103 case 1: type = STI; break;
104 case 2: k++; type = STD; break;
105 case 3: k=0; type = STE; break;
106 default: Die("never happens");
117 Die("can't happen.");
120 /* Choose a symbol emission, if necessary
123 if (type == STM) sym = FChoose(hmm->mat[k], Alphabet_size);
124 else if (type == STI) sym = FChoose(hmm->ins[k], Alphabet_size);
125 else if ((type == STN && tr->statetype[tpos-1] == STN) ||
126 (type == STC && tr->statetype[tpos-1] == STC) ||
127 (type == STJ && tr->statetype[tpos-1] == STJ))
128 sym = FChoose(hmm->null, Alphabet_size);
130 /* Add to the traceback; deal with realloc if necessary
132 TraceSet(tr, tpos, type, k, (sym != -1) ? L : 0);
134 if (tpos == alloc_tlen) {
136 P7ReallocTrace(tr, alloc_tlen);
139 /* Add to the digitized seq; deal with realloc, if necessary
144 if (L+1 == alloc_L) { /* L+1 leaves room for sentinel byte + \0 */
146 dsq = ReallocOrDie(dsq, sizeof(char) * alloc_L);
151 /* Finish off the trace
155 /* Finish off the dsq with sentinel byte and null terminator.
156 * Emitted Sequence length is L-1.
158 dsq[L] = (char) Alphabet_iupac;
164 if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);
165 if (ret_L != NULL) *ret_L = L;
166 if (ret_tr != NULL) *ret_tr = tr; else P7FreeTrace(tr);
171 /* Function: EmitBestSequence()
172 * Date: SRE, Tue Nov 10 16:21:59 1998 [St. Louis]
174 * Purpose: Given a model, emit the maximum probability sequence
175 * from it: argmax_{seq} P(seq | model).
176 * This is a sensible HMM equivalent to a "consensus"
178 * The model should be Plan7NakedConfig()'ed;
179 * in particular, if we allowed B->M and M->E,
180 * the highest probability sequence would be
181 * artifactually short. (We could do the highest
182 * scoring sequence instead, to get around this problem,
183 * but the highest scoring sequence is prone to
184 * other artifacts -- any looping state N,C,J, or I
185 * with a positively scoring residue leads to
186 * an infinitely long "best scoring" sequence.)
188 * Args: hmm - the model
189 * ret_seq - RETURN: best sequence
190 * ret_L - RETURN: length of sequence
191 * ret_tr - RETURN: traceback of the model/seq alignment; or NULL.
196 EmitBestSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr)
198 char *seq; /* RETURN: best seq */
199 struct p7trace_s *tr; /* RETURN: traceback */
200 float *mmx, *imx, *dmx; /* log P forward scores for M,D,I */
201 char *mtb, *itb, *dtb; /* traceback ptrs for M,D,I */
202 int x; /* counter for symbols */
203 int k; /* counter for nodes */
204 float sc; /* tmp var for a log P */
206 int rpos; /* position in a sequence */
207 int tpos; /* position in a trace */
208 int tlen; /* length of the traceback */
210 /* Initial allocations. We only need a 1D matrix and its shadow;
211 * it's overkill to use the Plan7Matrix structures, so don't.
213 mmx = MallocOrDie(sizeof(float) * (hmm->M+1));
214 imx = MallocOrDie(sizeof(float) * (hmm->M));
215 dmx = MallocOrDie(sizeof(float) * (hmm->M));
216 mtb = MallocOrDie(sizeof(char) * (hmm->M+1));
217 itb = MallocOrDie(sizeof(char) * (hmm->M));
218 dtb = MallocOrDie(sizeof(char) * (hmm->M));
221 * We can safely assume a max probability path of S->N->B->(M1 or D1),
222 * so just init M1 and D1.
224 mmx[1] = log(hmm->xt[XTN][MOVE]) + log(1.F - hmm->tbd1);
228 /* Main recursion, done as a push.
229 * The model is used in probability form; no wing folding needed.
231 for (k = 1; k < hmm->M; k++)
233 /* Transits out of match state (init with these)
235 mmx[k+1] = mmx[k] + log(hmm->t[k][TMM]); mtb[k+1] = STM;
236 dmx[k+1] = mmx[k] + log(hmm->t[k][TMD]); dtb[k+1] = STM;
238 imx[k] = mmx[k] + log(hmm->t[k][TMI]); itb[k] = STM;
240 /* Transits out of delete state
242 if ((sc = dmx[k] + log(hmm->t[k][TDM])) > mmx[k+1])
243 { mmx[k+1] = sc; mtb[k+1] = STD; }
244 if ((sc = dmx[k] + log(hmm->t[k][TDD])) > dmx[k+1])
245 { dmx[k+1] = sc; dtb[k+1] = STD; }
247 /* Transits out of insert state (self-loops are never good)
249 if ((sc = imx[k] + log(hmm->t[k][TIM])) > mmx[k+1])
250 { mmx[k+1] = sc; mtb[k+1] = STI; }
254 x = FMax(hmm->mat[k+1], Alphabet_size);
255 mmx[k+1] += log(hmm->mat[k+1][x]);
258 x = FMax(hmm->ins[k+1], Alphabet_size);
259 imx[k+1] += log(hmm->ins[k+1][x]);
263 #endif /* SRE_REMOVED */
266 /* Function: EmitConsensusSequence()
267 * Date: SRE, Wed Nov 11 11:08:59 1998 [St. Louis]
269 * Purpose: Generate a "consensus sequence". For the purposes
270 * of a profile HMM, this is defined as:
272 * - if StateOccupancy() says that M is used
273 * with probability >= 0.5, this M is "consensus".
274 * Then, choose maximally likely residue.
275 * if P>0.5 (protein) or P>0.9 (DNA), make
276 * it upper case; else make it lower case.
277 * - if StateOccupancy() says that I
278 * is used with P >= 0.5, this I is "consensus";
279 * use it 1/(1-TII) times (its expectation value).
280 * Generate an "x" from each I.
282 * The function expects that the model is config'ed
283 * by Plan7NakedConfig(): that is, for a single global pass
284 * with no N,C,J involvement.
287 * Args: hmm - the model
288 * ret_seq - RETURN: consensus sequence (pass NULL if unwanted)
289 * ret_dsq - RETURN: digitized consensus sequence (pass NULL if unwanted)
290 * ret_L - RETURN: length of generated sequence
291 * ret_tr - RETURN: generated trace (pass NULL if unwanted)
296 EmitConsensusSequence(struct plan7_s *hmm, char **ret_seq, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr)
298 struct p7trace_s *tr; /* RETURN: traceback */
299 char *dsq, *seq; /* sequence in digitized and undigitized form */
300 float *mp, *ip, *dp; /* state occupancies from StateOccupancy() */
301 int nmat, ndel, nins; /* number of matches, deletes, inserts used */
302 int k; /* counter for nodes */
303 int tpos; /* position in trace */
304 int i; /* position in seq (equiv pos in dsq is i+1 */
305 int x; /* symbol choice (M) or # symbols (I) */
306 float mthresh; /* >= this, show symbol as upper case */
308 if (Alphabet_type == hmmAMINO) mthresh = 0.5;
311 StateOccupancy(hmm, &mp, &ip, &dp);
313 /* First pass: how many states do we need in the trace?
314 * how long will the sequence be?
316 nmat = ndel = nins = 0;
317 for (k = 1; k <= hmm->M; k++)
319 if (mp[k] >= 0.5) nmat++; else ndel++;
320 if (k < hmm->M && ip[k] >= 0.5)
321 nins += (int) (1.f / (1.f - hmm->t[k][TII]));
326 P7AllocTrace(6 + nmat + ndel + nins, &tr);
327 dsq = MallocOrDie(sizeof(char) * (nmat+nins+3));
328 seq = MallocOrDie(sizeof(char) * (nmat+nins+1));
331 * Construct consensus trace, seq, and dsq.
333 TraceSet(tr, 0, STS, 0, 0);
334 TraceSet(tr, 1, STN, 0, 0);
335 TraceSet(tr, 2, STB, 0, 0);
336 dsq[0] = Alphabet_iupac; /* guard byte */
339 for (k = 1; k <= hmm->M; k++)
343 x = FMax(hmm->mat[k], Alphabet_size);
344 TraceSet(tr, tpos, STM, k, i+1);
345 seq[i] = Alphabet[x];
347 if (hmm->mat[k][x] < mthresh)
348 seq[i] = tolower((int) seq[i]);
354 TraceSet(tr, tpos, STD, k, 0);
358 if (k < hmm->M && ip[k] >= 0.5)
360 x = (int) (1.f / (1.f - hmm->t[k][TII]));
363 TraceSet(tr, tpos, STI, k, i+1);
365 dsq[i+1] = Alphabet_iupac - 1;
371 TraceSet(tr, tpos, STE, 0, 0); tpos++;
372 TraceSet(tr, tpos, STC, 0, 0); tpos++;
373 TraceSet(tr, tpos, STT, 0, 0); tpos++;
374 dsq[i+1] = Alphabet_iupac;
379 if (ret_seq != NULL) *ret_seq = seq; else free(seq);
380 if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq);
381 if (ret_L != NULL) *ret_L = i;
382 if (ret_tr != NULL) *ret_tr = tr; else P7FreeTrace(tr);
387 /* Function: StateOccupancy()
388 * Date: SRE, Wed Nov 11 09:46:15 1998 [St. Louis]
390 * Purpose: Calculate the expected state occupancy for
391 * a given HMM in generated traces.
393 * Note that expected prob of getting into
394 * any special state in a trace is trivial:
396 * J = E->J transition prob
398 * Args: hmm - the model
399 * ret_mp - RETURN: [1..M] prob's of occupying M
400 * ret_ip - RETURN: [1..M-1] prob's of occupying I
401 * ret_dp - RETURN: [1..M] prob's of occupying D
404 * mp, ip, dp are malloc'ed here. Caller must free().
407 StateOccupancy(struct plan7_s *hmm, float **ret_mp, float **ret_ip, float **ret_dp)
409 float *fmp, *fip, *fdp; /* forward probabilities */
410 int k; /* counter for nodes */
412 /* Initial allocations
414 fmp = MallocOrDie (sizeof(float) * (hmm->M+1));
415 fip = MallocOrDie (sizeof(float) * (hmm->M));
416 fdp = MallocOrDie (sizeof(float) * (hmm->M+1));
421 fmp[1] = hmm->begin[1];
422 fip[1] = fmp[1] * hmm->t[1][TMI];
423 for (k = 2; k <= hmm->M; k++)
425 /* M: from M,D,I at k-1, or B; count t_II as 1.0 */
426 fmp[k] = fmp[k-1] * hmm->t[k-1][TMM] +
428 fdp[k-1] * hmm->t[k-1][TDM] +
430 /* D: from M,D at k-1 */
431 fdp[k] = fmp[k-1] * hmm->t[k-1][TMD] +
432 fdp[k-1] * hmm->t[k-1][TDD];
433 /* I: from M at k; don't count II */
435 fip[k] = fmp[k] * hmm->t[k][TMI];
438 SQD_DASSERT2((fabs(1.0f - fmp[k] - fdp[k]) < 1e-6f));
439 fmp[k] /= fmp[k]+fdp[k]; /* prevent propagating fp errors */
440 fdp[k] /= fmp[k]+fdp[k];
442 /* We don't need a backward pass; all backwards P's are 1.0
443 * by definition (you can always get out of a state with P=1).
444 * The only situation where this might not be true is for
445 * a TII of 1.0, when TIM = 0 -- but in that case, if there's
446 * a finite chance of getting into that insert state, the model
447 * generates infinitely long sequences, so we can consider this
448 * situation "perverse" and disallow it elsewhere in building