1 /************************************************************
2 * HMMER - Biological sequence analysis with profile-HMMs
3 * Copyright (C) 1992-1997 Sean R. Eddy
5 * This source code is distributed under the terms of the
6 * GNU General Public License. See the files COPYING and
7 * GNULICENSE for details.
9 ************************************************************/
13 * alloc, free, and initialization of state structures
31 AllocHMM(int M) /* length of model to make */
33 struct hmm_struc *hmm; /* RETURN: blank HMM */
35 hmm = (struct hmm_struc *) MallocOrDie (sizeof(struct hmm_struc));
36 hmm->ins = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2));
37 hmm->del = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2));
38 hmm->mat = (struct basic_state *) MallocOrDie (sizeof(struct basic_state) * (M+2));
39 hmm->ref = (char *) MallocOrDie ((M+2) * sizeof(char));
40 hmm->cs = (char *) MallocOrDie ((M+2) * sizeof(char));
41 hmm->xray = (float *) MallocOrDie ((M+2) * sizeof(float) * NINPUTS);
43 hmm->name = Strdup("unnamed"); /* name is not optional. */
50 /* Function: ZeroHMM()
52 * Purpose: Zero emission and transition counts in an HMM.
55 ZeroHMM(struct hmm_struc *hmm)
59 for (k = 0; k <= hmm->M+1; k++)
61 for (ts = 0; ts < 3; ts++)
63 hmm->mat[k].t[ts] = 0.0;
64 hmm->ins[k].t[ts] = 0.0;
65 hmm->del[k].t[ts] = 0.0;
67 for (idx = 0; idx < Alphabet_size; idx++)
69 hmm->mat[k].p[idx] = 0.0;
70 hmm->ins[k].p[idx] = 0.0;
71 hmm->del[k].p[idx] = 0.0;
77 /* Function: LogifyHMM()
79 * Purpose: Convert a probability-form HMM to log probabilities.
80 * Best to do this on a modifiable copy of an HMM.
83 LogifyHMM(struct hmm_struc *hmm)
87 for (k = 0; k <= hmm->M+1; k++)
89 for (ts = 0; ts < 3; ts++)
91 hmm->mat[k].t[ts] = sreLOG2(hmm->mat[k].t[ts]);
92 hmm->ins[k].t[ts] = sreLOG2(hmm->ins[k].t[ts]);
93 hmm->del[k].t[ts] = sreLOG2(hmm->del[k].t[ts]);
95 for (idx = 0; idx < Alphabet_size; idx++)
97 hmm->mat[k].p[idx] = sreLOG2(hmm->mat[k].p[idx]);
98 hmm->ins[k].p[idx] = sreLOG2(hmm->ins[k].p[idx]);
103 /* Function: LogoddsifyHMM()
105 * Convert a probability form HMM to log odds scores.
106 * Best to do this on a modifiable copy of an HMM.
109 LogoddsifyHMM(struct hmm_struc *hmm)
113 for (k = 0; k <= hmm->M+1; k++)
115 for (ts = 0; ts < 3; ts++)
117 hmm->mat[k].t[ts] = sreLOG2(hmm->mat[k].t[ts]);
118 hmm->ins[k].t[ts] = sreLOG2(hmm->ins[k].t[ts]);
119 hmm->del[k].t[ts] = sreLOG2(hmm->del[k].t[ts]);
121 for (x = 0; x < Alphabet_size; x++)
123 hmm->mat[k].p[x] = sreLOG2(hmm->mat[k].p[x]) - sreLOG2(hmm->null[x]);
124 hmm->ins[k].p[x] = sreLOG2(hmm->ins[k].p[x]) - sreLOG2(hmm->null[x]);
130 /* Function: WriteFlatPriorHMM()
132 * Purpose: Fill an HMM with expected probabilities according
133 * to a given prior. Used to construct "flat" initial
137 WriteFlatPriorHMM(struct hmm_struc *hmm, struct prior_s *prior)
139 int k; /* counter across model */
140 int q; /* counter over mixtures */
141 int x; /* counter over symbols or transitions */
142 float malpha; /* alpha for mixture */
143 float ialpha; /* alpha for insert mixture */
144 float dalpha; /* alpha for delete mixture */
146 for (k = 0; k <= hmm->M; k++)
148 /* xray info for structure prior */
149 if (prior->strategy == PRI_STRUCT)
151 hmm->xray[k*NINPUTS + XRAY_bias] = 1.0;
152 hmm->xray[k*NINPUTS + XRAY_E] = 0.0;
153 hmm->xray[k*NINPUTS + XRAY_H] = 0.0;
154 hmm->xray[k*NINPUTS + XRAY_SA] = 0.0;
156 /* match symbol emissions */
157 for (x = 0; x < Alphabet_size; x++)
158 hmm->mat[k].p[x] = 0.0;
160 for (q = 0; q < prior->mnum; q++)
162 if (prior->strategy == PRI_STRUCT)
163 prior->mq[q] = 1.0 / prior->mnum;
165 for (x = 0; x < Alphabet_size; x++)
166 malpha += prior->mat[q][x];
167 for (x = 0; x < Alphabet_size; x++)
168 hmm->mat[k].p[x] += prior->mq[q] * prior->mat[q][x] / malpha;
170 /* insert emissions */
171 for (x = 0; x < Alphabet_size; x++)
172 hmm->ins[k].p[x] = 0.0;
173 for (q = 0; q < prior->inum; q++)
175 if (prior->strategy == PRI_STRUCT)
176 prior->iq[q] = 1.0 / prior->inum;
178 for (x = 0; x < Alphabet_size; x++)
179 ialpha += prior->ins[q][x];
180 for (x = 0; x < Alphabet_size; x++)
181 hmm->ins[k].p[x] += prior->iq[q] * prior->ins[q][x] / ialpha;
184 /* state transitions */
185 for (x = 0; x < 3; x++)
186 hmm->mat[k].t[x] = hmm->ins[k].t[x] = hmm->del[k].t[x] = 0.0;
187 for (q = 0; q < prior->tnum; q++)
189 if (prior->strategy == PRI_STRUCT)
190 prior->tq[q] = 1.0 / prior->tnum;
191 malpha = ialpha = dalpha = 0.0;
192 for (x = 0; x < 3; x++)
194 malpha += prior->tm[q][x];
195 ialpha += prior->ti[q][x];
196 dalpha += prior->td[q][x];
198 for (x = 0; x < 3; x++)
200 hmm->mat[k].t[x] += prior->tq[q] * prior->tm[q][x] / malpha;
201 hmm->ins[k].t[x] += prior->tq[q] * prior->ti[q][x] / ialpha;
202 if (k > 0) hmm->del[k].t[x] += prior->tq[q] * prior->td[q][x] / dalpha;
206 /* the final state never transits to d+1 */
207 hmm->mat[hmm->M].t[DELETE] = 0.0;
208 hmm->ins[hmm->M].t[DELETE] = 0.0;
209 hmm->del[hmm->M].t[DELETE] = 0.0;
215 /* Function: HMMDup()
217 * Purpose: Create a duplicate copy of an HMM.
219 * Return: Pointer to the duplicate.
220 * Caller is responsible for free'ing the duplicate.
223 HMMDup(struct hmm_struc *hmm)
225 struct hmm_struc *newhmm;
227 if ((newhmm = AllocHMM(hmm->M)) == NULL)
228 Die("AllocHMM() failed");
229 HMMCopy(newhmm, hmm);
234 /* Function: HMMCopy()
236 * Purpose: Make a copy of hmm2 in hmm1.
239 * Caller promises that hmm1 and hmm2 have identical architectures.
242 HMMCopy(struct hmm_struc *hmm1, struct hmm_struc *hmm2)
246 hmm1->flags = hmm2->flags;
247 if (hmm1->name != NULL) free(hmm1->name);
248 hmm1->name = Strdup(hmm2->name);
250 if (hmm2->flags & HMM_REF) strcpy(hmm1->ref, hmm2->ref);
251 if (hmm2->flags & HMM_CS) strcpy(hmm1->cs, hmm2->cs);
252 if (hmm2->flags & HMM_XRAY)
253 memcpy(hmm1->xray, hmm2->xray, NINPUTS * (hmm2->M+2) * sizeof(float));
254 memcpy(hmm1->null, hmm2->null, sizeof(float) * Alphabet_size);
256 for (k = 0; k <= hmm2->M+1; k++)
258 /* copy transition T's */
259 for (ts = 0; ts < 3; ts++)
261 hmm1->mat[k].t[ts] = hmm2->mat[k].t[ts];
262 hmm1->ins[k].t[ts] = hmm2->ins[k].t[ts];
263 hmm1->del[k].t[ts] = hmm2->del[k].t[ts];
265 /* copy symbol P tables */
266 for (x = 0; x < Alphabet_size; x++)
268 hmm1->mat[k].p[x] = hmm2->mat[k].p[x];
269 hmm1->ins[k].p[x] = hmm2->ins[k].p[x];
277 FreeHMM(struct hmm_struc *hmm)
279 if (hmm == NULL) return 0;
284 if (hmm->mat != NULL) free (hmm->mat);
285 if (hmm->ins != NULL) free (hmm->ins);
286 if (hmm->del != NULL) free (hmm->del);
293 AllocSearchHMM(int M)
298 if ((shmm = (struct shmm_s *) malloc (sizeof(struct shmm_s))) == NULL)
299 Die("malloc failed");
300 for (x = 0; x < 26; x++)
301 if ((shmm->m_emit[x] = (int *) calloc (M+1, sizeof(int))) == NULL ||
302 (shmm->i_emit[x] = (int *) calloc (M+1, sizeof(int))) == NULL)
303 Die("malloc failed");
304 if ((shmm->t = (int *) malloc (sizeof(int) * (9*(M+1)))) == NULL ||
305 (shmm->ref = (char *) malloc (sizeof(char) * (M+2))) == NULL ||
306 (shmm->cs = (char *) malloc (sizeof(char) * (M+2))) == NULL)
307 Die("malloc failed");
309 shmm->name = Strdup("nameless");
315 FreeSearchHMM(struct shmm_s *shmm)
319 for (x = 0; x < 26; x++)
321 free(shmm->m_emit[x]);
322 free(shmm->i_emit[x]);
332 /* Function: CountSymbol()
334 * Purpose: Given an observed symbol, and a number of counts to
335 * distribute (typically just 1.0), bump the appropriate counter(s).
337 * This is completely trivial only so long as the symbols
338 * always come from the expected alphabet; since we also
339 * have to deal with degenerate symbols for both nucleic
340 * acid and protein languages, we make a function to deal
343 * Args: sym - observed symbol, e.g. `A' or `X'
344 * wt - number of counts to distribute (e.g. 1.0)
345 * counters - array of 4 or 20 counters to increment
347 * Return: Returns 1 on success and bumps the necessary counters.
348 * Returns 0 on failure and bumps each counter evenly, as
349 * if it saw a completely ambiguous symbol; this lets
350 * the caller silently accept garbage symbols, if it cares to.
353 CountSymbol(char sym, float wt, float *counters)
355 char *sptr; /* pointer into symbol in hmm->alphabet */
356 int status; /* RETURN: status; did we recognize the symbol? */
357 char symidx; /* index of symbol in Alphabet_iupac */
359 if ((sptr = strchr(Alphabet,sym)) != NULL)
361 symidx = (char) (sptr - Alphabet);
366 symidx = (char) (Alphabet_iupac - 1);
367 Warn("unrecognized character %c in CountSymbol()\n", sym);
370 P7CountSymbol(counters, symidx, wt);
375 /* Function: HMMDistance()
377 * Purpose: Test two models for how different they are, using
378 * a simple squared difference measure on all homologous
379 * parameters. They must have the same architecture:
380 * i.e. check that newhmm->M == oldhmm->M before calling.
382 * Args: newhmm - new HMM, probability form
383 * oldhmm - old HMM, probability form
388 HMMDistance(struct hmm_struc *newhmm, struct hmm_struc *oldhmm)
391 float distance = 0.0;
393 for (k = 0; k <= newhmm->M; k++)
395 /* state transition distances */
398 for (ts = 0; ts < 3; ts++)
399 distance += SQR( 100. * (newhmm->del[k].t[ts] - oldhmm->del[k].t[ts]));
401 for (ts = 0; ts < 3; ts++)
402 distance += SQR( 100. * (newhmm->mat[k].t[ts] - oldhmm->mat[k].t[ts]));
403 for (ts = 0; ts < 3; ts++)
404 distance += SQR( 100. * (newhmm->ins[k].t[ts] - oldhmm->ins[k].t[ts]));
406 /* symbol emission distances */
408 for (x = 0; x < Alphabet_size; x++)
409 distance += SQR( 100. * (newhmm->mat[k].p[x] - oldhmm->mat[k].p[x]));
410 for (x = 0; x < Alphabet_size; x++)
411 distance += SQR( 100. * (newhmm->ins[k].p[x] - oldhmm->ins[k].p[x]));
413 distance = sqrt(distance) / newhmm->M;
420 /* Function: Renormalize()
422 * Normalize all P distributions so they sum to 1.
423 * P distributions that are all 0, or contain negative
424 * probabilities, are left untouched.
426 * Returns 1 on success, or 0 on failure.
429 Renormalize(struct hmm_struc *hmm)
431 int k; /* counter for states */
433 for (k = 0; k <= hmm->M ; k++)
435 /* match state transition frequencies */
436 FNorm(hmm->mat[k].t, 3);
437 FNorm(hmm->ins[k].t, 3);
438 if (k > 0) FNorm(hmm->del[k].t, 3);
440 if (k > 0) FNorm(hmm->mat[k].p, Alphabet_size);
441 FNorm(hmm->ins[k].p, Alphabet_size);