1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
13 * SRE, Sat Nov 16 14:19:56 1996
15 * Support for Plan 7 HMM data structure, plan7_s.
28 /* Functions: AllocPlan7(), AllocPlan7Shell(), AllocPlan7Body(), FreePlan7()
30 * Purpose: Allocate or free a Plan7 HMM structure.
31 * Can either allocate all at one (AllocPlan7()) or
32 * in two steps (AllocPlan7Shell(), AllocPlan7Body()).
33 * The two step method is used in hmmio.c where we start
34 * parsing the header of an HMM file but don't
35 * see the size of the model 'til partway thru the header.
42 hmm = AllocPlan7Shell();
43 AllocPlan7Body(hmm, M);
51 hmm = (struct plan7_s *) MallocOrDie (sizeof(struct plan7_s));
70 hmm->ga1 = hmm->ga2 = 0.0;
71 hmm->tc1 = hmm->tc2 = 0.0;
72 hmm->nc1 = hmm->nc2 = 0.0;
85 /* DNA translation is not enabled by default */
90 /* statistical parameters set to innocuous empty values */
98 AllocPlan7Body(struct plan7_s *hmm, int M)
104 hmm->rf = MallocOrDie ((M+2) * sizeof(char));
105 hmm->cs = MallocOrDie ((M+2) * sizeof(char));
106 hmm->ca = MallocOrDie ((M+2) * sizeof(char));
107 hmm->map = MallocOrDie ((M+1) * sizeof(int));
109 hmm->t = MallocOrDie (M * sizeof(float *));
110 hmm->tsc = MallocOrDie (M * sizeof(int *));
111 hmm->mat = MallocOrDie ((M+1) * sizeof(float *));
112 hmm->ins = MallocOrDie (M * sizeof(float *));
113 hmm->msc = MallocOrDie (MAXCODE * sizeof(int *));
114 hmm->isc = MallocOrDie (MAXCODE * sizeof(int *));
115 hmm->t[0] = MallocOrDie ((7*M) * sizeof(float));
116 hmm->tsc[0] = MallocOrDie ((7*M) * sizeof(int));
117 hmm->mat[0] = MallocOrDie ((MAXABET*(M+1)) * sizeof(float));
118 hmm->ins[0] = MallocOrDie ((MAXABET*M) * sizeof(float));
119 hmm->msc[0] = MallocOrDie ((MAXCODE*(M+1)) * sizeof(int));
120 hmm->isc[0] = MallocOrDie ((MAXCODE*M) * sizeof(int));
122 /* note allocation strategy for important 2D arrays -- trying
123 * to keep locality as much as possible, cache efficiency etc.
125 for (k = 1; k <= M; k++) {
126 hmm->mat[k] = hmm->mat[0] + k * MAXABET;
128 hmm->ins[k] = hmm->ins[0] + k * MAXABET;
129 hmm->t[k] = hmm->t[0] + k * 7;
130 hmm->tsc[k] = hmm->tsc[0] + k * 7;
133 for (x = 1; x < MAXCODE; x++) {
134 hmm->msc[x] = hmm->msc[0] + x * (M+1);
135 hmm->isc[x] = hmm->isc[0] + x * M;
137 /* tsc[0] is used as a boundary condition sometimes [Viterbi()],
138 * so set to -inf always.
140 for (x = 0; x < 7; x++)
141 hmm->tsc[0][x] = -INFTY;
143 hmm->begin = MallocOrDie ((M+1) * sizeof(float));
144 hmm->bsc = MallocOrDie ((M+1) * sizeof(int));
145 hmm->end = MallocOrDie ((M+1) * sizeof(float));
146 hmm->esc = MallocOrDie ((M+1) * sizeof(int));
153 FreePlan7(struct plan7_s *hmm)
155 if (hmm->name != NULL) free(hmm->name);
156 if (hmm->desc != NULL) free(hmm->desc);
157 if (hmm->rf != NULL) free(hmm->rf);
158 if (hmm->cs != NULL) free(hmm->cs);
159 if (hmm->ca != NULL) free(hmm->ca);
160 if (hmm->comlog != NULL) free(hmm->comlog);
161 if (hmm->ctime != NULL) free(hmm->ctime);
162 if (hmm->map != NULL) free(hmm->map);
163 if (hmm->tpri != NULL) free(hmm->tpri);
164 if (hmm->mpri != NULL) free(hmm->mpri);
165 if (hmm->ipri != NULL) free(hmm->ipri);
166 if (hmm->bsc != NULL) free(hmm->bsc);
167 if (hmm->begin != NULL) free(hmm->begin);
168 if (hmm->esc != NULL) free(hmm->esc);
169 if (hmm->end != NULL) free(hmm->end);
170 if (hmm->msc != NULL) free(hmm->msc[0]);
171 if (hmm->mat != NULL) free(hmm->mat[0]);
172 if (hmm->isc != NULL) free(hmm->isc[0]);
173 if (hmm->ins != NULL) free(hmm->ins[0]);
174 if (hmm->tsc != NULL) free(hmm->tsc[0]);
175 if (hmm->t != NULL) free(hmm->t[0]);
176 if (hmm->msc != NULL) free(hmm->msc);
177 if (hmm->mat != NULL) free(hmm->mat);
178 if (hmm->isc != NULL) free(hmm->isc);
179 if (hmm->ins != NULL) free(hmm->ins);
180 if (hmm->tsc != NULL) free(hmm->tsc);
181 if (hmm->t != NULL) free(hmm->t);
182 if (hmm->dnam != NULL) free(hmm->dnam);
183 if (hmm->dnai != NULL) free(hmm->dnai);
187 /* Function: ZeroPlan7()
189 * Purpose: Zeros the counts/probabilities fields in a model.
190 * Leaves null model untouched.
193 ZeroPlan7(struct plan7_s *hmm)
196 for (k = 1; k < hmm->M; k++)
198 FSet(hmm->t[k], 7, 0.);
199 FSet(hmm->mat[k], Alphabet_size, 0.);
200 FSet(hmm->ins[k], Alphabet_size, 0.);
202 FSet(hmm->mat[hmm->M], Alphabet_size, 0.);
204 FSet(hmm->begin+1, hmm->M, 0.);
205 FSet(hmm->end+1, hmm->M, 0.);
206 for (k = 0; k < 4; k++)
207 FSet(hmm->xt[k], 2, 0.);
208 hmm->flags &= ~PLAN7_HASBITS; /* invalidates scores */
209 hmm->flags &= ~PLAN7_HASPROB; /* invalidates probabilities */
213 /* Function: Plan7SetName()
215 * Purpose: Change the name of a Plan7 HMM. Convenience function.
217 * Note: Trailing whitespace and \n's are chopped.
220 Plan7SetName(struct plan7_s *hmm, char *name)
222 if (hmm->name != NULL) free(hmm->name);
223 hmm->name = Strdup(name);
224 StringChop(hmm->name);
226 /* Function: Plan7SetAccession()
228 * Purpose: Change the accession number of a Plan7 HMM. Convenience function.
230 * Note: Trailing whitespace and \n's are chopped.
233 Plan7SetAccession(struct plan7_s *hmm, char *acc)
235 if (hmm->acc != NULL) free(hmm->acc);
236 hmm->acc = Strdup(acc);
237 StringChop(hmm->acc);
238 hmm->flags |= PLAN7_ACC;
241 /* Function: Plan7SetDescription()
243 * Purpose: Change the description line of a Plan7 HMM. Convenience function.
245 * Note: Trailing whitespace and \n's are chopped.
248 Plan7SetDescription(struct plan7_s *hmm, char *desc)
250 if (hmm->desc != NULL) free(hmm->desc);
251 hmm->desc = Strdup(desc);
252 StringChop(hmm->desc);
253 hmm->flags |= PLAN7_DESC;
256 /* Function: Plan7ComlogAppend()
257 * Date: SRE, Wed Oct 29 09:57:30 1997 [TWA 721 over Greenland]
259 * Purpose: Concatenate command line options and append to the
263 Plan7ComlogAppend(struct plan7_s *hmm, int argc, char **argv)
268 /* figure out length of command line, w/ spaces and \n */
270 for (i = 0; i < argc; i++)
271 len += strlen(argv[i]);
274 if (hmm->comlog != NULL)
276 len += strlen(hmm->comlog);
277 hmm->comlog = ReallocOrDie(hmm->comlog, sizeof(char)* (len+1));
281 hmm->comlog = MallocOrDie(sizeof(char)* (len+1));
282 *(hmm->comlog) = '\0'; /* need this to make strcat work */
286 strcat(hmm->comlog, "\n");
287 for (i = 0; i < argc; i++)
289 strcat(hmm->comlog, argv[i]);
290 if (i < argc-1) strcat(hmm->comlog, " ");
294 /* Function: Plan7SetCtime()
295 * Date: SRE, Wed Oct 29 11:53:19 1997 [TWA 721 over the Atlantic]
297 * Purpose: Set the ctime field in a new HMM to the current time.
300 Plan7SetCtime(struct plan7_s *hmm)
302 time_t date = time(NULL);
303 if (hmm->ctime != NULL) free(hmm->ctime);
304 hmm->ctime = Strdup(ctime(&date));
305 StringChop(hmm->ctime);
309 /* Function: Plan7SetNullModel()
311 * Purpose: Set the null model section of an HMM.
312 * Convenience function.
315 Plan7SetNullModel(struct plan7_s *hmm, float null[MAXABET], float p1)
318 for (x = 0; x < Alphabet_size; x++)
319 hmm->null[x] = null[x];
324 /* Function: P7Logoddsify()
326 * Purpose: Take an HMM with valid probabilities, and
327 * fill in the integer log-odds score section of the model.
329 * Notes on log-odds scores:
330 * type of parameter probability score
331 * ----------------- ----------- ------
332 * any emission p_x log_2 p_x/null_x
333 * N,J,C /assume/ p_x = null_x so /always/ score zero.
334 * transition to emitters t_x log_2 t_x/p1
336 * NN and CC loops are often equal to p1, so usu. score zero.
337 * C->T transition t_x log_2 t_x/p2
338 * often zero, usu. C->T = p2.
339 * all other transitions t_x log_2 t_x
340 * (no null model counterpart, so null prob is 1)
342 * Notes on entry/exit scores, B->M and M->E:
343 * The probability form model includes delete states 1 and M.
344 * these states are removed from a search form model to
345 * prevent B->D...D->E->J->B mute cycles, which would complicate
346 * dynamic programming algorithms. The data-independent
347 * S/W B->M and M->E transitions are folded together with
348 * data-dependent B->D...D->M and M->D...D->E paths.
350 * This process is referred to in the code as "wing folding"
351 * or "wing retraction"... the analogy is to a swept-wing
352 * fighter in landing vs. high speed flight configuration.
354 * Note on Viterbi vs. forward flag:
355 * Wing retraction must take forward vs. Viterbi
356 * into account. If forward, sum two paths; if Viterbi, take
357 * max. I tried to slide this by as a sum, without
358 * the flag, but Alex detected it as a bug, because you can
359 * then find cases where the Viterbi score doesn't match
360 * the P7TraceScore().
362 * Args: hmm - the hmm to calculate scores in.
363 * viterbi_mode - TRUE to fold wings in Viterbi configuration.
366 * hmm scores are filled in.
369 P7Logoddsify(struct plan7_s *hmm, int viterbi_mode)
371 int k; /* counter for model position */
372 int x; /* counter for symbols */
376 if (hmm->flags & PLAN7_HASBITS) return;
378 /* Symbol emission scores
380 for (k = 1; k <= hmm->M; k++)
382 /* match/insert emissions in main model */
383 for (x = 0; x < Alphabet_size; x++)
385 hmm->msc[x][k] = Prob2Score(hmm->mat[k][x], hmm->null[x]);
387 hmm->isc[x][k] = Prob2Score(hmm->ins[k][x], hmm->null[x]);
389 /* degenerate match/insert emissions */
390 for (x = Alphabet_size; x < Alphabet_iupac; x++)
392 hmm->msc[x][k] = DegenerateSymbolScore(hmm->mat[k], hmm->null, x);
394 hmm->isc[x][k] = DegenerateSymbolScore(hmm->ins[k], hmm->null, x);
398 /* State transitions.
400 * A note on "folding" of D_1 and D_M.
401 * These two delete states are folded out of search form models
402 * in order to prevent null cycles in the dynamic programming
403 * algorithms (see code below). However, we use their log transitions
404 * when we save the model! So the following log transition probs
405 * are used *only* in save files, *never* in search algorithms:
406 * log (tbd1), D1 -> M2, D1 -> D2
407 * Mm-1 -> Dm, Dm-1 -> Dm
409 * In a search algorithm, these have to be interpreted as -INFTY
410 * because their contributions are folded into bsc[] and esc[]
411 * entry/exit scores. They can't be set to -INFTY here because
412 * we need them in save files.
414 for (k = 1; k < hmm->M; k++)
416 hmm->tsc[k][TMM] = Prob2Score(hmm->t[k][TMM], hmm->p1);
417 hmm->tsc[k][TMI] = Prob2Score(hmm->t[k][TMI], hmm->p1);
418 hmm->tsc[k][TMD] = Prob2Score(hmm->t[k][TMD], 1.0);
419 hmm->tsc[k][TIM] = Prob2Score(hmm->t[k][TIM], hmm->p1);
420 hmm->tsc[k][TII] = Prob2Score(hmm->t[k][TII], hmm->p1);
421 hmm->tsc[k][TDM] = Prob2Score(hmm->t[k][TDM], hmm->p1);
422 hmm->tsc[k][TDD] = Prob2Score(hmm->t[k][TDD], 1.0);
425 /* B->M entry transitions. Note how D_1 is folded out.
427 * M2 is sum (or max) of B->M2 and B->D1->M2
428 * M_k is sum (or max) of B->M_k and B->D1...D_k-1->M_k
429 * These have to be done in log space, else you'll get
430 * underflow errors; and we also have to watch for log(0).
431 * A little sloppier than it probably has to be; historically,
432 * doing in this in log space was in response to a bug report.
434 accum = hmm->tbd1 > 0.0 ? log(hmm->tbd1) : -9999.;
435 for (k = 1; k <= hmm->M; k++)
437 tbm = hmm->begin[k] > 0. ? log(hmm->begin[k]) : -9999.; /* B->M_k part */
439 /* B->D1...D_k-1->M_k part we get from accum*/
440 if (k > 1 && accum > -9999.)
442 if (hmm->t[k-1][TDM] > 0.0)
444 if (viterbi_mode) tbm = MAX(tbm, accum + log(hmm->t[k-1][TDM]));
445 else tbm = LogSum(tbm, accum + log(hmm->t[k-1][TDM]));
448 accum = (hmm->t[k-1][TDD] > 0.0) ? accum + log(hmm->t[k-1][TDD]) : -9999.;
450 /* Convert from log_e to scaled integer log_2 odds. */
452 hmm->bsc[k] = (int) floor(0.5 + INTSCALE * 1.44269504 * (tbm - log(hmm->p1)));
454 hmm->bsc[k] = -INFTY;
457 /* M->E exit transitions. Note how D_M is folded out.
458 * M_M is 1 by definition
459 * M_M-1 is sum of M_M-1->E and M_M-1->D_M->E, where D_M->E is 1 by definition
460 * M_k is sum of M_k->E and M_k->D_k+1...D_M->E
461 * Must be done in log space to avoid underflow errors.
462 * A little sloppier than it probably has to be; historically,
463 * doing in this in log space was in response to a bug report.
465 hmm->esc[hmm->M] = 0;
467 for (k = hmm->M-1; k >= 1; k--)
469 tme = hmm->end[k] > 0. ? log(hmm->end[k]) : -9999.;
472 if (hmm->t[k][TMD] > 0.0)
474 if (viterbi_mode) tme = MAX(tme, accum + log(hmm->t[k][TMD]));
475 else tme = LogSum(tme, accum + log(hmm->t[k][TMD]));
477 accum = (hmm->t[k][TDD] > 0.0) ? accum + log(hmm->t[k][TDD]) : -9999.;
479 /* convert from log_e to scaled integer log odds. */
480 hmm->esc[k] = (tme > -9999.) ? (int) floor(0.5 + INTSCALE * 1.44269504 * tme) : -INFTY;
483 /* special transitions */
484 hmm->xsc[XTN][LOOP] = Prob2Score(hmm->xt[XTN][LOOP], hmm->p1);
485 hmm->xsc[XTN][MOVE] = Prob2Score(hmm->xt[XTN][MOVE], 1.0);
486 hmm->xsc[XTE][LOOP] = Prob2Score(hmm->xt[XTE][LOOP], 1.0);
487 hmm->xsc[XTE][MOVE] = Prob2Score(hmm->xt[XTE][MOVE], 1.0);
488 hmm->xsc[XTC][LOOP] = Prob2Score(hmm->xt[XTC][LOOP], hmm->p1);
489 hmm->xsc[XTC][MOVE] = Prob2Score(hmm->xt[XTC][MOVE], 1.-hmm->p1);
490 hmm->xsc[XTJ][LOOP] = Prob2Score(hmm->xt[XTJ][LOOP], hmm->p1);
491 hmm->xsc[XTJ][MOVE] = Prob2Score(hmm->xt[XTJ][MOVE], 1.0);
493 hmm->flags |= PLAN7_HASBITS; /* raise the log-odds ready flag */
498 /* Function: Plan7Renormalize()
500 * Purpose: Take an HMM in counts form, and renormalize
501 * all of its probability vectors. Also enforces
502 * Plan7 restrictions on nonexistent transitions.
504 * Args: hmm - the model to renormalize.
510 Plan7Renormalize(struct plan7_s *hmm)
512 int k; /* counter for model position */
513 int st; /* counter for special states */
514 float d; /* denominator */
516 /* match emissions */
517 for (k = 1; k <= hmm->M; k++)
518 FNorm(hmm->mat[k], Alphabet_size);
519 /* insert emissions */
520 for (k = 1; k < hmm->M; k++)
521 FNorm(hmm->ins[k], Alphabet_size);
522 /* begin transitions */
523 d = FSum(hmm->begin+1, hmm->M) + hmm->tbd1;
524 FScale(hmm->begin+1, hmm->M, 1./d);
526 /* main model transitions */
527 for (k = 1; k < hmm->M; k++)
529 d = FSum(hmm->t[k], 3) + hmm->end[k];
530 FScale(hmm->t[k], 3, 1./d);
533 FNorm(hmm->t[k]+3, 2); /* insert */
534 FNorm(hmm->t[k]+5, 2); /* delete */
536 /* null model emissions */
537 FNorm(hmm->null, Alphabet_size);
538 /* special transitions */
539 for (st = 0; st < 4; st++)
540 FNorm(hmm->xt[st], 2);
541 /* enforce nonexistent transitions */
542 /* (is this necessary?) */
543 hmm->t[0][TDM] = hmm->t[0][TDD] = 0.0;
545 hmm->flags &= ~PLAN7_HASBITS; /* clear the log-odds ready flag */
546 hmm->flags |= PLAN7_HASPROB; /* set the probabilities OK flag */
550 /* Function: Plan7RenormalizeExits()
551 * Date: SRE, Fri Aug 14 11:22:19 1998 [St. Louis]
553 * Purpose: Renormalize just the match state transitions;
554 * for instance, after a Config() function has
555 * modified the exit distribution.
557 * Args: hmm - hmm to renormalize
562 Plan7RenormalizeExits(struct plan7_s *hmm)
567 for (k = 1; k < hmm->M; k++)
569 d = FSum(hmm->t[k], 3);
570 FScale(hmm->t[k], 3, 1./(d + d*hmm->end[k]));
575 /*****************************************************************
576 * Plan7 configuration functions
577 * The following few functions are the Plan7 equivalent of choosing
578 * different alignment styles (fully local, fully global, global/local,
581 * There is (at least) one constraint worth noting.
582 * If you want per-domain scores to sum up to per-sequence scores,
583 * then one of the following two sets of conditions must be met:
586 * e.g. no multidomain hits
588 * 2) t(N->N) = t(C->C) = t(J->J) = hmm->p1
589 * e.g. unmatching sequence scores zero, and
590 * N->B first-model score is equal to J->B another-model score.
592 * These constraints are obeyed in the default Config() functions below,
593 * but in the future (when HMM editing may be allowed) we'll have
594 * to remember this. Non-equality of the summed domain scores and
595 * the total sequence score is a really easy "red flag" for people to
596 * notice and report as a bug, even if it may make probabilistic
597 * sense not to meet either constraint for certain modeling problems.
598 *****************************************************************
601 /* Function: Plan7NakedConfig()
603 * Purpose: Set the alignment-independent, algorithm-dependent parameters
604 * of a Plan7 model so that no special states (N,C,J) emit anything:
605 * one simple, full global pass through the model.
607 * Args: hmm - the plan7 model
610 * The HMM is modified; algorithm dependent parameters are set.
611 * Previous scores are invalidated if they existed.
614 Plan7NakedConfig(struct plan7_s *hmm)
616 hmm->xt[XTN][MOVE] = 1.; /* disallow N-terminal tail */
617 hmm->xt[XTN][LOOP] = 0.;
618 hmm->xt[XTE][MOVE] = 1.; /* only 1 domain/sequence ("global" alignment) */
619 hmm->xt[XTE][LOOP] = 0.;
620 hmm->xt[XTC][MOVE] = 1.; /* disallow C-terminal tail */
621 hmm->xt[XTC][LOOP] = 0.;
622 hmm->xt[XTJ][MOVE] = 0.; /* J state unused */
623 hmm->xt[XTJ][LOOP] = 1.;
624 FSet(hmm->begin+2, hmm->M-1, 0.); /* disallow internal entries. */
625 hmm->begin[1] = 1. - hmm->tbd1;
626 FSet(hmm->end+1, hmm->M-1, 0.); /* disallow internal exits. */
627 hmm->end[hmm->M] = 1.;
628 Plan7RenormalizeExits(hmm);
629 hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
632 /* Function: Plan7GlobalConfig()
634 * Purpose: Set the alignment-independent, algorithm-dependent parameters
635 * of a Plan7 model to global (Needleman/Wunsch) configuration.
637 * Like a non-looping hmmls, since we actually allow flanking
638 * N and C terminal sequence.
640 * Args: hmm - the plan7 model
643 * The HMM is modified; algorithm dependent parameters are set.
644 * Previous scores are invalidated if they existed.
647 Plan7GlobalConfig(struct plan7_s *hmm)
649 hmm->xt[XTN][MOVE] = 1. - hmm->p1; /* allow N-terminal tail */
650 hmm->xt[XTN][LOOP] = hmm->p1;
651 hmm->xt[XTE][MOVE] = 1.; /* only 1 domain/sequence ("global" alignment) */
652 hmm->xt[XTE][LOOP] = 0.;
653 hmm->xt[XTC][MOVE] = 1. - hmm->p1; /* allow C-terminal tail */
654 hmm->xt[XTC][LOOP] = hmm->p1;
655 hmm->xt[XTJ][MOVE] = 0.; /* J state unused */
656 hmm->xt[XTJ][LOOP] = 1.;
657 FSet(hmm->begin+2, hmm->M-1, 0.); /* disallow internal entries. */
658 hmm->begin[1] = 1. - hmm->tbd1;
659 FSet(hmm->end+1, hmm->M-1, 0.); /* disallow internal exits. */
660 hmm->end[hmm->M] = 1.;
661 Plan7RenormalizeExits(hmm);
662 hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
665 /* Function: Plan7LSConfig()
667 * Purpose: Set the alignment independent parameters of a Plan7 model
668 * to hmmls (global in HMM, local in sequence) configuration.
670 * Args: hmm - the plan7 model
673 * the HMM probabilities are modified.
676 Plan7LSConfig(struct plan7_s *hmm)
678 hmm->xt[XTN][MOVE] = 1.-hmm->p1; /* allow N-terminal tail */
679 hmm->xt[XTN][LOOP] = hmm->p1;
680 hmm->xt[XTE][MOVE] = 0.5; /* expectation 2 domains/seq */
681 hmm->xt[XTE][LOOP] = 0.5;
682 hmm->xt[XTC][MOVE] = 1.-hmm->p1; /* allow C-terminal tail */
683 hmm->xt[XTC][LOOP] = hmm->p1;
684 hmm->xt[XTJ][MOVE] = 1.-hmm->p1; /* allow J junction state */
685 hmm->xt[XTJ][LOOP] = hmm->p1;
686 FSet(hmm->begin+2, hmm->M-1, 0.); /* start at M1/D1 */
687 hmm->begin[1] = 1. - hmm->tbd1;
688 FSet(hmm->end+1, hmm->M-1, 0.); /* end at M_m/D_m */
689 hmm->end[hmm->M] = 1.;
690 Plan7RenormalizeExits(hmm);
691 hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
695 /* Function: Plan7SWConfig()
697 * Purpose: Set the alignment independent parameters of
698 * a Plan7 model to hmmsw (Smith/Waterman) configuration.
700 * Notes: entry/exit is asymmetric because of the left/right
701 * nature of the HMM/profile. Entry probability is distributed
702 * simply by assigning p_x = pentry / (M-1) to M-1
703 * internal match states. However, the same approach doesn't
704 * lead to a flat distribution over exit points. Exit p's
705 * must be corrected for the probability of a previous exit
706 * from the model. Requiring a flat distribution over exit
707 * points leads to an easily solved piece of algebra, giving:
708 * p_1 = pexit / (M-1)
709 * p_x = p_1 / (1 - (x-1) p_1)
711 * Args: hmm - the Plan7 model w/ data-dep prob's valid
712 * pentry - probability of an internal entry somewhere;
713 * will be evenly distributed over M-1 match states
714 * pexit - probability of an internal exit somewhere;
715 * will be distributed over M-1 match states.
718 * HMM probabilities are modified.
721 Plan7SWConfig(struct plan7_s *hmm, float pentry, float pexit)
723 float basep; /* p1 for exits: the base p */
724 int k; /* counter over states */
726 /* Configure special states.
728 hmm->xt[XTN][MOVE] = 1-hmm->p1; /* allow N-terminal tail */
729 hmm->xt[XTN][LOOP] = hmm->p1;
730 hmm->xt[XTE][MOVE] = 1.; /* disallow jump state */
731 hmm->xt[XTE][LOOP] = 0.;
732 hmm->xt[XTC][MOVE] = 1-hmm->p1; /* allow C-terminal tail */
733 hmm->xt[XTC][LOOP] = hmm->p1;
734 hmm->xt[XTJ][MOVE] = 1.; /* J is unused */
735 hmm->xt[XTJ][LOOP] = 0.;
739 hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1);
740 FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- hmm->tbd1)) / (float)(hmm->M-1));
744 hmm->end[hmm->M] = 1.0;
745 basep = pexit / (float) (hmm->M-1);
746 for (k = 1; k < hmm->M; k++)
747 hmm->end[k] = basep / (1. - basep * (float) (k-1));
748 Plan7RenormalizeExits(hmm);
749 hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
752 /* Function: Plan7FSConfig()
753 * Date: SRE, Fri Jan 2 15:34:40 1998 [StL]
755 * Purpose: Set the alignment independent parameters of
756 * a Plan7 model to hmmfs (multihit Smith/Waterman) configuration.
758 * See comments on Plan7SWConfig() for explanation of
759 * how pentry and pexit are used.
761 * Args: hmm - the Plan7 model w/ data-dep prob's valid
762 * pentry - probability of an internal entry somewhere;
763 * will be evenly distributed over M-1 match states
764 * pexit - probability of an internal exit somewhere;
765 * will be distributed over M-1 match states.
768 * HMM probabilities are modified.
771 Plan7FSConfig(struct plan7_s *hmm, float pentry, float pexit)
773 float basep; /* p1 for exits: the base p */
774 int k; /* counter over states */
776 /* Configure special states.
778 hmm->xt[XTN][MOVE] = 1-hmm->p1; /* allow N-terminal tail */
779 hmm->xt[XTN][LOOP] = hmm->p1;
780 hmm->xt[XTE][MOVE] = 0.5; /* allow loops / multihits */
781 hmm->xt[XTE][LOOP] = 0.5;
782 hmm->xt[XTC][MOVE] = 1-hmm->p1; /* allow C-terminal tail */
783 hmm->xt[XTC][LOOP] = hmm->p1;
784 hmm->xt[XTJ][MOVE] = 1.-hmm->p1; /* allow J junction between domains */
785 hmm->xt[XTJ][LOOP] = hmm->p1;
789 hmm->begin[1] = (1. - pentry) * (1. - hmm->tbd1);
790 FSet(hmm->begin+2, hmm->M-1, (pentry * (1.-hmm->tbd1)) / (float)(hmm->M-1));
794 hmm->end[hmm->M] = 1.0;
795 basep = pexit / (float) (hmm->M-1);
796 for (k = 1; k < hmm->M; k++)
797 hmm->end[k] = basep / (1. - basep * (float) (k-1));
798 Plan7RenormalizeExits(hmm);
799 hmm->flags &= ~PLAN7_HASBITS; /* reconfig invalidates log-odds scores */
805 /* Function: Plan7ESTConfig()
807 * Purpose: Configure a Plan7 model for EST Smith/Waterman
810 * OUTDATED; DO NOT USE WITHOUT RECHECKING
812 * Args: hmm - hmm to configure.
813 * aacode - 0..63 vector mapping genetic code to amino acids
814 * estmodel - 20x64 translation matrix, w/ codon bias and substitution error
815 * dna2 - probability of a -1 frameshift in a triplet
816 * dna4 - probability of a +1 frameshift in a triplet
819 Plan7ESTConfig(struct plan7_s *hmm, int *aacode, float **estmodel,
820 float dna2, float dna4)
825 float *tripnull; /* UNFINISHED!!! */
827 /* configure specials */
828 hmm->xt[XTN][MOVE] = 1./351.;
829 hmm->xt[XTN][LOOP] = 350./351.;
830 hmm->xt[XTE][MOVE] = 1.;
831 hmm->xt[XTE][LOOP] = 0.;
832 hmm->xt[XTC][MOVE] = 1./351.;
833 hmm->xt[XTC][LOOP] = 350./351.;
834 hmm->xt[XTJ][MOVE] = 1.;
835 hmm->xt[XTJ][LOOP] = 0.;
836 /* configure entry/exit */
838 FSet(hmm->begin+2, hmm->M-1, 0.5 / ((float)hmm->M - 1.));
839 hmm->end[hmm->M] = 1.;
840 FSet(hmm->end, hmm->M-1, 0.5 / ((float)hmm->M - 1.));
842 /* configure dna triplet/frameshift emissions */
843 for (k = 1; k <= hmm->M; k++)
845 /* translate aa to triplet probabilities */
846 for (x = 0; x < 64; x++) {
847 p = hmm->mat[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4);
848 hmm->dnam[x][k] = Prob2Score(p, tripnull[x]);
850 p = hmm->ins[k][aacode[x]] * estmodel[aacode[x]][x] * (1.-dna2-dna4);
851 hmm->dnai[x][k] = Prob2Score(p, tripnull[x]);
853 hmm->dnam[64][k] = 0; /* ambiguous codons score 0 (danger?) */
854 hmm->dna2 = Prob2Score(dna2, 1.);
855 hmm->dna4 = Prob2Score(dna4, 1.);
859 /* Function: PrintPlan7Stats()
861 * Purpose: Given a newly constructed HMM and the tracebacks
862 * of the sequences it was trained on, print out all
863 * the interesting information at the end of hmmb
864 * and hmmt runs that convinces the user we actually
867 * Args: fp - where to send the output (stdout, usually)
868 * hmm - the new HMM, probability form
869 * dsq - digitized training seqs
870 * nseq - number of dsq's
871 * tr - array of tracebacks for dsq
876 PrintPlan7Stats(FILE *fp, struct plan7_s *hmm, char **dsq, int nseq,
877 struct p7trace_s **tr)
879 int idx; /* counter for sequences */
880 float score; /* an individual trace score */
881 float total, best, worst; /* for the avg. and range of the scores */
882 float sqsum, stddev; /* for the std. deviation of the scores */
884 P7Logoddsify(hmm, TRUE); /* make sure model scores are ready */
886 /* find individual trace scores */
887 score = P7TraceScore(hmm, dsq[0], tr[0]);
888 total = best = worst = score;
889 sqsum = score * score;
890 for (idx = 1; idx < nseq; idx++) {
891 /* P7PrintTrace(stdout, tr[idx], hmm, dsq[idx]); */
892 score = P7TraceScore(hmm, dsq[idx], tr[idx]);
894 sqsum += score * score;
895 if (score > best) best = score;
896 if (score < worst) worst = score;
899 stddev = (sqsum - (total * total / (float) nseq)) / ((float) nseq - 1.);
900 stddev = (stddev > 0) ? sqrt(stddev) : 0.0;
902 /* print out stuff. */
903 fprintf(fp, "Average score: %10.2f bits\n", total / (float) nseq);
904 fprintf(fp, "Minimum score: %10.2f bits\n", worst);
905 fprintf(fp, "Maximum score: %10.2f bits\n", best);
906 fprintf(fp, "Std. deviation: %10.2f bits\n", stddev);
909 /* Function: DegenerateSymbolScore()
911 * Purpose: Given a sequence character x and an hmm emission probability
912 * vector, calculate the log-odds (base 2) score of
915 * Easy if x is in the emission alphabet, but not so easy
916 * is x is a degenerate symbol. The "correct" Bayesian
917 * philosophy is to calculate score(X) by summing over
918 * p(x) for all x in the degenerate symbol X to get P(X),
919 * doing the same sum over the prior to get F(X), and
920 * doing log_2 (P(X)/F(X)). This gives an X a zero score,
923 * Though this is correct in a formal Bayesian sense --
924 * we have no information on the sequence, so we can't
925 * say if it's random or model, so it scores zero --
926 * it sucks, big time, for scoring biological sequences.
927 * Sequences with lots of X's score near zero, while
928 * real sequences have average scores that are negative --
929 * so the X-laden sequences appear to be lifted out
930 * of the noise of a full histogram of a database search.
931 * Correct or not, this is highly undesirable.
933 * So therefore we calculated the expected score of
934 * the degenerate symbol by summing over all x in X:
935 * e_x log_2 (p(x)/f(x))
936 * where the expectation of x, e_x, is calculated from
939 * Empirically, this works; it also has a wooly hand-waving
940 * probabilistic justification that I'm happy enough about.
942 * Args: p - probabilities of normal symbols
943 * null - null emission model
944 * ambig - index of the degenerate character in Alphabet[]
946 * Return: the integer log odds score of x given the emission
947 * vector and the null model, scaled up by INTSCALE.
950 DegenerateSymbolScore(float *p, float *null, int ambig)
956 for (x = 0; x < Alphabet_size; x++) {
957 if (Degenerate[ambig][x]) {
958 numer += null[x] * sreLOG2(p[x] / null[x]);
962 return (int) (INTSCALE * numer / denom);
965 /*****************************************************************
967 * Plan9/Plan7 interface
969 * Very important code during the evolutionary takeover by Plan7 --
970 * convert between Krogh/Haussler and Plan7 models.
971 *****************************************************************/
973 /* Function: Plan9toPlan7()
975 * Purpose: Convert an old HMM into Plan7. Configures it in
978 * Args: hmm - old ugly plan9 style HMM
979 * ret_plan7 - new wonderful Plan7 HMM
982 * Plan7 HMM is allocated here. Free w/ FreePlan7().
985 Plan9toPlan7(struct plan9_s *hmm, struct plan7_s **ret_plan7)
987 struct plan7_s *plan7;
990 plan7 = AllocPlan7(hmm->M);
992 for (k = 1; k < hmm->M; k++)
994 plan7->t[k][TMM] = hmm->mat[k].t[MATCH];
995 plan7->t[k][TMD] = hmm->mat[k].t[DELETE];
996 plan7->t[k][TMI] = hmm->mat[k].t[INSERT];
997 plan7->t[k][TDM] = hmm->del[k].t[MATCH];
998 plan7->t[k][TDD] = hmm->del[k].t[DELETE];
999 plan7->t[k][TIM] = hmm->ins[k].t[MATCH];
1000 plan7->t[k][TII] = hmm->ins[k].t[INSERT];
1003 for (k = 1; k <= hmm->M; k++)
1004 for (x = 0; x < Alphabet_size; x++)
1005 plan7->mat[k][x] = hmm->mat[k].p[x];
1007 for (k = 1; k < hmm->M; k++)
1008 for (x = 0; x < Alphabet_size; x++)
1009 plan7->ins[k][x] = hmm->ins[k].p[x];
1011 plan7->tbd1 = hmm->mat[0].t[DELETE] / (hmm->mat[0].t[DELETE] + hmm->mat[0].t[MATCH]);
1013 /* We have to make up the null transition p1; use default */
1014 P7DefaultNullModel(plan7->null, &(plan7->p1));
1015 for (x = 0; x < Alphabet_size; x++)
1016 plan7->null[x] = hmm->null[x];
1018 if (hmm->name != NULL)
1019 Plan7SetName(plan7, hmm->name);
1020 if (hmm->flags & HMM_REF) {
1021 strcpy(plan7->rf, hmm->ref);
1022 plan7->flags |= PLAN7_RF;
1024 if (hmm->flags & HMM_CS) {
1025 strcpy(plan7->cs, hmm->cs);
1026 plan7->flags |= PLAN7_CS;
1029 Plan7LSConfig(plan7); /* configure specials for ls-style alignment */
1030 Plan7Renormalize(plan7); /* mainly to correct for missing ID and DI */
1031 plan7->flags |= PLAN7_HASPROB; /* probabilities are valid */
1032 plan7->flags &= ~PLAN7_HASBITS; /* scores are not valid */