1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Fri Nov 15 10:00:04 1996
14 * Construction of models from multiple alignments. Three versions:
15 * Handmodelmaker() -- use #=RF annotation to indicate match columns
16 * Fastmodelmaker() -- Krogh/Haussler heuristic
17 * Maxmodelmaker() -- MAP model construction algorithm (Eddy,
20 * The meat of the model construction code is in matassign2hmm().
21 * The three model construction strategies simply label which columns
22 * are supposed to be match states, and then hand this info to
25 * Two wrinkles to watch for:
26 * 1) The alignment is assumed to contain sequence fragments. Look in
27 * fake_tracebacks() for how internal entry/exit points are handled.
28 * 2) Plan7 disallows DI and ID transitions, but an alignment may
29 * imply these. Look in trace_doctor() for how DI and ID transitions
46 /* flags used for matassign[] arrays --
47 * assignment of aligned columns to match/insert states
49 #define ASSIGN_MATCH (1<<0)
50 #define FIRST_MATCH (1<<1)
51 #define LAST_MATCH (1<<2)
52 #define ASSIGN_INSERT (1<<3)
53 #define EXTERNAL_INSERT_N (1<<4)
54 #define EXTERNAL_INSERT_C (1<<5)
56 static int build_cij(char **aseqs, int nseq, int *insopt, int i, int j,
57 float *wgt, float *cij);
58 static int estimate_model_length(MSA *msa);
59 static void matassign2hmm(MSA *msa, char **dsq,
60 int *matassign, struct plan7_s **ret_hmm,
61 struct p7trace_s ***ret_tr);
62 static void fake_tracebacks(char **aseq, int nseq, int alen, int *matassign,
63 struct p7trace_s ***ret_tr);
64 static void trace_doctor(struct p7trace_s *tr, int M, int *ret_ndi,
66 static void annotate_model(struct plan7_s *hmm, int *matassign, MSA *msa);
67 static void print_matassign(int *matassign, int alen);
71 /* Function: P7Handmodelmaker()
73 * Purpose: Manual model construction:
74 * Construct an HMM from an alignment, where the #=RF line
75 * of a HMMER alignment file is given to indicate
76 * the columns assigned to matches vs. inserts.
78 * NOTE: Handmodelmaker() will slightly revise the alignment
79 * if necessary, if the assignment of columns implies
80 * DI and ID transitions.
82 * Returns both the HMM in counts form (ready for applying
83 * Dirichlet priors as the next step), and fake tracebacks
84 * for each aligned sequence.
86 * Args: msa - multiple sequence alignment
87 * dsq - digitized unaligned aseq's
88 * ret_hmm - RETURN: counts-form HMM
89 * ret_tr - RETURN: array of tracebacks for aseq's
92 * ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr),
96 P7Handmodelmaker(MSA *msa, char **dsq,
97 struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr)
99 int *matassign; /* MAT state assignments if 1; 1..alen */
100 int apos; /* counter for aligned columns */
102 /* Make sure we have all the info about the alignment that we need */
104 Die("Alignment must have RF annotation to hand-build an HMM");
107 matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1));
109 /* Determine match assignment from optional annotation
112 for (apos = 0; apos < msa->alen; apos++)
114 matassign[apos+1] = 0;
115 if (!isgap(msa->rf[apos]))
116 matassign[apos+1] |= ASSIGN_MATCH;
118 matassign[apos+1] |= ASSIGN_INSERT;
121 /* Hand matassign off for remainder of model construction
123 /* print_matassign(matassign, msa->alen); */
124 matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr);
131 /* Function: P7Fastmodelmaker()
133 * Purpose: Heuristic model construction:
134 * Construct an HMM from an alignment using the original
135 * Krogh/Haussler heuristic; any column with more
136 * symbols in it than a given fraction is assigned to
139 * NOTE: Fastmodelmaker() will slightly revise the
140 * alignment if the assignment of columns implies
141 * DI and ID transitions.
143 * Returns the HMM in counts form (ready for applying Dirichlet
144 * priors as the next step). Also returns fake traceback
145 * for each training sequence.
147 * Args: msa - multiple sequence alignment
148 * dsq - digitized unaligned aseq's
149 * maxgap - if more gaps than this, column becomes insert.
150 * ret_hmm - RETURN: counts-form HMM
151 * ret_tr - RETURN: array of tracebacks for aseq's
154 * ret_hmm and ret_tr alloc'ed here; FreeTrace(tr[i]), free(tr),
158 P7Fastmodelmaker(MSA *msa, char **dsq, float maxgap,
159 struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr)
161 int *matassign; /* MAT state assignments if 1; 1..alen */
162 int idx; /* counter over sequences */
163 int apos; /* counter for aligned columns */
164 int ngap; /* number of gaps in a column */
166 /* Allocations: matassign is 1..alen array of bit flags
168 matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1));
170 /* Determine match assignment by counting symbols in columns
173 for (apos = 0; apos < msa->alen; apos++) {
174 matassign[apos+1] = 0;
177 for (idx = 0; idx < msa->nseq; idx++)
178 if (isgap(msa->aseq[idx][apos]))
181 if ((float) ngap / (float) msa->nseq > maxgap)
182 matassign[apos+1] |= ASSIGN_INSERT;
184 matassign[apos+1] |= ASSIGN_MATCH;
187 /* Once we have matassign calculated, all modelmakers behave
188 * the same; matassign2hmm() does this stuff (traceback construction,
189 * trace counting) and sets up ret_hmm and ret_tr.
191 matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr);
198 /* Function: P7Maxmodelmaker()
200 * Purpose: The Unholy Beast of HMM model construction algorithms --
201 * maximum a posteriori construction. A tour de force and
202 * probably overkill. MAP construction for Krogh
203 * HMM-profiles is fairly straightforward, but MAP construction of
204 * Plan 7 HMM-profiles is, er, intricate.
206 * Given a multiple alignment, construct an optimal (MAP) model
207 * architecture. Return a counts-based HMM.
209 * Args: msa - multiple sequence alignment
210 * dsq - digitized, unaligned seqs
211 * maxgap - above this, trailing columns are assigned to C
212 * prior - priors on parameters to use for model construction
213 * null - random sequence model emissions
214 * null_p1 - random sequence model p1 transition
215 * mpri - prior on architecture: probability of new match node
216 * ret_hmm - RETURN: new hmm (counts form)
217 * ret_tr - RETURN: array of tracebacks for aseq's
220 * ret_hmm and ret_tr (if !NULL) must be free'd by the caller.
223 P7Maxmodelmaker(MSA *msa, char **dsq, float maxgap,
224 struct p7prior_s *prior,
225 float *null, float null_p1, float mpri,
226 struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr)
228 int idx; /* counter for seqs */
229 int i, j; /* positions in alignment */
230 int x; /* counter for syms or transitions */
231 float **matc; /* count vectors: [1..alen][0..19] */
232 float cij[8], tij[8]; /* count and score transit vectors */
233 float matp[MAXABET]; /* match emission vector */
234 float insp[MAXABET]; /* insert score vector */
235 float insc[MAXABET]; /* insert count vector */
236 float *sc; /* DP scores [0,1..alen,alen+1] */
237 int *tbck; /* traceback ptrs for sc */
238 int *matassign; /* match assignments [1..alen] */
239 int *insopt; /* number of inserted chars [0..nseq-1] */
240 int first, last; /* positions of first and last cols [1..alen] */
241 float bm1, bm2; /* estimates for start,internal b->m t's */
242 int est_M; /* estimate for the size of the model */
243 float t_me; /* estimate for an internal M->E transition */
244 float new, bestsc; /* new score, best score so far */
245 int code; /* optimization: return code from build_cij() */
246 int ngap; /* gap count in a column */
247 float wgtsum; /* sum of weights; do not assume it is nseq */
251 matc = (float **) MallocOrDie (sizeof(float *) * (msa->alen+1));
252 sc = (float *) MallocOrDie (sizeof(float) * (msa->alen+2));
253 tbck = (int *) MallocOrDie (sizeof(int) * (msa->alen+2));
254 matassign = (int *) MallocOrDie (sizeof(int) * (msa->alen+1));
255 insopt = (int *) MallocOrDie (sizeof(int) * msa->nseq);
256 for (i = 0; i < msa->alen; i++) {
257 matc[i+1] = (float *) MallocOrDie (Alphabet_size * sizeof(float));
258 FSet(matc[i+1], Alphabet_size, 0.);
263 for (i = 0; i < msa->alen; i++)
264 for (idx = 0; idx < msa->nseq; idx++)
265 if (!isgap(msa->aseq[idx][i]))
266 P7CountSymbol(matc[i+1], SymbolIndex(msa->aseq[idx][i]), msa->wgt[idx]);
267 mpri = sreLOG2(mpri);
269 FCopy(insp, prior->i[0], Alphabet_size);
270 FNorm(insp, Alphabet_size);
271 wgtsum = FSum(msa->wgt, msa->nseq);
272 for (x = 0; x < Alphabet_size; x++)
273 insp[x] = sreLOG2(insp[x] / null[x]);
275 /* Estimate the relevant special transitions.
277 est_M = estimate_model_length(msa);
278 t_me = 0.5 / (float) (est_M-1);
280 bm2 = 0.5 / (float) (est_M-1);
281 bm1 = sreLOG2(bm1 / null_p1);
282 bm2 = sreLOG2(bm2 / null_p1);
284 /* Estimate the position of the last match-assigned column
285 * by counting gap frequencies.
288 for (last = msa->alen; last >= 1; last--) {
290 for (idx = 0; idx < msa->nseq; idx++)
291 if (isgap(msa->aseq[idx][last-1])) ngap++;
292 if ((float) ngap / (float) msa->nseq <= maxgap)
301 /* Set ME gaps to '_'
303 for (idx = 0; idx < msa->nseq; idx++)
304 for (i = last; i > 0 && isgap(msa->aseq[idx][i-1]); i--)
305 msa->aseq[idx][i-1] = '_';
307 /* Main recursion moves from right to left.
309 for (i = last-1; i > 0; i--) {
310 /* Calculate match emission scores for i */
311 FCopy(matp, matc[i], Alphabet_size);
312 P7PriorifyEmissionVector(matp, prior, prior->mnum, prior->mq, prior->m, NULL);
313 for (x = 0; x < Alphabet_size; x++)
314 matp[x] = sreLOG2(matp[x] / null[x]);
316 /* Initialize insert counters to zero */
317 FSet(insc, Alphabet_size, 0.);
318 for (idx = 0; idx < msa->nseq; idx++) insopt[idx] = 0;
321 for (j = i+1; j <= last; j++) {
322 /* build transition matrix for column pair i,j */
323 code = build_cij(msa->aseq, msa->nseq, insopt, i, j, msa->wgt, cij);
324 if (code == -1) break; /* no j to our right can work for us */
327 P7PriorifyTransitionVector(tij, prior, prior->tq);
329 tij[TMM] = sreLOG2(tij[TMM] / null_p1);
330 tij[TMI] = sreLOG2(tij[TMI] / null_p1);
331 tij[TMD] = sreLOG2(tij[TMD]);
332 tij[TIM] = sreLOG2(tij[TIM] / null_p1);
333 tij[TII] = sreLOG2(tij[TII] / null_p1);
334 tij[TDM] = sreLOG2(tij[TDM] / null_p1);
335 tij[TDD] = sreLOG2(tij[TDD]);
336 /* calculate the score of using this j. */
337 new = sc[j] + FDot(tij, cij, 7) + FDot(insp, insc, Alphabet_size);
339 SQD_DPRINTF2(("%3d %3d new=%6.2f scj=%6.2f m=%6.2f i=%6.2f t=%6.2f\n",
340 i, j, new, sc[j], FDot(matp, matc[i], Alphabet_size),
341 FDot(insp, insc, Alphabet_size), FDot(tij, cij, 7)));
343 /* keep it if it's better */
349 /* bump insc, insopt insert symbol counters */
350 FAdd(insc, matc[j], Alphabet_size);
351 for (idx = 0; idx < msa->nseq; idx++)
352 if (!isgap(msa->aseq[idx][j-1])) insopt[idx]++;
354 /* add in constant contributions for col i */
355 /* note ad hoc scaling of mpri by wgtsum (us. nseq)*/
356 sc[i] += FDot(matp, matc[i], Alphabet_size) + mpri * wgtsum;
357 } /* end loop over start positions i */
359 /* Termination: place the begin state.
360 * log odds score for S->N->B is all zero except for NB transition, which
361 * is a constant. So we only have to evaluate BM transitions.
364 for (i = 1; i <= last; i++) {
366 for (idx = 0; idx < msa->nseq; idx++) {
367 if (isgap(msa->aseq[idx][j-1]))
368 new += bm2; /* internal B->M transition */
370 new += bm1; /* B->M1 transition */
381 for (i = 1; i <= msa->alen; i++) matassign[i] = ASSIGN_INSERT;
382 for (i = first; i != 0; i = tbck[i]) {
383 matassign[i] &= ~ASSIGN_INSERT;
384 matassign[i] |= ASSIGN_MATCH;
387 /* Hand matassign off for remainder of model construction
389 /* print_matassign(matassign, ainfo->alen); */
390 matassign2hmm(msa, dsq, matassign, ret_hmm, ret_tr);
394 for (i = 1; i <= msa->alen; i++) free(matc[i]);
403 /* Function: build_cij()
405 * Purpose: Construct a counts vector for transitions between
406 * column i and column j in a multiple alignment.
408 * '_' gap characters indicate "external" gaps which
409 * are to be dealt with by B->M and M->E transitions.
410 * These characters must be placed by a preprocessor.
412 * insopt is an "insert optimization" -- an incrementor
413 * which keeps track of the number of insert symbols
416 * Args: aseqs - multiple alignment. [0.nseq-1][0.alen-1]
417 * nseq - number of seqs in aseqs
418 * insopt - number of inserts per seq between i/j [0.nseq-1]
419 * i - i column [1.alen], off by one from aseqs
420 * j - j column [1.alen], off by one from aseqs
421 * wgt - per-seq weights [0.nseq-1]
422 * cij - transition count vectors [0..7]
424 * Return: -1 if an illegal transition was seen for this i/j assignment *and*
425 * we are guaranteed that any j to the right will also
426 * have illegal transitions.
427 * 0 if an illegal transition was seen, but a j further to the
429 * 1 if all transitions were legal.
432 build_cij(char **aseqs, int nseq, int *insopt, int i, int j,
433 float *wgt, float *cij)
435 int idx; /* counter for seqs */
437 i--; /* make i,j relative to aseqs [0..alen-1] */
439 FSet(cij, 8, 0.); /* zero cij */
440 for (idx = 0; idx < nseq; idx++) {
441 if (insopt[idx] > 0) {
442 if (isgap(aseqs[idx][i])) return -1; /* D->I prohibited. */
443 if (isgap(aseqs[idx][j])) return 0; /* I->D prohibited. */
444 cij[TMI] += wgt[idx];
445 cij[TII] += (insopt[idx]-1) * wgt[idx];
446 cij[TIM] += wgt[idx];
448 if (!isgap(aseqs[idx][i])) {
449 if (aseqs[idx][j] == '_') ; /* YO! what to do with trailer? */
450 else if (isgap(aseqs[idx][j])) cij[TMD] += wgt[idx];
451 else cij[TMM] += wgt[idx];
452 } else { /* ignores B->E possibility */
453 if (aseqs[idx][j] == '_') continue;
454 else if (isgap(aseqs[idx][j])) cij[TDD] += wgt[idx];
455 else cij[TDM] += wgt[idx];
463 /* Function: estimate_model_length()
465 * Purpose: Return a decent guess about the length of the model,
466 * based on the lengths of the sequences.
468 * Algorithm is dumb: use weighted average length.
470 * Don't assume that weights sum to nseq!
473 estimate_model_length(MSA *msa)
479 for (idx = 0; idx < msa->nseq; idx++)
481 total += msa->wgt[idx] * DealignedLength(msa->aseq[idx]);
482 wgtsum += msa->wgt[idx];
485 return (int) (total / wgtsum);
489 /* Function: matassign2hmm()
491 * Purpose: Given an assignment of alignment columns to match vs.
492 * insert, finish the final part of the model construction
493 * calculation that is constant between model construction
496 * Args: msa - multiple sequence alignment
497 * dsq - digitized unaligned aseq's
498 * matassign - 1..alen bit flags for column assignments
499 * ret_hmm - RETURN: counts-form HMM
500 * ret_tr - RETURN: array of tracebacks for aseq's
503 * ret_hmm and ret_tr alloc'ed here for the calling
504 * modelmaker function.
507 matassign2hmm(MSA *msa, char **dsq, int *matassign,
508 struct plan7_s **ret_hmm, struct p7trace_s ***ret_tr)
510 struct plan7_s *hmm; /* RETURN: new hmm */
511 struct p7trace_s **tr; /* fake tracebacks for each seq */
512 int M; /* length of new model in match states */
513 int idx; /* counter over sequences */
514 int apos; /* counter for aligned columns */
516 /* how many match states in the HMM? */
518 for (apos = 1; apos <= msa->alen; apos++) {
519 if (matassign[apos] & ASSIGN_MATCH)
522 /* delimit N-terminal tail */
523 for (apos=1; matassign[apos] & ASSIGN_INSERT && apos <= msa->alen; apos++)
524 matassign[apos] |= EXTERNAL_INSERT_N;
525 if (apos <= msa->alen) matassign[apos] |= FIRST_MATCH;
527 /* delimit C-terminal tail */
528 for (apos=msa->alen; matassign[apos] & ASSIGN_INSERT && apos > 0; apos--)
529 matassign[apos] |= EXTERNAL_INSERT_C;
530 if (apos > 0) matassign[apos] |= LAST_MATCH;
532 /* print_matassign(matassign, msa->alen); */
534 /* make fake tracebacks for each seq */
535 fake_tracebacks(msa->aseq, msa->nseq, msa->alen, matassign, &tr);
536 /* build model from tracebacks */
539 for (idx = 0; idx < msa->nseq; idx++) {
540 /* P7PrintTrace(stdout, tr[idx], NULL, NULL); */
541 P7TraceCount(hmm, dsq[idx], msa->wgt[idx], tr[idx]);
543 /* annotate new model */
544 annotate_model(hmm, matassign, msa);
546 /* Set #=RF line of alignment to reflect our assignment
547 * of match, delete. matassign is valid from 1..alen and is off
548 * by one from msa->rf.
550 if (msa->rf != NULL) free(msa->rf);
551 msa->rf = (char *) MallocOrDie (sizeof(char) * (msa->alen + 1));
552 for (apos = 0; apos < msa->alen; apos++)
553 msa->rf[apos] = matassign[apos+1] & ASSIGN_MATCH ? 'x' : '.';
554 msa->rf[msa->alen] = '\0';
556 /* Cleanup and return. */
557 if (ret_tr != NULL) *ret_tr = tr;
558 else { for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]); free(tr); }
559 if (ret_hmm != NULL) *ret_hmm = hmm; else FreePlan7(hmm);
565 /* Function: fake_tracebacks()
567 * Purpose: From a consensus assignment of columns to MAT/INS, construct fake
568 * tracebacks for each individual sequence.
570 * Note: Fragment tolerant by default. Internal entries are
571 * B->M_x, instead of B->D1->D2->...->M_x; analogously
572 * for internal exits.
574 * Args: aseqs - alignment [0..nseq-1][0..alen-1]
575 * nseq - number of seqs in alignment
576 * alen - length of alignment in columns
577 * matassign - assignment of column; [1..alen] (off one from aseqs)
578 * ret_tr - RETURN: array of tracebacks
581 * ret_tr is alloc'ed here. Caller must free.
584 fake_tracebacks(char **aseq, int nseq, int alen, int *matassign,
585 struct p7trace_s ***ret_tr)
587 struct p7trace_s **tr;
588 int idx; /* counter over sequences */
589 int i; /* position in raw sequence (1..L) */
590 int k; /* position in HMM */
591 int apos; /* position in alignment columns */
592 int tpos; /* position in traceback */
594 tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * nseq);
596 for (idx = 0; idx < nseq; idx++)
598 P7AllocTrace(alen+6, &tr[idx]); /* allow room for S,N,B,E,C,T */
600 /* all traces start with S state... */
601 tr[idx]->statetype[0] = STS;
602 tr[idx]->nodeidx[0] = 0;
604 /* ...and transit to N state; N-term tail
605 is emitted on N->N transitions */
606 tr[idx]->statetype[1] = STN;
607 tr[idx]->nodeidx[1] = 0;
613 for (apos = 0; apos < alen; apos++)
615 tr[idx]->statetype[tpos] = STBOGUS; /* bogus, deliberately, to debug */
617 if (matassign[apos+1] & FIRST_MATCH)
619 tr[idx]->statetype[tpos] = STB;
620 tr[idx]->nodeidx[tpos] = 0;
621 tr[idx]->pos[tpos] = 0;
625 if (matassign[apos+1] & ASSIGN_MATCH && ! isgap(aseq[idx][apos]))
627 k++; /* move to next model pos */
628 tr[idx]->statetype[tpos] = STM;
629 tr[idx]->nodeidx[tpos] = k;
630 tr[idx]->pos[tpos] = i;
634 else if (matassign[apos+1] & ASSIGN_MATCH)
636 /* being careful about S/W transitions; no B->D transitions */
637 k++; /* *always* move on model when ASSIGN_MATCH */
638 if (tr[idx]->statetype[tpos-1] != STB)
640 tr[idx]->statetype[tpos] = STD;
641 tr[idx]->nodeidx[tpos] = k;
642 tr[idx]->pos[tpos] = 0;
646 else if (matassign[apos+1] & EXTERNAL_INSERT_N &&
647 ! isgap(aseq[idx][apos]))
648 { /* N-TERMINAL TAIL */
649 tr[idx]->statetype[tpos] = STN;
650 tr[idx]->nodeidx[tpos] = 0;
651 tr[idx]->pos[tpos] = i;
655 else if (matassign[apos+1] & EXTERNAL_INSERT_C &&
656 ! isgap(aseq[idx][apos]))
657 { /* C-TERMINAL TAIL */
658 tr[idx]->statetype[tpos] = STC;
659 tr[idx]->nodeidx[tpos] = 0;
660 tr[idx]->pos[tpos] = i;
664 else if (! isgap(aseq[idx][apos]))
666 tr[idx]->statetype[tpos] = STI;
667 tr[idx]->nodeidx[tpos] = k;
668 tr[idx]->pos[tpos] = i;
673 if (matassign[apos+1] & LAST_MATCH)
675 /* be careful about S/W transitions; may need to roll
676 * back over some D's because there's no D->E transition
678 while (tr[idx]->statetype[tpos-1] == STD)
680 tr[idx]->statetype[tpos] = STE;
681 tr[idx]->nodeidx[tpos] = 0;
682 tr[idx]->pos[tpos] = 0;
684 /* and then transit E->C;
685 alignments that use J are undefined;
686 C-term tail is emitted on C->C transitions */
687 tr[idx]->statetype[tpos] = STC;
688 tr[idx]->nodeidx[tpos] = 0;
689 tr[idx]->pos[tpos] = 0;
693 /* all traces end with T state */
694 tr[idx]->statetype[tpos] = STT;
695 tr[idx]->nodeidx[tpos] = 0;
696 tr[idx]->pos[tpos] = 0;
697 tr[idx]->tlen = ++tpos;
698 /* deal with DI, ID transitions */
700 trace_doctor(tr[idx], k, NULL, NULL);
702 } /* end for sequence # idx */
708 /* Function: trace_doctor()
710 * Purpose: Plan 7 disallows D->I and I->D "chatter" transitions.
711 * However, these transitions may be implied by many
712 * alignments for hand- or heuristic- built HMMs.
713 * trace_doctor() collapses I->D or D->I into a
714 * single M position in the trace.
715 * Similarly, B->I and I->E transitions may be implied
718 * trace_doctor does not examine any scores when it does
719 * this. In ambiguous situations (D->I->D) the symbol
720 * will be pulled arbitrarily to the left, regardless
721 * of whether that's the best column to put it in or not.
723 * Args: tr - trace to doctor
724 * M - length of model that traces are for
725 * ret_ndi - number of DI transitions doctored
726 * ret_nid - number of ID transitions doctored
732 trace_doctor(struct p7trace_s *tr, int mlen, int *ret_ndi, int *ret_nid)
734 int opos; /* position in old trace */
735 int npos; /* position in new trace (<= opos) */
736 int ndi, nid; /* number of DI, ID transitions doctored */
738 /* overwrite the trace from left to right */
741 while (opos < tr->tlen) {
742 /* fix implied D->I transitions; D transforms to M, I pulled in */
743 if (tr->statetype[opos] == STD && tr->statetype[opos+1] == STI) {
744 tr->statetype[npos] = STM;
745 tr->nodeidx[npos] = tr->nodeidx[opos]; /* D transforms to M */
746 tr->pos[npos] = tr->pos[opos+1]; /* insert char moves back */
750 } /* fix implied I->D transitions; D transforms to M, I is pushed in */
751 else if (tr->statetype[opos]== STI && tr->statetype[opos+1]== STD) {
752 tr->statetype[npos] = STM;
753 tr->nodeidx[npos] = tr->nodeidx[opos+1];/* D transforms to M */
754 tr->pos[npos] = tr->pos[opos]; /* insert char moves up */
758 } /* fix implied B->I transitions; pull I back to its M */
759 else if (tr->statetype[opos]== STI && tr->statetype[opos-1]== STB) {
760 tr->statetype[npos] = STM;
761 tr->nodeidx[npos] = tr->nodeidx[opos]; /* offending I transforms to M */
762 tr->pos[npos] = tr->pos[opos];
765 } /* fix implied I->E transitions; push I to next M */
766 else if (tr->statetype[opos]== STI && tr->statetype[opos+1]== STE) {
767 tr->statetype[npos] = STM;
768 tr->nodeidx[npos] = tr->nodeidx[opos]+1;/* offending I transforms to M */
769 tr->pos[npos] = tr->pos[opos];
772 } /* rare: N-N-B-E becomes N-B-M_1-E (swap B,N) */
773 else if (tr->statetype[opos]==STB && tr->statetype[opos+1]==STE
774 && tr->statetype[opos-1]==STN && tr->pos[opos-1] > 0) {
775 tr->statetype[npos] = STM;
776 tr->nodeidx[npos] = 1;
777 tr->pos[npos] = tr->pos[opos-1];
778 tr->statetype[npos-1] = STB;
779 tr->nodeidx[npos-1] = 0;
783 } /* rare: B-E-C-C-x becomes B-M_M-E-C-x (swap E,C) */
784 else if (tr->statetype[opos]==STE && tr->statetype[opos-1]==STB
785 && tr->statetype[opos+1]==STC
786 && tr->statetype[opos+2]==STC) {
787 tr->statetype[npos] = STM;
788 tr->nodeidx[npos] = mlen;
789 tr->pos[npos] = tr->pos[opos+2];
790 tr->statetype[npos+1] = STE;
791 tr->nodeidx[npos+1] = 0;
793 tr->statetype[npos+2] = STC; /* first C must be a nonemitter */
794 tr->nodeidx[npos+2] = 0;
798 } /* everything else is just copied */
800 tr->statetype[npos] = tr->statetype[opos];
801 tr->nodeidx[npos] = tr->nodeidx[opos];
802 tr->pos[npos] = tr->pos[opos];
809 if (ret_ndi != NULL) *ret_ndi = ndi;
810 if (ret_nid != NULL) *ret_nid = nid;
815 /* Function: annotate_model()
817 * Purpose: Add rf, cs optional annotation to a new model.
819 * Args: hmm - new model
820 * matassign - which alignment columns are MAT; [1..alen]
821 * msa - alignment, including annotation to transfer
826 annotate_model(struct plan7_s *hmm, int *matassign, MSA *msa)
828 int apos; /* position in matassign, 1.alen */
829 int k; /* position in model, 1.M */
830 char *pri; /* X-PRM, X-PRI, X-PRT annotation */
832 /* Transfer reference coord annotation from alignment,
835 if (msa->rf != NULL) {
837 for (apos = k = 1; apos <= msa->alen; apos++)
838 if (matassign[apos] & ASSIGN_MATCH) /* ainfo is off by one from HMM */
839 hmm->rf[k++] = (msa->rf[apos-1] == ' ') ? '.' : msa->rf[apos-1];
841 hmm->flags |= PLAN7_RF;
844 /* Transfer consensus structure annotation from alignment,
847 if (msa->ss_cons != NULL) {
849 for (apos = k = 1; apos <= msa->alen; apos++)
850 if (matassign[apos] & ASSIGN_MATCH)
851 hmm->cs[k++] = (msa->ss_cons[apos-1] == ' ') ? '.' : msa->ss_cons[apos-1];
853 hmm->flags |= PLAN7_CS;
856 /* Transfer surface accessibility annotation from alignment,
859 if (msa->sa_cons != NULL) {
861 for (apos = k = 1; apos <= msa->alen; apos++)
862 if (matassign[apos] & ASSIGN_MATCH)
863 hmm->ca[k++] = (msa->sa_cons[apos-1] == ' ') ? '.' : msa->sa_cons[apos-1];
865 hmm->flags |= PLAN7_CA;
868 /* Store the alignment map
870 for (apos = k = 1; apos <= msa->alen; apos++)
871 if (matassign[apos] & ASSIGN_MATCH)
872 hmm->map[k++] = apos;
873 hmm->flags |= PLAN7_MAP;
875 /* Translate and transfer X-PRM annotation.
876 * 0-9,[a-zA-Z] are legal; translate as 0-9,10-35 into hmm->mpri.
877 * Any other char is translated as -1, and this will be interpreted
878 * as a flag that means "unknown", e.g. use the normal mixture Dirichlet
879 * procedure for this column.
881 if ((pri = MSAGetGC(msa, "X-PRM")) != NULL)
883 hmm->mpri = MallocOrDie(sizeof(int) * (hmm->M+1));
884 for (apos = k = 1; apos <= msa->alen; apos++)
885 if (matassign[apos] & ASSIGN_MATCH)
887 if (isdigit((int) pri[apos-1])) hmm->mpri[k] = pri[apos-1] - '0';
888 else if (islower((int) pri[apos-1])) hmm->mpri[k] = pri[apos-1] - 'a' + 10;
889 else if (isupper((int) pri[apos-1])) hmm->mpri[k] = pri[apos-1] - 'A' + 10;
890 else hmm->mpri[k] = -1;
894 /* And again for X-PRI annotation on insert priors:
896 if ((pri = MSAGetGC(msa, "X-PRI")) != NULL)
898 hmm->ipri = MallocOrDie(sizeof(int) * (hmm->M+1));
899 for (apos = k = 1; apos <= msa->alen; apos++)
900 if (matassign[apos] & ASSIGN_MATCH)
902 if (isdigit((int) pri[apos-1])) hmm->ipri[k] = pri[apos-1] - '0';
903 else if (islower((int) pri[apos-1])) hmm->ipri[k] = pri[apos-1] - 'a' + 10;
904 else if (isupper((int) pri[apos-1])) hmm->ipri[k] = pri[apos-1] - 'A' + 10;
905 else hmm->ipri[k] = -1;
909 /* And one last time for X-PRT annotation on transition priors:
911 if ((pri = MSAGetGC(msa, "X-PRT")) != NULL)
913 hmm->tpri = MallocOrDie(sizeof(int) * (hmm->M+1));
914 for (apos = k = 1; apos <= msa->alen; apos++)
915 if (matassign[apos] & ASSIGN_MATCH)
917 if (isdigit((int) pri[apos-1])) hmm->tpri[k] = pri[apos-1] - '0';
918 else if (islower((int) pri[apos-1])) hmm->tpri[k] = pri[apos-1] - 'a' + 10;
919 else if (isupper((int) pri[apos-1])) hmm->tpri[k] = pri[apos-1] - 'A' + 10;
920 else hmm->tpri[k] = -1;
928 print_matassign(int *matassign, int alen)
932 for (apos = 0; apos <= alen; apos++) {
933 printf("%3d %c %c %c\n",
935 (matassign[apos] & ASSIGN_MATCH) ? 'x':' ',
936 (matassign[apos] & FIRST_MATCH || matassign[apos] & LAST_MATCH) ? '<' : ' ',
937 (matassign[apos] & EXTERNAL_INSERT_N ||
938 matassign[apos] & EXTERNAL_INSERT_C) ? '|':' ');