1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Sat Nov 16 12:34:57 1996
13 * RCS $Id: trace.c,v 1.1.1.1 2005/03/22 08:34:07 cmzmasek Exp $
15 * Support for Plan 7 traceback data structure, p7trace_s.
32 static void rightjustify(char *s, int n);
34 /* Function: P7AllocTrace(), P7ReallocTrace(), P7FreeTrace()
36 * Purpose: allocation and freeing of traceback structures
39 P7AllocTrace(int tlen, struct p7trace_s **ret_tr)
43 tr = MallocOrDie (sizeof(struct p7trace_s));
44 tr->statetype = MallocOrDie (sizeof(char) * tlen);
45 tr->nodeidx = MallocOrDie (sizeof(int) * tlen);
46 tr->pos = MallocOrDie (sizeof(int) * tlen);
50 P7ReallocTrace(struct p7trace_s *tr, int tlen)
52 tr->statetype = ReallocOrDie (tr->statetype, tlen * sizeof(char));
53 tr->nodeidx = ReallocOrDie (tr->nodeidx, tlen * sizeof(int));
54 tr->pos = ReallocOrDie (tr->pos, tlen * sizeof(int));
57 P7FreeTrace(struct p7trace_s *tr)
65 /* Function: TraceSet()
66 * Date: SRE, Sun Mar 8 12:39:00 1998 [St. Louis]
68 * Purpose: Convenience function; set values at position tpos
72 * Args: tr - trace object to write to
73 * tpos - ptr to position in trace to set
74 * type - statetype e.g. STS, etc.
75 * idx - nodeidx 1..M or 0
76 * pos - seq position 1..L or 0
81 TraceSet(struct p7trace_s *tr, int tpos, char type, int idx, int pos)
83 tr->statetype[tpos] = type;
84 tr->nodeidx[tpos] = idx;
89 /* Function: MergeTraceArrays()
90 * Date: SRE, Sun Jul 5 15:09:10 1998 [St. Louis]
92 * Purpose: Combine two arrays of traces into a single array.
93 * Used in hmmalign to merge traces from a fixed alignment
94 * with traces from individual unaligned seqs.
96 * t1 traces always precede t2 traces in the resulting array.
98 * Args: t1 - first set of traces
99 * n1 - number of traces in t1
100 * t2 - second set of traces
101 * n2 - number of traces in t2
103 * Returns: pointer to new array of traces.
104 * Both t1 and t2 are free'd here! Do not reuse.
107 MergeTraceArrays(struct p7trace_s **t1, int n1, struct p7trace_s **t2, int n2)
109 struct p7trace_s **tr;
110 int i; /* index in traces */
112 tr = MallocOrDie(sizeof(struct p7trace_s *) * (n1+n2));
113 for (i = 0; i < n1; i++) tr[i] = t1[i];
114 for (i = 0; i < n2; i++) tr[n1+i] = t2[i];
122 /* Function: P7ReverseTrace()
123 * Date: SRE, Mon Aug 25 12:57:29 1997; Denver CO.
125 * Purpose: Reverse the arrays in a traceback structure.
126 * Tracebacks from Forward() and Viterbi() are
127 * collected backwards, and call this function
130 * It's possible to reverse the arrays in place
131 * more efficiently; but the realloc/copy strategy
132 * has the advantage of reallocating the trace
133 * into the right size of memory. (Tracebacks
136 * Args: tr - the traceback to reverse. tr->tlen must be set.
142 P7ReverseTrace(struct p7trace_s *tr)
151 statetype = MallocOrDie (sizeof(char)* tr->tlen);
152 nodeidx = MallocOrDie (sizeof(int) * tr->tlen);
153 pos = MallocOrDie (sizeof(int) * tr->tlen);
155 /* Reverse the trace.
157 for (opos = tr->tlen-1, npos = 0; npos < tr->tlen; npos++, opos--)
159 statetype[npos] = tr->statetype[opos];
160 nodeidx[npos] = tr->nodeidx[opos];
161 pos[npos] = tr->pos[opos];
164 /* Swap old, new arrays.
169 tr->statetype = statetype;
170 tr->nodeidx = nodeidx;
176 /* Function: P7TraceCount()
178 * Purpose: Count a traceback into a count-based HMM structure.
179 * (Usually as part of a model parameter re-estimation.)
181 * Args: hmm - counts-based HMM
182 * dsq - digitized sequence that traceback aligns to the HMM (1..L)
183 * wt - weight on the sequence
184 * tr - alignment of seq to HMM
189 P7TraceCount(struct plan7_s *hmm, char *dsq, float wt, struct p7trace_s *tr)
191 int tpos; /* position in tr */
192 int i; /* symbol position in seq */
194 for (tpos = 0; tpos < tr->tlen; tpos++)
199 * Don't bother counting null states N,J,C.
201 if (tr->statetype[tpos] == STM)
202 P7CountSymbol(hmm->mat[tr->nodeidx[tpos]], dsq[i], wt);
203 else if (tr->statetype[tpos] == STI)
204 P7CountSymbol(hmm->ins[tr->nodeidx[tpos]], dsq[i], wt);
206 /* State transition counts
208 switch (tr->statetype[tpos]) {
210 break; /* don't bother; P=1 */
212 switch (tr->statetype[tpos+1]) {
213 case STB: hmm->xt[XTN][MOVE] += wt; break;
214 case STN: hmm->xt[XTN][LOOP] += wt; break;
216 Die("illegal state transition %s->%s in traceback",
217 Statetype(tr->statetype[tpos]),
218 Statetype(tr->statetype[tpos+1]));
222 switch (tr->statetype[tpos+1]) {
223 case STM: hmm->begin[tr->nodeidx[tpos+1]] += wt; break;
224 case STD: hmm->tbd1 += wt; break;
226 Die("illegal state transition %s->%s in traceback",
227 Statetype(tr->statetype[tpos]),
228 Statetype(tr->statetype[tpos+1]));
232 switch (tr->statetype[tpos+1]) {
233 case STM: hmm->t[tr->nodeidx[tpos]][TMM] += wt; break;
234 case STI: hmm->t[tr->nodeidx[tpos]][TMI] += wt; break;
235 case STD: hmm->t[tr->nodeidx[tpos]][TMD] += wt; break;
236 case STE: hmm->end[tr->nodeidx[tpos]] += wt; break;
238 Die("illegal state transition %s->%s in traceback",
239 Statetype(tr->statetype[tpos]),
240 Statetype(tr->statetype[tpos+1]));
244 switch (tr->statetype[tpos+1]) {
245 case STM: hmm->t[tr->nodeidx[tpos]][TIM] += wt; break;
246 case STI: hmm->t[tr->nodeidx[tpos]][TII] += wt; break;
248 Die("illegal state transition %s->%s in traceback",
249 Statetype(tr->statetype[tpos]),
250 Statetype(tr->statetype[tpos+1]));
254 switch (tr->statetype[tpos+1]) {
255 case STM: hmm->t[tr->nodeidx[tpos]][TDM] += wt; break;
256 case STD: hmm->t[tr->nodeidx[tpos]][TDD] += wt; break;
257 case STE: /* ignore; p(D->E) = 1.0 */ break;
259 Die("illegal state transition %s->%s in traceback",
260 Statetype(tr->statetype[tpos]),
261 Statetype(tr->statetype[tpos+1]));
265 switch (tr->statetype[tpos+1]) {
266 case STC: hmm->xt[XTE][MOVE] += wt; break;
267 case STJ: hmm->xt[XTE][LOOP] += wt; break;
269 Die("illegal state transition %s->%s in traceback",
270 Statetype(tr->statetype[tpos]),
271 Statetype(tr->statetype[tpos+1]));
275 switch (tr->statetype[tpos+1]) {
276 case STB: hmm->xt[XTJ][MOVE] += wt; break;
277 case STJ: hmm->xt[XTJ][LOOP] += wt; break;
279 Die("illegal state transition %s->%s in traceback",
280 Statetype(tr->statetype[tpos]),
281 Statetype(tr->statetype[tpos+1]));
285 switch (tr->statetype[tpos+1]) {
286 case STT: hmm->xt[XTC][MOVE] += wt; break;
287 case STC: hmm->xt[XTC][LOOP] += wt; break;
289 Die("illegal state transition %s->%s in traceback",
290 Statetype(tr->statetype[tpos]),
291 Statetype(tr->statetype[tpos+1]));
295 break; /* T is the last. It makes no transitions. */
297 Die("illegal state %s in traceback",
298 Statetype(tr->statetype[tpos]));
304 /* Function: P7TraceScore()
306 * Purpose: Score a traceback and return the score in scaled bits.
308 * Args: hmm - HMM with valid log odds scores.
309 * dsq - digitized sequence that traceback aligns to the HMM (1..L)
310 * tr - alignment of seq to HMM
315 P7TraceScore(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr)
317 int score; /* total score as a scaled integer */
318 int tpos; /* position in tr */
319 int sym; /* digitized symbol in dsq */
321 /* P7PrintTrace(stdout, tr, hmm, dsq); */
323 for (tpos = 0; tpos < tr->tlen-1; tpos++)
325 sym = (int) dsq[tr->pos[tpos]];
328 * Don't bother counting null states N,J,C.
330 if (tr->statetype[tpos] == STM)
331 score += hmm->msc[sym][tr->nodeidx[tpos]];
332 else if (tr->statetype[tpos] == STI)
333 score += hmm->isc[sym][tr->nodeidx[tpos]];
335 /* State transitions.
337 score += TransitionScoreLookup(hmm,
338 tr->statetype[tpos], tr->nodeidx[tpos],
339 tr->statetype[tpos+1], tr->nodeidx[tpos+1]);
341 return Scorify(score);
346 /* Function: P7Traces2Alignment()
348 * Purpose: Convert an array of traceback structures for a set
349 * of sequences into a new multiple alignment.
351 * Insertions are put into lower case and
352 * are not aligned; instead, Nterm is right-justified,
353 * Cterm is left-justified, and internal insertions
354 * are split in half and the halves are justified in
355 * each direction (the objective being to increase
356 * the chances of getting insertions aligned well enough
357 * for them to become a match). SAM gap char conventions
358 * are used: - in match columns, . in insert columns
360 * NOTE: Does not recognize J state.
362 * Args: dsq - digitized unaligned sequences
363 * sqinfo - array of info about the sequences
364 * wgt - weights on seqs
365 * nseq - number of sequences
366 * mlen - length of model (number of match states)
367 * tr - array of tracebacks
368 * matchonly - TRUE if we don't print insert-generated symbols at all
369 * Return: MSA structure; NULL on failure.
370 * Caller responsible for freeing msa with MSAFree(msa);
373 P7Traces2Alignment(char **dsq, SQINFO *sqinfo, float *wgt, int nseq, int mlen,
374 struct p7trace_s **tr, int matchonly)
376 MSA *msa; /* RETURN: new alignment */
377 int idx; /* counter for sequences */
378 int alen; /* width of alignment */
379 int *inserts; /* array of max gaps between aligned columns */
380 int *matmap; /* matmap[k] = apos of match k [1..M] */
381 int nins; /* counter for inserts */
382 int apos; /* position in aligned sequence (0..alen-1)*/
383 int rpos; /* position in raw digital sequence (1..L)*/
384 int tpos; /* position counter in traceback */
385 int statetype; /* type of current state, e.g. STM */
386 int k; /* counter over states in model */
388 /* Here's the problem. We want to align the match states in columns,
389 * but some sequences have inserted symbols in them; we need some
390 * sort of overall knowledge of where the inserts are and how long
391 * they are in order to create the alignment.
393 * Here's our trick. inserts[] is a 0..hmm->M array; inserts[i] stores
394 * the maximum number of times insert substate i was used. This
395 * is the maximum number of gaps to insert between canonical
396 * column i and i+1. inserts[0] is the N-term tail; inserts[M] is
399 * Remember that N and C emit on transition, hence the check for an
400 * N->N or C->C transition before bumping nins.
402 inserts = (int *) MallocOrDie (sizeof(int) * (mlen+1));
403 for (k = 0; k <= mlen; k++)
405 for (idx = 0; idx < nseq; idx++) {
407 for (tpos = 0; tpos < tr[idx]->tlen; tpos++) {
408 switch (tr[idx]->statetype[tpos]) {
409 case STI: nins++; break;
410 case STN: if (tr[idx]->statetype[tpos-1] == STN) nins++; break;
411 case STC: if (tr[idx]->statetype[tpos-1] == STC) nins++; break;
413 case STD: /* M,D: record max. reset ctr. */
414 if (nins > inserts[tr[idx]->nodeidx[tpos]-1])
415 inserts[tr[idx]->nodeidx[tpos]-1] = nins;
418 case STB: /* B; record N-tail max, reset ctr */
419 if (nins > inserts[0])
423 case STT: /* T: record C-tail max */
424 if (nins > inserts[mlen])
425 inserts[mlen] = nins;
427 case STS: case STE: break; /* ignore other states */
429 Die("yo! you don't support J in Traces2Alignment(), remember?");
431 Die("Traces2Alignment reports unrecognized statetype %c",
432 Statetype(tr[idx]->statetype[tpos]));
437 /* Insert compression option. */
439 for (k = 0; k <= mlen; k++)
443 /***********************************************
444 * Construct the alignment
445 ***********************************************/
446 /* calculate alignment length and matmap */
447 matmap= (int *) MallocOrDie (sizeof(int) * (mlen+1));
450 for (k = 1; k <= mlen ; k++) {
452 alen += inserts[k] + 1;
454 /* allocation for new alignment */
455 msa = MSAAlloc(nseq, alen);
457 for (idx = 0; idx < nseq; idx++) {
459 for (apos = 0; apos < alen; apos++)
460 msa->aseq[idx][apos] = '.';
461 for (k = 1; k <= mlen; k++)
462 msa->aseq[idx][matmap[k]] = '-';
463 msa->aseq[idx][alen] = '\0';
464 /* align the sequence */
466 for (tpos = 0; tpos < tr[idx]->tlen; tpos++) {
467 statetype = tr[idx]->statetype[tpos]; /* just for clarity */
468 rpos = tr[idx]->pos[tpos];
469 k = tr[idx]->nodeidx[tpos];
471 if (statetype == STM) {
473 msa->aseq[idx][apos] = Alphabet[(int) dsq[idx][rpos]];
476 else if (statetype == STI) {
478 msa->aseq[idx][apos] = '*'; /* insert compression option */
480 msa->aseq[idx][apos] = (char) tolower((int) Alphabet[(int) dsq[idx][rpos]]);
484 else if ((statetype == STN || statetype == STC) && rpos > 0) {
486 msa->aseq[idx][apos] = '*'; /* insert compression option */
488 msa->aseq[idx][apos] = (char) tolower((int) Alphabet[(int) dsq[idx][rpos]]);
492 else if (statetype == STE)
493 apos = matmap[mlen]+1; /* set position for C-term tail */
496 /* N-terminal extension is right-justified.
497 * Internal inserts are split in half, and C-term is right-justified.
498 * C-terminal extension remains left-justified.
501 rightjustify(msa->aseq[idx], inserts[0]);
503 for (k = 1; k < mlen; k++)
504 if (inserts[k] > 1) {
505 for (nins = 0, apos = matmap[k]+1; islower((int) (msa->aseq[idx][apos])); apos++)
507 nins /= 2; /* split the insertion in half */
508 rightjustify(msa->aseq[idx]+matmap[k]+1+nins, inserts[k]-nins);
514 /***********************************************
515 * Build the rest of the MSA annotation.
516 ***********************************************/
520 msa->au = MallocOrDie(sizeof(char) * (strlen(RELEASE)+7));
521 sprintf(msa->au, "HMMER %s", RELEASE);
522 /* copy sqinfo array and weights */
523 for (idx = 0; idx < nseq; idx++)
525 msa->sqname[idx] = sre_strdup(sqinfo[idx].name, -1);
526 if (sqinfo[idx].flags & SQINFO_ACC)
527 MSASetSeqAccession(msa, idx, sqinfo[idx].acc);
528 if (sqinfo[idx].flags & SQINFO_DESC)
529 MSASetSeqAccession(msa, idx, sqinfo[idx].desc);
531 if (sqinfo[idx].flags & SQINFO_SS) {
532 if (msa->ss == NULL) msa->ss = MallocOrDie(sizeof(char *) * nseq);
533 MakeAlignedString(msa->aseq[idx], alen,
534 sqinfo[idx].ss, &(msa->ss[idx]));
536 if (sqinfo[idx].flags & SQINFO_SA) {
537 if (msa->sa == NULL) msa->sa = MallocOrDie(sizeof(char *) * nseq);
538 MakeAlignedString(msa->aseq[idx], alen,
539 sqinfo[idx].sa, &(msa->sa[idx]));
541 msa->wgt[idx] = wgt[idx];
544 /* #=RF annotation: x for match column, . for insert column
546 msa->rf = (char *) MallocOrDie (sizeof(char) * (alen+1));
547 for (apos = 0; apos < alen; apos++)
549 for (k = 1; k <= mlen; k++)
550 msa->rf[matmap[k]] = 'x';
551 msa->rf[alen] = '\0';
553 /* Currently, we produce no consensus structure.
554 * #=CS, generated from HMM structural annotation, would go here.
562 /* Function: TransitionScoreLookup()
564 * Purpose: Convenience function used in PrintTrace() and TraceScore();
565 * given state types and node indices for a transition,
566 * return the integer score for that transition.
569 TransitionScoreLookup(struct plan7_s *hmm, char st1, int k1,
573 case STS: return 0; /* S never pays */
576 case STB: return hmm->xsc[XTN][MOVE];
577 case STN: return hmm->xsc[XTN][LOOP];
578 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
583 case STM: return hmm->bsc[k2];
584 case STD: return Prob2Score(hmm->tbd1, 1.);
585 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
590 case STM: return hmm->tsc[k1][TMM];
591 case STI: return hmm->tsc[k1][TMI];
592 case STD: return hmm->tsc[k1][TMD];
593 case STE: return hmm->esc[k1];
594 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
599 case STM: return hmm->tsc[k1][TIM];
600 case STI: return hmm->tsc[k1][TII];
601 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
606 case STM: return hmm->tsc[k1][TDM];
607 case STD: return hmm->tsc[k1][TDD];
608 case STE: return 0; /* D_m->E has probability 1.0 by definition in Plan7 */
609 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
614 case STC: return hmm->xsc[XTE][MOVE];
615 case STJ: return hmm->xsc[XTE][LOOP];
616 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
621 case STB: return hmm->xsc[XTJ][MOVE];
622 case STJ: return hmm->xsc[XTJ][LOOP];
623 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
628 case STT: return hmm->xsc[XTC][MOVE];
629 case STC: return hmm->xsc[XTC][LOOP];
630 default: Die("illegal %s->%s transition", Statetype(st1), Statetype(st2));
633 case STT: return 0; /* T makes no transitions */
634 default: Die("illegal state %s in traceback", Statetype(st1));
641 /* Function: CreateFancyAli()
642 * Date: SRE, Mon Oct 27 06:49:44 1997 [Sanger Centre UK]
644 * Purpose: Output of an HMM/sequence alignment, using a
645 * traceback structure. Deliberately similar to
646 * the output of BLAST, to make it easier for
647 * people to adapt their Perl parsers (or what have
648 * you) from BLAST to HMMER.
650 * Args: tr - traceback structure that gives the alignment
652 * dsq - the sequence (digitized form)
653 * name- name of the sequence
655 * Return: allocated, filled fancy alignment structure.
658 CreateFancyAli(struct p7trace_s *tr, struct plan7_s *hmm,
659 char *dsq, char *name)
661 struct fancyali_s *ali; /* alignment to create */
662 int tpos; /* position in trace and alignment */
663 int bestsym; /* index of best symbol at this pos */
664 float mthresh; /* above this P(x), display uppercase */
666 /* Allocate and initialize the five lines of display
668 ali = AllocFancyAli();
671 ali->model = MallocOrDie (sizeof(char) * (tr->tlen+1));
672 ali->mline = MallocOrDie (sizeof(char) * (tr->tlen+1));
673 ali->aseq = MallocOrDie (sizeof(char) * (tr->tlen+1));
675 memset(ali->model, ' ', tr->tlen);
676 memset(ali->mline, ' ', tr->tlen);
677 memset(ali->aseq, ' ', tr->tlen);
679 if (hmm->flags & PLAN7_RF)
681 ali->rfline = (char *) MallocOrDie (sizeof(char) * (tr->tlen+1));
682 memset(ali->rfline, ' ', tr->tlen);
684 if (hmm->flags & PLAN7_CS)
686 ali->csline = (char *) MallocOrDie (sizeof(char) * (tr->tlen+1));
687 memset(ali->csline, ' ', tr->tlen);
690 ali->query = Strdup(hmm->name);
691 ali->target = Strdup(name);
693 if (Alphabet_type == hmmAMINO) mthresh = 0.5;
696 /* Find first, last seq position
697 * HMM start/end positions currently not recorded, because there
698 * might be multiple HMM hits per sequence.
700 for (tpos = 0; tpos < tr->tlen; tpos++)
701 if (tr->pos[tpos] > 0) {
702 ali->sqfrom = tr->pos[tpos];
705 for (tpos = tr->tlen-1; tpos >= 0; tpos--)
706 if (tr->pos[tpos] > 0) {
707 ali->sqto = tr->pos[tpos];
711 /* Fill in the five lines of display
713 for (tpos = 0; tpos < tr->tlen; tpos++) {
714 switch (tr->statetype[tpos]) {
717 ali->model[tpos] = '*';
723 ali->model[tpos] = '-';
724 if (tr->pos[tpos] > 0) {
725 ali->aseq[tpos] = tolower(Alphabet[(int) dsq[tr->pos[tpos]]]);
730 ali->model[tpos] = '>';
734 ali->model[tpos] = '<';
738 if (hmm->flags & PLAN7_RF) ali->rfline[tpos] = hmm->rf[tr->nodeidx[tpos]];
739 if (hmm->flags & PLAN7_CS) ali->csline[tpos] = hmm->cs[tr->nodeidx[tpos]];
740 bestsym = FMax(hmm->mat[tr->nodeidx[tpos]], Alphabet_size);
741 ali->model[tpos] = Alphabet[bestsym];
742 if (hmm->mat[tr->nodeidx[tpos]][bestsym] < mthresh)
743 ali->model[tpos] = tolower(ali->model[tpos]);
744 if (dsq[tr->pos[tpos]] == bestsym)
746 ali->mline[tpos] = Alphabet[(int) dsq[tr->pos[tpos]]];
747 if (hmm->mat[tr->nodeidx[tpos]][bestsym] < mthresh)
748 ali->mline[tpos] = tolower(ali->mline[tpos]);
750 else if (hmm->msc[(int) dsq[tr->pos[tpos]]] [tr->nodeidx[tpos]] > 0)
751 ali->mline[tpos] = '+';
752 ali->aseq[tpos] = Alphabet[(int) dsq[tr->pos[tpos]]];
756 if (hmm->flags & PLAN7_RF) ali->rfline[tpos] = hmm->rf[tr->nodeidx[tpos]];
757 if (hmm->flags & PLAN7_CS) ali->csline[tpos] = hmm->cs[tr->nodeidx[tpos]];
758 bestsym = FMax(hmm->mat[tr->nodeidx[tpos]], Alphabet_size);
759 ali->model[tpos] = Alphabet[bestsym];
760 if (hmm->mat[tr->nodeidx[tpos]][bestsym] < mthresh)
761 ali->model[tpos] = tolower(ali->model[tpos]);
762 ali->aseq[tpos] = '-';
766 ali->model[tpos] = '.';
767 if (hmm->isc[(int) dsq[tr->pos[tpos]]] [tr->nodeidx[tpos]] > 0)
768 ali->mline[tpos] = '+';
769 ali->aseq[tpos] = (char) tolower((int) Alphabet[(int) dsq[tr->pos[tpos]]]);
773 Die("bogus statetype");
774 } /* end switch over statetypes */
775 } /* end loop over tpos */
778 if (hmm->flags & PLAN7_RF) ali->rfline[tpos] = '\0';
779 if (hmm->flags & PLAN7_CS) ali->csline[tpos] = '\0';
780 ali->model[tpos] = '\0';
781 ali->mline[tpos] = '\0';
782 ali->aseq[tpos] = '\0';
787 /* Function: PrintFancyAli()
788 * Date: SRE, Mon Oct 27 06:56:42 1997 [Sanger Centre UK]
790 * Purpose: Print an HMM/sequence alignment from a fancyali_s
791 * structure. Line length controlled by ALILENGTH in
792 * config.h (set to 50).
794 * Args: fp - where to print it (stdout or open FILE)
795 * ali - alignment to print
800 PrintFancyAli(FILE *fp, struct fancyali_s *ali)
802 char buffer[ALILENGTH+1]; /* output line buffer */
807 buffer[ALILENGTH] = '\0';
808 endi = ali->sqfrom - 1;
809 for (pos = 0; pos < ali->len; pos += ALILENGTH)
811 /* coords of target seq for this line */
813 for (i = pos; ali->aseq[i] != '\0' && i < pos + ALILENGTH; i++)
814 if (!isgap(ali->aseq[i])) endi++;
816 if (ali->csline != NULL) {
817 strncpy(buffer, ali->csline+pos, ALILENGTH);
818 fprintf(fp, " %16s %s\n", "CS", buffer);
820 if (ali->rfline != NULL) {
821 strncpy(buffer, ali->rfline+pos, ALILENGTH);
822 fprintf(fp, " %16s %s\n", "RF", buffer);
824 if (ali->model != NULL) {
825 strncpy(buffer, ali->model+pos, ALILENGTH);
826 fprintf(fp, " %16s %s\n", " ", buffer);
828 if (ali->mline != NULL) {
829 strncpy(buffer, ali->mline+pos, ALILENGTH);
830 fprintf(fp, " %16s %s\n", " ", buffer);
832 if (ali->aseq != NULL) {
833 strncpy(buffer, ali->aseq+pos, ALILENGTH);
835 fprintf(fp, " %10.10s %5d %s %-5d\n\n", ali->target, starti, buffer, endi);
837 fprintf(fp, " %10.10s %5s %s %-5s\n\n", ali->target, "-", buffer, "-");
841 /* Cleanup and return
849 /* Function: TraceDecompose()
850 * Date: Sat Aug 30 11:18:40 1997 (Denver CO)
852 * Purpose: Decompose a long multi-hit trace into zero or more
853 * traces without N,C,J transitions: for consistent
854 * scoring and statistical evaluation of single domain
857 * Args: otr - original trace structure
858 * ret_tr - RETURN: array of simpler traces
859 * ret_ntr- RETURN: number of traces.
862 * ret_tr alloc'ed here; free individuals with FreeTrace().
865 TraceDecompose(struct p7trace_s *otr, struct p7trace_s ***ret_tr, int *ret_ntr)
867 struct p7trace_s **tr; /* array of new traces */
868 int ntr; /* number of traces */
869 int i,j; /* position counters in traces */
870 int idx; /* index over ntr subtraces */
872 /* First pass: count begin states to get ntr.
874 for (ntr = 0, i = 0; i < otr->tlen; i++)
875 if (otr->statetype[i] == STB) ntr++;
884 tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * ntr);
886 for (idx = 0, i = 0; i < otr->tlen; i++) /* i = position in old trace */
887 if (otr->statetype[i] == STB)
889 for (j = i+1; j < otr->tlen; j++) /* j = tmp; get length of subtrace */
890 if (otr->statetype[j] == STE) break;
891 /* trace = S-N-(B..E)-C-T : len + 4 : j-i+1 + 4*/
892 P7AllocTrace(j-i+5, &(tr[idx]));
893 tr[idx]->tlen = j-i+5;
895 tr[idx]->statetype[0] = STS;
896 tr[idx]->nodeidx[0] = 0;
898 tr[idx]->statetype[1] = STN;
899 tr[idx]->nodeidx[1] = 0;
901 j = 2; /* now j = position in new subtrace */
902 while (1) /* copy subtrace */
904 tr[idx]->statetype[j] = otr->statetype[i];
905 tr[idx]->nodeidx[j] = otr->nodeidx[i];
906 tr[idx]->pos[j] = otr->pos[i];
907 if (otr->statetype[i] == STE) break;
911 tr[idx]->statetype[j] = STC;
912 tr[idx]->nodeidx[j] = 0;
915 tr[idx]->statetype[j] = STT;
916 tr[idx]->nodeidx[j] = 0;
927 /* Function: TraceDomainNumber()
929 * Purpose: Count how many times we traverse the
930 * model in a single Plan7 trace -- equivalent
931 * to counting the number of domains.
933 * (A weakness is that we might discard some of
934 * those domains because they have low scores
935 * below E or T threshold.)
938 TraceDomainNumber(struct p7trace_s *tr)
943 for (i = 0; i < tr->tlen; i++)
944 if (tr->statetype[i] == STB) ndom++;
949 /* Function: TraceSimpleBounds()
951 * Purpose: For a trace that contains only a single
952 * traverse of the model (i.e. something that's
953 * come from TraceDecompose(), or a global
954 * alignment), determine the bounds of
955 * the match on both the sequence [1..L] and the
958 * Args: tr - trace to look at
959 * i1 - RETURN: start point in sequence [1..L]
960 * i2 - RETURN: end point in sequence [1..L]
961 * k1 - RETURN: start point in model [1..M]
962 * k2 - RETURN: end point in model [1..M]
965 TraceSimpleBounds(struct p7trace_s *tr, int *ret_i1, int *ret_i2,
966 int *ret_k1, int *ret_k2)
968 int i1, i2, k1, k2, tpos;
970 i1 = k1 = i2 = k2 = -1;
972 /* Look forwards to find start of match */
973 for (tpos = 0; tpos < tr->tlen; tpos++)
975 if (k1 == -1 && (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD))
976 k1 = tr->nodeidx[tpos];
977 if (tr->statetype[tpos] == STM)
983 if (tpos == tr->tlen || i1 == -1 || k1 == -1)
984 Die("sanity check failed: didn't find a match state in trace");
986 /* Look backwards to find end of match */
987 for (tpos = tr->tlen-1; tpos >= 0; tpos--)
989 if (k2 == -1 && (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD))
990 k2 = tr->nodeidx[tpos];
991 if (tr->statetype[tpos] == STM)
997 if (tpos == tr->tlen || i2 == -1 || k2 == -1)
998 Die("sanity check failed: didn't find a match state in trace");
1007 /* Function: MasterTraceFromMap()
1008 * Date: SRE, Tue Jul 7 18:51:11 1998 [St. Louis]
1010 * Purpose: Convert an alignment map (e.g. hmm->map) to
1011 * a master trace. Used for mapping an alignment
1012 * onto an HMM. Generally precedes a call to
1013 * ImposeMasterTrace(). Compare P7ViterbiAlignAlignment(),
1014 * which aligns an alignment to the model using a
1015 * Viterbi algorithm to get a master trace.
1016 * MasterTraceFromMap() only works if the alignment
1017 * is exactly the one used to train the model.
1019 * Args: map - the map (usually hmm->map is passed) 1..M
1020 * M - length of map (model; usually hmm->M passed)
1021 * alen - length of alignment that map refers to
1023 * Returns: ptr to master trace
1024 * Caller must free: P7FreeTrace().
1027 MasterTraceFromMap(int *map, int M, int alen)
1029 struct p7trace_s *tr; /* RETURN: master trace */
1030 int tpos; /* position in trace */
1031 int apos; /* position in alignment, 1..alen */
1032 int k; /* position in model */
1034 /* Allocate for the trace.
1035 * S-N-B- ... - E-C-T : 6 states + alen is maximum trace,
1036 * because each of alen columns is an N*, M*, I*, or C* metastate.
1037 * No D* metastates possible.
1039 P7AllocTrace(alen+6, &tr);
1041 /* Initialize the trace
1044 TraceSet(tr, tpos, STS, 0, 0); tpos++;
1045 TraceSet(tr, tpos, STN, 0, 0); tpos++;
1049 for (apos = 1; apos < map[1]; apos++) {
1050 TraceSet(tr, tpos, STN, 0, apos); tpos++;
1051 } /* now apos == map[1] */
1052 TraceSet(tr, tpos, STB, 0, 0); tpos++;
1054 for (k = 1; k < M; k++)
1056 TraceSet(tr, tpos, STM, k, apos); tpos++;
1059 for (; apos < map[k+1]; apos++) {
1060 TraceSet(tr, tpos, STI, k, apos); tpos++;
1062 } /* now apos == map[M] and k == M*/
1064 TraceSet(tr, tpos, STM, M, apos); tpos++;
1069 TraceSet(tr, tpos, STE, 0, 0); tpos++;
1070 TraceSet(tr, tpos, STC, 0, 0); tpos++;
1071 for (; apos <= alen; apos++) {
1072 TraceSet(tr, tpos, STC, 0, apos); tpos++;
1075 /* Terminate and return
1077 TraceSet(tr, tpos, STT, 0, 0); tpos++;
1084 /* Function: ImposeMasterTrace()
1085 * Date: SRE, Sun Jul 5 14:27:16 1998 [St. Louis]
1087 * Purpose: Goes with P7ViterbiAlignAlignment(), which gives us
1088 * a "master trace" for a whole alignment. Now, given
1089 * the alignment and the master trace, construct individual
1090 * tracebacks for each sequence. Later we'll hand these
1091 * (and presumably other traces) to P7Traces2Alignment().
1093 * It is possible to generate individual traces that
1094 * are not consistent with Plan7 (e.g. D->I and I->D
1095 * transitions may be present). P7Traces2Alignment()
1096 * can handle such traces; other functions may not.
1097 * See modelmaker.c:trace_doctor() if this is a problem.
1099 * Akin to modelmaker.c:fake_tracebacks().
1101 * Args: aseq - aligned seqs
1102 * nseq - number of aligned seqs
1103 * mtr - master traceback
1104 * ret_tr- RETURN: array of individual tracebacks, one for each aseq
1109 ImposeMasterTrace(char **aseq, int nseq, struct p7trace_s *mtr, struct p7trace_s ***ret_tr)
1111 struct p7trace_s **tr;
1112 int idx; /* counter over sequences */
1113 int i; /* position in raw sequence (1..L) */
1114 int tpos; /* position in traceback */
1115 int mpos; /* position in master trace */
1117 tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * nseq);
1119 for (idx = 0; idx < nseq; idx++)
1121 P7AllocTrace(mtr->tlen, &tr[idx]); /* we're guaranteed that individuals len < master len */
1125 for (mpos = 0; mpos < mtr->tlen; mpos++)
1127 switch (mtr->statetype[mpos])
1129 case STS: /* straight copies w/ no emission: S, B, D, E, T*/
1134 TraceSet(tr[idx], tpos, mtr->statetype[mpos], mtr->nodeidx[mpos], 0);
1138 case STM: /* M* implies M or D */
1139 if (isgap(aseq[idx][mtr->pos[mpos]-1]))
1140 TraceSet(tr[idx], tpos, STD, mtr->nodeidx[mpos], 0);
1142 TraceSet(tr[idx], tpos, STM, mtr->nodeidx[mpos], i);
1148 case STI: /* I* implies I or nothing */
1149 if (!isgap(aseq[idx][mtr->pos[mpos]-1])) {
1150 TraceSet(tr[idx], tpos, STI, mtr->nodeidx[mpos], i);
1156 case STJ: /* N,J,C: first N* -> N. After that, N* -> N or nothing. */
1159 if (mtr->pos[mpos] == 0) {
1160 TraceSet(tr[idx], tpos, mtr->statetype[mpos], 0, 0);
1162 } else if (!isgap(aseq[idx][mtr->pos[mpos]-1])) {
1163 TraceSet(tr[idx], tpos, mtr->statetype[mpos], 0, i);
1170 Die("never happens. Trust me.");
1173 tr[idx]->tlen = tpos;
1179 /* Function: rightjustify()
1181 * Purpose: Given a gap-containing string of length n,
1182 * pull all the non-gap characters as far as
1183 * possible to the right, leaving gaps on the
1184 * left side. Used to rearrange the positions
1185 * of insertions in HMMER alignments.
1188 rightjustify(char *s, int n)
1196 if (isgap(s[opos])) opos--;
1197 else s[npos--]=s[opos--];