1 /* Source code from Cambridge visit July 1997
3 * Position-specific matrices.
22 /* Function: MakeStarHMM()
24 * Purpose: Given an HMM with counts, create an HMM according
25 * to the star rule. In star models we typically expect
26 * that the counts have been collected using BLOSUM style
29 * Args: hmm - HMM structure containing counts data
30 * mx - Star vectors, mx[q][x]
31 * pq - vector prior P(q)
32 * nq - number of vectors
33 * pri - Dirichlet priors for other parameters
36 * hmm is converted to probabilities.
39 MakeStarHMM(struct plan7_s *hmm, float **mx, float *pq, int nq, struct p7prior_s *pri)
41 int k; /* counter over model position */
42 int x; /* counter over symbol/transition */
43 float *pxa; /* P(x | a) : our parameter estimate */
44 float *pqa; /* P(q | a) for all q */
45 int q; /* counters over vectors q */
46 int ai; /* counter over symbols */
48 /* Match emissions: Star rule implementation.
50 pxa = (float *) MallocOrDie(sizeof(float) * Alphabet_size);
51 pqa = (float *) MallocOrDie(sizeof(float) * nq);
52 for (k = 1; k <= hmm->M; k++)
54 /* calculate log P(q | a) unnormalized (i.e. + log P(a))*/
55 for (q = 0; q < nq; q++)
58 for (ai = 0; ai < Alphabet_size; ai++)
59 pqa[q] += hmm->mat[k][ai] * log(mx[q][ai]);
61 /* calculate log P(x | a) unnormalized (i.e + log P(a))*/
62 for (x = 0; x < Alphabet_size; x++)
64 pxa[x] = pqa[0] + log(mx[0][x]);
65 for (q = 1; q < nq; q++)
66 pxa[x] = LogSum(pxa[x], (pqa[q] + log(mx[q][x])));
68 /* normalize now to get P(x|a) and store */
69 LogNorm(pxa, Alphabet_size);
70 FCopy(hmm->mat[k], pxa, Alphabet_size);
74 /* Everything else is done according to P7PriorifyHMM()
76 /* Model-dependent transitions are handled simply; Laplace.
78 FSet(hmm->begin+2, hmm->M-1, 0.); /* wipe internal BM entries */
79 FSet(hmm->end+1, hmm->M-1, 0.); /* wipe internal ME exits */
83 /* Main model transitions and insert emissions
85 for (k = 1; k < hmm->M; k++)
87 P7PriorifyTransitionVector(hmm->t[k], pri);
88 P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, pri->iq, pri->i, NULL);
91 Plan7Renormalize(hmm);
101 /* Function: MakeIslandHMM()
103 * Purpose: Given a sequence alignment of i = 1..nseq sequences,
104 * with columns j = 1..alen; and a sequence index idx
105 * to build the island from. Return a Plan7 island HMM in
108 * Args: aseqs - alignment
109 * ainfo - alignment info
110 * idx - index of which sequence to build island from
111 * null - random sequence model [0..Alphabet_size-1]
112 * mx - probability matrices mx[q][root b][x]
113 * bpri - priors on root distributions bpri[q][root b]
114 * qpri - prior probability distribution over matrices
115 * nmx - number of joint probability matrices
117 * Return: a new Plan7 HMM
120 MakeIslandHMM(char **aseqs, AINFO *ainfo, int idx,
121 float null[MAXABET], float ***mx, float **bpri,
122 float *qpri, int nmx)
124 struct plan7_s *hmm; /* RETURN: Plan7 HMM */
125 int j; /* column position index */
126 int k; /* model position index */
127 int q; /* counter for matrices */
128 int x; /* counter for symbols */
129 float *mat; /* a match emission probability vector */
130 float **probq; /* posterior P(q | column) */
131 int sym; /* index of a symbol in alphabet */
134 float **pxaq; /* P(x | a,q) vectors, [q][x] */
135 int b; /* counter over root symbols */
137 /* Allocate a model which is the length of the
140 hmm = AllocPlan7(DealignedLength(aseqs[idx]));
141 if (ainfo->sqinfo[idx].flags & SQINFO_NAME)
142 Plan7SetName(hmm, ainfo->sqinfo[idx].name);
143 if (ainfo->sqinfo[idx].flags & SQINFO_DESC)
144 Plan7SetDescription(hmm, ainfo->sqinfo[idx].desc);
145 Plan7SetNullModel(hmm, null, 350./351.); /* p1 made up; shouldn't matter*/
147 mat = (float *) MallocOrDie( sizeof(float) * Alphabet_size);
148 pxaq = FMX2Alloc(nmx, Alphabet_size);
150 /* Calculate the posterior probability distribution
151 * probq (= P(q | col)) over nmx different matrices
152 * at each column j -- probq[0..alen-1][0..nmx-1];
153 * currently does not use the prior on q, but does a
154 * winner-take-all rule.
156 probq = FMX2Alloc(ainfo->alen, nmx);
157 calc_probq(aseqs, ainfo, mx, bpri, qpri, nmx, probq);
161 print_probq(stdout, probq, ainfo->alen, nmx);
163 for (k = 1, j = 0; j < ainfo->alen; j++)
165 if (isgap(aseqs[idx][j])) continue;
167 if (strchr(Alphabet, aseqs[idx][j]) != NULL)
168 sym = SYMIDX(aseqs[idx][j]);
170 Die("MakeIslandHMM() can't handle ambiguous query symbols yet");
173 /* Calculate P(x | a, q) emission vectors for all matrices q
175 for (q = 0; q < nmx; q++)
177 for (x = 0; x < Alphabet_size; x++)
180 for (b = 0; b < 20; b++)
181 pxaq[q][x] += mx[q][b][x] * mx[q][b][sym] * bpri[q][b];
183 FNorm(pxaq[q], Alphabet_size);
186 /* Sum P(x | a, q) emission vectors over matrices q:
187 * P(x | a, col) = \sum_q P(x | a, q, col) P(q | a, col)
188 * = \sum_q P(x | a, q) P(q | col)
190 for (x = 0; x < Alphabet_size; x++)
193 for (q = 0; q < nmx; q++)
194 hmm->mat[k][x] += probq[j][q] * pxaq[q][x];
196 hmm->ins[k][x] = null[x];
199 /* Reference annotation on columns: most probable matrix
202 for (q = 0; q < nmx; q++)
203 if (probq[j][q] > max) { qmax = q; max = probq[j][q]; }
204 hmm->rf[k] = 'a'+(char)qmax; /* q > 9, so convert to char a-z*/
206 /* Consensus annotation on columns: original sequence.
208 hmm->cs[k] = aseqs[idx][j];
213 /* State transitions are set subjectively
216 for (k = 1; k < hmm->M; k++)
218 hmm->t[k][TMM] = 0.97;
219 hmm->t[k][TMI] = 0.02;
220 hmm->t[k][TMD] = 0.01;
221 hmm->t[k][TIM] = 0.20;
222 hmm->t[k][TII] = 0.80;
223 hmm->t[k][TDM] = 0.90;
224 hmm->t[k][TDD] = 0.10;
227 hmm->flags |= PLAN7_HASPROB | PLAN7_RF | PLAN7_CS;
237 /* Function: ReadGJMMatrices()
239 * Purpose: Read GJM's file format for star-based mixture matrices.
240 * Very first line is nq.
241 * First line of a set is P(q), the prior of the matrix.
242 * Second line contains P(b|q), the prior of the root symbols,
243 * _in arbitrary order_ (the root distribution is not over AA's!)
244 * Third line is blank.
245 * Next 20 lines give a 20x20 matrix of conditional probabilities;
246 * rows = root symbols b; cols = leaf symbols x;
247 * mx[row][col] = P(x | b).
249 * Instead of storing as matrices, store as q x r vectors.
252 * mx, pq, nq are returned via passed pointers.
253 * Caller must free FMX2Free(mx)
254 * Caller must free(pq).
257 ReadGJMMatrices(FILE *fp, float ***ret_mx, float **ret_pq, int *ret_nq)
259 float **mx; /* conditional p's [0..nq-1][0..19] */
260 float *pq; /* priors on vectors, [0..nq-1] */
261 int nq, nr; /* number of matrices, rows */
263 float tmppq; /* prior for matrix */
264 int q,r; /* counter for matrices, rows */
265 int x; /* counter for symbols */
266 char *s; /* tmp pointer into buf */
270 if (fgets(buf, 2048, fp) == NULL) Die("read failed");
273 mx = FMX2Alloc(nq*nr, 20);
274 pq = (float *) MallocOrDie (nq*nr * sizeof(float));
277 for (q = 0; q < nq; q++)
279 if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
282 if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
283 s = strtok(buf, "\n\t ");
284 for (r = 0; r < nr; r++)
286 pq[q*nr + r] = atof(s) * tmppq;
287 s = strtok(NULL, "\n\t ");
289 if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
291 for (r = 0; r < 20; r++)
293 if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
294 s = strtok(buf, "\n\t ");
295 for (x = 0; x < 20; x++)
297 mx[q*nr+r][x] = atof(s);
298 s = strtok(NULL, "\n\t ");
301 /* two blank lines */
302 if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
303 if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
314 /* Function: OldReadGJMMatrices()
316 * Purpose: Read GJM's file format for joint probability matrix sets.
319 * mx, qprior, nmx are returned via passed pointers.
320 * Caller must free mx: each matrix by FMX2Free(), then free(mx).
321 * Caller must also free(qprior).
324 OldReadGJMMatrices(FILE *fp, float ****ret_mx, float **ret_qprior, int *ret_nmx)
326 float ***mx; /* joint prob matrix [0..nmx-1][0..19][0..19] */
327 float *qprior; /* priors on matrices, [0..nmx-1] */
328 int nmx; /* number of matrices */
330 int q; /* counter for matrices */
331 int idx; /* index for this matrix seen in file */
332 int r,c; /* counter for row, column */
333 char *s; /* tmp pointer into buf */
335 /* pass one: count matrices */
337 while (fgets(buf, 2048, fp) != NULL)
338 if (Strparse("use [0-9]+ = .+", buf, 0) == 0)
342 qprior = (float *) MallocOrDie (20 * sizeof(float));
343 mx = (float ***) MallocOrDie (nmx * sizeof(float **));
344 for (q = 0; q < nmx; q++)
345 mx[q] = FMX2Alloc(20, 20);
347 /* pass two: parse matrices */
349 while (fgets(buf, 2048, fp) != NULL)
351 if (Strparse("use ([0-9]+) = (.+)", buf, 2) != 0)
353 idx = atoi(sqd_parse[1]);
354 qprior[q] = atof(sqd_parse[2]);
356 /* skip two lines in his new format */
357 if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed");
358 if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed");
360 for (r = 0; r < 20; r++)
362 if (fgets(buf, 2048, fp) == NULL)
363 Die("ReadGJMMatrices(): parse failed");
364 s = strtok(buf, "\n\t ");
365 for (c = 0; c < 20; c++)
367 mx[q][r][c] = atof(s);
368 s = strtok(NULL, "\n\t ");
375 *ret_qprior = qprior;
380 /* Function: OldPrintGJMMatrix()
382 * Purpose: (debugging, basically): print out Graeme's
383 * joint probability matrices in log odds integer form.
387 OldPrintGJMMatrix(FILE *fp, float **jmx, float *rnd, int N)
392 for (c = 0; c < N; c++)
393 fprintf(fp, " %c ", Alphabet[c]);
396 for (r = 0; r < N; r++)
398 fprintf(fp, "%c ", Alphabet[r]);
399 for (c = 0; c < N; c++)
401 (int) (10. * sreLOG2(jmx[r][c] / (rnd[r] * rnd[c]))));
405 #endif /* SRE_REMOVED*/
407 /* Function: Joint2SubstitutionMatrix()
409 * Purpose: Convert a joint probability matrix to a substitution
412 * Convention here for substitution matrices is
413 * smx[r][c] = r->c = P(c|r).
415 * We obtain the substitution matrix from the following logic:
416 * P(rc) = P(c|r) P(r);
417 * P(r) = \sum_c P(rc);
418 * thus P(c|r) = P(rc) / \sum_c P(rc)
420 * Args: jmx - NxN P(rc) joint probability matrix
421 * smx - NxN P(c|r) substitution matrix, alloced in caller
422 * N - size of matrices; typically Alphabet_size
428 Joint2SubstitutionMatrix(float **jmx, float **smx, int N)
430 float pr; /* P(r) = \sum_c P(rc) */
431 int r,c; /* counters for rows, columns */
433 for (r = 0; r < N; r++)
435 for (pr = 0., c = 0; c < N; c++)
437 for (c = 0; c < N; c++)
438 smx[r][c] = jmx[r][c] / pr;
444 /* Function: BlosumWeights()
446 * Purpose: Assign weights to a set of aligned sequences
447 * using the BLOSUM rule:
448 * - do single linkage clustering at some pairwise identity
449 * - in each cluster, give each sequence 1/clustsize
452 * Args: aseqs - alignment
453 * N - number of seqs in alignment
454 * maxid - fractional identity (e.g. 0.62 for BLOSUM62)
455 * clust - [0..nseq-1] vector of cluster assignments, filled here (or NULL)
456 * ret_nc - total number of clusters found (or pass NULL)
459 BlosumWeights(char **aseqs, AINFO *ainfo, float maxid, int *clust,int *ret_nc)
461 float **dmx; /* difference matrix */
462 struct phylo_s *tree; /* UPGMA tree */
463 float mindiff; /* minimum distance between clusters */
464 int c; /* counter for clusters */
465 struct intstack_s *stack;
469 mindiff = 1.0 - maxid;
470 /* first we do a difference matrix */
471 MakeDiffMx(aseqs, ainfo->nseq, &dmx);
472 /* then we build a tree */
473 Cluster(dmx, ainfo->nseq, CLUSTER_MIN, &tree);
475 /* Find clusters below mindiff.
478 * -if the parent is > mindiff and current < mindiff, then
479 * make current node a cluster.
481 for (i = 0; i < ainfo->nseq; i++)
483 ainfo->sqinfo[i].weight = 1.0;
484 ainfo->sqinfo[i].flags |= SQINFO_WGT;
487 stack = InitIntStack();
488 PushIntStack(stack, 0); /* push root on stack to start */
490 while (PopIntStack(stack, &node))
492 if ((node == 0 || tree[tree[node].parent-ainfo->nseq].diff > mindiff) &&
493 tree[node].diff < mindiff)
494 { /* we're at a cluster */
495 for (i = 0; i < ainfo->nseq; i++)
496 if (tree[node].is_in[i])
498 ainfo->sqinfo[i].weight = 1.0 / (float) tree[node].incnum;
499 if (clust != NULL) clust[i] = c;
503 else /* we're not a cluster, keep traversing */
505 if (tree[node].right >= ainfo->nseq)
506 PushIntStack(stack, tree[node].right - ainfo->nseq);
510 if (clust != NULL) clust[tree[node].right] = c; /* single seq, wgt 1.0 */
513 if (tree[node].left >= ainfo->nseq)
514 PushIntStack(stack, tree[node].left - ainfo->nseq);
518 if (clust != NULL) clust[tree[node].left] = c;
523 FreePhylo(tree, ainfo->nseq);
525 if (ret_nc != NULL) *ret_nc = c;
532 /* Function: calc_probq()
534 * Purpose: Calculate the posterior probability distribution
535 * P(q | a_j) for every column j in the alignment
536 * and every matrix choice q.
538 * Probabilistic, based on a star topology.
539 * Uses a BLOSUM-like rule to cluster the sequences in
540 * the alignment into groups with some seq identity (62%).
541 * Finds the consensus (majority rule) residue in
542 * each cluster as the representative.
543 * Then P(q | col) comes by Bayes:
544 * = (P(col | q) P(q) / Z
545 * where the likelihood
546 * P(col | q) = \sum_b [\prod_i P(a_i | q,b)] P(b | q)
547 * log P(col | q) = \logsum_b P(b|q) + \sum_i \log(P(a_i | q,b))
549 * Args: aseqs - alignment
550 * ainfo - optional info for alignment
551 * mx - conditional probability matrices [0..nmx-1][root b][x]
552 * bprior- root priors [0..nmx-1][root b]
553 * qprior- prior prob distribution over matrices
554 * nmx - number of matrices
555 * probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1]
556 * alloc'ed in called, filled in here.
559 * probq is filled in.
562 calc_probq(char **aseqs, AINFO *ainfo, float ***mx, float **bprior,
563 float *qprior, int nmx, float **probq)
565 int q; /* counter over matrices */
566 int a1; /* counter over sequences */
567 int j; /* counter over columns */
568 int *clust; /* assignment of seqs to clusters 0..nseq-1 */
569 int nclust; /* number of clusters */
570 float *wgt; /* weights on seqs, 0..nseq-1 */
571 int *sym; /* symbol indices in a column */
572 float obs[MAXABET]; /* number of symbols observed in a column */
576 float bterm[20]; /* intermediate in calculation, over root b's */
577 int b; /* counter over root symbols */
579 /* Use the BLOSUM rule to calculate weights and clusters
580 * for sequences in the alignment
582 wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq);
583 clust = (int *) MallocOrDie (sizeof(int) * ainfo->nseq);
584 BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust);
586 /* Use the BLOSUM rule to calculate a "likelihood" function
587 * P(column | q) for each column.
589 sym = (int *) MallocOrDie (sizeof(int) * nclust);
590 for (j = 0; j < ainfo->alen; j++)
592 /* Find majority rule symbols in this col */
593 for (i = 0; i < nclust; i++)
595 FSet(obs, Alphabet_size, 0.);
597 for (a1 = 0; a1 < ainfo->nseq; a1++)
599 if (isgap(aseqs[a1][j])) ngap += 0.;
600 else P7CountSymbol(obs, SymbolIndex(aseqs[a1][j]), 1.0);
603 for (x = 0; x < Alphabet_size; x++)
604 if (obs[x] > maxc) { maxc = obs[x]; sym[i] = x; }
605 /* either if no symbols observed, or more gaps than syms: */
606 if (ngap >= maxc) sym[i] = -1;
608 /* Calculate log likelihood + log prior */
609 for (q = 0; q < nmx; q++)
611 for (b = 0; b < 20; b++)
613 bterm[b] = bprior[q][b];
614 for (i = 0; i < nclust; i++)
616 bterm[b] += log(mx[q][b][sym[i]]);
618 probq[j][q] = log(qprior[q]) + FLogSum(bterm, 20);
620 LogNorm(probq[j], nmx); /* normalize -> gives posterior. */
628 /* Function: old_calc_probq() OBSOLETE VERSION
630 * Purpose: Calculate the posterior probability distribution
631 * P(q | a_j) for every column j in the alignment
632 * and every matrix choice q.
634 * Non-probabilistic. Uses a BLOSUM-like rule to
635 * find the single best matrix for a column, then
636 * assigns it a posterior of 1.0.
638 * This was version 1: a competitive learning rule,
639 * posterior either 1.0 or 0.0.
641 * Args: aseqs - alignment
642 * ainfo - optional info for alignment
643 * jmx - *joint* probability matrices [0..nmx-1][0..19][0..19]
644 * qprior- prior prob distribution over matrices [UNUSED]
645 * nmx - number of matrices
646 * probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1]
647 * alloc'ed in called, filled in here.
650 * probq is filled in.
653 old_calc_probq(char **aseqs, AINFO *ainfo, float ***jmx, float *qprior,
654 int nmx, float **probq)
656 int q; /* counter over matrices */
657 int a1, a2; /* counters over sequences */
658 int j; /* counter over columns */
659 float x; /* BLOSUM-style objective function */
660 float maxx; /* maximum x so far */
661 int maxq; /* maximum q so far */
662 int *clust; /* assignment of seqs to clusters 0..nseq-1 */
663 int nclust; /* number of clusters */
664 float *wgt; /* weights on seqs, 0..nseq-1 */
665 int *sym; /* symbol indices in a column */
668 /* Use the BLOSUM rule to calculate weights and clusters
669 * for sequences in the alignment
671 wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq);
672 clust = (int *) MallocOrDie (sizeof(int) * ainfo->nseq);
673 BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust);
675 /* Use the BLOSUM rule to calculate a "likelihood" function
676 * P(column | q) for each column.
678 sym = (int *) MallocOrDie (sizeof(int) * ainfo->nseq);
679 for (j = 0; j < ainfo->alen; j++)
681 for (a1 = 0; a1 < ainfo->nseq; a1++)
682 if (!isgap(aseqs[a1][j]) &&
683 strchr(Alphabet, aseqs[a1][j]) != NULL)
685 sym[a1] = SYMIDX(aseqs[a1][j]);
686 if (sym[a1] >= Alphabet_size) sym[a1] = -1; /* no degenerates */
691 for (q = 0; q < nmx; q++)
694 for (a1 = 0; a1 < ainfo->nseq; a1++)
695 for (a2 = 0; a2 < ainfo->nseq; a2++)
696 if (sym[a1] >= 0 && sym[a2] >= 0 && clust[a1] != clust[a2])
697 x += wgt[a1] * wgt[a2] * log(jmx[q][sym[a1]][sym[a2]]);
700 printf("%% col %3d mx %c x = %f\n",
701 j+1, 'a'+(char)q, x);
710 FSet(probq[j], nmx, 0.0);
711 probq[j][maxq] = 1.0; /* winner-take-all rule */
720 /* Function: print_probq()
722 * Purpose: Debugging output.
723 * probq is the posterior probability P(q | column) of
724 * a matrix q given an observed alignment column.
725 * Indexed probq[0..alen-1][0..nmx-1].
728 print_probq(FILE *fp, float **probq, int alen, int nmx)
730 int c; /* counter for columns */
731 int q; /* counter for matrices */
733 fputs("### probq debugging output\n", fp);
735 for (q = 0; q < nmx; q++)
736 fprintf(fp, " %c ", 'a'+(char)q);
739 for (c = 0; c < alen; c++)
741 fprintf(fp, "%4d ", c);
742 for (q = 0; q < nmx; q++)
743 fprintf(fp, "%5.3f ", probq[c][q]);