forester/archive/RIO/others/hmmer/src/camJul97.c

   1 /* Source code from Cambridge visit July 1997
   2  *
   3  * Position-specific matrices.
   4  */
   5
   6 #include <stdlib.h>
   7 #include <stdio.h>
   8 #include <string.h>
   9 #include <limits.h>
  10 #include <float.h>
  11 #include <math.h>
  12
  13 #include "funcs.h"
  14 #include "config.h"
  15 #include "structs.h"
  16 #include "squid.h"
  17
  18 #ifdef MEMDEBUG
  19 #include "dbmalloc.h"
  20 #endif
  21
  22 /* Function: MakeStarHMM()
  23  *
  24  * Purpose:  Given an HMM with counts, create an HMM according
  25  *           to the star rule. In star models we typically expect
  26  *           that the counts have been collected using BLOSUM style
  27  *           weights.
  28  *
  29  * Args:     hmm - HMM structure containing counts data
  30  *           mx  - Star vectors, mx[q][x]
  31  *           pq  - vector prior P(q)
  32  *           nq  - number of vectors
  33  *           pri - Dirichlet priors for other parameters
  34  *
  35  * Return:   (void)
  36  *           hmm is converted to probabilities.
  37  */
  38 void
  39 MakeStarHMM(struct plan7_s *hmm, float **mx, float *pq, int nq, struct p7prior_s *pri)
  40 {
  41   int    k;                     /* counter over model position       */
  42   int    x;                     /* counter over symbol/transition    */
  43   float *pxa;                   /* P(x | a) : our parameter estimate */
  44   float *pqa;                   /* P(q | a) for all q                */
  45   int    q;                     /* counters over vectors q           */
  46   int    ai;                    /* counter over symbols              */
  47
  48   /* Match emissions: Star rule implementation.
  49    */
  50   pxa = (float *) MallocOrDie(sizeof(float) * Alphabet_size);
  51   pqa = (float *) MallocOrDie(sizeof(float) * nq);
  52   for (k = 1; k <= hmm->M; k++)
  53     {
  54                         /* calculate log P(q | a) unnormalized (i.e. + log P(a))*/
  55       for (q = 0; q < nq; q++)
  56         {
  57           pqa[q] = log(pq[q]);
  58           for (ai = 0; ai < Alphabet_size; ai++)
  59             pqa[q] += hmm->mat[k][ai] * log(mx[q][ai]);
  60         }
  61                         /* calculate log P(x | a) unnormalized (i.e + log P(a))*/
  62       for (x = 0; x < Alphabet_size; x++)
  63         {
  64           pxa[x] = pqa[0] + log(mx[0][x]);
  65           for (q = 1; q < nq; q++)
  66             pxa[x] = LogSum(pxa[x], (pqa[q] + log(mx[q][x])));
  67         }
  68                         /* normalize now to get P(x|a) and store */
  69       LogNorm(pxa, Alphabet_size);
  70       FCopy(hmm->mat[k], pxa, Alphabet_size);
  71     }
  72
  73
  74   /* Everything else is done according to P7PriorifyHMM()
  75    */
  76   /* Model-dependent transitions are handled simply; Laplace.
  77    */
  78   FSet(hmm->begin+2, hmm->M-1, 0.);     /* wipe internal BM entries */
  79   FSet(hmm->end+1, hmm->M-1, 0.);       /* wipe internal ME exits   */
  80   hmm->tbd1        += 1.0;
  81   hmm->begin[1]    += 1.0;
  82
  83   /* Main model transitions and insert emissions
  84    */
  85   for (k = 1; k < hmm->M; k++)
  86     {
  87       P7PriorifyTransitionVector(hmm->t[k], pri);
  88       P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, pri->iq, pri->i, NULL);
  89     }
  90
  91   Plan7Renormalize(hmm);
  92   free(pxa);
  93   free(pqa);
  94   return;
  95 }
  96
  97
  98
  99
 100 #ifdef SRE_REMOVED
 101 /* Function: MakeIslandHMM()
 102  *
 103  * Purpose:  Given a sequence alignment of i = 1..nseq sequences,
 104  *           with columns j = 1..alen; and a sequence index idx
 105  *           to build the island from. Return a Plan7 island HMM in
 106  *           probability form.
 107  *
 108  * Args:     aseqs  - alignment
 109  *           ainfo  - alignment info
 110  *           idx    - index of which sequence to build island from
 111  *           null   - random sequence model [0..Alphabet_size-1]
 112  *           mx     - probability matrices mx[q][root b][x]
 113  *           bpri   - priors on root distributions bpri[q][root b]
 114  *           qpri   - prior probability distribution over matrices
 115  *           nmx    - number of joint probability matrices
 116  *
 117  * Return:   a new Plan7 HMM
 118  */
 119 struct plan7_s *
 120 MakeIslandHMM(char **aseqs, AINFO *ainfo, int idx,
 121               float null[MAXABET], float ***mx, float **bpri,
 122               float *qpri, int nmx)
 123 {
 124   struct plan7_s *hmm;          /* RETURN: Plan7 HMM                   */
 125   int             j;            /* column position index               */
 126   int             k;            /* model position index                */
 127   int             q;            /* counter for matrices                */
 128   int             x;            /* counter for symbols                 */
 129   float          *mat;          /* a match emission probability vector */
 130   float         **probq;        /* posterior P(q | column)             */
 131   int             sym;          /* index of a symbol in alphabet       */
 132   float           max;
 133   int             qmax;
 134   float         **pxaq;         /* P(x | a,q) vectors, [q][x]     */
 135   int             b;            /* counter over root symbols */
 136
 137   /* Allocate a model which is the length of the
 138    * raw sequence.
 139    */
 140   hmm  = AllocPlan7(DealignedLength(aseqs[idx]));
 141   if (ainfo->sqinfo[idx].flags & SQINFO_NAME)
 142     Plan7SetName(hmm, ainfo->sqinfo[idx].name);
 143   if (ainfo->sqinfo[idx].flags & SQINFO_DESC)
 144     Plan7SetDescription(hmm, ainfo->sqinfo[idx].desc);
 145   Plan7SetNullModel(hmm, null, 350./351.); /* p1 made up; shouldn't matter*/
 146
 147   mat = (float *) MallocOrDie( sizeof(float) * Alphabet_size);
 148   pxaq = FMX2Alloc(nmx, Alphabet_size);
 149
 150   /* Calculate the posterior probability distribution
 151    * probq (= P(q | col)) over nmx different matrices
 152    * at each column j -- probq[0..alen-1][0..nmx-1];
 153    * currently does not use the prior on q, but does a
 154    * winner-take-all rule.
 155    */
 156   probq = FMX2Alloc(ainfo->alen, nmx);
 157   calc_probq(aseqs, ainfo, mx, bpri, qpri, nmx, probq);
 158
 159   /* Debugging
 160    */
 161   print_probq(stdout, probq, ainfo->alen, nmx);
 162
 163   for (k = 1, j = 0; j < ainfo->alen; j++)
 164     {
 165       if (isgap(aseqs[idx][j])) continue;
 166
 167       if (strchr(Alphabet, aseqs[idx][j]) != NULL)
 168         sym = SYMIDX(aseqs[idx][j]);
 169       else
 170         Die("MakeIslandHMM() can't handle ambiguous query symbols yet");
 171
 172
 173       /* Calculate P(x | a, q) emission vectors for all matrices q
 174        */
 175       for (q = 0; q < nmx; q++)
 176         {
 177           for (x = 0; x < Alphabet_size; x++)
 178             {
 179               pxaq[q][x] = 0.0;
 180               for (b = 0; b < 20; b++)
 181                 pxaq[q][x] += mx[q][b][x] * mx[q][b][sym] * bpri[q][b];
 182             }
 183           FNorm(pxaq[q], Alphabet_size);
 184         }
 185
 186       /* Sum P(x | a, q) emission vectors over matrices q:
 187        *  P(x | a, col) = \sum_q P(x | a, q, col) P(q | a, col)
 188        *                = \sum_q P(x | a, q) P(q | col)
 189        */
 190       for (x = 0; x < Alphabet_size; x++)
 191         {
 192           hmm->mat[k][x] = 0.;
 193           for (q = 0; q < nmx; q++)
 194             hmm->mat[k][x] += probq[j][q] * pxaq[q][x];
 195           if (k < hmm->M)
 196             hmm->ins[k][x] = null[x];
 197         }
 198
 199       /* Reference annotation on columns: most probable matrix
 200        */
 201       max = -FLT_MAX;
 202       for (q = 0; q < nmx; q++)
 203         if (probq[j][q] > max) { qmax = q; max = probq[j][q]; }
 204       hmm->rf[k] = 'a'+(char)qmax;   /* q > 9, so convert to char a-z*/
 205
 206       /* Consensus annotation on columns: original sequence.
 207        */
 208       hmm->cs[k] = aseqs[idx][j];
 209
 210       k++;
 211     }
 212
 213   /* State transitions are set subjectively
 214    */
 215   hmm->tbd1 = 0.02;
 216   for (k = 1; k < hmm->M; k++)
 217     {
 218       hmm->t[k][TMM] = 0.97;
 219       hmm->t[k][TMI] = 0.02;
 220       hmm->t[k][TMD] = 0.01;
 221       hmm->t[k][TIM] = 0.20;
 222       hmm->t[k][TII] = 0.80;
 223       hmm->t[k][TDM] = 0.90;
 224       hmm->t[k][TDD] = 0.10;
 225     }
 226
 227   hmm->flags |= PLAN7_HASPROB | PLAN7_RF | PLAN7_CS;
 228
 229   FMX2Free(pxaq);
 230   FMX2Free(probq);
 231   free(mat);
 232   return hmm;
 233 }
 234 #endif
 235
 236
 237 /* Function: ReadGJMMatrices()
 238  *
 239  * Purpose:  Read GJM's file format for star-based mixture matrices.
 240  *           Very first line is nq.
 241  *           First line of a set is P(q), the prior of the matrix.
 242  *           Second line contains P(b|q), the prior of the root symbols,
 243  *              _in arbitrary order_ (the root distribution is not over AA's!)
 244  *           Third line is blank.
 245  *           Next 20 lines give a 20x20 matrix of conditional probabilities;
 246  *             rows = root symbols b; cols = leaf symbols x;
 247  *             mx[row][col] = P(x | b).
 248  *
 249  *           Instead of storing as matrices, store as q x r vectors.
 250  *
 251  * Return:   (void)
 252  *           mx, pq, nq are returned via passed pointers.
 253  *           Caller must free FMX2Free(mx)
 254  *           Caller must free(pq).
 255  */
 256 void
 257 ReadGJMMatrices(FILE *fp, float ***ret_mx, float **ret_pq, int *ret_nq)
 258 {
 259   float  **mx;                  /* conditional p's [0..nq-1][0..19] */
 260   float   *pq;                  /* priors on vectors, [0..nq-1] */
 261   int      nq, nr;              /* number of matrices, rows */
 262   char buf[2048];
 263   float tmppq;                  /* prior for matrix */
 264   int  q,r;                     /* counter for matrices, rows */
 265   int  x;                       /* counter for symbols */
 266   char *s;                      /* tmp pointer into buf    */
 267
 268
 269                                 /* allocations */
 270   if (fgets(buf, 2048, fp) == NULL) Die("read failed");
 271   nr  = 20;
 272   nq  = atoi(buf);
 273   mx  = FMX2Alloc(nq*nr, 20);
 274   pq  = (float *) MallocOrDie (nq*nr * sizeof(float));
 275
 276                                 /* parse matrices */
 277   for (q = 0; q < nq; q++)
 278     {
 279       if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
 280       tmppq = atof(buf);
 281
 282       if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
 283       s = strtok(buf, "\n\t ");
 284       for (r = 0; r < nr; r++)
 285         {
 286           pq[q*nr + r] = atof(s) * tmppq;
 287           s = strtok(NULL, "\n\t ");
 288         }
 289       if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
 290
 291       for (r = 0; r < 20; r++)
 292         {
 293           if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
 294           s = strtok(buf, "\n\t ");
 295           for (x = 0; x < 20; x++)
 296             {
 297               mx[q*nr+r][x] = atof(s);
 298               s = strtok(NULL, "\n\t ");
 299             }
 300         }
 301                                 /* two blank lines */
 302       if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
 303       if (fgets(buf, 2048, fp) == NULL) Die("parse failed");
 304     }
 305
 306   *ret_mx  = mx;
 307   *ret_pq  = pq;
 308   *ret_nq  = nq*nr;
 309   return;
 310 }
 311
 312
 313 #ifdef SRE_REMOVED
 314 /* Function: OldReadGJMMatrices()
 315  *
 316  * Purpose:  Read GJM's file format for joint probability matrix sets.
 317  *
 318  * Return:   (void)
 319  *           mx, qprior, nmx are returned via passed pointers.
 320  *           Caller must free mx: each matrix by FMX2Free(), then free(mx).
 321  *           Caller must also free(qprior).
 322  */
 323 void
 324 OldReadGJMMatrices(FILE *fp, float ****ret_mx, float **ret_qprior, int *ret_nmx)
 325 {
 326   float ***mx;                  /* joint prob matrix [0..nmx-1][0..19][0..19] */
 327   float   *qprior;              /* priors on matrices, [0..nmx-1] */
 328   int  nmx;                     /* number of matrices   */
 329   char buf[2048];
 330   int  q;                       /* counter for matrices */
 331   int  idx;                     /* index for this matrix seen in file */
 332   int  r,c;                     /* counter for row, column */
 333   char *s;                      /* tmp pointer into buf    */
 334
 335                                 /* pass one: count matrices */
 336   nmx = 0;
 337   while (fgets(buf, 2048, fp) != NULL)
 338     if (Strparse("use [0-9]+ = .+", buf, 0) == 0)
 339       nmx++;
 340   rewind(fp);
 341                                 /* allocations */
 342   qprior = (float *) MallocOrDie (20 * sizeof(float));
 343   mx     = (float ***) MallocOrDie (nmx * sizeof(float **));
 344   for (q = 0; q < nmx; q++)
 345     mx[q] = FMX2Alloc(20, 20);
 346
 347                                 /* pass two: parse matrices */
 348   q = 0;
 349   while (fgets(buf, 2048, fp) != NULL)
 350     {
 351       if (Strparse("use ([0-9]+) = (.+)", buf, 2) != 0)
 352         continue;
 353       idx       = atoi(sqd_parse[1]);
 354       qprior[q] = atof(sqd_parse[2]);
 355
 356                                 /* skip two lines in his new format */
 357       if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed");
 358       if (fgets(buf, 2048, fp) == NULL) Die("ReadGJMMatrices(): parse failed");
 359
 360       for (r = 0; r < 20; r++)
 361         {
 362           if (fgets(buf, 2048, fp) == NULL)
 363             Die("ReadGJMMatrices(): parse failed");
 364           s = strtok(buf, "\n\t ");
 365           for (c = 0; c < 20; c++)
 366             {
 367               mx[q][r][c] = atof(s);
 368               s = strtok(NULL, "\n\t ");
 369             }
 370         }
 371       q++;
 372     }
 373
 374   *ret_mx     = mx;
 375   *ret_qprior = qprior;
 376   *ret_nmx    = nmx;
 377   return;
 378 }
 379
 380 /* Function: OldPrintGJMMatrix()
 381  *
 382  * Purpose:  (debugging, basically): print out Graeme's
 383  *           joint probability matrices in log odds integer form.
 384  *
 385  */
 386 void
 387 OldPrintGJMMatrix(FILE *fp, float **jmx, float *rnd, int N)
 388 {
 389   int r, c;
 390
 391   fprintf(fp, "  ");
 392   for (c = 0; c < N; c++)
 393     fprintf(fp, " %c  ", Alphabet[c]);
 394   fprintf(fp, "\n");
 395
 396   for (r = 0; r < N; r++)
 397     {
 398       fprintf(fp, "%c ", Alphabet[r]);
 399       for (c = 0; c < N; c++)
 400         fprintf(fp, "%3d ",
 401                 (int) (10. * sreLOG2(jmx[r][c] / (rnd[r] * rnd[c]))));
 402       fprintf(fp, "\n");
 403     }
 404 }
 405 #endif /* SRE_REMOVED*/
 406
 407 /* Function: Joint2SubstitutionMatrix()
 408  *
 409  * Purpose:  Convert a joint probability matrix to a substitution
 410  *           matrix.
 411  *
 412  *           Convention here for substitution matrices is
 413  *           smx[r][c] = r->c = P(c|r).
 414  *
 415  *           We obtain the substitution matrix from the following logic:
 416  *           P(rc) = P(c|r) P(r);
 417  *           P(r)  = \sum_c P(rc);
 418  *           thus P(c|r) = P(rc) / \sum_c P(rc)
 419  *
 420  * Args:     jmx - NxN P(rc) joint probability matrix
 421  *           smx - NxN P(c|r) substitution matrix, alloced in caller
 422  *           N   - size of matrices; typically Alphabet_size
 423  *
 424  * Return:   (void)
 425  *           smx is filled in.
 426  */
 427 void
 428 Joint2SubstitutionMatrix(float **jmx, float **smx, int N)
 429 {
 430   float pr;                     /* P(r) = \sum_c P(rc)        */
 431   int   r,c;                    /* counters for rows, columns */
 432
 433   for (r = 0; r < N; r++)
 434     {
 435       for (pr = 0., c = 0; c < N; c++)
 436         pr += jmx[r][c];
 437       for (c = 0; c < N; c++)
 438         smx[r][c] = jmx[r][c] / pr;
 439     }
 440 }
 441
 442
 443 #ifdef SRE_REMOVED
 444 /* Function: BlosumWeights()
 445  *
 446  * Purpose:  Assign weights to a set of aligned sequences
 447  *           using the BLOSUM rule:
 448  *             - do single linkage clustering at some pairwise identity
 449  *             - in each cluster, give each sequence 1/clustsize
 450  *               total weight.
 451  *
 452  * Args:     aseqs - alignment
 453  *           N     - number of seqs in alignment
 454  *           maxid - fractional identity (e.g. 0.62 for BLOSUM62)
 455  *           clust - [0..nseq-1] vector of cluster assignments, filled here (or NULL)
 456  *           ret_nc - total number of clusters found  (or pass NULL)
 457  */
 458 void
 459 BlosumWeights(char **aseqs, AINFO *ainfo, float maxid, int *clust,int *ret_nc)
 460 {
 461   float         **dmx;          /* difference matrix */
 462   struct phylo_s *tree;         /* UPGMA tree        */
 463   float           mindiff;      /* minimum distance between clusters */
 464   int             c;            /* counter for clusters */
 465   struct intstack_s *stack;
 466   int             node;
 467   int             i;
 468
 469   mindiff = 1.0 - maxid;
 470                                 /* first we do a difference matrix */
 471   MakeDiffMx(aseqs, ainfo->nseq, &dmx);
 472                                 /* then we build a tree */
 473   Cluster(dmx, ainfo->nseq, CLUSTER_MIN, &tree);
 474
 475   /* Find clusters below mindiff.
 476    * The rule is:
 477    *     -traverse the tree
 478    *     -if the parent is > mindiff and current < mindiff, then
 479    *      make current node a cluster.
 480    */
 481   for (i = 0; i < ainfo->nseq; i++)
 482     {
 483       ainfo->sqinfo[i].weight = 1.0;
 484       ainfo->sqinfo[i].flags |= SQINFO_WGT;
 485     }
 486
 487   stack = InitIntStack();
 488   PushIntStack(stack, 0);       /* push root on stack to start */
 489   c = 0;
 490   while (PopIntStack(stack, &node))
 491     {
 492       if ((node == 0 || tree[tree[node].parent-ainfo->nseq].diff > mindiff) &&
 493           tree[node].diff < mindiff)
 494         {                       /* we're at a cluster */
 495           for (i = 0; i < ainfo->nseq;  i++)
 496             if (tree[node].is_in[i])
 497               {
 498                 ainfo->sqinfo[i].weight = 1.0 / (float) tree[node].incnum;
 499                 if (clust != NULL) clust[i] = c;
 500               }
 501           c++;
 502         }
 503       else                      /* we're not a cluster, keep traversing */
 504         {
 505           if (tree[node].right >= ainfo->nseq)
 506             PushIntStack(stack, tree[node].right - ainfo->nseq);
 507           else
 508             {
 509               c++;
 510               if (clust != NULL) clust[tree[node].right] = c; /* single seq, wgt 1.0 */
 511             }
 512
 513           if (tree[node].left >= ainfo->nseq)
 514             PushIntStack(stack, tree[node].left - ainfo->nseq);
 515           else
 516             {
 517               c++;
 518               if (clust != NULL) clust[tree[node].left] = c;
 519             }
 520         }
 521     }
 522   FreeIntStack(stack);
 523   FreePhylo(tree, ainfo->nseq);
 524   FMX2Free(dmx);
 525   if (ret_nc != NULL) *ret_nc = c;
 526   return;
 527 }
 528 #endif
 529
 530
 531 #ifdef SRE_REMOVED
 532 /* Function: calc_probq()
 533  *
 534  * Purpose:  Calculate the posterior probability distribution
 535  *           P(q | a_j) for every column j in the alignment
 536  *           and every matrix choice q.
 537  *
 538  *           Probabilistic, based on a star topology.
 539  *           Uses a BLOSUM-like rule to cluster the sequences in
 540  *           the alignment into groups with some seq identity (62%).
 541  *           Finds the consensus (majority rule) residue in
 542  *           each cluster as the representative.
 543  *           Then P(q | col) comes by Bayes:
 544  *                 = (P(col | q) P(q) / Z
 545  *           where the likelihood
 546  *                 P(col | q)  = \sum_b [\prod_i P(a_i | q,b)] P(b | q)
 547  *             log P(col | q) = \logsum_b P(b|q) + \sum_i \log(P(a_i | q,b))
 548  *
 549  * Args:     aseqs - alignment
 550  *           ainfo - optional info for alignment
 551  *           mx    - conditional probability matrices [0..nmx-1][root b][x]
 552  *           bprior- root priors [0..nmx-1][root b]
 553  *           qprior- prior prob distribution over matrices
 554  *           nmx   - number of matrices
 555  *           probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1]
 556  *                   alloc'ed in called, filled in here.
 557  *
 558  * Return:   (void)
 559  *           probq is filled in.
 560  */
 561 static void
 562 calc_probq(char **aseqs, AINFO *ainfo, float ***mx, float **bprior,
 563            float *qprior, int nmx, float **probq)
 564 {
 565   int q;                        /* counter over matrices  */
 566   int a1;                       /* counter over sequences */
 567   int j;                        /* counter over columns   */
 568   int  *clust;                  /* assignment of seqs to clusters 0..nseq-1 */
 569   int   nclust;                 /* number of clusters */
 570   float *wgt;                   /* weights on seqs, 0..nseq-1 */
 571   int   *sym;                   /* symbol indices in a column */
 572   float  obs[MAXABET];          /* number of symbols observed in a column */
 573   int    i, x;
 574   float  maxc;
 575   float  ngap;
 576   float  bterm[20];             /* intermediate in calculation, over root b's */
 577   int    b;                     /* counter over root symbols */
 578
 579   /* Use the BLOSUM rule to calculate weights and clusters
 580    * for sequences in the alignment
 581    */
 582   wgt   = (float *) MallocOrDie (sizeof(float) * ainfo->nseq);
 583   clust = (int *) MallocOrDie (sizeof(int)   * ainfo->nseq);
 584   BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust);
 585
 586   /* Use the BLOSUM rule to calculate a "likelihood" function
 587    * P(column | q) for each column.
 588    */
 589   sym = (int *) MallocOrDie (sizeof(int) * nclust);
 590   for (j = 0; j < ainfo->alen; j++)
 591     {
 592                                 /* Find majority rule symbols in this col  */
 593       for (i = 0; i < nclust; i++)
 594         {
 595           FSet(obs, Alphabet_size, 0.);
 596           ngap = 0.;
 597           for (a1 = 0; a1 < ainfo->nseq; a1++)
 598             if (clust[a1] == i)
 599               if   (isgap(aseqs[a1][j])) ngap += 0.;
 600               else P7CountSymbol(obs, SymbolIndex(aseqs[a1][j]), 1.0);
 601
 602           maxc = -1.;
 603           for (x = 0; x < Alphabet_size; x++)
 604             if (obs[x] > maxc) { maxc = obs[x]; sym[i] = x; }
 605                 /* either if no symbols observed, or more gaps than syms: */
 606           if (ngap >= maxc) sym[i] = -1;
 607         }
 608                                 /* Calculate log likelihood + log prior */
 609       for (q = 0; q < nmx; q++)
 610         {
 611           for (b = 0; b < 20; b++)
 612             {
 613               bterm[b] = bprior[q][b];
 614               for (i = 0; i < nclust; i++)
 615                 if (sym[i] >= 0)
 616                   bterm[b] += log(mx[q][b][sym[i]]);
 617             }
 618           probq[j][q] = log(qprior[q]) + FLogSum(bterm, 20);
 619         }
 620       LogNorm(probq[j], nmx);   /* normalize -> gives posterior. */
 621     }
 622   free(sym);
 623   free(wgt);
 624   free(clust);
 625 }
 626
 627
 628 /* Function: old_calc_probq() OBSOLETE VERSION
 629  *
 630  * Purpose:  Calculate the posterior probability distribution
 631  *           P(q | a_j) for every column j in the alignment
 632  *           and every matrix choice q.
 633  *
 634  *           Non-probabilistic. Uses a BLOSUM-like rule to
 635  *           find the single best matrix for a column, then
 636  *           assigns it a posterior of 1.0.
 637  *
 638  *           This was version 1: a competitive learning rule,
 639  *           posterior either 1.0 or 0.0.
 640  *
 641  * Args:     aseqs - alignment
 642  *           ainfo - optional info for alignment
 643  *           jmx   - *joint* probability matrices [0..nmx-1][0..19][0..19]
 644  *           qprior- prior prob distribution over matrices [UNUSED]
 645  *           nmx   - number of matrices
 646  *           probq - RETURN: posterior probabilities, [0..alen-1][0..nmx-1]
 647  *                   alloc'ed in called, filled in here.
 648  *
 649  * Return:   (void)
 650  *           probq is filled in.
 651  */
 652 static void
 653 old_calc_probq(char **aseqs, AINFO *ainfo, float ***jmx, float *qprior,
 654            int nmx, float **probq)
 655 {
 656   int q;                        /* counter over matrices  */
 657   int a1, a2;                   /* counters over sequences */
 658   int j;                        /* counter over columns   */
 659   float x;                      /* BLOSUM-style objective function */
 660   float maxx;                   /* maximum x so far */
 661   int   maxq;                   /* maximum q so far */
 662   int  *clust;                  /* assignment of seqs to clusters 0..nseq-1 */
 663   int   nclust;                 /* number of clusters */
 664   float *wgt;                   /* weights on seqs, 0..nseq-1 */
 665   int   *sym;                   /* symbol indices in a column */
 666
 667
 668   /* Use the BLOSUM rule to calculate weights and clusters
 669    * for sequences in the alignment
 670    */
 671   wgt = (float *) MallocOrDie (sizeof(float) * ainfo->nseq);
 672   clust = (int *) MallocOrDie (sizeof(int)   * ainfo->nseq);
 673   BlosumWeights(aseqs, ainfo, 0.62, clust, wgt, &nclust);
 674
 675   /* Use the BLOSUM rule to calculate a "likelihood" function
 676    * P(column | q) for each column.
 677    */
 678   sym = (int *) MallocOrDie (sizeof(int) * ainfo->nseq);
 679   for (j = 0; j < ainfo->alen; j++)
 680     {
 681       for (a1 = 0; a1 < ainfo->nseq; a1++)
 682         if (!isgap(aseqs[a1][j]) &&
 683             strchr(Alphabet, aseqs[a1][j]) != NULL)
 684           {
 685             sym[a1] = SYMIDX(aseqs[a1][j]);
 686             if (sym[a1] >= Alphabet_size) sym[a1] = -1; /* no degenerates */
 687           }
 688         else sym[a1] = -1;
 689
 690       maxx = -FLT_MAX;
 691       for (q = 0; q < nmx; q++)
 692         {
 693           x = 0.;
 694           for (a1 = 0; a1 < ainfo->nseq; a1++)
 695             for (a2 = 0; a2 < ainfo->nseq; a2++)
 696               if (sym[a1] >= 0 && sym[a2] >= 0 && clust[a1] != clust[a2])
 697                 x += wgt[a1] * wgt[a2] * log(jmx[q][sym[a1]][sym[a2]]);
 698
 699 #ifdef SRE_REMOVED
 700           printf("%% col %3d mx %c x = %f\n",
 701                  j+1, 'a'+(char)q, x);
 702 #endif
 703
 704           if (x > maxx)
 705             {
 706               maxx = x;
 707               maxq = q;
 708             }
 709         }
 710       FSet(probq[j], nmx, 0.0);
 711       probq[j][maxq] = 1.0;     /* winner-take-all rule */
 712     }
 713
 714   free(sym);
 715   free(wgt);
 716   free(clust);
 717 }
 718
 719
 720 /* Function: print_probq()
 721  *
 722  * Purpose:  Debugging output.
 723  *           probq is the posterior probability P(q | column) of
 724  *           a matrix q given an observed alignment column.
 725  *           Indexed probq[0..alen-1][0..nmx-1].
 726  */
 727 static void
 728 print_probq(FILE *fp, float **probq, int alen, int nmx)
 729 {
 730   int c;                        /* counter for columns  */
 731   int q;                        /* counter for matrices */
 732
 733   fputs("### probq debugging output\n", fp);
 734   fputs("     ", fp);
 735   for (q = 0; q < nmx; q++)
 736     fprintf(fp, "  %c   ", 'a'+(char)q);
 737   fputs("\n", fp);
 738
 739   for (c = 0; c < alen; c++)
 740     {
 741       fprintf(fp, "%4d ", c);
 742       for (q = 0; q < nmx; q++)
 743         fprintf(fp, "%5.3f ", probq[c][q]);
 744       fputs("\n", fp);
 745     }
 746 }
 747 #endif