forester/archive/RIO/others/hmmer/src/prior.c

   1 /*****************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  *****************************************************************/
  10
  11 /* prior.c
  12  * SRE, Mon Nov 18 15:44:08 1996
  13  *
  14  * Support for Dirichlet prior data structure, p7prior_s.
  15  */
  16
  17 #include "config.h"
  18 #include "structs.h"
  19 #include "funcs.h"
  20 #include "squid.h"
  21
  22 #ifdef MEMDEBUG
  23 #include "dbmalloc.h"
  24 #endif
  25
  26 static struct p7prior_s *default_amino_prior(void);
  27 static struct p7prior_s *default_nucleic_prior(void);
  28
  29 /* Function: P7AllocPrior(), P7FreePrior()
  30  *
  31  * Purpose:  Allocation and free'ing of a prior structure.
  32  *           Very simple, but might get more complex someday.
  33  */
  34 struct p7prior_s *
  35 P7AllocPrior(void)
  36 { return (struct p7prior_s *) MallocOrDie (sizeof(struct p7prior_s)); }
  37 void
  38 P7FreePrior(struct p7prior_s *pri)
  39 { free(pri); }
  40
  41
  42 /* Function: P7LaplacePrior()
  43  *
  44  * Purpose:  Create a Laplace plus-one prior. (single component Dirichlets).
  45  *           Global alphabet info is assumed to have been set already.
  46  *
  47  * Args:     (void)
  48  *
  49  * Return:   prior. Allocated here; call FreePrior() to free it.
  50  */
  51 struct p7prior_s *
  52 P7LaplacePrior(void)
  53 {
  54   struct p7prior_s *pri;
  55
  56   pri = P7AllocPrior();
  57   pri->strategy = PRI_DCHLET;
  58
  59   pri->tnum     = 1;
  60   pri->tq[0]    = 1.;
  61   FSet(pri->t[0], 8, 1.);
  62
  63   pri->mnum  = 1;
  64   pri->mq[0] = 1.;
  65   FSet(pri->m[0], Alphabet_size, 1.);
  66
  67   pri->inum  = 1;
  68   pri->iq[0] = 1.;
  69   FSet(pri->i[0], Alphabet_size, 1.);
  70
  71   return pri;
  72 }
  73
  74 /* Function: P7DefaultPrior()
  75  *
  76  * Purpose:  Set up a somewhat more realistic single component
  77  *           Dirichlet prior than Laplace.
  78  */
  79 struct p7prior_s *
  80 P7DefaultPrior(void)
  81 {
  82   switch (Alphabet_type) {
  83   case hmmAMINO:     return default_amino_prior();
  84   case hmmNUCLEIC:   return default_nucleic_prior();
  85   case hmmNOTSETYET: Die("Can't set prior; alphabet type not set yet");
  86   }
  87   /*NOTREACHED*/
  88   return NULL;
  89 }
  90
  91 /* Function: P7ReadPrior()
  92  *
  93  * Purpose:  Input a prior from disk file.
  94  */
  95 struct p7prior_s *
  96 P7ReadPrior(char *prifile)
  97 {
  98   FILE             *fp;
  99   struct p7prior_s *pri;
 100   char             *sptr;
 101   int               q, x;
 102
 103   if ((fp = fopen(prifile, "r")) == NULL)
 104     Die("Failed to open HMMER prior file %s\n", prifile);
 105   pri = P7AllocPrior();
 106
 107   /* First entry is the strategy:
 108    * Only standard Dirichlet prior (simple or mixture) is supported in Plan7 so far
 109    */
 110   sptr = Getword(fp, sqdARG_STRING);
 111   s2upper(sptr);
 112   if      (strcmp(sptr, "DIRICHLET") == 0) pri->strategy = PRI_DCHLET;
 113   else Die("No such prior strategy %s; failed to parse file %s", sptr, prifile);
 114
 115   /* Second entry is the alphabet type:
 116    * Amino or Nucleic
 117    */
 118   sptr = Getword(fp, sqdARG_STRING);
 119   s2upper(sptr);
 120   if (strcmp(sptr, "AMINO") == 0)
 121     {
 122       if (Alphabet_type != hmmAMINO)
 123         Die("HMM and/or sequences are DNA/RNA; can't use protein prior %s", prifile);
 124     }
 125   else if (strcmp(sptr, "NUCLEIC") == 0)
 126     {
 127       if (Alphabet_type != hmmNUCLEIC)
 128         Die("HMM and/or sequences are protein; can't use DNA/RNA prior %s", prifile);
 129     }
 130   else
 131     Die("Alphabet \"%s\" in prior file %s isn't valid.", sptr, prifile);
 132
 133   /* State transition priors:
 134    * # of mixtures.
 135    * then for each mixture:
 136    *    prior P(q)
 137    *    Dirichlet terms for Tmm, Tmi, Tmd, Tim, Tii, Tid, Tdm, Tdi, Tdd
 138    */
 139   pri->tnum = atoi(Getword(fp, sqdARG_INT));
 140   if (pri->tnum < 0)
 141     Die("%d is bad; need at least one state transition mixture component", pri->tnum);
 142   if (pri->tnum > MAXDCHLET)
 143     Die("%d is bad, too many transition components (MAXDCHLET = %d)\n", MAXDCHLET);
 144   for (q = 0; q < pri->tnum; q++)
 145     {
 146       pri->tq[q]    = (float) atof(Getword(fp, sqdARG_FLOAT));
 147       for (x = 0; x < 7; x++)
 148         pri->t[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
 149     }
 150
 151   /* Match emission priors:
 152    * # of mixtures.
 153    * then for each mixture:
 154    *    prior P(q)
 155    *    Dirichlet terms for Alphabet_size symbols in Alphabet
 156    */
 157   pri->mnum = atoi(Getword(fp, sqdARG_INT));
 158   if (pri->mnum < 0)
 159     Die("%d is bad; need at least one match emission mixture component", pri->mnum);
 160   if (pri->mnum > MAXDCHLET)
 161     Die("%d is bad; too many match components (MAXDCHLET = %d)\n", pri->mnum, MAXDCHLET);
 162
 163   for (q = 0; q < pri->mnum; q++)
 164     {
 165       pri->mq[q] = (float) atof(Getword(fp, sqdARG_FLOAT));
 166       for (x = 0; x < Alphabet_size; x++)
 167         pri->m[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
 168     }
 169
 170   /* Insert emission priors:
 171    * # of mixtures.
 172    * then for each mixture component:
 173    *    prior P(q)
 174    *    Dirichlet terms for Alphabet_size symbols in Alphabet
 175    */
 176   pri->inum = atoi(Getword(fp, sqdARG_INT));
 177   if (pri->inum < 0)
 178     Die("%d is bad; need at least one insert emission mixture component", pri->inum);
 179   if (pri->inum > MAXDCHLET)
 180     Die("%d is bad; too many insert components (MAXDCHLET = %d)\n", pri->inum,  MAXDCHLET);
 181   for (q = 0; q < pri->inum; q++)
 182     {
 183       pri->iq[q]  = (float) atof(Getword(fp, sqdARG_FLOAT));
 184       for (x = 0; x < Alphabet_size; x++)
 185         pri->i[q][x] = (float) atof(Getword(fp, sqdARG_FLOAT));
 186     }
 187
 188   fclose(fp);
 189   return pri;
 190 }
 191
 192
 193 /* Function: PAMPrior()
 194  *
 195  * Purpose:  Produces an ad hoc "Dirichlet mixture" prior for
 196  *           match emissions, using a PAM matrix.
 197  *
 198  *           Side effect notice: PAMPrior() replaces the match
 199  *           emission section of an existing Dirichlet prior,
 200  *           which is /expected/ to be a simple one-component
 201  *           kind of prior. The insert emissions /must/ be a
 202  *           one-component prior (because of details in how
 203  *           PriorifyEmissionVector() is done). However,
 204  *           the transitions /could/ be a mixture Dirichlet prior
 205  *           without causing problems. In other words, the
 206  *           -p and -P options of hmmb can coexist, but there
 207  *           may be conflicts. PAMPrior() checks for these,
 208  *           so there's no serious problem, except that the
 209  *           error message from PAMPrior() might be confusing to
 210  *           a user.
 211  */
 212 void
 213 PAMPrior(char *pamfile, struct p7prior_s *pri, float wt)
 214 {
 215   FILE  *fp;
 216   char  *blastpamfile;            /* BLAST looks in aa/ subdirectory of BLASTMAT */
 217   int  **pam;
 218   float  scale;
 219   int    xi, xj;
 220   int    idx1, idx2;
 221
 222   if (Alphabet_type != hmmAMINO)
 223     Die("PAM prior is only valid for protein sequences");
 224   if (pri->strategy != PRI_DCHLET)
 225     Die("PAM prior may only be applied over an existing Dirichlet prior");
 226   if (pri->inum != 1)
 227     Die("PAM prior requires that the insert emissions be a single Dirichlet");
 228   if (MAXDCHLET < 20)
 229     Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior");
 230
 231   blastpamfile = FileConcat("aa", pamfile);
 232
 233   if ((fp = fopen(pamfile, "r")) == NULL &&
 234       (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL &&
 235       (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL)
 236     Die("Failed to open PAM scoring matrix file %s", pamfile);
 237   if (! ParsePAMFile(fp, &pam, &scale))
 238     Die("Failed to parse PAM scoring matrix file %s", pamfile);
 239   fclose(fp);
 240   free(blastpamfile);
 241
 242   pri->strategy = PRI_PAM;
 243   pri->mnum     = 20;
 244
 245   /* Convert PAM entries back to conditional prob's P(xj | xi),
 246    * which we'll use as "pseudocounts" weighted by wt.
 247    */
 248   for (xi = 0; xi < Alphabet_size; xi++)
 249     for (xj = 0; xj < Alphabet_size; xj++)
 250       {
 251         idx1 = Alphabet[xi] - 'A';
 252         idx2 = Alphabet[xj] - 'A';
 253         pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale);
 254       }
 255
 256   /* Normalize so that rows add up to wt.
 257    * i.e. Sum(xj) mat[xi][xj] = wt for every row xi
 258    */
 259   for (xi = 0; xi < Alphabet_size; xi++)
 260     {
 261       pri->mq[xi] = 1. / Alphabet_size;
 262       FNorm(pri->m[xi], Alphabet_size);
 263       FScale(pri->m[xi], Alphabet_size, wt);
 264     }
 265
 266   Free2DArray((void **)pam,27);
 267 }
 268
 269
 270 /* Function: P7DefaultNullModel()
 271  *
 272  * Purpose:  Set up a default random sequence model, using
 273  *           global aafq[]'s for protein or 1/Alphabet_size for anything
 274  *           else. randomseq is alloc'ed in caller. Alphabet information
 275  *           must already be known.
 276  */
 277 void
 278 P7DefaultNullModel(float *null, float *ret_p1)
 279 {
 280   int x;
 281   if (Alphabet_type == hmmAMINO) {
 282     for (x = 0; x < Alphabet_size; x++)
 283       null[x] = aafq[x];
 284     *ret_p1 = 350./351.;        /* rationale: approx avg protein length. */
 285   } else {
 286     for (x = 0; x < Alphabet_size; x++)
 287       null[x] = 1.0 / (float) Alphabet_size;
 288     *ret_p1 = 1000./1001.;      /* rationale: approx inter-Alu distance. */
 289   }
 290 }
 291
 292 void
 293 P7ReadNullModel(char *rndfile, float *null, float *ret_p1)
 294 {
 295   FILE *fp;
 296   char *s;
 297   int   x;
 298   int   type = 0;
 299
 300   if ((fp = fopen(rndfile, "r")) == NULL)
 301     Die("Failed to open null model file %s\n", rndfile);
 302   if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
 303   s2upper(s);
 304   if      (strcmp(s, "NUCLEIC") == 0) type = hmmNUCLEIC;
 305   else if (strcmp(s, "AMINO")   == 0) type = hmmAMINO;
 306   else    goto FAILURE;
 307                                 /* check/set alphabet type */
 308   if (Alphabet_type == 0)
 309     SetAlphabet(type);
 310   else if (Alphabet_type != type)
 311     Die("Alphabet type conflict; null model in %s is inappropriate\n", rndfile);
 312                                 /* parse the file */
 313   for (x = 0; x < Alphabet_size; x++) {
 314     if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;
 315     null[x] = atof(s);
 316   }
 317   if ((s = Getword(fp, sqdARG_FLOAT)) == NULL) goto FAILURE;
 318   *ret_p1 = atof(s);
 319
 320   fclose(fp);
 321   return;
 322
 323 FAILURE:
 324   fclose(fp);
 325   Die("%s is not in HMMER null model file format", rndfile);
 326 }
 327
 328
 329 /* Function: P7PriorifyHMM()
 330  *
 331  * Purpose:  Add pseudocounts to an HMM using Dirichlet priors,
 332  *           and renormalize the HMM.
 333  *
 334  * Args:     hmm -- the HMM to add counts to (counts form)
 335  *           pri -- the Dirichlet prior to use
 336  *
 337  * Return:   (void)
 338  *           HMM returns in probability form.
 339  */
 340 void
 341 P7PriorifyHMM(struct plan7_s *hmm, struct p7prior_s *pri)
 342 {
 343   int k;                        /* counter for model position   */
 344   float d;                      /* a denominator */
 345   float tq[MAXDCHLET];          /* prior distribution over mixtures */
 346   float mq[MAXDCHLET];          /* prior distribution over mixtures */
 347   float iq[MAXDCHLET];          /* prior distribution over mixtures */
 348
 349   /* Model-dependent transitions are handled simply; Laplace.
 350    */
 351   FSet(hmm->begin+2, hmm->M-1, 0.);     /* wipe internal BM entries */
 352   FSet(hmm->end+1, hmm->M-1, 0.);       /* wipe internal ME exits   */
 353   d = hmm->tbd1 + hmm->begin[1] + 2.;
 354   hmm->tbd1        = (hmm->tbd1 + 1.)/ d;
 355   hmm->begin[1]    = (hmm->begin[1] + 1.)/ d;
 356   hmm->end[hmm->M] = 1.0;
 357
 358   /* Main model transitions and emissions
 359    */
 360   for (k = 1; k < hmm->M; k++)
 361     {
 362       /* The following code chunk is experimental.
 363        * Collaboration with Michael Asman, Erik Sonnhammer, CGR Stockholm.
 364        * Only activated if X-PR* annotation has been used, in which
 365        * priors are overridden and a single Dirichlet component is
 366        * specified for each column (using structural annotation).
 367        * If X-PR* annotation is not used, which is usually the case,
 368        * the following code has no effect (observe how the real prior
 369        * distributions are copied into tq, mq, iq).
 370        */
 371       if (hmm->tpri != NULL && hmm->tpri[k] >= 0)
 372         {
 373           if (hmm->tpri[k] >= pri->tnum) Die("X-PRT annotation out of range");
 374           FSet(tq, pri->tnum, 0.0);
 375           tq[hmm->tpri[k]] = 1.0;
 376         }
 377       else
 378         FCopy(tq, pri->tq, pri->tnum);
 379       if (hmm->mpri != NULL && hmm->mpri[k] >= 0)
 380         {
 381           if (hmm->mpri[k] >= pri->mnum) Die("X-PRM annotation out of range");
 382           FSet(mq, pri->mnum, 0.0);
 383           mq[hmm->mpri[k]] = 1.0;
 384         }
 385       else
 386         FCopy(mq, pri->mq, pri->mnum);
 387       if (hmm->ipri != NULL && hmm->ipri[k] >= 0)
 388         {
 389           if (hmm->ipri[k] >= pri->inum) Die("X-PRI annotation out of range");
 390           FSet(iq, pri->inum, 0.0);
 391           iq[hmm->ipri[k]] = 1.0;
 392         }
 393       else
 394         FCopy(iq, pri->iq, pri->inum);
 395
 396       /* This is the main line of the code:
 397        */
 398       P7PriorifyTransitionVector(hmm->t[k], pri, tq);
 399       P7PriorifyEmissionVector(hmm->mat[k], pri, pri->mnum, mq, pri->m, NULL);
 400       P7PriorifyEmissionVector(hmm->ins[k], pri, pri->inum, iq, pri->i, NULL);
 401     }
 402
 403   /* We repeat the above steps just for the final match state, M.
 404    */
 405   if (hmm->mpri != NULL && hmm->mpri[hmm->M] >= 0)
 406     {
 407       if (hmm->mpri[hmm->M] >= pri->mnum) Die("X-PRM annotation out of range");
 408       FSet(mq, pri->mnum, 0.0);
 409       mq[hmm->mpri[hmm->M]] = 1.0;
 410     }
 411   else
 412     FCopy(mq, pri->mq, pri->mnum);
 413
 414   P7PriorifyEmissionVector(hmm->mat[hmm->M], pri, pri->mnum, mq, pri->m, NULL);
 415
 416   /* Now we're done. Convert the counts-based HMM to probabilities.
 417    */
 418   Plan7Renormalize(hmm);
 419 }
 420
 421
 422 /* Function: P7PriorifyEmissionVector()
 423  *
 424  * Purpose:  Add prior pseudocounts to an observed
 425  *           emission count vector and renormalize.
 426  *
 427  *           Can return the posterior mixture probabilities
 428  *           P(q | counts) if ret_mix[MAXDCHLET] is passed.
 429  *           Else, pass NULL.
 430  *
 431  * Args:     vec     - the 4 or 20-long vector of counts to modify
 432  *           pri     - prior data structure
 433  *           num     - pri->mnum or pri->inum; # of mixtures
 434  *           eq      - pri->mq or pri->iq; prior mixture probabilities
 435  *           e       - pri->i or pri->m; Dirichlet components
 436  *           ret_mix - filled with posterior mixture probabilities, or NULL
 437  *
 438  * Return:   (void)
 439  *           The counts in vec are changed and normalized to probabilities.
 440  */
 441 void
 442 P7PriorifyEmissionVector(float *vec, struct p7prior_s *pri,
 443                        int num, float eq[MAXDCHLET], float e[MAXDCHLET][MAXABET],
 444                        float *ret_mix)
 445 {
 446   int   x;                      /* counter over vec                     */
 447   int   q;                      /* counter over mixtures                */
 448   float mix[MAXDCHLET];         /* posterior distribution over mixtures */
 449   float totc;                   /* total counts                         */
 450   float tota;                   /* total alpha terms                    */
 451   float xi;                     /* X_i term, Sjolander eq. 41           */
 452
 453   /* Calculate mix[], which is the posterior probability
 454    * P(q | n) of mixture component q given the count vector n
 455    *
 456    * (side effect note: note that an insert vector in a PAM prior
 457    * is passed with num = 1, bypassing pam prior code; this means
 458    * that inserts cannot be mixture Dirichlets...)
 459    * [SRE, 12/24/00: the above comment is cryptic! what the hell does that
 460    *  mean, inserts can't be mixtures? doesn't seem to be true. it
 461    *  may mean that in a PAM prior, you can't have a mixture for inserts,
 462    *  but I don't even understand that. The insert vectors aren't passed
 463    *  with num=1!!]
 464    */
 465   mix[0] = 1.0;
 466   if (pri->strategy == PRI_DCHLET && num > 1)
 467     {
 468       for (q = 0; q < num; q++)
 469         {
 470           mix[q] =  eq[q] > 0.0 ? log(eq[q]) : -999.;
 471           mix[q] += Logp_cvec(vec, Alphabet_size, e[q]);
 472         }
 473       LogNorm(mix, num);      /* now mix[q] is P(component_q | n) */
 474     }
 475   else if (pri->strategy == PRI_PAM && num > 1)
 476     {           /* pam prior uses aa frequencies as `P(q|n)' */
 477       for (q = 0; q < Alphabet_size; q++)
 478         mix[q] = vec[q];
 479       FNorm(mix, Alphabet_size);
 480     }
 481
 482   /* Convert the counts to probabilities, following Sjolander (1996)
 483    */
 484   totc = FSum(vec, Alphabet_size);
 485   for (x = 0; x < Alphabet_size; x++) {
 486     xi = 0.0;
 487     for (q = 0; q < num; q++) {
 488       tota = FSum(e[q], Alphabet_size);
 489       xi += mix[q] * (vec[x] + e[q][x]) / (totc + tota);
 490     }
 491     vec[x] = xi;
 492   }
 493   FNorm(vec, Alphabet_size);
 494
 495   if (ret_mix != NULL)
 496     for (q = 0; q < num; q++)
 497       ret_mix[q] = mix[q];
 498 }
 499
 500
 501
 502 /* Function: P7PriorifyTransitionVector()
 503  *
 504  * Purpose:  Add prior pseudocounts to transition vector,
 505  *           which contains three different probability vectors
 506  *           for m, d, and i.
 507  *
 508  * Args:     t     - state transitions, counts: 3 for M, 2 for I, 2 for D.
 509  *           prior - Dirichlet prior information
 510  *           tq    - prior distribution over Dirichlet components.
 511  *                   (overrides prior->iq[]; used for alternative
 512  *                   methods of conditioning prior on structural data)
 513  *
 514  * Return:   (void)
 515  *           t is changed, and renormalized -- comes back as
 516  *           probability vectors.
 517  */
 518 void
 519 P7PriorifyTransitionVector(float *t, struct p7prior_s *prior,
 520                            float tq[MAXDCHLET])
 521 {
 522   int   ts;
 523   int   q;
 524   float mix[MAXDCHLET];
 525   float totm, totd, toti;       /* total counts in three transition vecs */
 526   float xi;                     /* Sjolander's X_i term */
 527
 528   mix[0] = 1.0;                 /* default is simple one component */
 529   if ((prior->strategy == PRI_DCHLET || prior->strategy == PRI_PAM) && prior->mnum > 1)
 530     {
 531       for (q = 0; q < prior->tnum; q++)
 532         {
 533           mix[q] =  tq[q] > 0.0 ? log(tq[q]) : -999.;
 534           mix[q] += Logp_cvec(t,   3, prior->t[q]);   /* 3 match  */
 535           mix[q] += Logp_cvec(t+3, 2, prior->t[q]+3); /* 2 insert */
 536           mix[q] += Logp_cvec(t+5, 2, prior->t[q]+5); /* 2 delete */
 537         }
 538       LogNorm(mix, prior->tnum); /* mix[q] is now P(q | counts) */
 539     }
 540                                 /* precalc some denominators */
 541   totm = FSum(t,3);
 542   toti = t[TIM] + t[TII];
 543   totd = t[TDM] + t[TDD];
 544
 545   for (ts = 0; ts < 7; ts++)
 546     {
 547       xi = 0.0;
 548       for (q = 0; q < prior->tnum; q++)
 549         {
 550           switch (ts) {
 551           case TMM: case TMI: case TMD:
 552             xi += mix[q] * (t[ts] + prior->t[q][ts]) /
 553               (totm + FSum(prior->t[q], 3));
 554             break;
 555           case TIM: case TII:
 556             xi += mix[q] * (t[ts] + prior->t[q][ts]) /
 557               (toti + prior->t[q][TIM] + prior->t[q][TII]);
 558             break;
 559           case TDM: case TDD:
 560             xi += mix[q] * (t[ts] + prior->t[q][ts]) /
 561               (totd + prior->t[q][TDM] + prior->t[q][TDD]);
 562             break;
 563           }
 564         }
 565       t[ts] = xi;
 566     }
 567   FNorm(t,   3);                /* match  */
 568   FNorm(t+3, 2);                /* insert */
 569   FNorm(t+5, 2);                /* delete */
 570 }
 571
 572
 573 /* Function: default_amino_prior()
 574  *
 575  * Purpose:  Set the default protein prior.
 576  */
 577 static struct p7prior_s *
 578 default_amino_prior(void)
 579 {
 580   struct p7prior_s *pri;
 581   int q, x;
 582                                 /* default match mixture coefficients */
 583   static float defmq[9] = {
 584     0.178091, 0.056591, 0.0960191, 0.0781233, 0.0834977,
 585     0.0904123, 0.114468, 0.0682132, 0.234585 };
 586
 587                                 /* default match mixture Dirichlet components */
 588   static float defm[9][20] = {
 589     { 0.270671, 0.039848, 0.017576, 0.016415, 0.014268,
 590       0.131916, 0.012391, 0.022599, 0.020358, 0.030727,
 591       0.015315, 0.048298, 0.053803, 0.020662, 0.023612,
 592       0.216147, 0.147226, 0.065438, 0.003758, 0.009621 },
 593     { 0.021465, 0.010300, 0.011741, 0.010883, 0.385651,
 594       0.016416, 0.076196, 0.035329, 0.013921, 0.093517,
 595       0.022034, 0.028593, 0.013086, 0.023011, 0.018866,
 596       0.029156, 0.018153, 0.036100, 0.071770, 0.419641 },
 597     { 0.561459, 0.045448, 0.438366, 0.764167, 0.087364,
 598       0.259114, 0.214940, 0.145928, 0.762204, 0.247320,
 599       0.118662, 0.441564, 0.174822, 0.530840, 0.465529,
 600       0.583402, 0.445586, 0.227050, 0.029510, 0.121090 },
 601     { 0.070143, 0.011140, 0.019479, 0.094657, 0.013162,
 602       0.048038, 0.077000, 0.032939, 0.576639, 0.072293,
 603       0.028240, 0.080372, 0.037661, 0.185037, 0.506783,
 604       0.073732, 0.071587, 0.042532, 0.011254, 0.028723 },
 605     { 0.041103, 0.014794, 0.005610, 0.010216, 0.153602,
 606       0.007797, 0.007175, 0.299635, 0.010849, 0.999446,
 607       0.210189, 0.006127, 0.013021, 0.019798, 0.014509,
 608       0.012049, 0.035799, 0.180085, 0.012744, 0.026466 },
 609     { 0.115607, 0.037381, 0.012414, 0.018179, 0.051778,
 610       0.017255, 0.004911, 0.796882, 0.017074, 0.285858,
 611       0.075811, 0.014548, 0.015092, 0.011382, 0.012696,
 612       0.027535, 0.088333, 0.944340, 0.004373, 0.016741 },
 613     { 0.093461, 0.004737, 0.387252, 0.347841, 0.010822,
 614       0.105877, 0.049776, 0.014963, 0.094276, 0.027761,
 615       0.010040, 0.187869, 0.050018, 0.110039, 0.038668,
 616       0.119471, 0.065802, 0.025430, 0.003215, 0.018742 },
 617     { 0.452171, 0.114613, 0.062460, 0.115702, 0.284246,
 618       0.140204, 0.100358, 0.550230, 0.143995, 0.700649,
 619       0.276580, 0.118569, 0.097470, 0.126673, 0.143634,
 620       0.278983, 0.358482, 0.661750, 0.061533, 0.199373 },
 621     { 0.005193, 0.004039, 0.006722, 0.006121, 0.003468,
 622       0.016931, 0.003647, 0.002184, 0.005019, 0.005990,
 623       0.001473, 0.004158, 0.009055, 0.003630, 0.006583,
 624       0.003172, 0.003690, 0.002967, 0.002772, 0.002686 },
 625   };
 626
 627   pri = P7AllocPrior();
 628   pri->strategy = PRI_DCHLET;
 629
 630   /* Transition priors are subjective, but borrowed from GJM's estimations
 631    * on Pfam
 632    */
 633   pri->tnum     = 1;
 634   pri->tq[0]    = 1.0;
 635   pri->t[0][TMM]   = 0.7939;
 636   pri->t[0][TMI]   = 0.0278;
 637   pri->t[0][TMD]   = 0.0135;
 638   pri->t[0][TIM]   = 0.1551;
 639   pri->t[0][TII]   = 0.1331;
 640   pri->t[0][TDM]   = 0.9002;
 641   pri->t[0][TDD]   = 0.5630;
 642
 643   /* Match emission priors are a mixture Dirichlet,
 644    * from Kimmen Sjolander (Blocks9)
 645    */
 646   pri->mnum  = 9;
 647   for (q = 0; q < pri->mnum; q++)
 648     {
 649       pri->mq[q] = defmq[q];
 650       for (x = 0; x < 20; x++)
 651         pri->m[q][x] = defm[q][x];
 652     }
 653
 654   /* These insert emission priors are subjective. Observed frequencies
 655    * were obtained from PFAM 1.0, 10 Nov 96;
 656    *      see ~/projects/plan7/InsertStatistics.
 657    * Inserts are slightly biased towards polar residues and away from
 658    * hydrophobic residues.
 659    */
 660   pri->inum  = 1;
 661   pri->iq[0] = 1.;
 662   pri->i[0][0]  = 681.;         /* A */
 663   pri->i[0][1]  = 120.;         /* C */
 664   pri->i[0][2]  = 623.;         /* D */
 665   pri->i[0][3]  = 651.;         /* E */
 666   pri->i[0][4]  = 313.;         /* F */
 667   pri->i[0][5]  = 902.;         /* G */
 668   pri->i[0][6]  = 241.;         /* H */
 669   pri->i[0][7]  = 371.;         /* I */
 670   pri->i[0][8]  = 687.;         /* K */
 671   pri->i[0][9]  = 676.;         /* L */
 672   pri->i[0][10] = 143.;         /* M */
 673   pri->i[0][11] = 548.;         /* N */
 674   pri->i[0][12] = 647.;         /* P */
 675   pri->i[0][13] = 415.;         /* Q */
 676   pri->i[0][14] = 551.;         /* R */
 677   pri->i[0][15] = 926.;         /* S */
 678   pri->i[0][16] = 623.;         /* T */
 679   pri->i[0][17] = 505.;         /* V */
 680   pri->i[0][18] = 102.;         /* W */
 681   pri->i[0][19] = 269.;         /* Y */
 682
 683   return pri;
 684 }
 685
 686
 687 /* Function: default_nucleic_prior()
 688  *
 689  * Purpose:  Set the default DNA prior. (for now, almost a Laplace)
 690  */
 691 static struct p7prior_s *
 692 default_nucleic_prior(void)
 693 {
 694   struct p7prior_s *pri;
 695
 696   pri = P7AllocPrior();
 697   pri->strategy = PRI_DCHLET;
 698
 699   /* The use of the Pfam-trained amino acid transition priors
 700    * here is TOTALLY bogus. But it works better than a straight
 701    * Laplace, esp. for Maxmodelmaker(). For example, a Laplace
 702    * prior builds M=1 models for a single sequence GAATTC (at
 703    * one time an open "bug").
 704    */
 705   pri->tnum        = 1;
 706   pri->tq[0]       = 1.;
 707   pri->t[0][TMM]   = 0.7939;
 708   pri->t[0][TMI]   = 0.0278;
 709   pri->t[0][TMD]   = 0.0135;
 710   pri->t[0][TIM]   = 0.1551;
 711   pri->t[0][TII]   = 0.1331;
 712   pri->t[0][TDM]   = 0.9002;
 713   pri->t[0][TDD]   = 0.5630;
 714
 715   pri->mnum  = 1;
 716   pri->mq[0] = 1.;
 717   FSet(pri->m[0], Alphabet_size, 1.);
 718
 719   pri->inum  = 1;
 720   pri->iq[0] = 1.;
 721   FSet(pri->i[0], Alphabet_size, 1.);
 722
 723   return pri;
 724 }
 725