forester/archive/RIO/others/hmmer/src/hmmbuild.c

   1 /************************************************************
   2  * HMMER - Biological sequence analysis with profile HMMs
   3  * Copyright (C) 1992-1999 Washington University School of Medicine
   4  * All Rights Reserved
   5  *
   6  *     This source code is distributed under the terms of the
   7  *     GNU General Public License. See the files COPYING and LICENSE
   8  *     for details.
   9  ************************************************************/
  10
  11 /* hmmbuild.c
  12  * SRE, Mon Nov 18 12:41:29 1996
  13  *
  14  * main() for HMM construction from an alignment.
  15  * CVS $Id: hmmbuild.c,v 1.1.1.1 2005/03/22 08:34:06 cmzmasek Exp $
  16  */
  17
  18 #include <stdio.h>
  19 #include <stdlib.h>
  20 #include <string.h>
  21
  22 #include "structs.h"            /* data structures, macros, #define's   */
  23 #include "config.h"             /* compile-time configuration constants */
  24 #include "funcs.h"              /* function declarations                */
  25 #include "globals.h"            /* alphabet global variables            */
  26 #include "squid.h"              /* general sequence analysis library    */
  27 #include "msa.h"                /* squid's multiple alignment i/o       */
  28
  29 static char banner[] = "hmmbuild - build a hidden Markov model from an alignment";
  30
  31 static char usage[]  = "\
  32 Usage: hmmbuild [-options] <hmmfile output> <alignment file>\n\
  33   Available options are:\n\
  34    -h     : help; print brief help on version and usage\n\
  35    -n <s> : name; name this (first) HMM <s>\n\
  36    -o <f> : re-save annotated alignment to <f>\n\
  37    -A     : append; append this HMM to <hmmfile>\n\
  38    -F     : force; allow overwriting of <hmmfile>\n\
  39 \n\
  40   Alternative search algorithm styles: (default: hmmls domain alignment)\n\
  41    -f     : multi-hit local (hmmfs style)\n\
  42    -g     : global alignment (hmms style, Needleman/Wunsch)\n\
  43    -s     : local alignment (hmmsw style, Smith/Waterman)\n\
  44 ";
  45
  46 static char experts[] = "\
  47   Alternative model construction strategies: (default: MAP)\n\
  48    --fast        : Krogh/Haussler fast heuristic construction (see --gapmax)\n\
  49    --hand        : manual construction (requires annotated alignment)\n\
  50 \n\
  51   Expert customization of parameters and priors:\n\
  52    --null  <f>   : read null (random sequence) model from <f>\n\
  53    --pam   <f>   : heuristic PAM-based prior, using BLAST PAM matrix in <f>\n\
  54    --prior <f>   : read Dirichlet prior parameters from <f>\n\
  55 \n\
  56   Alternative sequence weighting strategies: (default: GSC weights)\n\
  57    --wblosum     : Henikoff simple filter weights (see --idlevel)\n\
  58    --wgsc        : Gerstein/Sonnhammer/Chothia tree weights (default)\n\
  59    --wme         : maximum entropy (ME)\n\
  60    --wpb         : Henikoff position-based weights\n\
  61    --wvoronoi    : Sibbald/Argos Voronoi weights\n\
  62    --wnone       : don't do any weighting\n\
  63    --noeff       : don't use effective sequence number; just use nseq\n\
  64    --pbswitch <n>: set switch from GSC to position-based wgts at > n seqs\n\
  65 \n\
  66   Forcing an alphabet: (normally autodetected)\n\
  67    --amino   : override autodetection, assert that seqs are protein\n\
  68    --nucleic : override autodetection, assert that seqs are DNA/RNA\n\
  69 \n\
  70   Other expert options:\n\
  71    --archpri <x> : set architecture size prior to <x> {0.85} [0..1]\n\
  72    --binary      : save the model in binary format, not ASCII text\n\
  73    --cfile <f>   : save count vectors to <f>\n\
  74    --gapmax <x>  : max fraction of gaps in mat column {0.50} [0..1]\n\
  75    --idlevel <x> : set frac. id level used by eff. nseq and --wblosum {0.62}\n\
  76    --informat <s>: input alignment is in format <s>, not Stockholm\n\
  77    --pamwgt <x>  : set weight on PAM-based prior to <x> {20.}[>=0]\n\
  78    --swentry <x> : set S/W aggregate entry prob. to <x> {0.5}\n\
  79    --swexit <x>  : set S/W aggregate exit prob. to <x>  {0.5}\n\
  80    --verbose     : print boring information\n\
  81 \n";
  82
  83 static struct opt_s OPTIONS[] = {
  84   { "-f", TRUE, sqdARG_NONE },
  85   { "-g", TRUE, sqdARG_NONE },
  86   { "-h", TRUE, sqdARG_NONE },
  87   { "-n", TRUE, sqdARG_STRING},
  88   { "-o", TRUE, sqdARG_STRING},
  89   { "-s", TRUE, sqdARG_NONE },
  90   { "-A", TRUE, sqdARG_NONE },
  91   { "-F", TRUE, sqdARG_NONE },
  92   { "--amino",   FALSE, sqdARG_NONE  },
  93   { "--archpri", FALSE, sqdARG_FLOAT },
  94   { "--binary",  FALSE, sqdARG_NONE  },
  95   { "--cfile",   FALSE, sqdARG_STRING},
  96   { "--fast",    FALSE, sqdARG_NONE},
  97   { "--gapmax",  FALSE, sqdARG_FLOAT },
  98   { "--hand",    FALSE, sqdARG_NONE},
  99   { "--idlevel", FALSE, sqdARG_FLOAT },
 100   { "--informat",FALSE, sqdARG_STRING },
 101   { "--noeff",   FALSE, sqdARG_NONE },
 102   { "--nucleic", FALSE, sqdARG_NONE },
 103   { "--null",    FALSE, sqdARG_STRING },
 104   { "--pam",     FALSE, sqdARG_STRING },
 105   { "--pamwgt",  FALSE, sqdARG_FLOAT },
 106   { "--pbswitch",FALSE, sqdARG_INT },
 107   { "--prior",   FALSE, sqdARG_STRING },
 108   { "--swentry", FALSE, sqdARG_FLOAT },
 109   { "--swexit",  FALSE, sqdARG_FLOAT },
 110   { "--verbose", FALSE, sqdARG_NONE  },
 111   { "--wgsc",    FALSE, sqdARG_NONE },
 112   { "--wblosum", FALSE, sqdARG_NONE },
 113   { "--wme",     FALSE, sqdARG_NONE },
 114   { "--wnone",   FALSE, sqdARG_NONE },
 115   { "--wpb",     FALSE, sqdARG_NONE },
 116   { "--wvoronoi",FALSE, sqdARG_NONE },
 117 };
 118 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
 119
 120 static void print_all_scores(FILE *fp, struct plan7_s *hmm,
 121                              char **dsq, MSA *msa, struct p7trace_s **tr);
 122 static void save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm);
 123 static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt,
 124                                    int nseq, struct p7trace_s **tr, float *pernode,
 125                                    float *ret_avg);
 126 static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr,
 127                               float *pernode, float expected);
 128 static void maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa,
 129                             float eff_nseq,
 130                             struct p7prior_s *prior, struct p7trace_s **tr);
 131
 132
 133 int
 134 main(int argc, char **argv)
 135 {
 136   char            *seqfile;     /* seqfile to read alignment from          */
 137   int              format;      /* format of seqfile                       */
 138   MSAFILE         *afp;         /* open alignment file                     */
 139   MSA             *msa;         /* a multiple sequence alignment           */
 140   char           **dsq;         /* digitized unaligned aseq's              */
 141   struct plan7_s  *hmm;         /* constructed HMM; written to hmmfile     */
 142   struct p7prior_s *pri;        /* Dirichlet priors to use                 */
 143   struct p7trace_s **tr;        /* fake tracebacks for aseq's              */
 144   char            *hmmfile;     /* file to write HMM to                    */
 145   FILE            *hmmfp;       /* HMM output file handle                  */
 146   char            *name;        /* name of the HMM                         */
 147   int              idx;         /* counter for sequences                   */
 148   float  randomseq[MAXABET];    /* null sequence model                     */
 149   float            p1;          /* null sequence model p1 transition       */
 150   int              nali;        /* count number of alignments/HMMs         */
 151   char             fpopts[3];   /* options to open a file with, e.g. "ab"  */
 152   int              checksum;    /* checksum of the alignment               */
 153
 154   char *optname;                /* name of option found by Getopt()      */
 155   char *optarg;                 /* argument found by Getopt()            */
 156   int   optind;                 /* index in argv[]                       */
 157
 158   enum p7_construction c_strategy; /* construction strategy choice        */
 159   enum p7_weight {              /* weighting strategy */
 160     WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_PB, WGT_VORONOI, WGT_ME} w_strategy;
 161   enum p7_config {              /* algorithm configuration strategy      */
 162     P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy;
 163   float gapmax;                 /* max frac gaps in mat col for -k       */
 164   int   overwrite_protect;      /* TRUE to prevent overwriting HMM file  */
 165   int   verbose;                /* TRUE to show a lot of output          */
 166   char *rndfile;                /* random sequence model file to read    */
 167   char *prifile;                /* Dirichlet prior file to read          */
 168   char *pamfile;                /* PAM matrix file for heuristic prior   */
 169   char *align_ofile;            /* name of output alignment file         */
 170   char *cfile;                  /* output file for count vectors         */
 171   FILE *alignfp;                /* open filehandle for alignment resaves */
 172   FILE *cfp;                    /* open filehandle for count vector saves*/
 173   float archpri;                /* "architecture" prior on model size    */
 174   float pamwgt;                 /* weight on PAM for heuristic prior     */
 175   int   do_append;              /* TRUE to append to hmmfile             */
 176   int   do_binary;              /* TRUE to write in binary format        */
 177   float blosumlevel;            /* BLOSUM frac id filtering level [0.62] */
 178   float swentry;                /* S/W aggregate entry probability       */
 179   float swexit;                 /* S/W aggregate exit probability        */
 180   int   do_eff;                 /* TRUE to set an effective seq number   */
 181   float eff_nseq;               /* effective sequence number             */
 182   int   pbswitch;               /* nseq >= this, switchover to PB weights*/
 183   char *setname;                /* NULL, or ptr to HMM name to set       */
 184   int   gapmax_set;             /* TRUE if gapmax was set on commandline */
 185
 186   /***********************************************
 187    * Parse command line
 188    ***********************************************/
 189
 190   format            = MSAFILE_UNKNOWN;        /* autodetect format by default. */
 191   c_strategy        = P7_MAP_CONSTRUCTION;
 192   w_strategy        = WGT_GSC;
 193   blosumlevel       = 0.62;
 194   cfg_strategy      = P7_LS_CONFIG;
 195   gapmax            = 0.5;
 196   overwrite_protect = TRUE;
 197   verbose           = FALSE;
 198   rndfile           = NULL;
 199   prifile           = NULL;
 200   pamfile           = NULL;
 201   align_ofile       = NULL;
 202   alignfp           = NULL;
 203   cfile             = NULL;
 204   cfp               = NULL;
 205   archpri           = 0.85;
 206   pamwgt            = 20.;
 207   Alphabet_type     = hmmNOTSETYET;     /* initially unknown */
 208   name              = NULL;
 209   do_append         = FALSE;
 210   swentry           = 0.5;
 211   swexit            = 0.5;
 212   do_eff            = TRUE;
 213   do_binary         = FALSE;
 214   pbswitch          = 1000;
 215   setname           = NULL;
 216   gapmax_set        = FALSE;
 217
 218   while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
 219                 &optind, &optname, &optarg))  {
 220     if      (strcmp(optname, "-f") == 0) cfg_strategy      = P7_FS_CONFIG;
 221     else if (strcmp(optname, "-g") == 0) cfg_strategy      = P7_BASE_CONFIG;
 222     else if (strcmp(optname, "-n") == 0) setname           = optarg;
 223     else if (strcmp(optname, "-o") == 0) align_ofile       = optarg;
 224     else if (strcmp(optname, "-s") == 0) cfg_strategy      = P7_SW_CONFIG;
 225     else if (strcmp(optname, "-A") == 0) do_append         = TRUE;
 226     else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE;
 227     else if (strcmp(optname, "--amino")   == 0) SetAlphabet(hmmAMINO);
 228     else if (strcmp(optname, "--archpri") == 0) archpri       = atof(optarg);
 229     else if (strcmp(optname, "--binary")  == 0) do_binary     = TRUE;
 230     else if (strcmp(optname, "--cfile")   == 0) cfile         = optarg;
 231     else if (strcmp(optname, "--fast")    == 0) c_strategy    = P7_FAST_CONSTRUCTION;
 232     else if (strcmp(optname, "--gapmax")  == 0) { gapmax      = atof(optarg); gapmax_set = TRUE; }
 233     else if (strcmp(optname, "--hand")    == 0) c_strategy    = P7_HAND_CONSTRUCTION;
 234     else if (strcmp(optname, "--idlevel") == 0) blosumlevel   = atof(optarg);
 235     else if (strcmp(optname, "--noeff")   == 0) do_eff        = FALSE;
 236     else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC);
 237     else if (strcmp(optname, "--null")    == 0) rndfile       = optarg;
 238     else if (strcmp(optname, "--pam")     == 0) pamfile       = optarg;
 239     else if (strcmp(optname, "--pamwgt")  == 0) pamwgt        = atof(optarg);
 240     else if (strcmp(optname, "--pbswitch")== 0) pbswitch      = atoi(optarg);
 241     else if (strcmp(optname, "--prior")   == 0) prifile       = optarg;
 242     else if (strcmp(optname, "--swentry") == 0) swentry       = atof(optarg);
 243     else if (strcmp(optname, "--swexit")  == 0) swexit        = atof(optarg);
 244     else if (strcmp(optname, "--verbose") == 0) verbose       = TRUE;
 245     else if (strcmp(optname, "--wgsc")    == 0) w_strategy    = WGT_GSC;
 246     else if (strcmp(optname, "--wblosum") == 0) w_strategy    = WGT_BLOSUM;
 247     else if (strcmp(optname, "--wme")     == 0) w_strategy    = WGT_ME;
 248     else if (strcmp(optname, "--wpb")     == 0) w_strategy    = WGT_PB;
 249     else if (strcmp(optname, "--wnone")   == 0) w_strategy    = WGT_NONE;
 250     else if (strcmp(optname, "--wvoronoi")== 0) w_strategy    = WGT_VORONOI;
 251     else if (strcmp(optname, "--informat") == 0) {
 252       format = String2SeqfileFormat(optarg);
 253       if (format == MSAFILE_UNKNOWN)
 254         Die("unrecognized sequence file format \"%s\"", optarg);
 255       if (! IsAlignmentFormat(format))
 256         Die("%s is an unaligned format, can't read as an alignment", optarg);
 257     }
 258     else if (strcmp(optname, "-h") == 0) {
 259       Banner(stdout, banner);
 260       puts(usage);
 261       puts(experts);
 262       exit(EXIT_SUCCESS);
 263     }
 264   }
 265   if (argc - optind != 2)
 266     Die("Incorrect number of arguments.\n%s\n", usage);
 267
 268   hmmfile = argv[optind++];
 269   seqfile = argv[optind++];
 270
 271   if (gapmax < 0. || gapmax > 1.)
 272     Die("--gapmax must be a value from 0 to 1\n%s\n", usage);
 273   if (archpri < 0. || archpri > 1.)
 274     Die("--archpri must be a value from 0 to 1\n%s\n", usage);
 275   if (overwrite_protect && !do_append && FileExists(hmmfile))
 276     Die("HMM file %s already exists. Rename or delete it.", hmmfile);
 277   if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile))
 278     Die("Alignment resave file %s exists. Rename or delete it.", align_ofile);
 279   if (gapmax_set && c_strategy  != P7_FAST_CONSTRUCTION)
 280     Die("using --gapmax only makes sense if you use --fast");
 281
 282   /***********************************************
 283    * Preliminaries: open our files for i/o
 284    ***********************************************/
 285
 286                                 /* Open the alignment */
 287   if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL)
 288     Die("Alignment file %s could not be opened for reading", seqfile);
 289
 290                                 /* Open the HMM output file */
 291   if (do_append) strcpy(fpopts, "a");
 292   else           strcpy(fpopts, "w");
 293   if (do_binary) strcat(fpopts, "b");
 294   if ((hmmfp = fopen(hmmfile, fpopts)) == NULL)
 295     Die("Failed to open HMM file %s for %s\n", hmmfile,
 296         do_append ? "appending" : "writing");
 297
 298                                 /* Open the count vector save file */
 299   cfp = NULL;
 300   if (cfile != NULL)
 301     if ((cfp = fopen(cfile, "w")) == NULL)
 302       Die("Failed to open count vector file %s for writing\n", cfile);
 303
 304                                 /* Open the alignment resave file */
 305   alignfp = NULL;
 306   if (align_ofile != NULL)
 307     if ((alignfp = fopen(align_ofile, "w")) == NULL)
 308       Die("Failed to open alignment resave file %s for writing\n", align_ofile);
 309
 310   /***********************************************
 311    * Show the banner
 312    ***********************************************/
 313
 314   Banner(stdout, banner);
 315   printf("Alignment file:                    %s\n",
 316          seqfile);
 317   printf("File format:                       %s\n",
 318          SeqfileFormat2String(afp->format));
 319
 320   printf("Search algorithm configuration:    ");
 321   if      (cfg_strategy == P7_BASE_CONFIG)   puts("Global alignment (hmms)");
 322   else if (cfg_strategy == P7_SW_CONFIG)     {
 323     puts("Local  (hmmsw)");
 324     printf("S/W aggregate entry probability:   %.2f\n", swentry);
 325     printf("S/W aggregate exit probability:    %.2f\n", swexit);
 326   }
 327   else if (cfg_strategy == P7_LS_CONFIG)     puts("Multiple domain (hmmls)");
 328   else if (cfg_strategy == P7_FS_CONFIG)     {
 329     puts("Multiple local (hmmfs)");
 330     printf("S/W aggregate entry probability:   %.2f\n", swentry);
 331     printf("S/W aggregate exit probability:    %.2f\n", swexit);
 332   }
 333
 334   printf("Model construction strategy:       ");
 335   if (c_strategy == P7_HAND_CONSTRUCTION)    puts("Manual, from #=RF annotation");
 336   else if (c_strategy==P7_FAST_CONSTRUCTION) printf("Fast/ad hoc (gapmax %.2f)\n", gapmax);
 337   else                                       printf("MAP (gapmax hint: %.2f)\n", gapmax);
 338
 339   printf("Null model used:                   %s\n",
 340          (rndfile == NULL) ? "(default)" : rndfile);
 341
 342   printf("Prior used:                        %s\n",
 343          (prifile == NULL) ? "(default)" : prifile);
 344
 345   printf("Sequence weighting method:         ");
 346   if      (w_strategy == WGT_NONE)   puts("none");
 347   else if (w_strategy == WGT_GSC)    puts("G/S/C tree weights");
 348   else if (w_strategy == WGT_BLOSUM) printf("BLOSUM filter at %.2f id\n", blosumlevel);
 349   else if (w_strategy == WGT_PB)     puts("Henikoff position-based");
 350   else if (w_strategy == WGT_VORONOI)puts("Sibbald/Argos Voronoi");
 351   else if (w_strategy == WGT_ME)     puts("Maximum entropy");
 352
 353   printf("New HMM file:                      %s %s\n",
 354          hmmfile, do_append? "[appending]" : "");
 355   if (cfile != NULL)
 356     printf("Count vectors saved to:            %s\n", cfile);
 357   if (align_ofile != NULL)
 358     printf("Annotated alignment(s) resaved to: %s\n", align_ofile);
 359   printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
 360
 361
 362   /***********************************************
 363    * Get alignment(s), build HMMs one at a time
 364    ***********************************************/
 365
 366   nali = 0;
 367   while ((msa = MSAFileRead(afp)) != NULL)
 368     {
 369       /* Print some stuff about what we're about to do.
 370        */
 371       if (msa->name != NULL) printf("Alignment:           %s\n",  msa->name);
 372       else                   printf("Alignment:           #%d\n", nali+1);
 373       printf                       ("Number of sequences: %d\n",  msa->nseq);
 374       printf                       ("Number of columns:   %d\n",  msa->alen);
 375       puts("");
 376       fflush(stdout);
 377
 378       /* Make alignment upper case, because some symbol counting
 379        * things are case-sensitive.
 380        */
 381       for (idx = 0; idx < msa->nseq; idx++)
 382         s2upper(msa->aseq[idx]);
 383
 384       /* Set up the alphabet globals:
 385        * either already set by --amino or --nucleic, or
 386        * we guess based on the first alignment we see
 387        */
 388       if (Alphabet_type == hmmNOTSETYET)
 389         DetermineAlphabet(msa->aseq, msa->nseq);
 390
 391       /* Do some initialization the first time through.
 392        * This code must be delayed until after we've seen the
 393        * first alignment, because we have to see the alphabet type first
 394        */
 395       if (nali == 0)
 396         {
 397                                 /* Set up Dirichlet priors */
 398           if (prifile == NULL)  pri = P7DefaultPrior();
 399           else                  pri = P7ReadPrior(prifile);
 400
 401           if (pamfile != NULL)  PAMPrior(pamfile, pri, pamwgt);
 402
 403                                 /* Set up the null/random seq model */
 404           if (rndfile == NULL)  P7DefaultNullModel(randomseq, &p1);
 405           else                  P7ReadNullModel(rndfile, randomseq, &p1);
 406         }
 407
 408       /* Prepare unaligned digitized sequences for internal use
 409        */
 410       DigitizeAlignment(msa, &dsq);
 411
 412       /* In some respects we treat DNA more crudely for now;
 413        * for example, we can't do eff seq #, because it's
 414        * calibrated for protein.
 415        */
 416       if (Alphabet_type == hmmNUCLEIC)
 417         do_eff = FALSE;
 418
 419       /* Determine "effective sequence number".
 420        * The BlosumWeights() routine is now an efficient O(N)
 421        * memory clustering algorithm that doesn't blow up on,
 422        * say, Pfam's GP120 alignment (13000+ sequences)
 423        */
 424       eff_nseq = (float) msa->nseq;
 425       if (do_eff)
 426         {
 427           float *wgt;
 428           printf("%-40s ... ", "Determining effective sequence number");
 429           fflush(stdout);
 430                                 /* dummy weights array to feed BlosumWeights*/
 431           wgt = MallocOrDie(sizeof(float) * msa->nseq);
 432           BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt);
 433           eff_nseq = FSum(wgt, msa->nseq);
 434
 435           free(wgt);
 436           printf("done. [%.0f]\n", eff_nseq);
 437         }
 438
 439
 440       /* Weight the sequences (optional),
 441        */
 442       if (w_strategy == WGT_GSC     ||
 443           w_strategy == WGT_BLOSUM  ||
 444           w_strategy == WGT_VORONOI ||
 445           w_strategy == WGT_PB)
 446         {
 447           printf("%-40s ... ", "Weighting sequences heuristically");
 448           fflush(stdout);
 449
 450           if (w_strategy != WGT_PB && msa->nseq >= pbswitch)
 451             {
 452               printf("[big alignment! doing PB]... ");
 453               PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
 454             }
 455           else if (w_strategy == WGT_GSC)
 456             GSCWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
 457           else if (w_strategy == WGT_BLOSUM)
 458             BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, msa->wgt);
 459           else if (w_strategy == WGT_PB)
 460             PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
 461           else if (w_strategy ==  WGT_VORONOI)
 462             VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
 463
 464           printf("done.\n");
 465         }
 466
 467       /* Set the effective sequence number (if do_eff is FALSE, eff_nseq
 468        * was set to nseq).
 469        */
 470       FNorm(msa->wgt,  msa->nseq);
 471       FScale(msa->wgt, msa->nseq, eff_nseq);
 472
 473       /* Build a model architecture.
 474        * If we're not doing MD or ME, that's all we need to do.
 475        * We get an allocated, counts-based HMM back.
 476        *
 477        * Because the architecture algorithms are allowed to change
 478        * gap characters in the alignment, we have to calculate the
 479        * alignment checksum before we enter the algorithms.
 480        */
 481       printf("%-40s ... ", "Constructing model architecture");
 482       fflush(stdout);
 483       checksum = GCGMultchecksum(msa->aseq, msa->nseq);
 484       if (c_strategy == P7_FAST_CONSTRUCTION)
 485         P7Fastmodelmaker(msa, dsq, gapmax, &hmm, &tr);
 486       else if (c_strategy == P7_HAND_CONSTRUCTION)
 487         P7Handmodelmaker(msa, dsq, &hmm, &tr);
 488       else
 489         P7Maxmodelmaker(msa, dsq, gapmax,
 490                         pri, randomseq, p1, archpri, &hmm, &tr);
 491       hmm->checksum = checksum;
 492       printf("done.\n");
 493
 494       /* Save the count vectors if asked. Used primarily for
 495        * making the data files for training priors.
 496        */
 497       if (cfile != NULL)
 498         {
 499           printf("%-40s ... ", "Saving count vector file");
 500           fflush(stdout);
 501           save_countvectors(cfp,
 502                             (msa->name != NULL ? msa->name : "-"),
 503                             hmm);
 504           printf("done. [%s]\n", cfile);
 505         }
 506
 507       /* Record the null model in the HMM;
 508        * add prior contributions in pseudocounts and renormalize.
 509        */
 510       printf("%-40s ... ", "Converting counts to probabilities");
 511       fflush(stdout);
 512       Plan7SetNullModel(hmm, randomseq, p1);
 513       P7PriorifyHMM(hmm, pri);
 514       printf("done.\n");
 515
 516       /* Model configuration, temporary.
 517        * hmmbuild assumes that it's given an alignment of single domains,
 518        * and the alignment may contain fragments. So, for the purpose of
 519        * scoring the sequences (or, optionally, MD/ME weighting),
 520        * configure the model into hmmsw mode. Later we'll
 521        * configure the model according to how the user wants to
 522        * use it.
 523        */
 524       Plan7SWConfig(hmm, 0.5, 0.5);
 525
 526       /* Do model-dependent "weighting" strategies.
 527        */
 528       if (w_strategy == WGT_ME)
 529         {
 530           printf("\n%-40s ...\n", "Maximum entropy weighting, iterative");
 531           maximum_entropy(hmm, dsq, msa, eff_nseq, pri, tr);
 532           printf("----------------------------------------------\n\n");
 533         }
 534
 535       /* Give the model a name.
 536        * We deal with this differently depending on whether
 537        * we're in an alignment database or a single alignment.
 538        *
 539        * If a single alignment, priority is:
 540        *      1. Use -n <name> if set.
 541        *      2. Use msa->name (avail in Stockholm or SELEX formats only)
 542        *      3. If all else fails, use alignment file name without
 543        *         filename extension (e.g. "globins.slx" gets named "globins"
 544        *
 545        * If a multiple MSA database (e.g. Stockholm/Pfam),
 546        * only msa->name is applied. -n is not allowed.
 547        * if msa->name is unavailable, or -n was used,
 548        * a fatal error is thrown.
 549        *
 550        * Because we can't tell whether we've got more than one
 551        * alignment 'til we're on the second one, these fatal errors
 552        * only happen after the first HMM has already been built.
 553        * Oh well.
 554        */
 555       printf("%-40s ... ", "Setting model name, etc.");
 556       fflush(stdout);
 557       if (nali == 0)            /* first (only?) HMM in file:  */
 558         {
 559           if      (setname != NULL)   name = Strdup(setname);
 560           else if (msa->name != NULL) name = Strdup(msa->name);
 561           else                        name = FileTail(seqfile, TRUE);
 562         }
 563       else
 564         {
 565           if (setname != NULL)
 566             Die("Oops. Wait. You can't use -n with an alignment database.");
 567           else if (msa->name != NULL) name = Strdup(msa->name);
 568           else
 569             Die("Oops. Wait. I need name annotation on each alignment.\n");
 570         }
 571       Plan7SetName(hmm, name);
 572       free(name);
 573
 574       /* Transfer other information from the alignment to
 575        * the HMM. This typically only works for SELEX format
 576        * alignments, so these things are conditional/optional.
 577        */
 578       if (msa->acc  != NULL) Plan7SetAccession(hmm,   msa->acc);
 579       if (msa->desc != NULL) Plan7SetDescription(hmm, msa->desc);
 580
 581       if (msa->flags & MSA_SET_GA)
 582         { hmm->flags |= PLAN7_GA; hmm->ga1 = msa->ga1; hmm->ga2 = msa->ga2; }
 583       if (msa->flags & MSA_SET_TC)
 584         { hmm->flags |= PLAN7_TC; hmm->tc1 = msa->tc1; hmm->tc2 = msa->tc2; }
 585       if (msa->flags & MSA_SET_NC)
 586         { hmm->flags |= PLAN7_NC; hmm->nc1 = msa->nc1; hmm->nc2 = msa->nc2; }
 587
 588       /* Record some other miscellaneous information in the HMM,
 589        * like how/when we built it.
 590        */
 591       Plan7ComlogAppend(hmm, argc, argv);
 592       Plan7SetCtime(hmm);
 593       hmm->nseq = msa->nseq;
 594       printf("done. [%s]\n", hmm->name);
 595
 596       /* Print information for the user
 597        */
 598       printf("\nConstructed a profile HMM (length %d)\n", hmm->M);
 599       PrintPlan7Stats(stdout, hmm, dsq, msa->nseq, tr);
 600       printf("\n");
 601
 602       /* Configure the model for chosen algorithm
 603        */
 604       printf("%-40s ... ", "Finalizing model configuration");
 605       fflush(stdout);
 606       switch (cfg_strategy) {
 607       case P7_BASE_CONFIG:  Plan7GlobalConfig(hmm);              break;
 608       case P7_SW_CONFIG:    Plan7SWConfig(hmm, swentry, swexit); break;
 609       case P7_LS_CONFIG:    Plan7LSConfig(hmm);                  break;
 610       case P7_FS_CONFIG:    Plan7FSConfig(hmm, swentry, swexit); break;
 611       default:              Die("bogus configuration choice");
 612       }
 613       printf("done.\n");
 614
 615       /* Save new HMM to disk: open a file for appending or writing.
 616        */
 617       printf("%-40s ... ", "Saving model to file");
 618       fflush(stdout);
 619       if (do_binary) WriteBinHMM(hmmfp, hmm);
 620       else           WriteAscHMM(hmmfp, hmm);
 621       printf("done.\n");
 622
 623                                 /* the annotated alignment may be resaved */
 624       if (alignfp != NULL)
 625         {
 626           MSA    *new_msa;
 627           SQINFO *sqinfo;
 628
 629           printf("%-40s ... ", "Saving annotated alignment");
 630           fflush(stdout);
 631           sqinfo  = MSAToSqinfo(msa);
 632           new_msa = P7Traces2Alignment(dsq, sqinfo, msa->wgt, msa->nseq,
 633                                        hmm->M, tr, FALSE);
 634
 635           WriteStockholm(alignfp, new_msa);
 636           MSAFree(new_msa);
 637           for (idx = 0; idx < msa->nseq; idx++)
 638             FreeSequence(NULL, &(sqinfo[idx]));
 639           free(sqinfo);
 640           printf("done.\n");
 641         }
 642
 643       /* Verbose output; show scores for each sequence
 644        */
 645       if (verbose)
 646         print_all_scores(stdout, hmm, dsq, msa, tr);
 647
 648       /* Clean up before moving on to next alignment
 649        */
 650       for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]);
 651       free(tr);
 652       FreePlan7(hmm);
 653       MSAFree(msa);
 654       Free2DArray((void **) dsq, msa->nseq);
 655       fflush(hmmfp);
 656       if (cfp != NULL)     fflush(cfp);
 657       if (alignfp != NULL) fflush(alignfp);
 658
 659       puts("//\n");
 660       nali++;
 661     }
 662
 663
 664
 665   /* Clean up and exit
 666    */
 667   MSAFileClose(afp);
 668   fclose(hmmfp);
 669   if (cfp != NULL)     fclose(cfp);
 670   if (alignfp != NULL) fclose(alignfp);
 671   P7FreePrior(pri);
 672   SqdClean();
 673   return 0;
 674 }
 675
 676
 677 /* Function: print_all_scores()
 678  *
 679  * Purpose:  For each training sequence, print its score under
 680  *           the final model.
 681  *
 682  * Args:     fp   - where to print the output (usu. stdout)
 683  *           hmm  - newly constructed HMM, with prob's.
 684  *           dsq  - digitized unaligned training sequences.
 685  *           msa  - alignment and associated info
 686  *           tr   - array of tracebacks
 687  *
 688  * Return:   (void)
 689  */
 690 static void
 691 print_all_scores(FILE *fp, struct plan7_s *hmm,
 692                  char **dsq, MSA *msa, struct p7trace_s **tr)
 693 {
 694   int idx;                      /* counter for sequences */
 695
 696                                 /* make sure model scores are ready */
 697   P7Logoddsify(hmm, TRUE);
 698                                 /* header */
 699   fputs("**\n", fp);
 700   fputs("Individual training sequence scores:\n", fp);
 701                                 /* score for each sequence */
 702   for (idx = 0; idx < msa->nseq; idx++)
 703     {
 704       fprintf(fp, "%7.2f %-12s %s\n",
 705               P7TraceScore(hmm, dsq[idx], tr[idx]),
 706               msa->sqname[idx],
 707               (MSAGetSeqDescription(msa,idx) != NULL) ?
 708                MSAGetSeqDescription(msa,idx) : "");
 709       P7PrintTrace(fp, tr[idx], hmm, dsq[idx]);
 710     }
 711   fputs("\n", fp);
 712 }
 713
 714
 715
 716 /* Function: save_countvectors()
 717  *
 718  * Purpose:  Save emission/transition count vectors to a file.
 719  *           Used for gathering the data on which to train a
 720  *           prior (e.g. mixture Dirichlet, etc.)
 721  *
 722  *           The format of the file is one vector per line:
 723  *           M <f> <f> ...: 20 match emission counts in order AC..WY.
 724  *                          followed by two chars of CS, CA annotation.
 725  *           I <f> <f> ...: 20 insert emission counts in order AC..WY.
 726  *                          followed by two chars of CS, CA annotation.
 727  *           T <f> <f> ...: 7 transition counts in order TMM, TMI, TMD,
 728  *                            TIM, TII, TDM, TDD. (see structs.h)
 729  *                            followed by four chars of structure
 730  *                            annotation: CS, CS of M+1; CA, CA of M+1.
 731  *
 732  * Args:     cfp    - open counts file
 733  *           name   - name of alignment or HMM to associate with these vectors
 734  *           hmm    - counts-based HMM
 735  */
 736 static void
 737 save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm)
 738 {
 739   int k, x;
 740                                 /* match emission vectors */
 741   for (k = 1; k <= hmm->M; k++)
 742     {
 743       fputs("M ", cfp);
 744       for (x = 0; x < Alphabet_size; x++)
 745         fprintf(cfp, "%8.2f ", hmm->mat[k][x]);
 746
 747       fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);
 748       if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)
 749         fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]);
 750       else
 751         fputs("- -", cfp);
 752       fputs("\n", cfp);
 753     }
 754                                 /* insert emission vectors */
 755   for (k = 1; k < hmm->M; k++)
 756     {
 757       fputs("I ", cfp);
 758       for (x = 0; x < Alphabet_size; x++)
 759         fprintf(cfp, "%8.2f ", hmm->ins[k][x]);
 760
 761       fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);
 762       if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)
 763         fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]);
 764       else
 765         fputs("- -", cfp);
 766
 767       fputs("\n", cfp);
 768     }
 769                                 /* transition vectors */
 770     for (k = 1; k < hmm->M; k++)
 771     {
 772       fputs("T ", cfp);
 773
 774       for (x = 0; x < 7; x++)
 775         fprintf(cfp, "%8.2f ", hmm->t[k][x]);
 776
 777       fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);
 778       if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)
 779         fprintf(cfp, "%c %c %c %c",
 780                 hmm->cs[k], hmm->cs[k+1],
 781                 hmm->ca[k], hmm->ca[k+1]);
 782       else
 783         fputs("- -", cfp);
 784       fputs("\n", cfp);
 785     }
 786 }
 787
 788
 789 /* Function: position_average_score()
 790  * Date:     Wed Dec 31 09:36:35 1997 [StL]
 791  *
 792  * Purpose:  Calculate scores from tracebacks, keeping them
 793  *           in a position specific array. The final array
 794  *           is normalized position-specifically too, according
 795  *           to how many sequences contributed data to this
 796  *           position. Used for compensating for sequence
 797  *           fragments in ME and MD score optimization.
 798  *           Very much ad hoc.
 799  *
 800  *           Code related to (derived from) TraceScore().
 801  *
 802  * Args:     hmm       - HMM structure, scores valid
 803  *           dsq       - digitized unaligned sequences
 804  *           wgt       - weights on the sequences
 805  *           nseq      - number of sequences
 806  *           tr        - array of nseq tracebacks that aligns each dsq to hmm
 807  *           pernode   - RETURN: [0]1..M array of position-specific avg scores
 808  *           ret_avg   - RETURN: overall average full-length, one-domain score
 809  *
 810  * Return:   1 on success, 0 on failure.
 811  *           pernode is malloc'ed [0]1..M by CALLER and filled here.
 812  */
 813 static void
 814 position_average_score(struct plan7_s    *hmm,
 815                        char             **dsq,
 816                        float             *wgt,
 817                        int                nseq,
 818                        struct p7trace_s **tr,
 819                        float             *pernode,
 820                        float             *ret_avg)
 821 {
 822   int    pos;                   /* position in seq */
 823   int    sym;
 824   int    tpos;                  /* position in trace/state sequence */
 825   float *counts;                /* counts at each position */
 826   float  avg;                   /* RETURN: average overall */
 827   int    k;                     /* counter for model position */
 828   int    idx;                   /* counter for sequence number */
 829
 830   /* Allocations
 831    */
 832   counts = MallocOrDie ((hmm->M+1) * sizeof(float));
 833   FSet(pernode, hmm->M+1, 0.);
 834   FSet(counts,  hmm->M+1, 0.);
 835
 836   /* Loop over traces, accumulate weighted scores per position
 837    */
 838   for (idx = 0; idx < nseq; idx++)
 839     for (tpos = 0; tpos < tr[idx]->tlen; tpos++)
 840       {
 841         pos = tr[idx]->pos[tpos];
 842         sym = (int) dsq[idx][tr[idx]->pos[tpos]];
 843         k   = tr[idx]->nodeidx[tpos];
 844
 845         /* Counts: how many times did we use this model position 1..M?
 846          * (weighted)
 847          */
 848         if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD)
 849           counts[k] += wgt[idx];
 850
 851         /* Emission scores.
 852          */
 853         if (tr[idx]->statetype[tpos] == STM)
 854           pernode[k] += wgt[idx] * Scorify(hmm->msc[sym][k]);
 855         else if (tr[idx]->statetype[tpos] == STI)
 856           pernode[k] += wgt[idx] * Scorify(hmm->isc[sym][k]);
 857
 858         /* Transition scores.
 859          */
 860         if (tr[idx]->statetype[tpos] == STM ||
 861             tr[idx]->statetype[tpos] == STD ||
 862             tr[idx]->statetype[tpos] == STI)
 863           pernode[k] += wgt[idx] *
 864             Scorify(TransitionScoreLookup(hmm, tr[idx]->statetype[tpos], tr[idx]->nodeidx[tpos],
 865                                           tr[idx]->statetype[tpos+1],tr[idx]->nodeidx[tpos+1]));
 866       }
 867
 868   /* Divide accumulated scores by accumulated weighted counts
 869    */
 870   avg = 0.;
 871   for (k = 1; k <= hmm->M; k++)
 872     {
 873       pernode[k] /= counts[k];
 874       avg += pernode[k];
 875     }
 876
 877   free(counts);
 878   *ret_avg = avg;
 879   return;
 880 }
 881
 882
 883 /* Function: frag_trace_score()
 884  * Date:     SRE, Wed Dec 31 10:03:47 1997 [StL]
 885  *
 886  * Purpose:  Allow MD/ME optimization to be used for alignments
 887  *           that include fragments and multihits -- estimate a full-length
 888  *           per-domain score.
 889  *
 890  *
 891  *
 892  * Return:   "corrected" score.
 893  */
 894 static float
 895 frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr,
 896                  float *pernode, float expected)
 897 {
 898   float sc;                     /* corrected score  */
 899   float fragexp;                /* expected score for a trace like this */
 900   int   tpos;                   /* position in trace */
 901
 902                                 /* get uncorrected score */
 903   sc = P7TraceScore(hmm, dsq, tr);
 904
 905                                /* calc expected score for trace like this */
 906   fragexp = 0.;
 907   for (tpos = 0; tpos < tr->tlen; tpos++)
 908     if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD)
 909       fragexp += pernode[tr->nodeidx[tpos]];
 910
 911                                 /* correct for multihits */
 912   fragexp /= (float) TraceDomainNumber(tr);
 913
 914                                 /* extrapolate to full-length, one-hit score */
 915   sc = sc * expected / fragexp;
 916   return sc;
 917 }
 918
 919
 920 /* Function: maximum_entropy()
 921  * Date:     SRE, Fri Jan  2 10:56:00 1998 [StL]
 922  *
 923  * Purpose:  Optimizes a model according to maximum entropy weighting.
 924  *           See Krogh and Mitchison (1995).
 925  *
 926  *           [Actually, we do minimum relative entropy, rather than
 927  *           maximum entropy. Same thing, though we refer to "ME"
 928  *           weights and models. The optimization is a steepest
 929  *           descents minimization of the relative entropy.]
 930  *
 931  *           Expects to be called shortly after a Maxmodelmaker()
 932  *           or Handmodelmaker(), so that both a new model architecture
 933  *           (with MAP parameters) and fake tracebacks are available.
 934  *
 935  *           Prints a summary of optimization progress to stdout.
 936  *
 937  * Args:     hmm     - model. allocated, set with initial MAP parameters.
 938  *           dsq     - dealigned digitized seqs the model is based on
 939  *           ainfo   - extra info for aseqs
 940  *           nseq    - number of aseqs
 941  *           eff_nseq- effective sequence number; weights normalize up to this.
 942  *           prior   - prior distributions for parameterizing model
 943  *           tr      - array of fake traces for each sequence
 944  *
 945  * Return:   (void)
 946  *           hmm changed to an ME HMM
 947  *           ainfo changed, contains ME weights
 948  */
 949 static void
 950 maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa,
 951                 float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr)
 952 {
 953   float *wgt;                  /* current best set of ME weights   */
 954   float *new_wgt;              /* new set of ME weights to try     */
 955   float *sc;                    /* log-odds score of each sequence */
 956   float *grad;                  /* gradient                        */
 957   float  epsilon;               /* steepness of descent            */
 958   float  relative_entropy;      /* current best relative entropy   */
 959   float  new_entropy;           /* relative entropy at new weights */
 960   float  last_new_entropy;      /* last new_entropy we calc'ed     */
 961   float  use_epsilon;           /* current epsilon value in use    */
 962   int    idx;                   /* counter over sequences          */
 963   int    i1, i2;                /* counters for iterations         */
 964
 965   float  converge_criterion;
 966   float  minw, maxw;            /* min, max weight                 */
 967   int    posw, highw;           /* number of positive weights      */
 968   float  mins, maxs, avgs;      /* min, max, avg score             */
 969   float *pernode;               /* expected score per node of HMM  */
 970   float  expscore;              /* expected score of complete HMM  */
 971   int    max_iter;              /* bulletproof against infinite loop bugs */
 972
 973   epsilon  = 0.2;                /* works fine */
 974   max_iter = 666;
 975
 976   /* Allocations
 977    */
 978   sc      = MallocOrDie (sizeof(float) * msa->nseq);
 979   wgt     = MallocOrDie (sizeof(float) * msa->nseq);
 980   new_wgt = MallocOrDie (sizeof(float) * msa->nseq);
 981   grad    = MallocOrDie (sizeof(float) * msa->nseq);
 982   pernode = MallocOrDie (sizeof(float) * (hmm->M+1));
 983
 984   /* Initialization. Start with all weights == 1.0.
 985    * Find relative entropy and gradient.
 986    */
 987   Plan7SWConfig(hmm, 0.5, 0.5);
 988   P7Logoddsify(hmm, TRUE);
 989
 990   FSet(wgt, msa->nseq, 1.0);
 991   position_average_score(hmm, dsq, wgt, msa->nseq, tr, pernode,&expscore);
 992   for (idx = 0; idx < msa->nseq; idx++)
 993     sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);
 994   relative_entropy = FSum(sc, msa->nseq) / (float) msa->nseq;
 995   for (idx = 0; idx < msa->nseq; idx++)
 996     grad[idx] = relative_entropy - sc[idx];
 997
 998
 999   printf("iter avg-sc min-sc max-sc min-wgt max-wgt +wgt ++wgt rel.ent convergence\n");
1000   printf("---- ------ ------ ------ ------- ------- ---- ----- ------- -----------\n");
1001   mins = maxs = avgs = sc[0];
1002   for (idx = 1; idx < msa->nseq; idx++)
1003     {
1004       if (sc[idx] < mins) mins = sc[idx];
1005       if (sc[idx] > maxs) maxs = sc[idx];
1006       avgs += sc[idx];
1007     }
1008   avgs /= (float) msa->nseq;
1009   printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8s\n",
1010          0, avgs, mins, maxs, 1.0, 1.0, msa->nseq, 0, relative_entropy, "-");
1011
1012
1013   /* Steepest descents optimization;
1014    * iterate until relative entropy converges.
1015    */
1016   i1 = 0;
1017   while (++i1 < max_iter)
1018     {
1019       /* Gradient gives us a line of steepest descents.
1020        * (Roughly speaking, anyway. We actually have a constraint
1021        * that weights are nonnegative and normalized, and the
1022        * gradient doesn't take these into account.)
1023        * Look along this line, a distance of epsilon * gradient:
1024        * if new point is better, accept; if new point is worse,
1025        * move back along the line by half the distance and re-evaluate.
1026        */
1027       use_epsilon = epsilon;
1028       new_entropy = relative_entropy + 1.0;    /* just ensure new > old */
1029
1030       i2 = 0;
1031       while (new_entropy > relative_entropy && ++i2 < max_iter)
1032         {
1033           last_new_entropy = new_entropy;
1034
1035                                 /* find a new point in weight space */
1036           for (idx = 0; idx < msa->nseq; idx++)
1037             {
1038               new_wgt[idx] = wgt[idx] + use_epsilon * grad[idx];
1039               if (new_wgt[idx] < 0.) new_wgt[idx] = 0.0;
1040             }
1041           FNorm(new_wgt, msa->nseq);
1042           FScale(new_wgt, msa->nseq, (float) msa->nseq);
1043
1044                                 /* Make new HMM using these weights */
1045           ZeroPlan7(hmm);
1046           for (idx = 0; idx < msa->nseq; idx++)
1047             P7TraceCount(hmm, dsq[idx], new_wgt[idx], tr[idx]);
1048           P7PriorifyHMM(hmm, prior);
1049
1050
1051                                 /* Evaluate new point */
1052           Plan7SWConfig(hmm, 0.5, 0.5);
1053           P7Logoddsify(hmm, TRUE);
1054           position_average_score(hmm, dsq, new_wgt, msa->nseq, tr, pernode, &expscore);
1055           for (idx = 0; idx < msa->nseq; idx++)
1056             sc[idx]      = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);
1057           new_entropy = FDot(sc, new_wgt, msa->nseq) / (float) msa->nseq;
1058
1059           use_epsilon /= 2.0;
1060           /* Failsafe: we're not converging. Set epsilon to zero,
1061            * do one more round.
1062            */
1063           if (use_epsilon < 1e-6) use_epsilon = 0.0;
1064           if (use_epsilon == 0.0) break;
1065
1066           /* Failsafe: avoid infinite loops. Sometimes the
1067              new entropy converges without ever being better
1068              than the previous point, probably as a result
1069              of minor roundoff error. */
1070           if (last_new_entropy == new_entropy) break;
1071         }
1072       if (i2 == max_iter) printf("   -- exceeded maximum iterations; giving up --\n");
1073
1074       /* Evaluate convergence before accepting the new weights;
1075        * then, accept the new point and evaluate the gradient there.
1076        */
1077       converge_criterion = fabs((relative_entropy-new_entropy)/relative_entropy);
1078       relative_entropy = new_entropy;
1079       FCopy(wgt, new_wgt, msa->nseq);
1080       for (idx = 0; idx < msa->nseq; idx++)
1081         grad[idx] = relative_entropy - sc[idx];
1082
1083       /* Print some statistics about this iteration
1084        */
1085       mins = maxs = avgs = sc[0];
1086       minw = maxw = wgt[0];
1087       posw = (wgt[0] > 0.0) ? 1 : 0;
1088       highw = (wgt[0] > 1.0) ? 1 : 0;
1089       for (idx = 1; idx < msa->nseq; idx++)
1090         {
1091           if (sc[idx] < mins) mins = sc[idx];
1092           if (sc[idx] > maxs) maxs = sc[idx];
1093           if (wgt[idx] < minw) minw = wgt[idx];
1094           if (wgt[idx] > maxw) maxw = wgt[idx];
1095           if (wgt[idx] > 0.0)  posw++;
1096           if (wgt[idx] > 1.0)  highw++;
1097           avgs += sc[idx];
1098         }
1099       avgs /= (float) msa->nseq;
1100       printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8.5f\n",
1101              i1,
1102              avgs, mins, maxs,
1103              minw, maxw, posw, highw,
1104              relative_entropy, converge_criterion);
1105
1106       if (converge_criterion < 1e-5) break;
1107     }
1108   if (i1 == max_iter) printf("   -- exceeded maximum iterations; giving up --\n");
1109
1110   /* Renormalize weights to sum to eff_nseq, and save.
1111    */
1112   FNorm(wgt, msa->nseq);
1113   FScale(wgt, msa->nseq, (float) eff_nseq);
1114   FCopy(msa->wgt, wgt, msa->nseq);
1115                         /* Make final HMM using these adjusted weights */
1116   ZeroPlan7(hmm);
1117   for (idx = 0; idx < msa->nseq; idx++)
1118     P7TraceCount(hmm, dsq[idx], wgt[idx], tr[idx]);
1119   P7PriorifyHMM(hmm, prior);
1120
1121   /* Cleanup and return
1122    */
1123   free(pernode);
1124   free(new_wgt);
1125   free(grad);
1126   free(wgt);
1127   free(sc);
1128   return;
1129 }