1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
11 /* Derived from code developed by Ian Holmes (Sanger Centre and UC Berkeley)
12 * Copyright (C) 1998 Ian Holmes
13 * Distributed under the GNU General Public License
20 #include "structs.h" /* data structures, macros, #define's */
21 #include "config.h" /* compile-time configuration constants */
22 #include "funcs.h" /* function declarations */
23 #include "globals.h" /* alphabet global variables */
24 #include "squid.h" /* general sequence analysis library */
26 static char banner[] = "hmmbuild - build a hidden Markov model from an alignment";
28 static char usage[] = "\
29 Usage: hmmbuildpost [-options] <alignment file>\n\
30 Available options are:\n\
31 -h : help; print brief help on version and usage\n\
32 -n <s> : name; name this HMM <s>\n\
33 -r <hmmfile> : read HMM from <file> instead of building\n\
34 -m <hmmfile> : save HMM to <file>\n\
35 -o <file> : re-save annotated alignment to <file>\n\
36 -A : append; append this HMM to <hmmfile>\n\
37 -F : force; allow overwriting of <hmmfile>\n\
39 Alternative search algorithm styles: (default: hmmls domain alignment)\n\
40 -f : multi-hit local (hmmfs style)\n\
41 -g : global alignment (hmms style, Needleman/Wunsch)\n\
42 -s : local alignment (hmmsw style, Smith/Waterman)\n\
45 static char experts[] = "\
46 Optional re-alignment of sequences to model:\n\
47 --viterbi : standard max-likelihood (Viterbi) algorithm\n\
48 --optacc : optimal accuracy algorithm\n\
50 Alternative model construction strategies: (default: MAP)\n\
51 --fast : Krogh/Haussler fast heuristic construction (see --gapmax)\n\
52 --hand : manual construction (requires SELEX file, #=RF annotation)\n\
54 Expert customization of parameters and priors:\n\
55 --null <file> : read null (random sequence) model from <file>\n\
56 --pam <file> : heuristic PAM-based prior, using BLAST PAM matrix in <file>\n\
57 --prior <file> : read Dirichlet prior parameters from <file>\n\
59 Alternative sequence weighting strategies: (default: GSC weights)\n\
60 --wblosum : Henikoff simple filter weights (see --idlevel)\n\
61 --wgsc : Gerstein/Sonnhammer/Chothia tree weights (default)\n\
62 --wme : maximum entropy (ME)\n\
63 --wvoronoi : Sibbald/Argos Voronoi weights\n\
64 --wnone : don't do any weighting\n\
65 --noeff : don't use effective sequence number; just use nseq\n\
67 Forcing an alphabet: (normally autodetected)\n\
68 --amino : override autodetection, assert that seqs are protein\n\
69 --nucleic : override autodetection, assert that seqs are DNA/RNA\n\
71 Other expert options:\n\
72 --archpri <x> : set architecture size prior to <x> {0.85} [0..1]\n\
73 --binary : save the model in binary format, not ASCII text\n\
74 --cfile <file>: save count vectors to <file>\n\
75 --gapmax <x> : max fraction of gaps in mat column {0.50} [0..1]\n\
76 --idlevel <x> : set frac. id level used by eff. nseq and --wblosum {0.62}\n\
77 --informat <s>: input alignment is in format <s>, not Stockholm\n\
78 --pamwgt <x> : set weight on PAM-based prior to <x> {20.}[>=0]\n\
79 --swentry <x> : set S/W aggregate entry prob. to <x> {0.5}\n\
80 --swexit <x> : set S/W aggregate exit prob. to <x> {0.5}\n\
81 --verbose : print a lot of boring information\n\
84 static struct opt_s OPTIONS[] = {
85 { "-f", TRUE, sqdARG_NONE },
86 { "-g", TRUE, sqdARG_NONE },
87 { "-h", TRUE, sqdARG_NONE },
88 { "-n", TRUE, sqdARG_STRING},
89 { "-r", TRUE, sqdARG_STRING},
90 { "-m", TRUE, sqdARG_STRING},
91 { "-o", TRUE, sqdARG_STRING},
92 { "-s", TRUE, sqdARG_NONE },
93 { "-A", TRUE, sqdARG_NONE },
94 { "-F", TRUE, sqdARG_NONE },
95 { "--amino", FALSE, sqdARG_NONE },
96 { "--archpri", FALSE, sqdARG_FLOAT },
97 { "--binary", FALSE, sqdARG_NONE },
98 { "--cfile", FALSE, sqdARG_STRING},
99 { "--fast", FALSE, sqdARG_NONE},
100 { "--gapmax", FALSE, sqdARG_FLOAT },
101 { "--hand", FALSE, sqdARG_NONE},
102 { "--idlevel", FALSE, sqdARG_FLOAT },
103 { "--informat",FALSE, sqdARG_STRING },
104 { "--noeff", FALSE, sqdARG_NONE },
105 { "--nucleic", FALSE, sqdARG_NONE },
106 { "--null", FALSE, sqdARG_STRING },
107 { "--optacc", FALSE, sqdARG_NONE },
108 { "--pam", FALSE, sqdARG_STRING },
109 { "--pamwgt", FALSE, sqdARG_FLOAT },
110 { "--prior", FALSE, sqdARG_STRING },
111 { "--swentry", FALSE, sqdARG_FLOAT },
112 { "--swexit", FALSE, sqdARG_FLOAT },
113 { "--verbose", FALSE, sqdARG_NONE },
114 { "--viterbi", FALSE, sqdARG_NONE },
115 { "--wgsc", FALSE, sqdARG_NONE },
116 { "--wblosum", FALSE, sqdARG_NONE },
117 { "--wme", FALSE, sqdARG_NONE },
118 { "--wnone", FALSE, sqdARG_NONE },
119 { "--wvoronoi",FALSE, sqdARG_NONE },
121 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
123 static void save_model(struct plan7_s *hmm, char *hmmfile, int do_append, int do_binary);
124 static void print_all_scores(FILE *fp, struct plan7_s *hmm,
125 AINFO *ainfo, char **dsq, int nseq,
126 struct p7trace_s **tr);
127 static void save_countvectors(char *cfile, struct plan7_s *hmm);
128 static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt,
129 int nseq, struct p7trace_s **tr, float *pernode,
131 static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr,
132 float *pernode, float expected);
133 static void maximum_entropy(struct plan7_s *hmm, char **dsq, AINFO *ainfo,
134 int nseq, float eff_nseq,
135 struct p7prior_s *prior, struct p7trace_s **tr);
137 extern void Postcode(int L, struct dpmatrix_s *mx, struct p7trace_s *tr);
140 main(int argc, char **argv)
142 char *seqfile; /* seqfile to read alignment from */
143 int format; /* format of seqfile */
144 MSAFILE *afp; /* open alignment file */
145 MSA *msa; /* a multiple sequence alignment */
146 char **dsq; /* digitized unaligned aseq's */
147 struct plan7_s *hmm; /* constructed HMM; written to hmmfile */
148 struct p7prior_s *pri; /* Dirichlet priors to use */
149 struct p7trace_s **tr; /* fake tracebacks for aseq's */
150 char *readfile; /* file to read HMM from */
151 HMMFILE *hmmfp; /* opened hmmfile for reading */
152 char *hmmfile; /* file to write HMM to */
153 FILE *fp; /* OUTPUT file handle (misc.) */
154 char *name; /* name of the HMM */
155 int idx; /* counter for sequences */
156 float randomseq[MAXABET]; /* null sequence model */
157 float p1; /* null sequence model p1 transition */
159 char *optname; /* name of option found by Getopt() */
160 char *optarg; /* argument found by Getopt() */
161 int optind; /* index in argv[] */
162 enum p7_construction c_strategy; /* construction strategy choice */
163 enum p7_weight { /* weighting strategy */
164 WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_VORONOI, WGT_ME} w_strategy;
165 enum p7_config { /* algorithm configuration strategy */
166 P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy;
167 float gapmax; /* max frac gaps in mat col for -k */
168 int overwrite_protect; /* TRUE to prevent overwriting HMM file */
169 enum realignment_strategy { /* re-alignment strategy */
170 REALIGN_NONE, REALIGN_VITERBI, REALIGN_OPTACC } r_strategy;
171 int verbose; /* TRUE to show a lot of output */
172 char *align_ofile; /* name of output alignment file */
173 char *rndfile; /* random sequence model file to read */
174 char *prifile; /* Dirichlet prior file to read */
175 char *pamfile; /* PAM matrix file for heuristic prior */
176 char *cfile; /* output file for count vectors */
177 float archpri; /* "architecture" prior on model size */
178 float pamwgt; /* weight on PAM for heuristic prior */
179 int do_append; /* TRUE to append to hmmfile */
180 int do_binary; /* TRUE to write in binary format */
181 float blosumlevel; /* BLOSUM frac id filtering level [0.62] */
182 float swentry; /* S/W aggregate entry probability */
183 float swexit; /* S/W aggregate exit probability */
184 int do_eff; /* TRUE to set an effective seq number */
185 float eff_nseq; /* effective sequence number */
189 struct dpmatrix_s *forward_mx; /* Forward matrix */
190 struct dpmatrix_s *backward_mx; /* Backward matrix */
191 struct dpmatrix_s *posterior_mx; /* Posterior matrix */
192 struct dpmatrix_s *optacc_mx; /* Optimal accuracy matrix */
194 /***********************************************
196 ***********************************************/
198 format = MSAFILE_UNKNOWN;
199 c_strategy = P7_MAP_CONSTRUCTION;
200 w_strategy = WGT_GSC;
202 cfg_strategy = P7_LS_CONFIG;
204 overwrite_protect = TRUE;
205 r_strategy = REALIGN_NONE;
216 Alphabet_type = hmmNOTSETYET; /* initially unknown */
224 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
225 &optind, &optname, &optarg)) {
226 if (strcmp(optname, "-f") == 0) cfg_strategy = P7_FS_CONFIG;
227 else if (strcmp(optname, "-g") == 0) cfg_strategy = P7_BASE_CONFIG;
228 else if (strcmp(optname, "-n") == 0) name = Strdup(optarg);
229 else if (strcmp(optname, "-r") == 0) readfile = optarg;
230 else if (strcmp(optname, "-m") == 0) hmmfile = optarg;
231 else if (strcmp(optname, "-o") == 0) align_ofile = optarg;
232 else if (strcmp(optname, "-r") == 0) rndfile = optarg;
233 else if (strcmp(optname, "-s") == 0) cfg_strategy = P7_SW_CONFIG;
234 else if (strcmp(optname, "-A") == 0) do_append = TRUE;
235 else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE;
236 else if (strcmp(optname, "--amino") == 0) SetAlphabet(hmmAMINO);
237 else if (strcmp(optname, "--archpri") == 0) archpri = atof(optarg);
238 else if (strcmp(optname, "--binary") == 0) do_binary = TRUE;
239 else if (strcmp(optname, "--cfile") == 0) cfile = optarg;
240 else if (strcmp(optname, "--fast") == 0) c_strategy = P7_FAST_CONSTRUCTION;
241 else if (strcmp(optname, "--hand") == 0) c_strategy = P7_HAND_CONSTRUCTION;
242 else if (strcmp(optname, "--gapmax") == 0) gapmax = atof(optarg);
243 else if (strcmp(optname, "--idlevel") == 0) blosumlevel = atof(optarg);
244 else if (strcmp(optname, "--noeff") == 0) do_eff = FALSE;
245 else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC);
246 else if (strcmp(optname, "--optacc") == 0) r_strategy = REALIGN_OPTACC;
247 else if (strcmp(optname, "--pam") == 0) pamfile = optarg;
248 else if (strcmp(optname, "--pamwgt") == 0) pamwgt = atof(optarg);
249 else if (strcmp(optname, "--prior") == 0) prifile = optarg;
250 else if (strcmp(optname, "--swentry") == 0) swentry = atof(optarg);
251 else if (strcmp(optname, "--swexit") == 0) swexit = atof(optarg);
252 else if (strcmp(optname, "--verbose") == 0) verbose = TRUE;
253 else if (strcmp(optname, "--viterbi") == 0) r_strategy = REALIGN_VITERBI;
254 else if (strcmp(optname, "--wgsc") == 0) w_strategy = WGT_GSC;
255 else if (strcmp(optname, "--wblosum") == 0) w_strategy = WGT_BLOSUM;
256 else if (strcmp(optname, "--wme") == 0) w_strategy = WGT_ME;
257 else if (strcmp(optname, "--wnone") == 0) w_strategy = WGT_NONE;
258 else if (strcmp(optname, "--wvoronoi")== 0) w_strategy = WGT_VORONOI;
259 else if (strcmp(optname, "--informat") == 0) {
260 format = String2SeqfileFormat(optarg);
261 if (format == MSAFILE_UNKNOWN)
262 Die("unrecognized sequence file format \"%s\"", optarg);
263 if (! IsAlignmentFormat(format))
264 Die("%s is an unaligned format, can't read as an alignment", optarg);
266 else if (strcmp(optname, "-h") == 0) {
267 Banner(stdout, banner);
273 if (argc - optind != 1)
274 Die("Incorrect number of arguments.\n%s\n", usage);
276 seqfile = argv[optind++];
278 if (readfile != NULL && r_strategy == REALIGN_NONE)
279 r_strategy = REALIGN_VITERBI;
281 if (gapmax < 0. || gapmax > 1.)
282 Die("--gapmax must be a value from 0 to 1\n%s\n", usage);
283 if (archpri < 0. || archpri > 1.)
284 Die("--archpri must be a value from 0 to 1\n%s\n", usage);
285 if (overwrite_protect && hmmfile && !do_append && FileExists(hmmfile))
286 Die("HMM file %s already exists. Rename or delete it.", hmmfile);
287 if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile))
288 Die("Alignment resave file %s exists. Rename or delete it.", align_ofile);
290 /***********************************************
292 ***********************************************/
294 /* Open the alignment */
295 if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL)
296 Die("Alignment file %s could not be opened for reading", seqfile);
298 /* read the alignment from file */
299 if ((msa = MSAFileRead(afp)) == NULL)
300 Die("Failed to read aligned sequence file %s", seqfile);
301 for (idx = 0; idx < msa->nseq; idx++)
302 s2upper(msa->aseq[idx]);
304 /* Set up the alphabet globals */
305 if (Alphabet_type == hmmNOTSETYET)
306 DetermineAlphabet(msa->aseq, msa->nseq);
308 /* Set up Dirichlet priors */
309 if (prifile == NULL) pri = P7DefaultPrior();
310 else pri = P7ReadPrior(prifile);
312 if (pamfile != NULL) PAMPrior(pamfile, pri, pamwgt);
314 /* Set up the null/random seq model */
315 if (rndfile == NULL) P7DefaultNullModel(randomseq, &p1);
316 else P7ReadNullModel(rndfile, randomseq, &p1);
318 /* Prepare sequences for internal use */
319 DigitizeAlignment(msa, &dsq);
321 /* In some respects we treat DNA more crudely... */
322 if (Alphabet_type == hmmNUCLEIC)
324 do_eff = FALSE; /* don't do effective seq #; it's calibrated for protein */
327 /***********************************************
328 * Either read in an HMM or build from alignment,
329 * depending on user specifications.
330 ***********************************************/
332 if (readfile != NULL) {
334 /***********************************************
335 * Open HMM file (might be in HMMERDB or current directory).
336 * Read a single HMM from it.
337 ***********************************************/
339 if ((hmmfp = HMMFileOpen(readfile, "HMMERDB")) == NULL)
340 Die("Failed to open HMM file %s\n%s", readfile, usage);
341 if (!HMMFileRead(hmmfp, &hmm))
342 Die("Failed to read any HMMs from %s\n", readfile);
345 Die("HMM file %s corrupt or in incorrect format? Parse failed", readfile);
347 tr = (struct p7trace_s **) MallocOrDie (sizeof(struct p7trace_s *) * msa->nseq);
348 for (idx = 0; idx < msa->nseq; idx++)
353 /***********************************************
355 ***********************************************/
357 /* Determine the effective sequence number to use (optional)
359 eff_nseq = (float) msa->nseq;
363 printf("%-40s ... ", "Determining effective sequence number");
365 /* dummy weights array to feed BlosumWeights*/
366 wgt = MallocOrDie(sizeof(float) * msa->nseq);
367 BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt);
368 eff_nseq = FSum(wgt, msa->nseq);
371 printf("done. [%.0f]\n", eff_nseq);
375 /* Weight the sequences (optional),
377 /* Weight the sequences (optional),
379 if (w_strategy == WGT_GSC ||
380 w_strategy == WGT_BLOSUM ||
381 w_strategy == WGT_VORONOI)
383 printf("%-40s ... ", "Weighting sequences heuristically");
386 if (w_strategy == WGT_GSC)
387 GSCWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
388 else if (w_strategy == WGT_BLOSUM)
389 BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, msa->wgt);
390 else if (w_strategy == WGT_VORONOI)
391 VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
396 /* Set the effective sequence number (if do_eff is FALSE, eff_nseq
399 FNorm(msa->wgt, msa->nseq);
400 FScale(msa->wgt, msa->nseq, eff_nseq);
403 /* Build a model architecture.
404 * If we're not doing MD or ME, that's all we need to do.
405 * We get an allocated, counts-based HMM back.
407 printf("%-40s ... ", "Constructing model architecture");
409 checksum = GCGMultchecksum(msa->aseq, msa->nseq);
410 if (c_strategy == P7_FAST_CONSTRUCTION)
411 P7Fastmodelmaker(msa, dsq, gapmax, &hmm, &tr);
412 else if (c_strategy == P7_HAND_CONSTRUCTION)
413 P7Handmodelmaker(msa, dsq, &hmm, &tr);
415 P7Maxmodelmaker(msa, dsq, gapmax,
416 pri, randomseq, p1, archpri, &hmm, &tr);
417 hmm->checksum = checksum;
420 /* Save the count vectors if asked. Used primarily for
421 * making the data files for training priors.
425 save_countvectors(cfile, hmm);
428 /* Record the null model in the HMM;
429 * add prior contributions in pseudocounts and renormalize.
431 Plan7SetNullModel(hmm, randomseq, p1);
432 P7PriorifyHMM(hmm, pri);
435 /* Model configuration, temporary.
436 * hmmbuild assumes that it's given an alignment of single domains,
437 * and the alignment may contain fragments. So, for the purpose of
438 * scoring the sequences (or, optionally, MD/ME weighting),
439 * configure the model into hmmsw mode. Later we'll
440 * configure the model according to how the user wants to
443 Plan7SWConfig(hmm, 0.5, 0.5);
445 /* Do model-dependent "weighting" strategies.
448 if (w_strategy == WGT_ME)
450 maximum_entropy(hmm, dsq, &ainfo, ainfo.nseq, eff_nseq, pri, tr);
454 /* Give the model a name; by default, the name of the alignment file
455 * without any filename extension (i.e. "globins.slx" becomes "globins"
457 if (name == NULL) name = FileTail(seqfile, TRUE);
458 Plan7SetName(hmm, name);
459 Plan7ComlogAppend(hmm, argc, argv);
461 hmm->nseq = msa->nseq;
464 /* Configure the model for chosen algorithm
466 switch (cfg_strategy) {
467 case P7_BASE_CONFIG: Plan7GlobalConfig(hmm); break;
468 case P7_SW_CONFIG: Plan7SWConfig(hmm, swentry, swexit); break;
469 case P7_LS_CONFIG: Plan7LSConfig(hmm); break;
470 case P7_FS_CONFIG: Plan7FSConfig(hmm, swentry, swexit); break;
471 default: Die("bogus configuration choice");
476 /* Optionally save new HMM to disk: open a file for appending or writing.
478 P7Logoddsify(hmm, TRUE);
480 save_model(hmm, hmmfile, do_append, do_binary);
482 /* Display posterior probabilities for each sequence,
483 re-aligning them to the model if user requested that
486 for (idx = 0; idx < msa->nseq; idx++) {
487 printf ("#\n# Sequence %d: %s\n#\n", idx + 1, msa->sqname[idx]);
489 len = DealignedLength(msa->aseq[idx]);
490 if (P7ViterbiSize(len, hmm->M) * 2 > RAMLIMIT)
491 Die("insufficient memory");
493 (void) P7Forward (dsq[idx], len, hmm, &forward_mx);
494 (void) P7Backward (dsq[idx],len, hmm, &backward_mx);
496 if (r_strategy == REALIGN_VITERBI) {
498 if (tr[idx]) P7FreeTrace (tr[idx]);
500 if (P7ViterbiSize(len, hmm->M) * 3 <= RAMLIMIT)
501 (void) P7Viterbi(dsq[idx], len, hmm, &(tr[idx]));
503 (void) P7SmallViterbi(dsq[idx], len, hmm, &(tr[idx]));
505 } else if (r_strategy == REALIGN_OPTACC) {
507 if (tr[idx]) P7FreeTrace (tr[idx]);
509 if (P7ViterbiSize(len, hmm->M) * 4 > RAMLIMIT)
510 Die("insufficient memory");
512 posterior_mx = AllocPlan7Matrix (len + 1, hmm->M, 0, 0, 0, 0);
513 P7EmitterPosterior (len, hmm, forward_mx, backward_mx,
516 optacc_mx = AllocPlan7Matrix (len + 1, hmm->M, 0, 0, 0, 0);
517 (void) P7FillOptimalAccuracy (len, hmm->M, posterior_mx,
518 optacc_mx, &(tr[idx]));
520 FreePlan7Matrix (posterior_mx);
521 FreePlan7Matrix (optacc_mx);
525 posterior_mx = AllocPlan7Matrix (len + 1, hmm->M, 0, 0, 0, 0);
526 P7EmitterPosterior (len, hmm, forward_mx, backward_mx,
529 Postcode(len, posterior_mx, tr[idx]);
530 /* DisplayPlan7Matrix(dsq[idx], len, hmm, posterior_mx); */
533 /* DisplayPlan7PostAlign (len, hmm,
534 forward_mx, backward_mx,
538 FreePlan7Matrix (backward_mx);
539 FreePlan7Matrix (forward_mx);
543 /* the annotated alignment may be resaved */
544 if (align_ofile != NULL) {
548 sqinfo = MSAToSqinfo(msa);
549 new_msa = P7Traces2Alignment(dsq, sqinfo, msa->wgt, msa->nseq,
551 if ((fp = fopen(align_ofile, "w")) == NULL) {
552 Warn("Failed to open alignment resave file %s; using stdout instead",
556 WriteStockholm(fp, new_msa);
558 for (idx = 0; idx < msa->nseq; idx++)
559 FreeSequence(NULL, &(sqinfo[idx]));
561 if (fp != stdout) fclose(fp);
564 /* Verbose output; show scores for each sequence
568 print_all_scores(stdout, hmm, dsq, msq, tr);
573 for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]);
577 Free2DArray((void **) dsq, msa->nseq);
584 /* Function: save_model()
586 * Purpose: Save the new model to a file.
588 * Args: hmm - model to save
589 * hmmfile - file to save to (if NULL, use stdout)
590 * do_append - TRUE to append to file
591 * do_binary - TRUE to write a binary file
596 save_model(struct plan7_s *hmm, char *hmmfile, int do_append, int do_binary)
604 /* check that it looks like an HMM file */
605 #ifdef REMOVED /* This code induces an unresolved Linux/SGI NFS bug! */
606 if (FileExists(hmmfile))
609 hmmfp = HMMFileOpen(hmmfile, NULL);
611 Warn("%s not an HMM file; can't append to it; using stdout instead",
614 puts(""); /* do a newline before stdout HMM starts */
621 if ((fp = fopen(hmmfile, "a")) == NULL) {
622 Warn("hey, where'd your HMM file go? Using stdout instead.");
624 puts(""); /* do a newline before stdout HMM starts */
629 if ((fp = fopen(hmmfile, "w")) == NULL) {
630 Warn("Failed to open HMM save file %s; using stdout instead", hmmfile);
632 puts(""); /* do a newline before stdout HMM starts */
636 if (do_binary) WriteBinHMM(fp, hmm);
637 else WriteAscHMM(fp, hmm);
639 if (fp != stdout) fclose(fp);
647 /* Function: print_all_scores()
649 * Purpose: For each training sequence, print its score under
652 * Args: fp - where to print the output (usu. stdout)
653 * hmm - newly constructed HMM, with prob's.
654 * ainfo- info with aseq
655 * dsq - digitized unaligned training sequences.
656 * nseq - number of training sequences
657 * tr - array of tracebacks
662 print_all_scores(FILE *fp, struct plan7_s *hmm,
663 AINFO *ainfo, char **dsq, int nseq, struct p7trace_s **tr)
665 int idx; /* counter for sequences */
667 /* make sure model scores are ready */
668 P7Logoddsify(hmm, TRUE);
671 fputs("Individual training sequence scores:\n", fp);
672 /* score for each sequence */
673 for (idx = 0; idx < nseq; idx++)
675 fprintf(fp, "%7.2f %-12s %s\n",
676 P7TraceScore(hmm, dsq[idx], tr[idx]),
677 ainfo->sqinfo[idx].name,
678 (ainfo->sqinfo[idx].flags & SQINFO_DESC) ?
679 ainfo->sqinfo[idx].desc : "");
680 P7PrintTrace(fp, tr[idx], hmm, dsq[idx]);
687 /* Function: save_countvectors()
689 * Purpose: Save emission/transition count vectors to a file.
690 * Used for gathering the data on which to train a
691 * prior (e.g. mixture Dirichlet, etc.)
693 * The format of the file is one vector per line:
694 * M <f> <f> ...: 20 match emission counts in order AC..WY.
695 * I <f> <f> ...: 20 insert emission counts in order AC..WY.
696 * T <f> <f> ...: 7 transition counts in order TMM, TMI, TMD,
697 * TIM, TII, TDM, TDD. (see structs.h)
699 * Args: cfile - counts file to make
700 * hmm - counts-based HMM
703 save_countvectors(char *cfile, struct plan7_s *hmm)
708 if ((fp = fopen(cfile, "w")) == NULL)
709 Die("failed to open count vector file %s for writing", cfile);
711 /* match emission vectors */
712 for (k = 1; k <= hmm->M; k++)
715 for (x = 0; x < Alphabet_size; x++)
716 fprintf(fp, "%.2f ", hmm->mat[k][x]);
719 /* insert emission vectors */
720 for (k = 1; k < hmm->M; k++)
723 for (x = 0; x < Alphabet_size; x++)
724 fprintf(fp, "%.2f ", hmm->ins[k][x]);
727 /* transition vectors */
728 for (k = 1; k < hmm->M; k++)
731 for (x = 0; x < 7; x++)
732 fprintf(fp, "%.2f ", hmm->t[k][x]);
740 /* Function: position_average_score()
741 * Date: Wed Dec 31 09:36:35 1997 [StL]
743 * Purpose: Calculate scores from tracebacks, keeping them
744 * in a position specific array. The final array
745 * is normalized position-specifically too, according
746 * to how many sequences contributed data to this
747 * position. Used for compensating for sequence
748 * fragments in ME and MD score optimization.
751 * Code related to (derived from) TraceScore().
753 * Args: hmm - HMM structure, scores valid
754 * dsq - digitized unaligned sequences
755 * wgt - weights on the sequences
756 * nseq - number of sequences
757 * tr - array of nseq tracebacks that aligns each dsq to hmm
758 * pernode - RETURN: [0]1..M array of position-specific avg scores
759 * ret_avg - RETURN: overall average full-length, one-domain score
761 * Return: 1 on success, 0 on failure.
762 * pernode is malloc'ed [0]1..M by CALLER and filled here.
765 position_average_score(struct plan7_s *hmm,
769 struct p7trace_s **tr,
773 int pos; /* position in seq */
775 int tpos; /* position in trace/state sequence */
776 float *counts; /* counts at each position */
777 float avg; /* RETURN: average overall */
778 int k; /* counter for model position */
779 int idx; /* counter for sequence number */
783 counts = MallocOrDie ((hmm->M+1) * sizeof(float));
784 FSet(pernode, hmm->M+1, 0.);
785 FSet(counts, hmm->M+1, 0.);
787 /* Loop over traces, accumulate weighted scores per position
789 for (idx = 0; idx < nseq; idx++)
790 for (tpos = 0; tpos < tr[idx]->tlen; tpos++)
792 pos = tr[idx]->pos[tpos];
793 sym = (int) dsq[idx][tr[idx]->pos[tpos]];
794 k = tr[idx]->nodeidx[tpos];
796 /* Counts: how many times did we use this model position 1..M?
799 if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD)
800 counts[k] += wgt[idx];
804 if (tr[idx]->statetype[tpos] == STM)
805 pernode[k] += wgt[idx] * Scorify(hmm->msc[sym][k]);
806 else if (tr[idx]->statetype[tpos] == STI)
807 pernode[k] += wgt[idx] * Scorify(hmm->isc[sym][k]);
809 /* Transition scores.
811 if (tr[idx]->statetype[tpos] == STM ||
812 tr[idx]->statetype[tpos] == STD ||
813 tr[idx]->statetype[tpos] == STI)
814 pernode[k] += wgt[idx] *
815 Scorify(TransitionScoreLookup(hmm, tr[idx]->statetype[tpos], tr[idx]->nodeidx[tpos],
816 tr[idx]->statetype[tpos+1],tr[idx]->nodeidx[tpos+1]));
819 /* Divide accumulated scores by accumulated weighted counts
822 for (k = 1; k <= hmm->M; k++)
824 pernode[k] /= counts[k];
834 /* Function: frag_trace_score()
835 * Date: SRE, Wed Dec 31 10:03:47 1997 [StL]
837 * Purpose: Allow MD/ME optimization to be used for alignments
838 * that include fragments and multihits -- estimate a full-length
843 * Return: "corrected" score.
846 frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr,
847 float *pernode, float expected)
849 float sc; /* corrected score */
850 float fragexp; /* expected score for a trace like this */
851 int tpos; /* position in trace */
853 /* get uncorrected score */
854 sc = P7TraceScore(hmm, dsq, tr);
856 /* calc expected score for trace like this */
858 for (tpos = 0; tpos < tr->tlen; tpos++)
859 if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD)
860 fragexp += pernode[tr->nodeidx[tpos]];
862 /* correct for multihits */
863 fragexp /= (float) TraceDomainNumber(tr);
865 /* extrapolate to full-length, one-hit score */
866 sc = sc * expected / fragexp;
871 /* Function: maximum_entropy()
872 * Date: SRE, Fri Jan 2 10:56:00 1998 [StL]
874 * Purpose: Optimizes a model according to maximum entropy weighting.
875 * See Krogh and Mitchison (1995).
877 * [Actually, we do minimum relative entropy, rather than
878 * maximum entropy. Same thing, though we refer to "ME"
879 * weights and models. The optimization is a steepest
880 * descents minimization of the relative entropy.]
882 * Expects to be called shortly after a Maxmodelmaker()
883 * or Handmodelmaker(), so that both a new model architecture
884 * (with MAP parameters) and fake tracebacks are available.
886 * Prints a summary of optimization progress to stdout.
888 * Args: hmm - model. allocated, set with initial MAP parameters.
889 * dsq - dealigned digitized seqs the model is based on
890 * ainfo - extra info for aseqs
891 * nseq - number of aseqs
892 * eff_nseq- effective sequence number; weights normalize up to this.
893 * prior - prior distributions for parameterizing model
894 * tr - array of fake traces for each sequence
897 * hmm changed to an ME HMM
898 * ainfo changed, contains ME weights
901 maximum_entropy(struct plan7_s *hmm, char **dsq, AINFO *ainfo, int nseq,
902 float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr)
904 float *wgt; /* current best set of ME weights */
905 float *new_wgt; /* new set of ME weights to try */
906 float *sc; /* log-odds score of each sequence */
907 float *grad; /* gradient */
908 float epsilon; /* steepness of descent */
909 float relative_entropy; /* current best relative entropy */
910 float new_entropy; /* relative entropy at new weights */
911 float last_new_entropy; /* last new_entropy we calc'ed */
912 float use_epsilon; /* current epsilon value in use */
913 int idx; /* counter over sequences */
914 int i1, i2; /* counters for iterations */
916 float converge_criterion;
917 float minw, maxw; /* min, max weight */
918 int posw, highw; /* number of positive weights */
919 float mins, maxs, avgs; /* min, max, avg score */
920 float *pernode; /* expected score per node of HMM */
921 float expscore; /* expected score of complete HMM */
922 int max_iter; /* bulletproof against infinite loop bugs */
924 epsilon = 0.2; /* works fine */
929 sc = MallocOrDie (sizeof(float) * nseq);
930 wgt = MallocOrDie (sizeof(float) * nseq);
931 new_wgt = MallocOrDie (sizeof(float) * nseq);
932 grad = MallocOrDie (sizeof(float) * nseq);
933 pernode = MallocOrDie (sizeof(float) * (hmm->M+1));
935 /* Initialization. Start with all weights == 1.0.
936 * Find relative entropy and gradient.
938 Plan7SWConfig(hmm, 0.5, 0.5);
939 P7Logoddsify(hmm, TRUE);
941 FSet(wgt, nseq, 1.0);
942 position_average_score(hmm, dsq, wgt, nseq, tr, pernode, &expscore);
943 for (idx = 0; idx < nseq; idx++)
944 sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);
945 relative_entropy = FSum(sc, nseq) / (float) nseq;
946 for (idx = 0; idx < nseq; idx++)
947 grad[idx] = relative_entropy - sc[idx];
951 * printf statements commented out:
953 * printf("iter avg-sc min-sc max-sc min-wgt max-wgt +wgt ++wgt rel.ent convergence\n");
954 * printf("---- ------ ------ ------ ------- ------- ---- ----- ------- -----------\n");
957 mins = maxs = avgs = sc[0];
958 for (idx = 1; idx < nseq; idx++)
960 if (sc[idx] < mins) mins = sc[idx];
961 if (sc[idx] > maxs) maxs = sc[idx];
967 * printf statement commented out:
969 * printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8s\n",
970 * 0, avgs, mins, maxs, 1.0, 1.0, nseq, 0, relative_entropy, "-");
975 /* Steepest descents optimization;
976 * iterate until relative entropy converges.
979 while (++i1 < max_iter)
981 /* Gradient gives us a line of steepest descents.
982 * (Roughly speaking, anyway. We actually have a constraint
983 * that weights are nonnegative and normalized, and the
984 * gradient doesn't take these into account.)
985 * Look along this line, a distance of epsilon * gradient:
986 * if new point is better, accept; if new point is worse,
987 * move back along the line by half the distance and re-evaluate.
989 use_epsilon = epsilon;
990 new_entropy = relative_entropy + 1.0; /* just ensure new > old */
993 while (new_entropy > relative_entropy && ++i2 < max_iter)
995 last_new_entropy = new_entropy;
997 /* find a new point in weight space */
998 for (idx = 0; idx < nseq; idx++)
1000 new_wgt[idx] = wgt[idx] + use_epsilon * grad[idx];
1001 if (new_wgt[idx] < 0.) new_wgt[idx] = 0.0;
1003 FNorm(new_wgt, nseq);
1004 FScale(new_wgt, nseq, (float) nseq);
1006 /* Make new HMM using these weights */
1008 for (idx = 0; idx < nseq; idx++)
1009 P7TraceCount(hmm, dsq[idx], new_wgt[idx], tr[idx]);
1010 P7PriorifyHMM(hmm, prior);
1013 /* Evaluate new point */
1014 Plan7SWConfig(hmm, 0.5, 0.5);
1015 P7Logoddsify(hmm, TRUE);
1016 position_average_score(hmm, dsq, new_wgt, nseq, tr, pernode, &expscore);
1017 for (idx = 0; idx < nseq; idx++)
1018 sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);
1019 new_entropy = FDot(sc, new_wgt, nseq) / nseq;
1022 /* Failsafe: we're not converging. Set epsilon to zero,
1023 * do one more round.
1025 if (use_epsilon < 1e-6) use_epsilon = 0.0;
1026 if (use_epsilon == 0.0) break;
1028 /* Failsafe: avoid infinite loops. Sometimes the
1029 new entropy converges without ever being better
1030 than the previous point, probably as a result
1031 of minor roundoff error. */
1032 if (last_new_entropy == new_entropy) break;
1035 * printf statement commented out:
1037 * if (i2 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n");
1041 /* Evaluate convergence before accepting the new weights;
1042 * then, accept the new point and evaluate the gradient there.
1044 converge_criterion = fabs((relative_entropy-new_entropy)/relative_entropy);
1045 relative_entropy = new_entropy;
1046 FCopy(wgt, new_wgt, nseq);
1047 for (idx = 0; idx < nseq; idx++)
1048 grad[idx] = relative_entropy - sc[idx];
1050 /* Print some statistics about this iteration
1052 mins = maxs = avgs = sc[0];
1053 minw = maxw = wgt[0];
1054 posw = (wgt[0] > 0.0) ? 1 : 0;
1055 highw = (wgt[0] > 1.0) ? 1 : 0;
1056 for (idx = 1; idx < nseq; idx++)
1058 if (sc[idx] < mins) mins = sc[idx];
1059 if (sc[idx] > maxs) maxs = sc[idx];
1060 if (wgt[idx] < minw) minw = wgt[idx];
1061 if (wgt[idx] > maxw) maxw = wgt[idx];
1062 if (wgt[idx] > 0.0) posw++;
1063 if (wgt[idx] > 1.0) highw++;
1070 * printf statement commented out:
1072 * printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8.5f\n",
1075 * minw, maxw, posw, highw,
1076 * relative_entropy, converge_criterion);
1080 if (converge_criterion < 1e-5) break;
1083 * printf statement commented out:
1085 * if (i1 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n");
1089 /* Renormalize weights to sum to eff_nseq, and save.
1092 FScale(wgt, nseq, (float) eff_nseq);
1093 FCopy(ainfo->wgt, wgt, nseq);
1094 /* Make final HMM using these adjusted weights */
1096 for (idx = 0; idx < nseq; idx++)
1097 P7TraceCount(hmm, dsq[idx], wgt[idx], tr[idx]);
1098 P7PriorifyHMM(hmm, prior);
1100 /* Cleanup and return