1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
12 * SRE, Mon Nov 18 12:41:29 1996
14 * main() for HMM construction from an alignment.
15 * CVS $Id: hmmbuild.c,v 1.1.1.1 2005/03/22 08:34:06 cmzmasek Exp $
22 #include "structs.h" /* data structures, macros, #define's */
23 #include "config.h" /* compile-time configuration constants */
24 #include "funcs.h" /* function declarations */
25 #include "globals.h" /* alphabet global variables */
26 #include "squid.h" /* general sequence analysis library */
27 #include "msa.h" /* squid's multiple alignment i/o */
29 static char banner[] = "hmmbuild - build a hidden Markov model from an alignment";
31 static char usage[] = "\
32 Usage: hmmbuild [-options] <hmmfile output> <alignment file>\n\
33 Available options are:\n\
34 -h : help; print brief help on version and usage\n\
35 -n <s> : name; name this (first) HMM <s>\n\
36 -o <f> : re-save annotated alignment to <f>\n\
37 -A : append; append this HMM to <hmmfile>\n\
38 -F : force; allow overwriting of <hmmfile>\n\
40 Alternative search algorithm styles: (default: hmmls domain alignment)\n\
41 -f : multi-hit local (hmmfs style)\n\
42 -g : global alignment (hmms style, Needleman/Wunsch)\n\
43 -s : local alignment (hmmsw style, Smith/Waterman)\n\
46 static char experts[] = "\
47 Alternative model construction strategies: (default: MAP)\n\
48 --fast : Krogh/Haussler fast heuristic construction (see --gapmax)\n\
49 --hand : manual construction (requires annotated alignment)\n\
51 Expert customization of parameters and priors:\n\
52 --null <f> : read null (random sequence) model from <f>\n\
53 --pam <f> : heuristic PAM-based prior, using BLAST PAM matrix in <f>\n\
54 --prior <f> : read Dirichlet prior parameters from <f>\n\
56 Alternative sequence weighting strategies: (default: GSC weights)\n\
57 --wblosum : Henikoff simple filter weights (see --idlevel)\n\
58 --wgsc : Gerstein/Sonnhammer/Chothia tree weights (default)\n\
59 --wme : maximum entropy (ME)\n\
60 --wpb : Henikoff position-based weights\n\
61 --wvoronoi : Sibbald/Argos Voronoi weights\n\
62 --wnone : don't do any weighting\n\
63 --noeff : don't use effective sequence number; just use nseq\n\
64 --pbswitch <n>: set switch from GSC to position-based wgts at > n seqs\n\
66 Forcing an alphabet: (normally autodetected)\n\
67 --amino : override autodetection, assert that seqs are protein\n\
68 --nucleic : override autodetection, assert that seqs are DNA/RNA\n\
70 Other expert options:\n\
71 --archpri <x> : set architecture size prior to <x> {0.85} [0..1]\n\
72 --binary : save the model in binary format, not ASCII text\n\
73 --cfile <f> : save count vectors to <f>\n\
74 --gapmax <x> : max fraction of gaps in mat column {0.50} [0..1]\n\
75 --idlevel <x> : set frac. id level used by eff. nseq and --wblosum {0.62}\n\
76 --informat <s>: input alignment is in format <s>, not Stockholm\n\
77 --pamwgt <x> : set weight on PAM-based prior to <x> {20.}[>=0]\n\
78 --swentry <x> : set S/W aggregate entry prob. to <x> {0.5}\n\
79 --swexit <x> : set S/W aggregate exit prob. to <x> {0.5}\n\
80 --verbose : print boring information\n\
83 static struct opt_s OPTIONS[] = {
84 { "-f", TRUE, sqdARG_NONE },
85 { "-g", TRUE, sqdARG_NONE },
86 { "-h", TRUE, sqdARG_NONE },
87 { "-n", TRUE, sqdARG_STRING},
88 { "-o", TRUE, sqdARG_STRING},
89 { "-s", TRUE, sqdARG_NONE },
90 { "-A", TRUE, sqdARG_NONE },
91 { "-F", TRUE, sqdARG_NONE },
92 { "--amino", FALSE, sqdARG_NONE },
93 { "--archpri", FALSE, sqdARG_FLOAT },
94 { "--binary", FALSE, sqdARG_NONE },
95 { "--cfile", FALSE, sqdARG_STRING},
96 { "--fast", FALSE, sqdARG_NONE},
97 { "--gapmax", FALSE, sqdARG_FLOAT },
98 { "--hand", FALSE, sqdARG_NONE},
99 { "--idlevel", FALSE, sqdARG_FLOAT },
100 { "--informat",FALSE, sqdARG_STRING },
101 { "--noeff", FALSE, sqdARG_NONE },
102 { "--nucleic", FALSE, sqdARG_NONE },
103 { "--null", FALSE, sqdARG_STRING },
104 { "--pam", FALSE, sqdARG_STRING },
105 { "--pamwgt", FALSE, sqdARG_FLOAT },
106 { "--pbswitch",FALSE, sqdARG_INT },
107 { "--prior", FALSE, sqdARG_STRING },
108 { "--swentry", FALSE, sqdARG_FLOAT },
109 { "--swexit", FALSE, sqdARG_FLOAT },
110 { "--verbose", FALSE, sqdARG_NONE },
111 { "--wgsc", FALSE, sqdARG_NONE },
112 { "--wblosum", FALSE, sqdARG_NONE },
113 { "--wme", FALSE, sqdARG_NONE },
114 { "--wnone", FALSE, sqdARG_NONE },
115 { "--wpb", FALSE, sqdARG_NONE },
116 { "--wvoronoi",FALSE, sqdARG_NONE },
118 #define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))
120 static void print_all_scores(FILE *fp, struct plan7_s *hmm,
121 char **dsq, MSA *msa, struct p7trace_s **tr);
122 static void save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm);
123 static void position_average_score(struct plan7_s *hmm, char **seq, float *wgt,
124 int nseq, struct p7trace_s **tr, float *pernode,
126 static float frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr,
127 float *pernode, float expected);
128 static void maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa,
130 struct p7prior_s *prior, struct p7trace_s **tr);
134 main(int argc, char **argv)
136 char *seqfile; /* seqfile to read alignment from */
137 int format; /* format of seqfile */
138 MSAFILE *afp; /* open alignment file */
139 MSA *msa; /* a multiple sequence alignment */
140 char **dsq; /* digitized unaligned aseq's */
141 struct plan7_s *hmm; /* constructed HMM; written to hmmfile */
142 struct p7prior_s *pri; /* Dirichlet priors to use */
143 struct p7trace_s **tr; /* fake tracebacks for aseq's */
144 char *hmmfile; /* file to write HMM to */
145 FILE *hmmfp; /* HMM output file handle */
146 char *name; /* name of the HMM */
147 int idx; /* counter for sequences */
148 float randomseq[MAXABET]; /* null sequence model */
149 float p1; /* null sequence model p1 transition */
150 int nali; /* count number of alignments/HMMs */
151 char fpopts[3]; /* options to open a file with, e.g. "ab" */
152 int checksum; /* checksum of the alignment */
154 char *optname; /* name of option found by Getopt() */
155 char *optarg; /* argument found by Getopt() */
156 int optind; /* index in argv[] */
158 enum p7_construction c_strategy; /* construction strategy choice */
159 enum p7_weight { /* weighting strategy */
160 WGT_NONE, WGT_GSC, WGT_BLOSUM, WGT_PB, WGT_VORONOI, WGT_ME} w_strategy;
161 enum p7_config { /* algorithm configuration strategy */
162 P7_BASE_CONFIG, P7_LS_CONFIG, P7_FS_CONFIG, P7_SW_CONFIG } cfg_strategy;
163 float gapmax; /* max frac gaps in mat col for -k */
164 int overwrite_protect; /* TRUE to prevent overwriting HMM file */
165 int verbose; /* TRUE to show a lot of output */
166 char *rndfile; /* random sequence model file to read */
167 char *prifile; /* Dirichlet prior file to read */
168 char *pamfile; /* PAM matrix file for heuristic prior */
169 char *align_ofile; /* name of output alignment file */
170 char *cfile; /* output file for count vectors */
171 FILE *alignfp; /* open filehandle for alignment resaves */
172 FILE *cfp; /* open filehandle for count vector saves*/
173 float archpri; /* "architecture" prior on model size */
174 float pamwgt; /* weight on PAM for heuristic prior */
175 int do_append; /* TRUE to append to hmmfile */
176 int do_binary; /* TRUE to write in binary format */
177 float blosumlevel; /* BLOSUM frac id filtering level [0.62] */
178 float swentry; /* S/W aggregate entry probability */
179 float swexit; /* S/W aggregate exit probability */
180 int do_eff; /* TRUE to set an effective seq number */
181 float eff_nseq; /* effective sequence number */
182 int pbswitch; /* nseq >= this, switchover to PB weights*/
183 char *setname; /* NULL, or ptr to HMM name to set */
184 int gapmax_set; /* TRUE if gapmax was set on commandline */
186 /***********************************************
188 ***********************************************/
190 format = MSAFILE_UNKNOWN; /* autodetect format by default. */
191 c_strategy = P7_MAP_CONSTRUCTION;
192 w_strategy = WGT_GSC;
194 cfg_strategy = P7_LS_CONFIG;
196 overwrite_protect = TRUE;
207 Alphabet_type = hmmNOTSETYET; /* initially unknown */
218 while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
219 &optind, &optname, &optarg)) {
220 if (strcmp(optname, "-f") == 0) cfg_strategy = P7_FS_CONFIG;
221 else if (strcmp(optname, "-g") == 0) cfg_strategy = P7_BASE_CONFIG;
222 else if (strcmp(optname, "-n") == 0) setname = optarg;
223 else if (strcmp(optname, "-o") == 0) align_ofile = optarg;
224 else if (strcmp(optname, "-s") == 0) cfg_strategy = P7_SW_CONFIG;
225 else if (strcmp(optname, "-A") == 0) do_append = TRUE;
226 else if (strcmp(optname, "-F") == 0) overwrite_protect = FALSE;
227 else if (strcmp(optname, "--amino") == 0) SetAlphabet(hmmAMINO);
228 else if (strcmp(optname, "--archpri") == 0) archpri = atof(optarg);
229 else if (strcmp(optname, "--binary") == 0) do_binary = TRUE;
230 else if (strcmp(optname, "--cfile") == 0) cfile = optarg;
231 else if (strcmp(optname, "--fast") == 0) c_strategy = P7_FAST_CONSTRUCTION;
232 else if (strcmp(optname, "--gapmax") == 0) { gapmax = atof(optarg); gapmax_set = TRUE; }
233 else if (strcmp(optname, "--hand") == 0) c_strategy = P7_HAND_CONSTRUCTION;
234 else if (strcmp(optname, "--idlevel") == 0) blosumlevel = atof(optarg);
235 else if (strcmp(optname, "--noeff") == 0) do_eff = FALSE;
236 else if (strcmp(optname, "--nucleic") == 0) SetAlphabet(hmmNUCLEIC);
237 else if (strcmp(optname, "--null") == 0) rndfile = optarg;
238 else if (strcmp(optname, "--pam") == 0) pamfile = optarg;
239 else if (strcmp(optname, "--pamwgt") == 0) pamwgt = atof(optarg);
240 else if (strcmp(optname, "--pbswitch")== 0) pbswitch = atoi(optarg);
241 else if (strcmp(optname, "--prior") == 0) prifile = optarg;
242 else if (strcmp(optname, "--swentry") == 0) swentry = atof(optarg);
243 else if (strcmp(optname, "--swexit") == 0) swexit = atof(optarg);
244 else if (strcmp(optname, "--verbose") == 0) verbose = TRUE;
245 else if (strcmp(optname, "--wgsc") == 0) w_strategy = WGT_GSC;
246 else if (strcmp(optname, "--wblosum") == 0) w_strategy = WGT_BLOSUM;
247 else if (strcmp(optname, "--wme") == 0) w_strategy = WGT_ME;
248 else if (strcmp(optname, "--wpb") == 0) w_strategy = WGT_PB;
249 else if (strcmp(optname, "--wnone") == 0) w_strategy = WGT_NONE;
250 else if (strcmp(optname, "--wvoronoi")== 0) w_strategy = WGT_VORONOI;
251 else if (strcmp(optname, "--informat") == 0) {
252 format = String2SeqfileFormat(optarg);
253 if (format == MSAFILE_UNKNOWN)
254 Die("unrecognized sequence file format \"%s\"", optarg);
255 if (! IsAlignmentFormat(format))
256 Die("%s is an unaligned format, can't read as an alignment", optarg);
258 else if (strcmp(optname, "-h") == 0) {
259 Banner(stdout, banner);
265 if (argc - optind != 2)
266 Die("Incorrect number of arguments.\n%s\n", usage);
268 hmmfile = argv[optind++];
269 seqfile = argv[optind++];
271 if (gapmax < 0. || gapmax > 1.)
272 Die("--gapmax must be a value from 0 to 1\n%s\n", usage);
273 if (archpri < 0. || archpri > 1.)
274 Die("--archpri must be a value from 0 to 1\n%s\n", usage);
275 if (overwrite_protect && !do_append && FileExists(hmmfile))
276 Die("HMM file %s already exists. Rename or delete it.", hmmfile);
277 if (overwrite_protect && align_ofile != NULL && FileExists(align_ofile))
278 Die("Alignment resave file %s exists. Rename or delete it.", align_ofile);
279 if (gapmax_set && c_strategy != P7_FAST_CONSTRUCTION)
280 Die("using --gapmax only makes sense if you use --fast");
282 /***********************************************
283 * Preliminaries: open our files for i/o
284 ***********************************************/
286 /* Open the alignment */
287 if ((afp = MSAFileOpen(seqfile, format, NULL)) == NULL)
288 Die("Alignment file %s could not be opened for reading", seqfile);
290 /* Open the HMM output file */
291 if (do_append) strcpy(fpopts, "a");
292 else strcpy(fpopts, "w");
293 if (do_binary) strcat(fpopts, "b");
294 if ((hmmfp = fopen(hmmfile, fpopts)) == NULL)
295 Die("Failed to open HMM file %s for %s\n", hmmfile,
296 do_append ? "appending" : "writing");
298 /* Open the count vector save file */
301 if ((cfp = fopen(cfile, "w")) == NULL)
302 Die("Failed to open count vector file %s for writing\n", cfile);
304 /* Open the alignment resave file */
306 if (align_ofile != NULL)
307 if ((alignfp = fopen(align_ofile, "w")) == NULL)
308 Die("Failed to open alignment resave file %s for writing\n", align_ofile);
310 /***********************************************
312 ***********************************************/
314 Banner(stdout, banner);
315 printf("Alignment file: %s\n",
317 printf("File format: %s\n",
318 SeqfileFormat2String(afp->format));
320 printf("Search algorithm configuration: ");
321 if (cfg_strategy == P7_BASE_CONFIG) puts("Global alignment (hmms)");
322 else if (cfg_strategy == P7_SW_CONFIG) {
323 puts("Local (hmmsw)");
324 printf("S/W aggregate entry probability: %.2f\n", swentry);
325 printf("S/W aggregate exit probability: %.2f\n", swexit);
327 else if (cfg_strategy == P7_LS_CONFIG) puts("Multiple domain (hmmls)");
328 else if (cfg_strategy == P7_FS_CONFIG) {
329 puts("Multiple local (hmmfs)");
330 printf("S/W aggregate entry probability: %.2f\n", swentry);
331 printf("S/W aggregate exit probability: %.2f\n", swexit);
334 printf("Model construction strategy: ");
335 if (c_strategy == P7_HAND_CONSTRUCTION) puts("Manual, from #=RF annotation");
336 else if (c_strategy==P7_FAST_CONSTRUCTION) printf("Fast/ad hoc (gapmax %.2f)\n", gapmax);
337 else printf("MAP (gapmax hint: %.2f)\n", gapmax);
339 printf("Null model used: %s\n",
340 (rndfile == NULL) ? "(default)" : rndfile);
342 printf("Prior used: %s\n",
343 (prifile == NULL) ? "(default)" : prifile);
345 printf("Sequence weighting method: ");
346 if (w_strategy == WGT_NONE) puts("none");
347 else if (w_strategy == WGT_GSC) puts("G/S/C tree weights");
348 else if (w_strategy == WGT_BLOSUM) printf("BLOSUM filter at %.2f id\n", blosumlevel);
349 else if (w_strategy == WGT_PB) puts("Henikoff position-based");
350 else if (w_strategy == WGT_VORONOI)puts("Sibbald/Argos Voronoi");
351 else if (w_strategy == WGT_ME) puts("Maximum entropy");
353 printf("New HMM file: %s %s\n",
354 hmmfile, do_append? "[appending]" : "");
356 printf("Count vectors saved to: %s\n", cfile);
357 if (align_ofile != NULL)
358 printf("Annotated alignment(s) resaved to: %s\n", align_ofile);
359 printf("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n");
362 /***********************************************
363 * Get alignment(s), build HMMs one at a time
364 ***********************************************/
367 while ((msa = MSAFileRead(afp)) != NULL)
369 /* Print some stuff about what we're about to do.
371 if (msa->name != NULL) printf("Alignment: %s\n", msa->name);
372 else printf("Alignment: #%d\n", nali+1);
373 printf ("Number of sequences: %d\n", msa->nseq);
374 printf ("Number of columns: %d\n", msa->alen);
378 /* Make alignment upper case, because some symbol counting
379 * things are case-sensitive.
381 for (idx = 0; idx < msa->nseq; idx++)
382 s2upper(msa->aseq[idx]);
384 /* Set up the alphabet globals:
385 * either already set by --amino or --nucleic, or
386 * we guess based on the first alignment we see
388 if (Alphabet_type == hmmNOTSETYET)
389 DetermineAlphabet(msa->aseq, msa->nseq);
391 /* Do some initialization the first time through.
392 * This code must be delayed until after we've seen the
393 * first alignment, because we have to see the alphabet type first
397 /* Set up Dirichlet priors */
398 if (prifile == NULL) pri = P7DefaultPrior();
399 else pri = P7ReadPrior(prifile);
401 if (pamfile != NULL) PAMPrior(pamfile, pri, pamwgt);
403 /* Set up the null/random seq model */
404 if (rndfile == NULL) P7DefaultNullModel(randomseq, &p1);
405 else P7ReadNullModel(rndfile, randomseq, &p1);
408 /* Prepare unaligned digitized sequences for internal use
410 DigitizeAlignment(msa, &dsq);
412 /* In some respects we treat DNA more crudely for now;
413 * for example, we can't do eff seq #, because it's
414 * calibrated for protein.
416 if (Alphabet_type == hmmNUCLEIC)
419 /* Determine "effective sequence number".
420 * The BlosumWeights() routine is now an efficient O(N)
421 * memory clustering algorithm that doesn't blow up on,
422 * say, Pfam's GP120 alignment (13000+ sequences)
424 eff_nseq = (float) msa->nseq;
428 printf("%-40s ... ", "Determining effective sequence number");
430 /* dummy weights array to feed BlosumWeights*/
431 wgt = MallocOrDie(sizeof(float) * msa->nseq);
432 BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, wgt);
433 eff_nseq = FSum(wgt, msa->nseq);
436 printf("done. [%.0f]\n", eff_nseq);
440 /* Weight the sequences (optional),
442 if (w_strategy == WGT_GSC ||
443 w_strategy == WGT_BLOSUM ||
444 w_strategy == WGT_VORONOI ||
445 w_strategy == WGT_PB)
447 printf("%-40s ... ", "Weighting sequences heuristically");
450 if (w_strategy != WGT_PB && msa->nseq >= pbswitch)
452 printf("[big alignment! doing PB]... ");
453 PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
455 else if (w_strategy == WGT_GSC)
456 GSCWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
457 else if (w_strategy == WGT_BLOSUM)
458 BlosumWeights(msa->aseq, msa->nseq, msa->alen, blosumlevel, msa->wgt);
459 else if (w_strategy == WGT_PB)
460 PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
461 else if (w_strategy == WGT_VORONOI)
462 VoronoiWeights(msa->aseq, msa->nseq, msa->alen, msa->wgt);
467 /* Set the effective sequence number (if do_eff is FALSE, eff_nseq
470 FNorm(msa->wgt, msa->nseq);
471 FScale(msa->wgt, msa->nseq, eff_nseq);
473 /* Build a model architecture.
474 * If we're not doing MD or ME, that's all we need to do.
475 * We get an allocated, counts-based HMM back.
477 * Because the architecture algorithms are allowed to change
478 * gap characters in the alignment, we have to calculate the
479 * alignment checksum before we enter the algorithms.
481 printf("%-40s ... ", "Constructing model architecture");
483 checksum = GCGMultchecksum(msa->aseq, msa->nseq);
484 if (c_strategy == P7_FAST_CONSTRUCTION)
485 P7Fastmodelmaker(msa, dsq, gapmax, &hmm, &tr);
486 else if (c_strategy == P7_HAND_CONSTRUCTION)
487 P7Handmodelmaker(msa, dsq, &hmm, &tr);
489 P7Maxmodelmaker(msa, dsq, gapmax,
490 pri, randomseq, p1, archpri, &hmm, &tr);
491 hmm->checksum = checksum;
494 /* Save the count vectors if asked. Used primarily for
495 * making the data files for training priors.
499 printf("%-40s ... ", "Saving count vector file");
501 save_countvectors(cfp,
502 (msa->name != NULL ? msa->name : "-"),
504 printf("done. [%s]\n", cfile);
507 /* Record the null model in the HMM;
508 * add prior contributions in pseudocounts and renormalize.
510 printf("%-40s ... ", "Converting counts to probabilities");
512 Plan7SetNullModel(hmm, randomseq, p1);
513 P7PriorifyHMM(hmm, pri);
516 /* Model configuration, temporary.
517 * hmmbuild assumes that it's given an alignment of single domains,
518 * and the alignment may contain fragments. So, for the purpose of
519 * scoring the sequences (or, optionally, MD/ME weighting),
520 * configure the model into hmmsw mode. Later we'll
521 * configure the model according to how the user wants to
524 Plan7SWConfig(hmm, 0.5, 0.5);
526 /* Do model-dependent "weighting" strategies.
528 if (w_strategy == WGT_ME)
530 printf("\n%-40s ...\n", "Maximum entropy weighting, iterative");
531 maximum_entropy(hmm, dsq, msa, eff_nseq, pri, tr);
532 printf("----------------------------------------------\n\n");
535 /* Give the model a name.
536 * We deal with this differently depending on whether
537 * we're in an alignment database or a single alignment.
539 * If a single alignment, priority is:
540 * 1. Use -n <name> if set.
541 * 2. Use msa->name (avail in Stockholm or SELEX formats only)
542 * 3. If all else fails, use alignment file name without
543 * filename extension (e.g. "globins.slx" gets named "globins"
545 * If a multiple MSA database (e.g. Stockholm/Pfam),
546 * only msa->name is applied. -n is not allowed.
547 * if msa->name is unavailable, or -n was used,
548 * a fatal error is thrown.
550 * Because we can't tell whether we've got more than one
551 * alignment 'til we're on the second one, these fatal errors
552 * only happen after the first HMM has already been built.
555 printf("%-40s ... ", "Setting model name, etc.");
557 if (nali == 0) /* first (only?) HMM in file: */
559 if (setname != NULL) name = Strdup(setname);
560 else if (msa->name != NULL) name = Strdup(msa->name);
561 else name = FileTail(seqfile, TRUE);
566 Die("Oops. Wait. You can't use -n with an alignment database.");
567 else if (msa->name != NULL) name = Strdup(msa->name);
569 Die("Oops. Wait. I need name annotation on each alignment.\n");
571 Plan7SetName(hmm, name);
574 /* Transfer other information from the alignment to
575 * the HMM. This typically only works for SELEX format
576 * alignments, so these things are conditional/optional.
578 if (msa->acc != NULL) Plan7SetAccession(hmm, msa->acc);
579 if (msa->desc != NULL) Plan7SetDescription(hmm, msa->desc);
581 if (msa->flags & MSA_SET_GA)
582 { hmm->flags |= PLAN7_GA; hmm->ga1 = msa->ga1; hmm->ga2 = msa->ga2; }
583 if (msa->flags & MSA_SET_TC)
584 { hmm->flags |= PLAN7_TC; hmm->tc1 = msa->tc1; hmm->tc2 = msa->tc2; }
585 if (msa->flags & MSA_SET_NC)
586 { hmm->flags |= PLAN7_NC; hmm->nc1 = msa->nc1; hmm->nc2 = msa->nc2; }
588 /* Record some other miscellaneous information in the HMM,
589 * like how/when we built it.
591 Plan7ComlogAppend(hmm, argc, argv);
593 hmm->nseq = msa->nseq;
594 printf("done. [%s]\n", hmm->name);
596 /* Print information for the user
598 printf("\nConstructed a profile HMM (length %d)\n", hmm->M);
599 PrintPlan7Stats(stdout, hmm, dsq, msa->nseq, tr);
602 /* Configure the model for chosen algorithm
604 printf("%-40s ... ", "Finalizing model configuration");
606 switch (cfg_strategy) {
607 case P7_BASE_CONFIG: Plan7GlobalConfig(hmm); break;
608 case P7_SW_CONFIG: Plan7SWConfig(hmm, swentry, swexit); break;
609 case P7_LS_CONFIG: Plan7LSConfig(hmm); break;
610 case P7_FS_CONFIG: Plan7FSConfig(hmm, swentry, swexit); break;
611 default: Die("bogus configuration choice");
615 /* Save new HMM to disk: open a file for appending or writing.
617 printf("%-40s ... ", "Saving model to file");
619 if (do_binary) WriteBinHMM(hmmfp, hmm);
620 else WriteAscHMM(hmmfp, hmm);
623 /* the annotated alignment may be resaved */
629 printf("%-40s ... ", "Saving annotated alignment");
631 sqinfo = MSAToSqinfo(msa);
632 new_msa = P7Traces2Alignment(dsq, sqinfo, msa->wgt, msa->nseq,
635 WriteStockholm(alignfp, new_msa);
637 for (idx = 0; idx < msa->nseq; idx++)
638 FreeSequence(NULL, &(sqinfo[idx]));
643 /* Verbose output; show scores for each sequence
646 print_all_scores(stdout, hmm, dsq, msa, tr);
648 /* Clean up before moving on to next alignment
650 for (idx = 0; idx < msa->nseq; idx++) P7FreeTrace(tr[idx]);
654 Free2DArray((void **) dsq, msa->nseq);
656 if (cfp != NULL) fflush(cfp);
657 if (alignfp != NULL) fflush(alignfp);
669 if (cfp != NULL) fclose(cfp);
670 if (alignfp != NULL) fclose(alignfp);
677 /* Function: print_all_scores()
679 * Purpose: For each training sequence, print its score under
682 * Args: fp - where to print the output (usu. stdout)
683 * hmm - newly constructed HMM, with prob's.
684 * dsq - digitized unaligned training sequences.
685 * msa - alignment and associated info
686 * tr - array of tracebacks
691 print_all_scores(FILE *fp, struct plan7_s *hmm,
692 char **dsq, MSA *msa, struct p7trace_s **tr)
694 int idx; /* counter for sequences */
696 /* make sure model scores are ready */
697 P7Logoddsify(hmm, TRUE);
700 fputs("Individual training sequence scores:\n", fp);
701 /* score for each sequence */
702 for (idx = 0; idx < msa->nseq; idx++)
704 fprintf(fp, "%7.2f %-12s %s\n",
705 P7TraceScore(hmm, dsq[idx], tr[idx]),
707 (MSAGetSeqDescription(msa,idx) != NULL) ?
708 MSAGetSeqDescription(msa,idx) : "");
709 P7PrintTrace(fp, tr[idx], hmm, dsq[idx]);
716 /* Function: save_countvectors()
718 * Purpose: Save emission/transition count vectors to a file.
719 * Used for gathering the data on which to train a
720 * prior (e.g. mixture Dirichlet, etc.)
722 * The format of the file is one vector per line:
723 * M <f> <f> ...: 20 match emission counts in order AC..WY.
724 * followed by two chars of CS, CA annotation.
725 * I <f> <f> ...: 20 insert emission counts in order AC..WY.
726 * followed by two chars of CS, CA annotation.
727 * T <f> <f> ...: 7 transition counts in order TMM, TMI, TMD,
728 * TIM, TII, TDM, TDD. (see structs.h)
729 * followed by four chars of structure
730 * annotation: CS, CS of M+1; CA, CA of M+1.
732 * Args: cfp - open counts file
733 * name - name of alignment or HMM to associate with these vectors
734 * hmm - counts-based HMM
737 save_countvectors(FILE *cfp, char *name, struct plan7_s *hmm)
740 /* match emission vectors */
741 for (k = 1; k <= hmm->M; k++)
744 for (x = 0; x < Alphabet_size; x++)
745 fprintf(cfp, "%8.2f ", hmm->mat[k][x]);
747 fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);
748 if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)
749 fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]);
754 /* insert emission vectors */
755 for (k = 1; k < hmm->M; k++)
758 for (x = 0; x < Alphabet_size; x++)
759 fprintf(cfp, "%8.2f ", hmm->ins[k][x]);
761 fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);
762 if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)
763 fprintf(cfp, "%c %c", hmm->cs[k], hmm->ca[k]);
769 /* transition vectors */
770 for (k = 1; k < hmm->M; k++)
774 for (x = 0; x < 7; x++)
775 fprintf(cfp, "%8.2f ", hmm->t[k][x]);
777 fprintf(cfp, "%15s %6d %6d ", name, hmm->map[k], k);
778 if ((hmm->flags & PLAN7_CS) && hmm->flags & PLAN7_CA)
779 fprintf(cfp, "%c %c %c %c",
780 hmm->cs[k], hmm->cs[k+1],
781 hmm->ca[k], hmm->ca[k+1]);
789 /* Function: position_average_score()
790 * Date: Wed Dec 31 09:36:35 1997 [StL]
792 * Purpose: Calculate scores from tracebacks, keeping them
793 * in a position specific array. The final array
794 * is normalized position-specifically too, according
795 * to how many sequences contributed data to this
796 * position. Used for compensating for sequence
797 * fragments in ME and MD score optimization.
800 * Code related to (derived from) TraceScore().
802 * Args: hmm - HMM structure, scores valid
803 * dsq - digitized unaligned sequences
804 * wgt - weights on the sequences
805 * nseq - number of sequences
806 * tr - array of nseq tracebacks that aligns each dsq to hmm
807 * pernode - RETURN: [0]1..M array of position-specific avg scores
808 * ret_avg - RETURN: overall average full-length, one-domain score
810 * Return: 1 on success, 0 on failure.
811 * pernode is malloc'ed [0]1..M by CALLER and filled here.
814 position_average_score(struct plan7_s *hmm,
818 struct p7trace_s **tr,
822 int pos; /* position in seq */
824 int tpos; /* position in trace/state sequence */
825 float *counts; /* counts at each position */
826 float avg; /* RETURN: average overall */
827 int k; /* counter for model position */
828 int idx; /* counter for sequence number */
832 counts = MallocOrDie ((hmm->M+1) * sizeof(float));
833 FSet(pernode, hmm->M+1, 0.);
834 FSet(counts, hmm->M+1, 0.);
836 /* Loop over traces, accumulate weighted scores per position
838 for (idx = 0; idx < nseq; idx++)
839 for (tpos = 0; tpos < tr[idx]->tlen; tpos++)
841 pos = tr[idx]->pos[tpos];
842 sym = (int) dsq[idx][tr[idx]->pos[tpos]];
843 k = tr[idx]->nodeidx[tpos];
845 /* Counts: how many times did we use this model position 1..M?
848 if (tr[idx]->statetype[tpos] == STM || tr[idx]->statetype[tpos] == STD)
849 counts[k] += wgt[idx];
853 if (tr[idx]->statetype[tpos] == STM)
854 pernode[k] += wgt[idx] * Scorify(hmm->msc[sym][k]);
855 else if (tr[idx]->statetype[tpos] == STI)
856 pernode[k] += wgt[idx] * Scorify(hmm->isc[sym][k]);
858 /* Transition scores.
860 if (tr[idx]->statetype[tpos] == STM ||
861 tr[idx]->statetype[tpos] == STD ||
862 tr[idx]->statetype[tpos] == STI)
863 pernode[k] += wgt[idx] *
864 Scorify(TransitionScoreLookup(hmm, tr[idx]->statetype[tpos], tr[idx]->nodeidx[tpos],
865 tr[idx]->statetype[tpos+1],tr[idx]->nodeidx[tpos+1]));
868 /* Divide accumulated scores by accumulated weighted counts
871 for (k = 1; k <= hmm->M; k++)
873 pernode[k] /= counts[k];
883 /* Function: frag_trace_score()
884 * Date: SRE, Wed Dec 31 10:03:47 1997 [StL]
886 * Purpose: Allow MD/ME optimization to be used for alignments
887 * that include fragments and multihits -- estimate a full-length
892 * Return: "corrected" score.
895 frag_trace_score(struct plan7_s *hmm, char *dsq, struct p7trace_s *tr,
896 float *pernode, float expected)
898 float sc; /* corrected score */
899 float fragexp; /* expected score for a trace like this */
900 int tpos; /* position in trace */
902 /* get uncorrected score */
903 sc = P7TraceScore(hmm, dsq, tr);
905 /* calc expected score for trace like this */
907 for (tpos = 0; tpos < tr->tlen; tpos++)
908 if (tr->statetype[tpos] == STM || tr->statetype[tpos] == STD)
909 fragexp += pernode[tr->nodeidx[tpos]];
911 /* correct for multihits */
912 fragexp /= (float) TraceDomainNumber(tr);
914 /* extrapolate to full-length, one-hit score */
915 sc = sc * expected / fragexp;
920 /* Function: maximum_entropy()
921 * Date: SRE, Fri Jan 2 10:56:00 1998 [StL]
923 * Purpose: Optimizes a model according to maximum entropy weighting.
924 * See Krogh and Mitchison (1995).
926 * [Actually, we do minimum relative entropy, rather than
927 * maximum entropy. Same thing, though we refer to "ME"
928 * weights and models. The optimization is a steepest
929 * descents minimization of the relative entropy.]
931 * Expects to be called shortly after a Maxmodelmaker()
932 * or Handmodelmaker(), so that both a new model architecture
933 * (with MAP parameters) and fake tracebacks are available.
935 * Prints a summary of optimization progress to stdout.
937 * Args: hmm - model. allocated, set with initial MAP parameters.
938 * dsq - dealigned digitized seqs the model is based on
939 * ainfo - extra info for aseqs
940 * nseq - number of aseqs
941 * eff_nseq- effective sequence number; weights normalize up to this.
942 * prior - prior distributions for parameterizing model
943 * tr - array of fake traces for each sequence
946 * hmm changed to an ME HMM
947 * ainfo changed, contains ME weights
950 maximum_entropy(struct plan7_s *hmm, char **dsq, MSA *msa,
951 float eff_nseq, struct p7prior_s *prior, struct p7trace_s **tr)
953 float *wgt; /* current best set of ME weights */
954 float *new_wgt; /* new set of ME weights to try */
955 float *sc; /* log-odds score of each sequence */
956 float *grad; /* gradient */
957 float epsilon; /* steepness of descent */
958 float relative_entropy; /* current best relative entropy */
959 float new_entropy; /* relative entropy at new weights */
960 float last_new_entropy; /* last new_entropy we calc'ed */
961 float use_epsilon; /* current epsilon value in use */
962 int idx; /* counter over sequences */
963 int i1, i2; /* counters for iterations */
965 float converge_criterion;
966 float minw, maxw; /* min, max weight */
967 int posw, highw; /* number of positive weights */
968 float mins, maxs, avgs; /* min, max, avg score */
969 float *pernode; /* expected score per node of HMM */
970 float expscore; /* expected score of complete HMM */
971 int max_iter; /* bulletproof against infinite loop bugs */
973 epsilon = 0.2; /* works fine */
978 sc = MallocOrDie (sizeof(float) * msa->nseq);
979 wgt = MallocOrDie (sizeof(float) * msa->nseq);
980 new_wgt = MallocOrDie (sizeof(float) * msa->nseq);
981 grad = MallocOrDie (sizeof(float) * msa->nseq);
982 pernode = MallocOrDie (sizeof(float) * (hmm->M+1));
984 /* Initialization. Start with all weights == 1.0.
985 * Find relative entropy and gradient.
987 Plan7SWConfig(hmm, 0.5, 0.5);
988 P7Logoddsify(hmm, TRUE);
990 FSet(wgt, msa->nseq, 1.0);
991 position_average_score(hmm, dsq, wgt, msa->nseq, tr, pernode,&expscore);
992 for (idx = 0; idx < msa->nseq; idx++)
993 sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);
994 relative_entropy = FSum(sc, msa->nseq) / (float) msa->nseq;
995 for (idx = 0; idx < msa->nseq; idx++)
996 grad[idx] = relative_entropy - sc[idx];
999 printf("iter avg-sc min-sc max-sc min-wgt max-wgt +wgt ++wgt rel.ent convergence\n");
1000 printf("---- ------ ------ ------ ------- ------- ---- ----- ------- -----------\n");
1001 mins = maxs = avgs = sc[0];
1002 for (idx = 1; idx < msa->nseq; idx++)
1004 if (sc[idx] < mins) mins = sc[idx];
1005 if (sc[idx] > maxs) maxs = sc[idx];
1008 avgs /= (float) msa->nseq;
1009 printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8s\n",
1010 0, avgs, mins, maxs, 1.0, 1.0, msa->nseq, 0, relative_entropy, "-");
1013 /* Steepest descents optimization;
1014 * iterate until relative entropy converges.
1017 while (++i1 < max_iter)
1019 /* Gradient gives us a line of steepest descents.
1020 * (Roughly speaking, anyway. We actually have a constraint
1021 * that weights are nonnegative and normalized, and the
1022 * gradient doesn't take these into account.)
1023 * Look along this line, a distance of epsilon * gradient:
1024 * if new point is better, accept; if new point is worse,
1025 * move back along the line by half the distance and re-evaluate.
1027 use_epsilon = epsilon;
1028 new_entropy = relative_entropy + 1.0; /* just ensure new > old */
1031 while (new_entropy > relative_entropy && ++i2 < max_iter)
1033 last_new_entropy = new_entropy;
1035 /* find a new point in weight space */
1036 for (idx = 0; idx < msa->nseq; idx++)
1038 new_wgt[idx] = wgt[idx] + use_epsilon * grad[idx];
1039 if (new_wgt[idx] < 0.) new_wgt[idx] = 0.0;
1041 FNorm(new_wgt, msa->nseq);
1042 FScale(new_wgt, msa->nseq, (float) msa->nseq);
1044 /* Make new HMM using these weights */
1046 for (idx = 0; idx < msa->nseq; idx++)
1047 P7TraceCount(hmm, dsq[idx], new_wgt[idx], tr[idx]);
1048 P7PriorifyHMM(hmm, prior);
1051 /* Evaluate new point */
1052 Plan7SWConfig(hmm, 0.5, 0.5);
1053 P7Logoddsify(hmm, TRUE);
1054 position_average_score(hmm, dsq, new_wgt, msa->nseq, tr, pernode, &expscore);
1055 for (idx = 0; idx < msa->nseq; idx++)
1056 sc[idx] = frag_trace_score(hmm, dsq[idx], tr[idx], pernode, expscore);
1057 new_entropy = FDot(sc, new_wgt, msa->nseq) / (float) msa->nseq;
1060 /* Failsafe: we're not converging. Set epsilon to zero,
1061 * do one more round.
1063 if (use_epsilon < 1e-6) use_epsilon = 0.0;
1064 if (use_epsilon == 0.0) break;
1066 /* Failsafe: avoid infinite loops. Sometimes the
1067 new entropy converges without ever being better
1068 than the previous point, probably as a result
1069 of minor roundoff error. */
1070 if (last_new_entropy == new_entropy) break;
1072 if (i2 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n");
1074 /* Evaluate convergence before accepting the new weights;
1075 * then, accept the new point and evaluate the gradient there.
1077 converge_criterion = fabs((relative_entropy-new_entropy)/relative_entropy);
1078 relative_entropy = new_entropy;
1079 FCopy(wgt, new_wgt, msa->nseq);
1080 for (idx = 0; idx < msa->nseq; idx++)
1081 grad[idx] = relative_entropy - sc[idx];
1083 /* Print some statistics about this iteration
1085 mins = maxs = avgs = sc[0];
1086 minw = maxw = wgt[0];
1087 posw = (wgt[0] > 0.0) ? 1 : 0;
1088 highw = (wgt[0] > 1.0) ? 1 : 0;
1089 for (idx = 1; idx < msa->nseq; idx++)
1091 if (sc[idx] < mins) mins = sc[idx];
1092 if (sc[idx] > maxs) maxs = sc[idx];
1093 if (wgt[idx] < minw) minw = wgt[idx];
1094 if (wgt[idx] > maxw) maxw = wgt[idx];
1095 if (wgt[idx] > 0.0) posw++;
1096 if (wgt[idx] > 1.0) highw++;
1099 avgs /= (float) msa->nseq;
1100 printf("%4d %6.1f %6.1f %6.1f %7.2f %7.2f %4d %5d %7.2f %8.5f\n",
1103 minw, maxw, posw, highw,
1104 relative_entropy, converge_criterion);
1106 if (converge_criterion < 1e-5) break;
1108 if (i1 == max_iter) printf(" -- exceeded maximum iterations; giving up --\n");
1110 /* Renormalize weights to sum to eff_nseq, and save.
1112 FNorm(wgt, msa->nseq);
1113 FScale(wgt, msa->nseq, (float) eff_nseq);
1114 FCopy(msa->wgt, wgt, msa->nseq);
1115 /* Make final HMM using these adjusted weights */
1117 for (idx = 0; idx < msa->nseq; idx++)
1118 P7TraceCount(hmm, dsq[idx], wgt[idx], tr[idx]);
1119 P7PriorifyHMM(hmm, prior);
1121 /* Cleanup and return