1 /************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 ************************************************************/
13 * Input/output of HMMs.
15 * As of HMMER 2.0, HMMs are saved by default in a tabular ASCII format
16 * as log-odds or log probabilities scaled to an integer. A binary save
17 * file format is also available which is faster to access (a
18 * consideration which might be important for HMM library applications).
19 * HMMs can be concatenated into HMM libraries.
21 * A comment on loss of accuracy. Storing a number as a scaled log
22 * probability guarantees us an error of about 0.035% or
23 * less in the retrieved probability. We are relatively invulnerable
24 * to the truncation errors which HMMER 1.8 was vulnerable to.
26 * Magic numbers (both for the ASCII and binary save formats) are used
27 * to label save files with a major version number. This simplifies the task of
28 * backwards compatibility as new versions of the program are created.
29 * Reverse but not forward compatibility is guaranteed. I.e. HMMER 2.0
30 * can read `1.7' save files, but not vice versa. Note that the major
31 * version number in the save files is NOT the version of the software
32 * that generated it; rather, the number of the last major version in which
33 * save format changed.
35 ******************************************************************
41 * struct plan7_s *hmm;
42 * char env[] = "HMMERDB"; (a la BLASTDB)
44 * hmmfp = HMMFileOpen(hmmfile, env) NULL on failure
45 * while (HMMFileRead(hmmfp, &hmm)) 0 if no more HMMs
46 * if (hmm == NULL) Die(); NULL on file parse failure
50 * HMMFileClose(hmmfp);
52 *****************************************************************
57 * struct plan7_s *hmm;
59 * WriteAscHMM(ofp, hmm); to write/append an HMM to open file
60 * or WriteBinHMM(ofp, hmm); to write/append binary format HMM to open file
62 *****************************************************************
64 * V1.0: original implementation
65 * V1.1: regularizers removed from model structure
66 * V1.7: ref and cs annotation lines added from alignment, one
67 * char per match state 1..M
68 * V1.9: null model and name added to HMM structure. ASCII format changed
69 * to compact tabular one.
70 * V2.0: Plan7. Essentially complete rewrite.
78 #include <unistd.h> /* to get SEEK_CUR definition on silly Suns */
87 /* Magic numbers identifying binary formats.
88 * Do not change the old magics! Necessary for backwards compatibility.
90 static unsigned int v10magic = 0xe8ededb1; /* v1.0 binary: "hmm1" + 0x80808080 */
91 static unsigned int v10swap = 0xb1edede8; /* byteswapped v1.0 */
92 static unsigned int v11magic = 0xe8ededb2; /* v1.1 binary: "hmm2" + 0x80808080 */
93 static unsigned int v11swap = 0xb2edede8; /* byteswapped v1.1 */
94 static unsigned int v17magic = 0xe8ededb3; /* v1.7 binary: "hmm3" + 0x80808080 */
95 static unsigned int v17swap = 0xb3edede8; /* byteswapped v1.7 */
96 static unsigned int v19magic = 0xe8ededb4; /* V1.9 binary: "hmm4" + 0x80808080 */
97 static unsigned int v19swap = 0xb4edede8; /* V1.9 binary, byteswapped */
98 static unsigned int v20magic = 0xe8ededb5; /* V2.0 binary: "hmm5" + 0x80808080 */
99 static unsigned int v20swap = 0xb5edede8; /* V2.0 binary, byteswapped */
101 /* Old HMMER 1.x file formats.
103 #define HMMER1_0B 1 /* binary HMMER 1.0 */
104 #define HMMER1_0F 2 /* flat ascii HMMER 1.0 */
105 #define HMMER1_1B 3 /* binary HMMER 1.1 */
106 #define HMMER1_1F 4 /* flat ascii HMMER 1.1 */
107 #define HMMER1_7B 5 /* binary HMMER 1.7 */
108 #define HMMER1_7F 6 /* flat ascii HMMER 1.7 */
109 #define HMMER1_9B 7 /* HMMER 1.9 binary */
110 #define HMMER1_9F 8 /* HMMER 1.9 flat ascii */
112 static int read_asc20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
113 static int read_bin20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
114 static int read_asc19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
115 static int read_bin19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
116 static int read_asc17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
117 static int read_bin17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
118 static int read_asc11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
119 static int read_bin11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
120 static int read_asc10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
121 static int read_bin10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);
123 static void byteswap(char *swap, int nbytes);
124 static char *prob2ascii(float p, float null);
125 static float ascii2prob(char *s, float null);
126 static void write_bin_string(FILE *fp, char *s);
127 static int read_bin_string(FILE *fp, int doswap, char **ret_s);
128 static void multiline(FILE *fp, char *pfx, char *s);
130 static struct plan9_s *read_plan9_binhmm(FILE *fp, int version, int swapped);
131 static struct plan9_s *read_plan9_aschmm(FILE *fp, int version);
133 /*****************************************************************
134 * HMM input API functions:
139 *****************************************************************/
141 /* Function: HMMFileOpen()
143 * Purpose: Open an HMM file for reading. The file may be either
144 * an index for a library of HMMs, or an HMM.
146 * Args: hmmfile - name of file
147 * env - NULL, or environment variable for HMM database.
149 * Return: Valid HMMFILE *, or NULL on failure.
152 HMMFileOpen(char *hmmfile, char *env)
158 char *dir; /* dir name in which HMM file was found */
161 hmmfp = (HMMFILE *) MallocOrDie (sizeof(HMMFILE));
163 hmmfp->parser = NULL;
164 hmmfp->is_binary = FALSE;
165 hmmfp->byteswap = FALSE;
166 hmmfp->is_seekable= TRUE; /* always; right now, an HMM must always be in a file. */
168 /* Open the file. Look in current directory.
169 * If that doesn't work, check environment var for
170 * a second possible directory (usually the location
171 * of a system-wide HMM library).
172 * Using dir name if necessary, construct correct SSI file name.
176 if ((hmmfp->f = fopen(hmmfile, "r")) != NULL)
178 ssifile = MallocOrDie(sizeof(char) * (strlen(hmmfile) + 5));
179 sprintf(ssifile, "%s.ssi", hmmfile);
181 if ((hmmfp->mode = SSIRecommendMode(hmmfile)) == -1)
182 Die("SSIRecommendMode() failed");
184 else if ((hmmfp->f = EnvFileOpen(hmmfile, env, &dir)) != NULL)
187 full = FileConcat(dir, hmmfile);
189 ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(hmmfile) + 5));
190 sprintf(ssifile, "%s.ssi", full);
192 if ((hmmfp->mode = SSIRecommendMode(full)) == -1)
193 Die("SSIRecommendMode() failed");
200 /* Open the SSI index file. If it doesn't exist, or it's corrupt, or
201 * some error happens, hmmfp->ssi stays NULL.
203 SQD_DPRINTF1(("Opening ssifile %s...\n", ssifile));
204 SSIOpen(ssifile, &(hmmfp->ssi));
207 /* Initialize the disk offset stuff.
209 status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset));
210 if (status != 0) Die("SSIGetFilePosition() failed");
212 /* Check for binary or byteswapped binary format
213 * by peeking at first 4 bytes.
215 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) {
221 if (magic == v20magic) {
222 hmmfp->parser = read_bin20hmm;
223 hmmfp->is_binary = TRUE;
226 else if (magic == v20swap) {
227 SQD_DPRINTF1(("Opened a HMMER 2.0 binary file [byteswapped]\n"));
228 hmmfp->parser = read_bin20hmm;
229 hmmfp->is_binary = TRUE;
230 hmmfp->byteswap = TRUE;
233 else if (magic == v19magic) {
234 hmmfp->parser = read_bin19hmm;
235 hmmfp->is_binary = TRUE;
238 else if (magic == v19swap) {
239 hmmfp->parser = read_bin19hmm;
240 hmmfp->is_binary = TRUE;
241 hmmfp->byteswap = TRUE;
244 else if (magic == v17magic) {
245 hmmfp->parser = read_bin17hmm;
246 hmmfp->is_binary = TRUE;
249 else if (magic == v17swap) {
250 hmmfp->parser = read_bin17hmm;
251 hmmfp->is_binary = TRUE;
252 hmmfp->byteswap = TRUE;
255 else if (magic == v11magic) {
256 hmmfp->parser = read_bin11hmm;
257 hmmfp->is_binary = TRUE;
260 else if (magic == v11swap) {
261 hmmfp->parser = read_bin11hmm;
262 hmmfp->is_binary = TRUE;
263 hmmfp->byteswap = TRUE;
266 else if (magic == v10magic) {
267 hmmfp->parser = read_bin10hmm;
268 hmmfp->is_binary = TRUE;
271 else if (magic == v10swap) {
272 hmmfp->parser = read_bin10hmm;
273 hmmfp->is_binary = TRUE;
274 hmmfp->byteswap = TRUE;
277 /* else we fall thru; it may be an ASCII file. */
279 /* If magic looks binary but we don't recognize it, choke and die.
281 if (magic & 0x80000000) {
283 %s appears to be a binary but format is not recognized\n\
284 It may be from a HMMER version more recent than yours,\n\
285 or may be a different kind of binary altogether.\n", hmmfile);
290 /* Check for ASCII format by peeking at first word.
292 if (fgets(buf, 512, hmmfp->f) == NULL) {
298 if (strncmp("HMMER2.0", buf, 8) == 0) {
299 hmmfp->parser = read_asc20hmm;
301 } else if (strncmp("HMMER v1.9", buf, 10) == 0) {
302 hmmfp->parser = read_asc19hmm;
304 } else if (strncmp("# HMM v1.7", buf, 10) == 0) {
305 hmmfp->parser = read_asc17hmm;
307 } else if (strncmp("# HMM v1.1", buf, 10) == 0) {
308 hmmfp->parser = read_asc11hmm;
310 } else if (strncmp("# HMM v1.0", buf, 10) == 0) {
311 hmmfp->parser = read_asc10hmm;
315 /* If we haven't recognized it yet, it's bogus.
321 HMMFileRead(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
324 /* Set the disk position marker. */
325 if (hmmfp->is_seekable) {
326 status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset));
327 if (status != 0) Die("SSIGetFilePosition() failed");
329 /* Parse the HMM and return it. */
330 return (*hmmfp->parser)(hmmfp, ret_hmm);
333 HMMFileClose(HMMFILE *hmmfp)
335 if (hmmfp->f != NULL) fclose(hmmfp->f);
336 if (hmmfp->ssi != NULL) SSIClose(hmmfp->ssi);
340 HMMFileRewind(HMMFILE *hmmfp)
345 HMMFilePositionByName(HMMFILE *hmmfp, char *name)
347 SSIOFFSET offset; /* offset in hmmfile, from SSI */
348 int fh; /* ignored. */
350 if (hmmfp->ssi == NULL) return 0;
351 if (SSIGetOffsetByName(hmmfp->ssi, name, &fh, &offset) != 0) return 0;
352 if (SSISetFilePosition(hmmfp->f, &offset) != 0) return 0;
356 HMMFilePositionByIndex(HMMFILE *hmmfp, int idx)
357 { /* idx runs from 0..nhmm-1 */
358 int fh; /* file handle is ignored; only one HMM file */
359 SSIOFFSET offset; /* file position of HMM */
361 if (hmmfp->ssi == NULL) return 0;
362 if (SSIGetOffsetByNumber(hmmfp->ssi, idx, &fh, &offset) != 0) return 0;
363 if (SSISetFilePosition(hmmfp->f, &offset) != 0) return 0;
367 /*****************************************************************
372 *****************************************************************/
374 /* Function: WriteAscHMM()
376 * Purpose: Save an HMM in flat text ASCII format.
378 * Args: fp - open file for writing
382 WriteAscHMM(FILE *fp, struct plan7_s *hmm)
384 int k; /* counter for nodes */
385 int x; /* counter for symbols */
386 int ts; /* counter for state transitions */
388 fprintf(fp, "HMMER2.0 [%s]\n", RELEASE); /* magic header */
390 /* write header information
392 fprintf(fp, "NAME %s\n", hmm->name);
393 if (hmm->flags & PLAN7_ACC)
394 fprintf(fp, "ACC %s\n", hmm->acc);
395 if (hmm->flags & PLAN7_DESC)
396 fprintf(fp, "DESC %s\n", hmm->desc);
397 fprintf(fp, "LENG %d\n", hmm->M);
398 fprintf(fp, "ALPH %s\n",
399 (Alphabet_type == hmmAMINO) ? "Amino":"Nucleic");
400 fprintf(fp, "RF %s\n", (hmm->flags & PLAN7_RF) ? "yes" : "no");
401 fprintf(fp, "CS %s\n", (hmm->flags & PLAN7_CS) ? "yes" : "no");
402 fprintf(fp, "MAP %s\n", (hmm->flags & PLAN7_MAP) ? "yes" : "no");
403 multiline(fp, "COM ", hmm->comlog);
404 fprintf(fp, "NSEQ %d\n", hmm->nseq);
405 fprintf(fp, "DATE %s\n", hmm->ctime);
406 fprintf(fp, "CKSUM %d\n", hmm->checksum);
407 if (hmm->flags & PLAN7_GA)
408 fprintf(fp, "GA %.1f %.1f\n", hmm->ga1, hmm->ga2);
409 if (hmm->flags & PLAN7_TC)
410 fprintf(fp, "TC %.1f %.1f\n", hmm->tc1, hmm->tc2);
411 if (hmm->flags & PLAN7_NC)
412 fprintf(fp, "NC %.1f %.1f\n", hmm->nc1, hmm->nc2);
417 for (k = 0; k < 4; k++)
418 for (x = 0; x < 2; x++)
419 fprintf(fp, "%6s ", prob2ascii(hmm->xt[k][x], 1.0));
422 /* Save the null model first, so HMM readers can decode
423 * log odds scores on the fly. Save as log odds probabilities
424 * relative to 1/Alphabet_size (flat distribution)
426 fprintf(fp, "NULT ");
427 fprintf(fp, "%6s ", prob2ascii(hmm->p1, 1.0)); /* p1 */
428 fprintf(fp, "%6s\n", prob2ascii(1.0-hmm->p1, 1.0)); /* p2 */
430 for (x = 0; x < Alphabet_size; x++)
431 fprintf(fp, "%6s ", prob2ascii(hmm->null[x], 1/(float)(Alphabet_size)));
436 if (hmm->flags & PLAN7_STATS)
437 fprintf(fp, "EVD %10f %10f\n", hmm->mu, hmm->lambda);
442 for (x = 0; x < Alphabet_size; x++) fprintf(fp, " %c ", Alphabet[x]);
444 fprintf(fp, " %6s %6s %6s %6s %6s %6s %6s %6s %6s\n",
445 "m->m", "m->i", "m->d", "i->m", "i->i", "d->m", "d->d", "b->m", "m->e");
447 /* Print HMM parameters (main section of the save file)
449 fprintf(fp, " %6s %6s ", prob2ascii(1-hmm->tbd1, 1.0), "*");
450 fprintf(fp, "%6s\n", prob2ascii(hmm->tbd1, 1.0));
451 for (k = 1; k <= hmm->M; k++)
453 /* Line 1: k, match emissions, map */
454 fprintf(fp, " %5d ", k);
455 for (x = 0; x < Alphabet_size; x++)
456 fprintf(fp, "%6s ", prob2ascii(hmm->mat[k][x], hmm->null[x]));
457 if (hmm->flags & PLAN7_MAP) fprintf(fp, "%5d", hmm->map[k]);
459 /* Line 2: RF and insert emissions */
460 fprintf(fp, " %5c ", hmm->flags & PLAN7_RF ? hmm->rf[k] : '-');
461 for (x = 0; x < Alphabet_size; x++)
462 fprintf(fp, "%6s ", (k < hmm->M) ? prob2ascii(hmm->ins[k][x], hmm->null[x]) : "*");
464 /* Line 3: CS and transition probs */
465 fprintf(fp, " %5c ", hmm->flags & PLAN7_CS ? hmm->cs[k] : '-');
466 for (ts = 0; ts < 7; ts++)
467 fprintf(fp, "%6s ", (k < hmm->M) ? prob2ascii(hmm->t[k][ts], 1.0) : "*");
468 fprintf(fp, "%6s ", prob2ascii(hmm->begin[k], 1.0));
469 fprintf(fp, "%6s ", prob2ascii(hmm->end[k], 1.0));
476 /* Function: WriteBinHMM()
478 * Purpose: Write an HMM in binary format.
481 WriteBinHMM(FILE *fp, struct plan7_s *hmm)
485 /* ye olde magic number */
486 fwrite((char *) &(v20magic), sizeof(unsigned int), 1, fp);
490 fwrite((char *) &(hmm->flags), sizeof(int), 1, fp);
491 write_bin_string(fp, hmm->name);
492 if (hmm->flags & PLAN7_ACC) write_bin_string(fp, hmm->acc);
493 if (hmm->flags & PLAN7_DESC) write_bin_string(fp, hmm->desc);
494 fwrite((char *) &(hmm->M), sizeof(int), 1, fp);
495 fwrite((char *) &(Alphabet_type), sizeof(int), 1, fp);
496 if (hmm->flags & PLAN7_RF) fwrite((char *) hmm->rf, sizeof(char), hmm->M+1, fp);
497 if (hmm->flags & PLAN7_CS) fwrite((char *) hmm->cs, sizeof(char), hmm->M+1, fp);
498 if (hmm->flags & PLAN7_MAP) fwrite((char *) hmm->map, sizeof(int), hmm->M+1, fp);
499 write_bin_string(fp, hmm->comlog);
500 fwrite((char *) &(hmm->nseq), sizeof(int), 1, fp);
501 write_bin_string(fp, hmm->ctime);
502 fwrite((char *) &(hmm->checksum), sizeof(int), 1, fp);
503 if (hmm->flags & PLAN7_GA) {
504 fwrite((char *) &(hmm->ga1), sizeof(float), 1, fp);
505 fwrite((char *) &(hmm->ga2), sizeof(float), 1, fp);
507 if (hmm->flags & PLAN7_TC) {
508 fwrite((char *) &(hmm->tc1), sizeof(float), 1, fp);
509 fwrite((char *) &(hmm->tc2), sizeof(float), 1, fp);
511 if (hmm->flags & PLAN7_NC) {
512 fwrite((char *) &(hmm->nc1), sizeof(float), 1, fp);
513 fwrite((char *) &(hmm->nc2), sizeof(float), 1, fp);
517 for (k = 0; k < 4; k++)
518 fwrite((char *) hmm->xt[k], sizeof(float), 2, fp);
521 fwrite((char *)&(hmm->p1), sizeof(float), 1, fp);
522 fwrite((char *) hmm->null, sizeof(float), Alphabet_size, fp);
525 if (hmm->flags & PLAN7_STATS) {
526 fwrite((char *) &(hmm->mu), sizeof(float), 1, fp);
527 fwrite((char *) &(hmm->lambda), sizeof(float), 1, fp);
530 /* entry/exit probabilities
532 fwrite((char *)&(hmm->tbd1),sizeof(float), 1, fp);
533 fwrite((char *) hmm->begin, sizeof(float), hmm->M+1, fp);
534 fwrite((char *) hmm->end, sizeof(float), hmm->M+1, fp);
538 for (k = 1; k <= hmm->M; k++)
539 fwrite((char *) hmm->mat[k], sizeof(float), Alphabet_size, fp);
540 for (k = 1; k < hmm->M; k++)
541 fwrite((char *) hmm->ins[k], sizeof(float), Alphabet_size, fp);
542 for (k = 1; k < hmm->M; k++)
543 fwrite((char *) hmm->t[k], sizeof(float), 7, fp);
547 /*****************************************************************
549 * Internal: HMM file parsers for various releases of HMMER.
551 * read_{asc,bin}xxhmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
553 * Upon return, *ret_hmm is an allocated Plan7 HMM.
554 * Return 0 if no more HMMs in the file (normal).
555 * Return 1 and *ret_hmm = something if we got an HMM (normal)
556 * Return 1 if an error occurs (meaning "I tried to
557 * read something...") and *ret_hmm == NULL (meaning
558 * "...but it wasn't an HMM"). I know, this is a funny
559 * way to handle errors.
561 *****************************************************************/
564 read_asc20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
572 int atype; /* alphabet type, hmmAMINO or hmmNUCLEIC */
575 if (feof(hmmfp->f) || fgets(buffer, 512, hmmfp->f) == NULL) return 0;
576 if (strncmp(buffer, "HMMER2.0", 8) != 0) goto FAILURE;
578 /* Get the header information: tag/value pairs in any order,
579 * ignore unknown tags, stop when "HMM" is reached (signaling
580 * start of main model)
582 hmm = AllocPlan7Shell();
584 while (fgets(buffer, 512, hmmfp->f) != NULL) {
585 if (strncmp(buffer, "NAME ", 5) == 0) Plan7SetName(hmm, buffer+6);
586 else if (strncmp(buffer, "ACC ", 5) == 0) Plan7SetAccession(hmm, buffer+6);
587 else if (strncmp(buffer, "DESC ", 5) == 0) Plan7SetDescription(hmm, buffer+6);
588 else if (strncmp(buffer, "LENG ", 5) == 0) M = atoi(buffer+6);
589 else if (strncmp(buffer, "NSEQ ", 5) == 0) hmm->nseq = atoi(buffer+6);
590 else if (strncmp(buffer, "ALPH ", 5) == 0)
591 { /* Alphabet type */
593 if (strncmp(buffer+6, "AMINO", 5) == 0) atype = hmmAMINO;
594 else if (strncmp(buffer+6, "NUCLEIC", 7) == 0) atype = hmmNUCLEIC;
597 if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype);
598 else if (atype != Alphabet_type)
599 Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype));
601 else if (strncmp(buffer, "RF ", 5) == 0)
602 { /* Reference annotation present? */
603 if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_RF;
605 else if (strncmp(buffer, "CS ", 5) == 0)
606 { /* Consensus annotation present? */
607 if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_CS;
609 else if (strncmp(buffer, "MAP ", 5) == 0)
610 { /* Map annotation present? */
611 if (sre_toupper(*(buffer+6)) == 'Y') hmm->flags |= PLAN7_MAP;
613 else if (strncmp(buffer, "COM ", 5) == 0)
614 { /* Command line log */
615 StringChop(buffer+6);
616 if (hmm->comlog == NULL)
617 hmm->comlog = Strdup(buffer+6);
620 hmm->comlog = ReallocOrDie(hmm->comlog, sizeof(char *) *
621 (strlen(hmm->comlog) + 1 + strlen(buffer+6)));
622 strcat(hmm->comlog, "\n");
623 strcat(hmm->comlog, buffer+6);
626 else if (strncmp(buffer, "DATE ", 5) == 0)
627 { /* Date file created */
628 StringChop(buffer+6);
629 hmm->ctime= Strdup(buffer+6);
631 else if (strncmp(buffer, "GA ", 5) == 0)
633 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
635 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
637 hmm->flags |= PLAN7_GA;
639 else if (strncmp(buffer, "TC ", 5) == 0)
641 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
643 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
645 hmm->flags |= PLAN7_TC;
647 else if (strncmp(buffer, "NC ", 5) == 0)
649 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
651 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
653 hmm->flags |= PLAN7_NC;
655 else if (strncmp(buffer, "XT ", 5) == 0)
656 { /* Special transition section */
657 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
658 for (k = 0; k < 4; k++)
659 for (x = 0; x < 2; x++)
661 if (s == NULL) goto FAILURE;
662 hmm->xt[k][x] = ascii2prob(s, 1.0);
663 s = strtok(NULL, " \t\n");
666 else if (strncmp(buffer, "NULT ", 5) == 0)
667 { /* Null model transitions */
668 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
669 hmm->p1 = ascii2prob(s, 1.);
670 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
671 hmm->p1 = hmm->p1 / (hmm->p1 + ascii2prob(s, 1.0));
673 else if (strncmp(buffer, "NULE ", 5) == 0)
674 { /* Null model emissions */
675 if (Alphabet_type == hmmNOTSETYET)
676 Die("ALPH must precede NULE in HMM save files");
677 s = strtok(buffer+6, " \t\n");
678 for (x = 0; x < Alphabet_size; x++) {
679 if (s == NULL) goto FAILURE;
680 hmm->null[x] = ascii2prob(s, 1./(float)Alphabet_size);
681 s = strtok(NULL, " \t\n");
684 else if (strncmp(buffer, "EVD ", 5) == 0)
685 { /* EVD parameters */
686 hmm->flags |= PLAN7_STATS;
687 if ((s = strtok(buffer+6, " \t\n")) == NULL) goto FAILURE;
689 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
690 hmm->lambda = atof(s);
692 else if (strncmp(buffer, "CKSUM", 5) == 0) hmm->checksum = atoi(buffer+6);
693 else if (strncmp(buffer, "HMM ", 5) == 0) break;
696 /* partial check for mandatory fields */
697 if (feof(hmmfp->f)) goto FAILURE;
698 if (M < 1) goto FAILURE;
699 if (hmm->name == NULL) goto FAILURE;
700 if (Alphabet_type == hmmNOTSETYET) goto FAILURE;
702 /* Main model section. Read as integer log odds, convert
705 AllocPlan7Body(hmm, M);
706 /* skip an annotation line */
707 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
708 /* parse tbd1 line */
709 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
710 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
711 p = ascii2prob(s, 1.0);
712 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
713 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
714 hmm->tbd1 = ascii2prob(s, 1.0);
715 hmm->tbd1 = hmm->tbd1 / (p + hmm->tbd1);
718 for (k = 1; k <= hmm->M; k++) {
719 /* Line 1: k, match emissions, map */
720 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
721 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
722 if (atoi(s) != k) goto FAILURE;
723 for (x = 0; x < Alphabet_size; x++) {
724 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
725 hmm->mat[k][x] = ascii2prob(s, hmm->null[x]);
727 if (hmm->flags & PLAN7_MAP) {
728 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
729 hmm->map[k] = atoi(s);
731 /* Line 2: RF and insert emissions */
732 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
733 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
734 if (hmm->flags & PLAN7_RF) hmm->rf[k] = *s;
736 for (x = 0; x < Alphabet_size; x++) {
737 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
738 hmm->ins[k][x] = ascii2prob(s, hmm->null[x]);
741 /* Line 3: CS and transitions */
742 if (fgets(buffer, 512, hmmfp->f) == NULL) goto FAILURE;
743 if ((s = strtok(buffer, " \t\n")) == NULL) goto FAILURE;
744 if (hmm->flags & PLAN7_CS) hmm->cs[k] = *s;
745 for (x = 0; x < 7; x++) {
746 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
747 if (k < hmm->M) hmm->t[k][x] = ascii2prob(s, 1.0);
749 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
750 hmm->begin[k] = ascii2prob(s, 1.0);
751 if ((s = strtok(NULL, " \t\n")) == NULL) goto FAILURE;
752 hmm->end[k] = ascii2prob(s, 1.0);
754 } /* end loop over main model */
756 /* Advance to record separator
758 while (fgets(buffer, 512, hmmfp->f) != NULL)
759 if (strncmp(buffer, "//", 2) == 0) break;
761 Plan7Renormalize(hmm); /* Paracel reported bug 6/11/99 */
763 /* Set flags and return
765 hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */
766 hmm->flags &= ~PLAN7_HASBITS; /* scores are not valid */
772 if (hmm != NULL) FreePlan7(hmm);
779 read_bin20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
790 if (feof(hmmfp->f)) return 0;
791 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
793 if (hmmfp->byteswap) byteswap((char *)&magic, sizeof(unsigned int));
794 if (magic != v20magic) goto FAILURE;
795 /* allocate HMM shell for header info */
796 hmm = AllocPlan7Shell();
798 if (! fread((char *) &(hmm->flags), sizeof(int), 1, hmmfp->f)) goto FAILURE;
799 if (hmmfp->byteswap) byteswap((char *)&(hmm->flags), sizeof(int));
801 if (! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->name))) goto FAILURE;
803 /* optional accession */
804 if ((hmm->flags & PLAN7_ACC) &&
805 ! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->acc))) goto FAILURE;
806 /* optional description */
807 if ((hmm->flags & PLAN7_DESC) &&
808 ! read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->desc))) goto FAILURE;
809 /* length of model */
810 if (! fread((char *) &hmm->M, sizeof(int), 1, hmmfp->f)) goto FAILURE;
811 if (hmmfp->byteswap) byteswap((char *)&(hmm->M), sizeof(int));
813 if (! fread((char *) &type, sizeof(int), 1, hmmfp->f)) goto FAILURE;
814 if (hmmfp->byteswap) byteswap((char *)&type, sizeof(int));
815 if (Alphabet_type == hmmNOTSETYET) SetAlphabet(type);
816 else if (type != Alphabet_type)
817 Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(type));
819 /* now allocate for rest of model */
820 AllocPlan7Body(hmm, hmm->M);
822 /* optional #=RF alignment annotation */
823 if ((hmm->flags & PLAN7_RF) &&
824 !fread((char *) hmm->rf, sizeof(char), hmm->M+1, hmmfp->f)) goto FAILURE;
825 hmm->rf[hmm->M+1] = '\0';
826 /* optional #=CS alignment annotation */
827 if ((hmm->flags & PLAN7_CS) &&
828 !fread((char *) hmm->cs, sizeof(char), hmm->M+1, hmmfp->f)) goto FAILURE;
829 hmm->cs[hmm->M+1] = '\0';
830 /* optional alignment map annotation */
831 if ((hmm->flags & PLAN7_MAP) &&
832 !fread((char *) hmm->map, sizeof(int), hmm->M+1, hmmfp->f)) goto FAILURE;
834 for (k = 1; k <= hmm->M; k++)
835 byteswap((char*)&(hmm->map[k]), sizeof(int));
836 /* command line log */
837 if (!read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->comlog))) goto FAILURE;
839 if (!fread((char *) &(hmm->nseq),sizeof(int), 1, hmmfp->f)) goto FAILURE;
840 if (hmmfp->byteswap) byteswap((char *)&(hmm->nseq), sizeof(int));
842 if (!read_bin_string(hmmfp->f, hmmfp->byteswap, &(hmm->ctime))) goto FAILURE;
844 if (!fread((char *) &(hmm->checksum),sizeof(int), 1, hmmfp->f)) goto FAILURE;
845 if (hmmfp->byteswap) byteswap((char *)&(hmm->checksum), sizeof(int));
847 /* Pfam gathering thresholds */
848 if (hmm->flags & PLAN7_GA) {
849 if (! fread((char *) &(hmm->ga1), sizeof(float), 1, hmmfp->f)) goto FAILURE;
850 if (! fread((char *) &(hmm->ga2), sizeof(float), 1, hmmfp->f)) goto FAILURE;
851 if (hmmfp->byteswap) {
852 byteswap((char *) &(hmm->ga1), sizeof(float));
853 byteswap((char *) &(hmm->ga2), sizeof(float));
856 /* Pfam trusted cutoffs */
857 if (hmm->flags & PLAN7_TC) {
858 if (! fread((char *) &(hmm->tc1), sizeof(float), 1, hmmfp->f)) goto FAILURE;
859 if (! fread((char *) &(hmm->tc2), sizeof(float), 1, hmmfp->f)) goto FAILURE;
860 if (hmmfp->byteswap) {
861 byteswap((char *) &(hmm->tc1), sizeof(float));
862 byteswap((char *) &(hmm->tc2), sizeof(float));
865 /* Pfam noise cutoffs */
866 if (hmm->flags & PLAN7_NC) {
867 if (! fread((char *) &(hmm->nc1), sizeof(float), 1, hmmfp->f)) goto FAILURE;
868 if (! fread((char *) &(hmm->nc2), sizeof(float), 1, hmmfp->f)) goto FAILURE;
869 if (hmmfp->byteswap) {
870 byteswap((char *) &(hmm->nc1), sizeof(float));
871 byteswap((char *) &(hmm->nc2), sizeof(float));
876 for (k = 0; k < 4; k++)
878 if (! fread((char *) hmm->xt[k], sizeof(float), 2, hmmfp->f)) goto FAILURE;
879 if (hmmfp->byteswap) {
880 for (x = 0; x < 2; x++)
881 byteswap((char *)&(hmm->xt[k][x]), sizeof(float));
886 if (!fread((char *) &(hmm->p1),sizeof(float), 1, hmmfp->f)) goto FAILURE;
887 if (!fread((char *)hmm->null,sizeof(float),Alphabet_size,hmmfp->f))goto FAILURE;
890 if (hmm->flags & PLAN7_STATS) {
891 if (! fread((char *) &(hmm->mu), sizeof(float), 1, hmmfp->f))goto FAILURE;
892 if (! fread((char *) &(hmm->lambda), sizeof(float), 1, hmmfp->f))goto FAILURE;
894 if (hmmfp->byteswap) {
895 byteswap((char *)&(hmm->mu), sizeof(float));
896 byteswap((char *)&(hmm->lambda), sizeof(float));
900 /* entry/exit probabilities
902 if (! fread((char *)&(hmm->tbd1), sizeof(float), 1, hmmfp->f)) goto FAILURE;
903 if (! fread((char *) hmm->begin, sizeof(float), hmm->M+1, hmmfp->f)) goto FAILURE;
904 if (! fread((char *) hmm->end, sizeof(float), hmm->M+1, hmmfp->f)) goto FAILURE;
907 for (k = 1; k <= hmm->M; k++)
908 if (! fread((char *) hmm->mat[k], sizeof(float), Alphabet_size, hmmfp->f)) goto FAILURE;
909 for (k = 1; k < hmm->M; k++)
910 if (! fread((char *) hmm->ins[k], sizeof(float), Alphabet_size, hmmfp->f)) goto FAILURE;
911 for (k = 1; k < hmm->M; k++)
912 if (! fread((char *) hmm->t[k], sizeof(float), 7, hmmfp->f)) goto FAILURE;
916 if (hmmfp->byteswap) {
917 for (x = 0; x < Alphabet_size; x++)
918 byteswap((char *) &(hmm->null[x]), sizeof(float));
919 byteswap((char *)&(hmm->p1), sizeof(float));
920 byteswap((char *)&(hmm->tbd1), sizeof(float));
922 for (k = 1; k <= hmm->M; k++)
924 for (x = 0; x < Alphabet_size; x++)
925 byteswap((char *)&(hmm->mat[k][x]), sizeof(float));
927 for (x = 0; x < Alphabet_size; x++)
928 byteswap((char *)&(hmm->ins[k][x]), sizeof(float));
929 byteswap((char *)&(hmm->begin[k]), sizeof(float));
930 byteswap((char *)&(hmm->end[k]), sizeof(float));
932 for (x = 0; x < 7; x++)
933 byteswap((char *)&(hmm->t[k][x]), sizeof(float));
938 /* set flags and return
940 hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */
941 hmm->flags &= ~PLAN7_HASBITS; /* scores are not yet valid */
946 if (hmm != NULL) FreePlan7(hmm);
955 /* Function: read_asc19hmm()
956 * Date: Tue Apr 7 17:11:29 1998 [StL]
958 * Purpose: Read ASCII-format tabular (1.9 and later) save files.
960 * HMMER 1.9 was only used internally at WashU, as far as
961 * I know, so this code shouldn't be terribly important
965 read_asc19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
971 int M; /* length of model */
972 int k; /* state number */
973 int x; /* symbol number */
974 int atype; /* Alphabet type */
978 if (feof(fp) || fgets(buffer, 512, fp) == NULL) return 0;
979 if (strncmp(buffer, "HMMER v1.9", 10) != 0) goto FAILURE;
981 hmm = AllocPlan7Shell();
982 /* read M from first line */
983 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; M = atoi(s); /* model length */
984 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* ignore alphabet size */
985 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; Plan7SetName(hmm, s); /* name */
986 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE; /* alphabet type */
988 if (strcmp(s, "AMINO") == 0) atype = hmmAMINO;
989 else if (strcmp(s, "NUCLEIC") == 0) atype = hmmNUCLEIC;
992 if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype);
993 else if (atype != Alphabet_type)
994 Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype));
996 /* read alphabet, make sure it's Plan7-compatible... */
997 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
998 if (strncmp(s, Alphabet, Alphabet_size) != 0) goto FAILURE;
1000 /* whether we have ref, cs info */
1001 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
1002 if (strcmp(s, "yes") == 0) hmm->flags |= PLAN7_RF;
1003 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
1004 if (strcmp(s, "yes") == 0) hmm->flags |= PLAN7_CS;
1006 /* null model. 1.9 has emissions only. invent transitions. */
1007 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
1008 if (strcmp(s, "null") != 0) goto FAILURE;
1009 for (x = 0; x < Alphabet_size; x++) {
1010 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1011 hmm->null[x] = ascii2prob(s, 1.0);
1013 hmm->p1 = (Alphabet_type == hmmAMINO)? 350./351. : 1000./1001.;
1015 /* Done with header; check some stuff before proceeding
1017 if (feof(hmmfp->f)) goto FAILURE;
1018 if (M < 1) goto FAILURE;
1019 if (hmm->name == NULL) goto FAILURE;
1020 if (Alphabet_type == hmmNOTSETYET) goto FAILURE;
1022 /* Allocate the model. Set up the probabilities that Plan9
1025 AllocPlan7Body(hmm, M);
1029 /* The zero row has: 4 or 20 unused scores for nonexistent M0 state
1030 * then: B->M, tbd1, a B->I that Plan7 doesn't have;
1031 * three unused D-> transitions; then three I0 transitions that Plan7 doesn't have;
1032 * then two unused rf, cs annotations.
1034 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* position index ignored */
1035 for (x = 0; x < Alphabet_size; x++)
1036 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* emissions ignored */
1037 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1038 hmm->begin[1] = ascii2prob(s, 1.0);
1039 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1040 hmm->tbd1 = ascii2prob(s, 1.0);
1042 hmm->begin[1] = hmm->begin[1] / (hmm->begin[1] + hmm->tbd1);
1043 hmm->tbd1 = hmm->tbd1 / (hmm->begin[1] + hmm->tbd1);
1044 /* skip rest of line, seven integer fields, two char fields */
1045 for (x = 0; x < 7; x++)
1046 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1047 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
1048 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
1050 /* main model: table of emissions, transitions, annotation */
1051 for (k = 1; k <= hmm->M; k++)
1053 /* position index ignored */
1054 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1055 /* match emissions */
1056 for (x = 0; x < Alphabet_size; x++) {
1057 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1058 hmm->mat[k][x] = ascii2prob(s, hmm->null[x]);
1060 /* nine transitions; two are ignored */
1061 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1062 if (k < hmm->M) hmm->t[k][TMM] = ascii2prob(s, 1.0);
1063 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1064 if (k < hmm->M) hmm->t[k][TMD] = (k == hmm->M) ? 0.0 : ascii2prob(s, 1.0);
1065 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1066 if (k < hmm->M) hmm->t[k][TMI] = ascii2prob(s, 1.0);
1068 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1069 if (k < hmm->M) hmm->t[k][TDM] = ascii2prob(s, 1.0);
1070 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1071 if (k < hmm->M) hmm->t[k][TDD] = (k == hmm->M) ? 0.0 : ascii2prob(s, 1.0);
1072 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;/* TDI ignored. */
1074 /* no insert state at k == M, be careful */
1075 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1076 if (k < hmm->M) hmm->t[k][TIM] = ascii2prob(s, 1.0);
1077 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* TID ignored. */
1078 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1079 if (k < hmm->M) hmm->t[k][TII] = ascii2prob(s, 1.0);
1082 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
1083 if (hmm->flags & PLAN7_RF) hmm->rf[k] = *s;
1084 if ((s = Getword(fp, sqdARG_STRING)) == NULL) goto FAILURE;
1085 if (hmm->flags & PLAN7_CS) hmm->cs[k] = *s;
1087 /* table of insert emissions;
1088 * Plan7 has no insert state at 0 or M */
1089 for (k = 0; k <= hmm->M; k++)
1091 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE; /* position index ignored */
1092 for (x = 0; x < Alphabet_size; x++) {
1093 if ((s = Getword(fp, sqdARG_INT)) == NULL) goto FAILURE;
1094 if (k > 0 && k < hmm->M)
1095 hmm->ins[k][x] = ascii2prob(s, hmm->null[x]);
1099 /* Set flags and return
1101 hmm->flags |= PLAN7_HASPROB; /* probabilities are valid */
1102 hmm->flags &= ~PLAN7_HASBITS; /* scores are not valid */
1103 Plan7Renormalize(hmm);
1104 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1110 if (hmm != NULL) FreePlan7(hmm);
1116 read_bin19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1119 struct plan7_s *hmm; /* plan7 HMM */
1120 struct plan9_s *p9hmm; /* old style 1.x HMM */
1122 /* Read the magic number; if we don't see it, then we
1123 * must be out of data in the file.
1125 if (feof(hmmfp->f)) return 0;
1126 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
1128 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_9B, hmmfp->byteswap);
1129 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1131 Plan9toPlan7(p9hmm, &hmm);
1133 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1141 read_asc17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1143 struct plan7_s *hmm; /* plan7 HMM */
1144 struct plan9_s *p9hmm; /* old style 1.x HMM */
1147 /* Read the magic header; if we don't see it, then
1148 * we must be out of data in the file.
1150 if (feof(hmmfp->f) || fgets(buffer, 512, hmmfp->f) == NULL) return 0;
1152 p9hmm = read_plan9_aschmm(hmmfp->f, HMMER1_7F);
1153 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1155 Plan9toPlan7(p9hmm, &hmm);
1157 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1161 Plan7Renormalize(hmm);
1167 read_bin17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1170 struct plan7_s *hmm; /* plan7 HMM */
1171 struct plan9_s *p9hmm; /* old style 1.x HMM */
1173 /* Read the magic number; if we don't see it, then we
1174 * must be out of data in the file.
1176 if (feof(hmmfp->f)) return 0;
1177 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
1179 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_7B, hmmfp->byteswap);
1180 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1182 Plan9toPlan7(p9hmm, &hmm);
1184 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1193 read_asc11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1195 Die("1.1 ASCII HMMs unsupported");
1199 read_bin11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1202 struct plan7_s *hmm; /* plan7 HMM */
1203 struct plan9_s *p9hmm; /* old style 1.x HMM */
1205 /* Read the magic number; if we don't see it, then we
1206 * must be out of data in the file.
1208 if (feof(hmmfp->f)) return 0;
1209 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
1211 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_1B, hmmfp->byteswap);
1212 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1214 Plan9toPlan7(p9hmm, &hmm);
1216 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1225 read_asc10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1227 Die("1.0 ASCII HMMs unsupported");
1232 read_bin10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm)
1235 struct plan7_s *hmm; /* plan7 HMM */
1236 struct plan9_s *p9hmm; /* old style 1.x HMM */
1238 /* Read the magic number; if we don't see it, then we
1239 * must be out of data in the file.
1241 if (feof(hmmfp->f)) return 0;
1242 if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) return 0;
1244 p9hmm = read_plan9_binhmm(hmmfp->f, HMMER1_0B, hmmfp->byteswap);
1245 if (p9hmm == NULL) { *ret_hmm = NULL; return 1; }
1247 Plan9toPlan7(p9hmm, &hmm);
1249 hmm->comlog = Strdup("[converted from an old Plan9 HMM]");
1257 /*****************************************************************
1258 * Some miscellaneous utility functions
1259 *****************************************************************/
1261 /* Function: prob2ascii()
1263 * Purpose: Format a probability for output to an ASCII save
1264 * file. Returns a ptr to a static internal buffer.
1268 prob2ascii(float p, float null)
1270 static char buffer[8];
1272 if (p == 0.0) return "*";
1273 sprintf(buffer, "%6d", Prob2Score(p, null));
1278 /* Function: ascii2prob()
1280 * Purpose: Convert a saved string back to a probability.
1283 ascii2prob(char *s, float null)
1285 return (*s == '*') ? 0. : Score2Prob(atoi(s), null);
1288 /* Function: byteswap()
1290 * Purpose: Swap between big-endian and little-endian.
1292 * int foo = 0x12345678;
1293 * byteswap((char *) &foo, sizeof(int));
1294 * printf("%x\n", foo)
1297 * I don't fully understand byte-swapping issues.
1298 * However, I have tested this on chars through floats,
1299 * on various machines:
1300 * SGI IRIX 4.0.5, SunOS 4.1.3, DEC Alpha OSF/1, Alliant
1302 * Note: this is only a partial solution to the problem of
1303 * binary file portability. 32 bit integers are assumed by HMMER,
1304 * for instance. This should be true for all UNIX, VAX, and WinNT
1305 * platforms, I believe.
1307 * Date: Sun Feb 12 10:26:22 1995
1310 byteswap(char *swap, int nbytes)
1315 for (x = 0; x < nbytes / 2; x++)
1317 byte = swap[nbytes - x - 1];
1318 swap[nbytes - x - 1] = swap[x];
1323 /* Function: write_bin_string()
1324 * Date: SRE, Wed Oct 29 13:49:27 1997 [TWA 721 over Canada]
1326 * Purpose: Write a string in binary save format: an integer
1327 * for the string length (including \0), followed by
1331 write_bin_string(FILE *fp, char *s)
1336 len = strlen(s) + 1;
1337 fwrite((char *) &len, sizeof(int), 1, fp);
1338 fwrite((char *) s, sizeof(char), len, fp);
1343 fwrite((char *) &len, sizeof(int), 1, fp);
1347 /* Function: read_bin_string()
1348 * Date: SRE, Wed Oct 29 14:03:23 1997 [TWA 721]
1350 * Purpose: Read in a string from a binary file, where
1351 * the first integer is the length (including '\0').
1353 * Args: fp - FILE to read from
1354 * doswap - TRUE to byteswap
1355 * ret_s - string to read into
1357 * Return: 0 on failure. ret_s is malloc'ed here.
1360 read_bin_string(FILE *fp, int doswap, char **ret_s)
1365 if (! fread((char *) &len, sizeof(int), 1, fp)) return 0;
1366 if (doswap) byteswap((char *)&len, sizeof(int));
1367 s = MallocOrDie (sizeof(char) * (len));
1368 if (! fread((char *) s, sizeof(char), len, fp))
1378 /* Function: multiline()
1379 * Date: Mon Jan 5 14:57:50 1998 [StL]
1381 * Purpose: Given a record (like the comlog) that contains
1382 * multiple lines, print it as multiple lines with
1383 * a given prefix. e.g.:
1385 * given: "COM ", "foo\nbar\nbaz"
1391 * Used to print the command log to ASCII save files.
1393 * Args: fp: FILE to print to
1394 * pfx: prefix for each line
1395 * s: line to break up and print; tolerates a NULL
1400 multiline(FILE *fp, char *pfx, char *s)
1405 if (s == NULL) return;
1407 sptr = strtok(buf, "\n");
1408 while (sptr != NULL)
1410 fprintf(fp, "%s%s\n", pfx, sptr);
1411 sptr = strtok(NULL, "\n");
1417 /*****************************************************************
1418 * HMMER 1.x save file reading functions, modified from the
1420 *****************************************************************/
1423 /* Function: read_plan9_binhmm()
1425 * Read old (Plan9) binary HMM save files from HMMER 1.9 and earlier.
1426 * V1.0 saved regularizer and sympvec info, which V1.1 ignores.
1427 * V1.7 and later may include optional ref, cs annotation lines.
1428 * V1.9 added name, null model.
1430 * Returns pointer to the HMM on success; NULL
1431 * on failure. Sets global alphabet information based on
1432 * whether it reads 4 or 20 as alphabet size (don't rely
1433 * on ancient HMMER macro definitions).
1435 static struct plan9_s *
1436 read_plan9_binhmm(FILE *fp, int version, int swapped)
1438 struct plan9_s *hmm;
1439 int M; /* length of model */
1440 int k; /* state number */
1441 int x; /* symbol or transition number */
1442 int len; /* length of variable length string */
1443 int asize; /* alphabet size */
1444 int atype; /* alphabet type (read but ignored) */
1445 char abet[20]; /* alphabet (read but ignored) */
1447 /* read M and alphabet size */
1448 if (! fread((char *) &(M), sizeof(int), 1, fp)) return NULL;
1449 if (! fread((char *) &asize, sizeof(int), 1, fp)) return NULL;
1451 byteswap((char *) &M, sizeof(int));
1452 byteswap((char *) &asize, sizeof(int));
1455 /* Set global alphabet information
1457 if (asize == 4) atype = hmmNUCLEIC;
1458 else if (asize == 20) atype = hmmAMINO;
1459 else Die("A nonbiological alphabet size of %d; so I can't convert plan9 to plan7", asize);
1460 if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype);
1461 else if (atype != Alphabet_type)
1462 Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype));
1464 /* now, create space for hmm */
1465 if ((hmm = P9AllocHMM(M)) == NULL)
1466 Die("malloc failed for reading hmm in\n");
1468 /* version 1.9+ files have a name */
1469 if (version == HMMER1_9B) {
1470 if (! fread((char *) &len, sizeof(int), 1, fp)) return NULL;
1471 if (swapped) byteswap((char *) &len, sizeof(int));
1472 hmm->name = (char *) ReallocOrDie (hmm->name, sizeof(char) * (len+1));
1473 if (! fread((char *) hmm->name, sizeof(char), len, fp)) return NULL;
1474 hmm->name[len] = '\0';
1477 /* read alphabet_type and alphabet, but ignore: we've already set them */
1478 if (! fread((char *) &atype, sizeof(int), 1, fp)) return NULL;
1479 if (! fread((char *) abet, sizeof(char), Alphabet_size, fp)) return NULL;
1481 /* skip the random symbol frequencies in V1.0 */
1482 if (version == HMMER1_0B)
1483 fseek(fp, (long) (sizeof(float) * Alphabet_size), SEEK_CUR);
1485 /* Get optional info in V1.7 and later
1487 if (version == HMMER1_7B || version == HMMER1_9B)
1489 if (! fread((char *) &(hmm->flags), sizeof(int), 1, fp)) return NULL;
1490 if (swapped) byteswap((char *) &hmm->flags, sizeof(int));
1491 if ((hmm->flags & HMM_REF) &&
1492 ! fread((char *) hmm->ref, sizeof(char), hmm->M+1, fp)) return NULL;
1493 hmm->ref[hmm->M+1] = '\0';
1494 if ((hmm->flags & HMM_CS) &&
1495 ! fread((char *) hmm->cs, sizeof(char), hmm->M+1, fp)) return NULL;
1496 hmm->cs[hmm->M+1] = '\0';
1499 /* Get the null model in V1.9 and later
1501 if (version == HMMER1_9B)
1503 if (! fread((char *) hmm->null, sizeof(float), Alphabet_size, fp)) return NULL;
1505 for (x = 0; x < Alphabet_size; x++)
1506 byteswap((char *) &(hmm->null[x]), sizeof(float));
1508 else P9DefaultNullModel(hmm->null);
1510 /* everything else is states */
1511 for (k = 0; k <= hmm->M; k++)
1513 /* get match state info */
1514 if (! fread((char *) &(hmm->mat[k].t[MATCH]), sizeof(float), 1, fp)) return NULL;
1515 if (! fread((char *) &(hmm->mat[k].t[DELETE]), sizeof(float), 1, fp)) return NULL;
1516 if (! fread((char *) &(hmm->mat[k].t[INSERT]), sizeof(float), 1, fp)) return NULL;
1517 if (! fread((char *) hmm->mat[k].p, sizeof(float), Alphabet_size, fp)) return NULL
1520 byteswap((char *) &(hmm->mat[k].t[MATCH]), sizeof(float));
1521 byteswap((char *) &(hmm->mat[k].t[DELETE]), sizeof(float));
1522 byteswap((char *) &(hmm->mat[k].t[INSERT]), sizeof(float));
1523 for (x = 0; x < Alphabet_size; x++)
1524 byteswap((char *) &(hmm->mat[k].p[x]), sizeof(float));
1527 /* skip the regularizer info in V1.0 */
1528 if (version == HMMER1_0B)
1529 fseek(fp, (long)(sizeof(float) * (3 + Alphabet_size)), SEEK_CUR);
1531 /* get delete state info */
1532 if (! fread((char *) &(hmm->del[k].t[MATCH]), sizeof(float), 1, fp)) return NULL;
1533 if (! fread((char *) &(hmm->del[k].t[DELETE]), sizeof(float), 1, fp)) return NULL;
1534 if (! fread((char *) &(hmm->del[k].t[INSERT]), sizeof(float), 1, fp)) return NULL;
1536 byteswap((char *) &(hmm->del[k].t[MATCH]), sizeof(float));
1537 byteswap((char *) &(hmm->del[k].t[DELETE]), sizeof(float));
1538 byteswap((char *) &(hmm->del[k].t[INSERT]), sizeof(float));
1541 /* skip the regularizer info in V1.0 */
1542 if (version == HMMER1_0B)
1543 fseek(fp, (long)(sizeof(float) * 3), SEEK_CUR);
1545 /* get insert state info */
1546 if (! fread((char *) &(hmm->ins[k].t[MATCH]), sizeof(float), 1, fp)) return NULL;
1547 if (! fread((char *) &(hmm->ins[k].t[DELETE]), sizeof(float), 1, fp)) return NULL;
1548 if (! fread((char *) &(hmm->ins[k].t[INSERT]), sizeof(float), 1, fp)) return NULL;
1549 if (! fread((char *) hmm->ins[k].p, sizeof(float), Alphabet_size, fp)) return NULL
1552 byteswap((char *) &(hmm->ins[k].t[MATCH]), sizeof(float));
1553 byteswap((char *) &(hmm->ins[k].t[DELETE]), sizeof(float));
1554 byteswap((char *) &(hmm->ins[k].t[INSERT]), sizeof(float));
1555 for (x = 0; x < Alphabet_size; x++)
1556 byteswap((char *) &(hmm->ins[k].p[x]), sizeof(float));
1559 /* skip the regularizer info in V1.0 */
1560 if (version == HMMER1_0B)
1561 fseek(fp, (long)(sizeof(float) * (3 + Alphabet_size)), SEEK_CUR);
1568 /* Function: read_plan9_aschmm()
1570 * Purpose: Read ASCII-format save files from 1.8.4 and earlier.
1571 * V1.0 contained sympvec and regularizers; these are ignored
1573 * V1.7 and later contain ref and cs annotation.
1575 * Args: fp - open save file, header has been read already
1576 * version - HMMER1_7F, for instance
1578 * Returns ptr to the (allocated) new HMM on success,
1579 * or NULL on failure.
1581 static struct plan9_s *
1582 read_plan9_aschmm(FILE *fp, int version)
1584 struct plan9_s *hmm;
1585 int M; /* length of model */
1589 int k; /* state number */
1590 int i; /* symbol number */
1591 int asize; /* Alphabet size */
1592 int atype; /* Alphabet type */
1594 /* read M from first line */
1595 if (fgets(buffer, 512, fp) == NULL) return NULL;
1596 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1597 if (!isdigit((int) (*s))) return NULL;
1599 /* read alphabet_length */
1600 if (fgets(buffer, 512, fp) == NULL) return NULL;
1601 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1602 if (!isdigit((int) (*s))) return NULL;
1605 /* Set global alphabet information
1607 if (asize == 4) atype = hmmNUCLEIC;
1608 else if (asize == 20) atype = hmmAMINO;
1609 else Die("A nonbiological alphabet size of %d; so I can't convert plan9 to plan7", asize);
1610 if (Alphabet_type == hmmNOTSETYET) SetAlphabet(atype);
1611 else if (atype != Alphabet_type)
1612 Die("Alphabet mismatch error.\nI thought we were working with %s, but tried to read a %s HMM.\n", AlphabetType2String(Alphabet_type), AlphabetType2String(atype));
1614 /* now, create space for hmm */
1615 if ((hmm = P9AllocHMM(M)) == NULL)
1616 Die("malloc failed for reading hmm in\n");
1618 /* read alphabet_type but ignore */
1619 if (fgets(buffer, 512, fp) == NULL) return NULL;
1620 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1621 if (!isdigit((int) (*s))) return NULL;
1622 /* read alphabet but ignore */
1623 if (fgets(buffer, 512, fp) == NULL) return NULL;
1624 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1626 /* skip the random symbol frequencies in V1.0 files. now unused */
1627 if (version == HMMER1_0F)
1628 for (i = 0; i < Alphabet_size; i++)
1629 if (fgets(buffer, 512, fp) == NULL) return NULL;
1631 /* V1.7 has lines for whether we have valid ref, cs info
1633 if (version == HMMER1_7F)
1635 if (fgets(buffer, 512, fp) == NULL) return NULL;
1636 if (strncmp(buffer, "yes", 3) == 0) hmm->flags |= HMM_REF;
1637 if (fgets(buffer, 512, fp) == NULL) return NULL;
1638 if (strncmp(buffer, "yes", 3) == 0) hmm->flags |= HMM_CS;
1641 /* everything else is states */
1642 while (fgets(buffer, 512, fp) != NULL)
1644 /* get state type and index info */
1645 if ((statetype = strtok(buffer, " \t\n")) == NULL) return NULL;
1646 if ((s = strtok((char *) NULL, " \t\n")) == NULL) return NULL;
1647 if (!isdigit((int) (*s))) return NULL;
1649 if (k < 0 || k > hmm->M+1) return NULL;
1651 if (strcmp(statetype, "###MATCH_STATE") == 0)
1653 /* V1.7: get ref, cs info: */
1654 /* ###MATCH_STATE 16 (x) (H) */
1655 if (version == HMMER1_7F)
1657 s = strtok(NULL, "\n");
1658 while (*s != '(' && *s != '\0') s++;
1659 if (*s != '(') return NULL;
1660 hmm->ref[k] = *(s+1);
1661 while (*s != '(' && *s != '\0') s++;
1662 if (*s != '(') return NULL;
1663 hmm->cs[k] = *(s+1);
1666 if (fgets(buffer, 512, fp) == NULL) return NULL;
1667 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1668 hmm->mat[k].t[MATCH] = (float) atof(s);
1670 if (fgets(buffer, 512, fp) == NULL) return NULL;
1671 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1672 hmm->mat[k].t[DELETE] = (float) atof(s);
1674 if (fgets(buffer, 512, fp) == NULL) return NULL;
1675 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1676 hmm->mat[k].t[INSERT] = (float) atof(s);
1678 for (i = 0; i < Alphabet_size; i++)
1680 if (fgets(buffer, 512, fp) == NULL) return NULL;
1681 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1682 hmm->mat[k].p[i] = (float) atof(s);
1685 /* Skip all regularizer info for V1.0 */
1686 if (version == HMMER1_0F)
1687 for (i = 0; i < Alphabet_size + 3; i++)
1688 if (fgets(buffer, 512, fp) == NULL) return NULL;
1691 else if (strcmp(statetype, "###INSERT_STATE") == 0)
1693 if (fgets(buffer, 512, fp) == NULL) return NULL;
1694 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1695 hmm->ins[k].t[MATCH] = (float) atof(s);
1697 if (fgets(buffer, 512, fp) == NULL) return NULL;
1698 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1699 hmm->ins[k].t[DELETE] = (float) atof(s);
1701 if (fgets(buffer, 512, fp) == NULL) return NULL;
1702 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1703 hmm->ins[k].t[INSERT] = (float) atof(s);
1705 for (i = 0; i < Alphabet_size; i++)
1707 if (fgets(buffer, 512, fp) == NULL) return NULL;
1708 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1709 hmm->ins[k].p[i] = (float) atof(s);
1712 /* Skip all regularizer info in V1.0 files */
1713 if (version == HMMER1_0F)
1714 for (i = 0; i < Alphabet_size + 3; i++)
1715 if (fgets(buffer, 512, fp) == NULL) return NULL;
1718 else if (strcmp(statetype, "###DELETE_STATE") == 0)
1720 if (fgets(buffer, 512, fp) == NULL) return NULL;
1721 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1722 hmm->del[k].t[MATCH] = (float) atof(s);
1724 if (fgets(buffer, 512, fp) == NULL) return NULL;
1725 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1726 hmm->del[k].t[DELETE] = (float) atof(s);
1728 if (fgets(buffer, 512, fp) == NULL) return NULL;
1729 if ((s = strtok(buffer, " \t\n")) == NULL) return NULL;
1730 hmm->del[k].t[INSERT] = (float) atof(s);
1732 /* Skip all regularizer info in V1.0 files*/
1733 if (version == HMMER1_0F)
1734 for (i = 0; i < 3; i++)
1735 if (fgets(buffer, 512, fp) == NULL) return NULL;
1741 P9DefaultNullModel(hmm->null);