1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * Interfaces for GSI "generic sequence index" files.
13 * broken away from sqio.c and extended: SRE, Wed Aug 5 10:32:53 1998
17 * 1 + <nfiles> + <nkeys> total records.
18 * Each record = 38 bytes.
20 * one header record : <"GSI" (32)> <nfiles (2)> <nkeys (4)>
21 * <nfiles> file records : <filename (32)> <fileno (2)> <fmt (4)>
22 * <nkeys> key records : <key (32)> <fileno (2)> <offset(4)>
24 * Matches up with my Perl scripts that create GSI files.
26 * RCS $Id: gsi.c,v 1.1.1.1 2005/03/22 08:34:18 cmzmasek Exp $
33 #include <unistd.h> /* needed for poor crippled SunOS */
40 /*****************************************************************
41 * GSI index file access routines
42 *****************************************************************/
44 /* Function: GSIOpen()
46 * Purpose: Open a GSI file. Returns the number of records in
47 * the file and a file pointer. Returns NULL on failure.
48 * The file pointer should be fclose()'d normally.
51 GSIOpen(char *gsifile)
54 char magic[GSI_KEYSIZE];
56 gsi = (GSIFILE *) MallocOrDie (sizeof(GSIFILE));
57 if ((gsi->gsifp = fopen(gsifile, "r")) == NULL)
58 { free(gsi); squid_errno = SQERR_NOFILE; return NULL; }
60 if (! fread(magic, sizeof(char), GSI_KEYSIZE, gsi->gsifp))
61 { free(gsi); squid_errno = SQERR_NODATA; return NULL; }
62 if (strcmp(magic, "GSI") != 0)
63 { free(gsi); squid_errno = SQERR_FORMAT; return NULL; }
65 if (! fread(&(gsi->nfiles), sizeof(sqd_uint16), 1, gsi->gsifp))
66 { free(gsi); squid_errno = SQERR_NODATA; return NULL; }
67 if (! fread(&(gsi->recnum), sizeof(sqd_uint32), 1, gsi->gsifp))
68 { free(gsi); squid_errno = SQERR_NODATA; return NULL; }
70 gsi->nfiles = sre_ntoh16(gsi->nfiles); /* convert from network short */
71 gsi->recnum = sre_ntoh32(gsi->recnum); /* convert from network long */
76 /* Function: GSIGetRecord()
78 * Purpose: Each non-header record of a GSI index files consists
79 * of 38 bytes: 32 bytes of character string, a 2 byte
80 * short, and a 4 byte long. This function returns the
83 * Args: gsi - open GSI index file, correctly positioned at a record
84 * f1 - char[32], allocated by caller (or NULL if unwanted)
85 * f2 - pointer to short (or NULL if unwanted)
86 * f3 - pointer to long (or NULL if unwanted)
88 * Return: 0 on failure and sets squid_errno.
91 GSIGetRecord(GSIFILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint32 *f3)
93 if (f1 == NULL) fseek(gsi->gsifp, GSI_KEYSIZE, SEEK_CUR);
94 else if (! fread(f1, GSI_KEYSIZE, 1, gsi->gsifp))
95 { squid_errno = SQERR_NODATA; return 0; }
97 if (f2 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint16), SEEK_CUR);
98 else if (! fread(f2, sizeof(sqd_uint16), 1, gsi->gsifp))
99 { squid_errno = SQERR_NODATA; return 0; }
101 if (f3 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint32), SEEK_CUR);
102 else if (! fread(f3, sizeof(sqd_uint32), 1, gsi->gsifp))
103 { squid_errno = SQERR_NODATA; return 0; }
105 if (f2 != NULL) *f2 = sre_ntoh16(*f2);
106 if (f3 != NULL) *f3 = sre_ntoh32(*f3);
112 /* Function: GSIGetOffset()
114 * Purpose: From a key (sequence name), find a disk offset
115 * in an open general sequence index file by binary
116 * search. Presumably GSI indexing could be even faster
117 * if we used hashing.
119 * Args: gsi - GSI index file, opened by GSIOpen()
120 * key - name of key to retrieve indices for
121 * ret_seqfile - pre-alloced char[32] array for seqfile name
122 * ret_fmt - format of seqfile
123 * ret_offset - return: disk offset in seqfile.
126 GSIGetOffset(GSIFILE *gsi, char *key, char *ret_seqfile,
127 int *ret_format, long *ret_offset)
129 sqd_uint32 left, right, mid;
131 char name[GSI_KEYSIZE + 1];
136 name[GSI_KEYSIZE] = '\0';
138 left = gsi->nfiles + 1;
139 right = gsi->nfiles + gsi->recnum;
140 mid = (left + right) / 2;
141 fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET);
143 while (GSIGetRecord(gsi, name, &filenum, &offset))
145 cmp = strcmp(name, key);
146 if (cmp == 0) break; /* found it! */
147 else if (left >= right) return 0; /* oops, missed it; fail. */
148 else if (cmp < 0) left = mid + 1; /* it's right of mid */
149 else if (cmp > 0) right = mid - 1; /* it's left of mid */
150 mid = (left + right) / 2;
151 fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET);
154 /* Using file number, look up the sequence file and format.
156 fseek(gsi->gsifp, filenum * GSI_RECSIZE, SEEK_SET);
157 GSIGetRecord(gsi, ret_seqfile, NULL, &fmt);
158 *ret_format = (int) fmt;
159 *ret_offset = (long) offset;
164 /* Function: GSIClose()
166 * Purpose: Close an open GSI sequence index file.
169 GSIClose(GSIFILE *gsi)
176 /*****************************************************************
177 * GSI index construction routines
178 * SRE, Wed Nov 10 11:49:14 1999 [St. Louis]
181 * g = GSIAllocIndex();
183 * [foreach filename, <32 char, no directory path]
184 * GSIAddFileToIndex(g, filename);
186 * [foreach key, <32 char, w/ filenum 1..nfiles, w/ 32bit offset]
187 * GSIAddKeyToIndex(g, key, filenum, offset);
190 * GSIWriteIndex(fp, g);
192 *****************************************************************/
196 struct gsiindex_s *g;
198 g = MallocOrDie(sizeof(struct gsiindex_s));
199 g->filenames = MallocOrDie(sizeof(char *) * 10);
200 g->fmt = MallocOrDie(sizeof(int) * 10);
201 g->elems = MallocOrDie(sizeof(struct gsikey_s) * 100);
207 GSIFreeIndex(struct gsiindex_s *g)
210 for (i = 0; i < g->nfiles; i++) free(g->filenames[i]);
217 GSIAddFileToIndex(struct gsiindex_s *g, char *filename, int fmt)
221 len = strlen(filename);
222 if (len >= GSI_KEYSIZE) Die("File name too long to be indexed.");
223 g->filenames[g->nfiles] = sre_strdup(filename, len);
224 g->fmt[g->nfiles] = fmt;
226 if (g->nfiles % 10 == 0) {
227 g->filenames = ReallocOrDie(g->filenames, sizeof(char *) * (g->nfiles + 10));
228 g->fmt = ReallocOrDie(g->fmt, sizeof(int) * (g->nfiles + 10));
232 GSIAddKeyToIndex(struct gsiindex_s *g, char *key, int filenum, long offset)
234 if (strlen(key) >= GSI_KEYSIZE) Die("key too long in GSI index");
235 if (filenum > SQD_UINT16_MAX) Die("too many files in GSI index");
236 if (offset > SQD_UINT32_MAX) Die("offset too big in GSI index");
238 strncpy(g->elems[g->nkeys].key, key, GSI_KEYSIZE-1);
239 g->elems[g->nkeys].key[GSI_KEYSIZE-1] = '\0';
240 g->elems[g->nkeys].filenum = (sqd_uint16) filenum;
241 g->elems[g->nkeys].offset = (sqd_uint32) offset;
244 if (g->nkeys % 100 == 0)
245 g->elems = ReallocOrDie(g->elems, sizeof(struct gsikey_s) * (g->nkeys + 100));
248 gsi_keysorter(const void *k1, const void *k2)
250 struct gsikey_s *key1;
251 struct gsikey_s *key2;
252 key1 = (struct gsikey_s *) k1;
253 key2 = (struct gsikey_s *) k2;
254 return strcmp(key1->key, key2->key);
257 GSISortIndex(struct gsiindex_s *g)
259 qsort((void *) g->elems, g->nkeys, sizeof(struct gsikey_s), gsi_keysorter);
262 GSIWriteIndex(FILE *fp, struct gsiindex_s *g)
268 if (g->nfiles > SQD_UINT16_MAX) Die("Too many files in GSI index.");
269 if (g->nkeys > SQD_UINT32_MAX) Die("Too many keys in GSI index.");
271 GSIWriteHeader(fp, g->nfiles, g->nkeys);
272 for (i = 0; i < g->nfiles; i++)
273 GSIWriteFileRecord(fp, g->filenames[i], i+1, g->fmt[i]);
274 for (i = 0; i < g->nkeys; i++)
275 GSIWriteKeyRecord(fp, g->elems[i].key, g->elems[i].filenum, g->elems[i].offset);
282 /* Function: GSIWriteHeader()
283 * Date: SRE, Wed Aug 5 10:36:02 1998 [St. Louis]
285 * Purpose: Write the first record to an open GSI file:
286 * "GSI" <nfiles> <nkeys>
288 * Args: fp - open file to write to.
289 * nfiles - number of files indexed
290 * nkeys - number of keys indexed
295 GSIWriteHeader(FILE *fp, int nfiles, long nkeys)
297 char key[GSI_KEYSIZE];
301 /* beware potential range errors!
303 if (nfiles > SQD_UINT16_MAX) Die("GSI: nfiles out of range");
304 if (nkeys > SQD_UINT32_MAX) Die("GSI: nkeys out of range");
306 f1 = (sqd_uint16) nfiles;
307 f2 = (sqd_uint32) nkeys;
312 if (fwrite(key, 1, GSI_KEYSIZE, fp) < GSI_KEYSIZE) PANIC;
313 if (fwrite(&f1, 2, 1, fp) < 1) PANIC;
314 if (fwrite(&f2, 4, 1, fp) < 1) PANIC;
318 /* Function: GSIWriteFileRecord()
319 * Date: SRE, Wed Aug 5 10:45:51 1998 [St. Louis]
321 * Purpose: Write a file record to an open GSI file.
323 * Args: fp - open GSI file
324 * fname - file name (max 31 characters)
326 * fmt - file format (e.g. kPearson, etc.)
328 * Returns: 0 on failure. 1 on success.
331 GSIWriteFileRecord(FILE *fp, char *fname, int idx, int fmt)
336 if (strlen(fname) >= GSI_KEYSIZE) return 0;
337 if (idx > SQD_UINT16_MAX) Die("GSI: file index out of range");
338 if (fmt > SQD_UINT32_MAX) Die("GSI: format index out of range");
340 f1 = (sqd_uint16) idx;
341 f2 = (sqd_uint32) fmt;
345 if (fwrite(fname, 1, GSI_KEYSIZE, fp) < GSI_KEYSIZE) PANIC;
346 if (fwrite(&f1, 2, 1, fp) < 1) PANIC;
347 if (fwrite(&f2, 4, 1, fp) < 1) PANIC;
352 /* Function: GSIWriteKeyRecord()
353 * Date: SRE, Wed Aug 5 10:52:30 1998 [St. Louis]
355 * Purpose: Write a key record to a GSI file.
357 * Args: fp - open GSI file for writing
358 * key - key (max 31 char + \0)
359 * fileidx - which file number to find this key in
360 * offset - offset for this key
362 * Returns: 1 on success, else 0.
363 * will fail if key >= 32 chars, for instance.
366 GSIWriteKeyRecord(FILE *fp, char *key, int fileidx, long offset)
371 if (strlen(key) >= GSI_KEYSIZE) return 0;
372 if (fileidx > SQD_UINT16_MAX) Die("GSI: file index out of range");
373 if (offset > SQD_UINT32_MAX) Die("GSI: offset out of range");
375 f1 = (sqd_uint16) fileidx;
376 f2 = (sqd_uint32) offset;
380 if (fwrite(key, 1, GSI_KEYSIZE, fp) < GSI_KEYSIZE) PANIC;
381 if (fwrite(&f1, 2, 1, fp) < 1) PANIC;
382 if (fwrite(&f2, 4, 1, fp) < 1) PANIC;