1 /*****************************************************************
2 * HMMER - Biological sequence analysis with profile HMMs
3 * Copyright (C) 1992-1999 Washington University School of Medicine
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
9 *****************************************************************/
12 * SRE, Sat May 1 14:49:08 1999
14 * "generic key index" module: emulation of Perl hashes.
15 * Maps keys (ASCII char strings) to array index. Dynamically
16 * resizes the hash table.
19 * - hash table can only grow; no provision for deleting keys
20 * or downsizing the hash table.
21 * - Maximum hash table size set at 100003. Performance
22 * will degrade for key sets much larger than this.
23 * - Assumes that integers are 32 bits (or greater).
25 * Defines a typedef'd structure:
26 * gki - a key index hash table.
28 * GKIInit() - start a hash table.
29 * GKIStoreKey() - store a new key, get a unique index.
30 * GKIKeyIndex() - retrieve an existing key's index.
31 * GKIFree() - free a hash table.
32 * GKIStatus() - Debugging: prints internal status of a hash struct
35 * Note that there are no dependencies on squid; the gki.c/gki.h
36 * pair are base ANSI C and can be reused anywhere.
37 *****************************************************************
39 * API for storing/reading stuff:
40 * moral equivalent of Perl's $foo{$key} = whatever, $bar{$key} = whatever:
50 * idx = GKIStoreKey(hash, key);
51 * (reallocate foo, bar as needed)
52 * foo[idx] = whatever;
53 * bar[idx] = whatever;
57 * idx = GKIKeyIndex(hash, key);
58 * if (idx == -1) {no_such_key; }
59 * (do something with) foo[idx];
60 * (do something with) bar[idx];
64 *****************************************************************
66 * Timings on wrasse for 45402 keys in /usr/dict/words using
68 * 250 msec store (6 usec/store)
69 * 140 msec retrieve (3 usec/retrieve)
70 * and using the 13408 names of Pfam's GP120.full alignment:
71 * 70 msec store (5 usec/store)
72 * 50 msec retrieve (4 usec/retrieve)
74 * RCS $Id: gki.c,v 1.1.1.1 2005/03/22 08:34:18 cmzmasek Exp $
87 * Best hash table sizes are prime numbers (see Knuth vol 3, Sorting
89 * gki_primes[] defines the ascending order of hash table sizes
90 * that we use in upsizing the hash table dynamically.
91 * useful site for testing primes:
92 * http://www.idbsu.edu/people/jbrennan/algebra/numbers/sieve.html
93 * Because of the way gki_hashvalue works, the largest number
94 * must be < INT_MAX / 128 / 128 : 131072 on a 32 bit machine.
96 static int gki_primes[] = { 101, 1009, 10007, 100003 };
98 #define GKI_ALPHABETSIZE 128
100 static GKI *gki_alloc(int primelevel);
101 static int gki_hashvalue(GKI *hash, char *key);
102 static int gki_upsize(GKI *old);
105 /* Function: GKIInit()
106 * Date: SRE, Sat May 1 11:12:24 1999 [May Day geek-out]
108 * Purpose: Initialize a hash table for key indexing.
109 * Simply a wrapper around a level 0 gki_alloc().
113 * Returns: An allocated hash table structure.
114 * Caller frees with GKIFree().
124 /* Function: GKIFree()
125 * Date: SRE, Sat May 1 11:13:26 1999 [May Day geek-out]
127 * Purpose: Free a key index hash table.
129 * Args: hash - the gki structure
132 * hash table is destroyed.
137 struct gki_elem *ptr;
140 if (hash == NULL) return; /* tolerate a NULL */
142 for (i = 0; i < hash->nhash; i++)
143 while (hash->table[i] != NULL)
145 ptr = hash->table[i]->nxt;
146 /* NULL keys can occur after we've gki_upsize'd */
147 if (hash->table[i]->key != NULL) free(hash->table[i]->key);
148 free(hash->table[i]);
149 hash->table[i] = ptr;
155 /* Function: GKIStoreKey()
156 * Date: SRE, Sat May 1 11:16:48 1999 [May Day geek-out]
158 * Purpose: Store a key in the key index hash table.
159 * Associate it with a unique "key index", counting
160 * from 0. (It's this index that lets us map
161 * the hashed keys to indexed C arrays, (clumsily)
162 * emulating Perl's hashes.)
164 * Does *not* check to see if the key's already
165 * in the table, so it's possible to store multiple
166 * copies of a key with different indices; probably
167 * not what you want, so if you're not sure the
168 * key is unique, check the table first with
171 * Args: hash - GKI structure to store the key in
172 * key - string to store
174 * Returns: the new key's index. Since it's always the
175 * last one in the current array, this index is
176 * just hash->nkeys-1.
177 * On a malloc failure, returns -1.
178 * hash table is modified.
181 GKIStoreKey(GKI *hash, char *key)
184 struct gki_elem *ptr;
186 val = gki_hashvalue(hash, key);
188 ptr = hash->table[val];
189 hash->table[val] = MallocOrDie(sizeof(struct gki_elem));
190 hash->table[val]->key = MallocOrDie(sizeof(char) * (strlen(key)+1));
191 strcpy(hash->table[val]->key, key);
193 hash->table[val]->idx = hash->nkeys;
194 hash->table[val]->nxt = ptr;
197 /* time to upsize? */
198 if (hash->nkeys > 3*hash->nhash && hash->primelevel < GKI_NPRIMES-1)
201 return hash->nkeys-1;
204 /* Function: GKIKeyIndex()
205 * Date: SRE, Sat May 1 11:20:42 1999 [May Day geek-out]
207 * Purpose: Look up a key in the hash table. Return
208 * its index (0..nkeys-1), else -1 if the key
209 * isn't in the hash (yet).
211 * Args: hash - the GKI hash table to search in
212 * key - the key to look up
214 * Returns: -1 if key is not found;
215 * index of key if it is found (range 0..nkeys-1).
216 * hash table is unchanged.
219 GKIKeyIndex(GKI *hash, char *key)
221 struct gki_elem *ptr;
224 val = gki_hashvalue(hash, key);
225 for (ptr = hash->table[val]; ptr != NULL; ptr = ptr->nxt)
226 if (strcmp(key, ptr->key) == 0) return ptr->idx;
230 /* Function: GKIStatus()
231 * Date: SRE, Sat May 1 11:11:13 1999 [St. Louis]
233 * Purpose: (DEBUGGING) How are we doing? Calculate some
234 * simple statistics for the hash table.
236 * Args: hash - the GKI hash table to look at
239 * Prints diagnostics on stdout.
240 * hash table is unchanged.
245 struct gki_elem *ptr;
250 int minkeys = INT_MAX;
252 for (i = 0; i < hash->nhash; i++)
255 for (ptr = hash->table[i]; ptr != NULL; ptr = ptr->nxt)
258 if (nkeys == 0) nempty++;
259 if (nkeys > maxkeys) maxkeys = nkeys;
260 if (nkeys < minkeys) minkeys = nkeys;
263 printf("Total keys: %d\n", hash->nkeys);
264 printf("Hash table size: %d\n", hash->nhash);
265 printf("Average occupancy: %.1f\n", (float) hash->nkeys / (float) hash->nhash);
266 printf("Unoccupied slots: %d\n", nempty);
267 printf("Most in one slot: %d\n", maxkeys);
268 printf("Least in one slot: %d\n", minkeys);
273 /* Function: gki_alloc()
274 * Date: SRE, Sat May 1 11:55:47 1999 [May Day geek-out]
276 * Purpose: Allocate a hash table structure with the
277 * size given by primelevel.
279 * Args: primelevel - level 0..GKI_NPRIMES-1, specifying
280 * the size of the table; see gki_primes[]
283 * Returns: An allocated hash table structure.
284 * Caller frees with GKIFree().
287 gki_alloc(int primelevel)
292 if (primelevel < 0 || primelevel >= GKI_NPRIMES)
293 Die("bad primelevel in gki_alloc()");
294 hash = MallocOrDie(sizeof(GKI));
296 hash->primelevel = primelevel;
297 hash->nhash = gki_primes[hash->primelevel];
298 hash->table = MallocOrDie(sizeof(struct gki_elem) * hash->nhash);
299 for (i = 0; i < hash->nhash; i++)
300 hash->table[i] = NULL;
306 /* Function: gki_hashvalue()
307 * Date: SRE, Sat May 1 11:14:10 1999 [May Day geek-out]
309 * Purpose: Calculate the hash value for a key. Usually
310 * we expect a one-word key, but the function will
311 * hash any ASCII string effectively. The hash function
312 * is a simple one (see p. 233 of Sedgewick,
314 * Slightly optimized: does two characters at a time
315 * before doing the modulo; this gives us a significant
318 * Args: hash - the gki structure (we need to know the hash table size)
319 * key - a string to calculate the hash value for
321 * Returns: a hash value, in the range 0..hash->nhash-1.
322 * hash table is unmodified.
325 gki_hashvalue(GKI *hash, char *key)
329 for (; *key != '\0'; key++)
331 val = GKI_ALPHABETSIZE*val + *key;
332 if (*(++key) == '\0') { val = val % hash->nhash; break; }
333 val = (GKI_ALPHABETSIZE*val + *key) % hash->nhash;
338 /* Function: gki_upsize()
339 * Date: SRE, Sat May 1 11:46:07 1999 [May Day geek-out]
341 * Purpose: Grow the hash table to the next available size.
343 * Args: old - the GKI hash table to reallocate.
345 * Returns: 1 on success (the hash table is changed);
346 * 0 on failure; the table is already at its maximum size,
347 * and the hash table is returned unchanged.
354 struct gki_elem *optr;
355 struct gki_elem *nptr;
358 if (old->primelevel >= GKI_NPRIMES-1) return 0;
359 new = gki_alloc(old->primelevel+1);
361 /* Read the old, store in the new, while *not changing*
362 * any key indices. Because of the way the lists are
363 * treated as LIFO stacks, all the lists are reversed
364 * in the new structure.
366 for (i = 0; i < old->nhash; i++)
368 optr = old->table[i];
371 val = gki_hashvalue(new, optr->key);
373 nptr = new->table[val];
374 new->table[val] = optr;
376 new->table[val]->nxt = nptr;
381 /* Now swap within the interior of the structures, so the old
382 * structure is updated to the new structure.
383 * (nkeys is identical, so we don't need to swap that element.)
385 old->primelevel = new->primelevel;
386 old->nhash = new->nhash;
387 old->table = new->table;