/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ /* gki.c * SRE, Sat May 1 14:49:08 1999 * * "generic key index" module: emulation of Perl hashes. * Maps keys (ASCII char strings) to array index. Dynamically * resizes the hash table. * * Limitations: * - hash table can only grow; no provision for deleting keys * or downsizing the hash table. * - Maximum hash table size set at 100003. Performance * will degrade for key sets much larger than this. * - Assumes that integers are 32 bits (or greater). * * Defines a typedef'd structure: * gki - a key index hash table. * Provides functions: * GKIInit() - start a hash table. * GKIStoreKey() - store a new key, get a unique index. * GKIKeyIndex() - retrieve an existing key's index. * GKIFree() - free a hash table. * GKIStatus() - Debugging: prints internal status of a hash struct * * * Note that there are no dependencies on squid; the gki.c/gki.h * pair are base ANSI C and can be reused anywhere. ***************************************************************** * * API for storing/reading stuff: * moral equivalent of Perl's $foo{$key} = whatever, $bar{$key} = whatever: * #include "gki.h" * * gki *hash; * int idx; * char *key; * * hash = GKIInit(); * (Storing:) * (foreach key) { * idx = GKIStoreKey(hash, key); * (reallocate foo, bar as needed) * foo[idx] = whatever; * bar[idx] = whatever; * } * (Reading:) * (foreach key) { * idx = GKIKeyIndex(hash, key); * if (idx == -1) {no_such_key; } * (do something with) foo[idx]; * (do something with) bar[idx]; * } * GKIFree(); * ***************************************************************** * * Timings on wrasse for 45402 keys in /usr/dict/words using * Tests/test_gki: * 250 msec store (6 usec/store) * 140 msec retrieve (3 usec/retrieve) * and using the 13408 names of Pfam's GP120.full alignment: * 70 msec store (5 usec/store) * 50 msec retrieve (4 usec/retrieve) * * RCS $Id: gki.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: gki.c,v 1.3 2000/12/21 23:42:59 eddy Exp) */ #include #include #include #include #include "squid.h" #include "gki.h" /* * Best hash table sizes are prime numbers (see Knuth vol 3, Sorting * and Searching). * gki_primes[] defines the ascending order of hash table sizes * that we use in upsizing the hash table dynamically. * useful site for testing primes: * http://www.idbsu.edu/people/jbrennan/algebra/numbers/sieve.html * Because of the way gki_hashvalue works, the largest number * must be < INT_MAX / 128 / 128 : 131072 on a 32 bit machine. */ static int gki_primes[] = { 101, 1009, 10007, 100003 }; #define GKI_NPRIMES 4 #define GKI_ALPHABETSIZE 128 static GKI *gki_alloc(int primelevel); static int gki_hashvalue(GKI *hash, char *key); static int gki_upsize(GKI *old); /* Function: GKIInit() * Date: SRE, Sat May 1 11:12:24 1999 [May Day geek-out] * * Purpose: Initialize a hash table for key indexing. * Simply a wrapper around a level 0 gki_alloc(). * * Args: (void) * * Returns: An allocated hash table structure. * Caller frees with GKIFree(). */ GKI * GKIInit(void) { GKI *hash; hash = gki_alloc(0); return hash; } /* Function: GKIFree() * Date: SRE, Sat May 1 11:13:26 1999 [May Day geek-out] * * Purpose: Free a key index hash table. * * Args: hash - the gki structure * * Returns: (void). * hash table is destroyed. */ void GKIFree(GKI *hash) { struct gki_elem *ptr; int i; if (hash == NULL) return; /* tolerate a NULL */ for (i = 0; i < hash->nhash; i++) while (hash->table[i] != NULL) { ptr = hash->table[i]->nxt; /* NULL keys can occur after we've gki_upsize'd */ if (hash->table[i]->key != NULL) free(hash->table[i]->key); free(hash->table[i]); hash->table[i] = ptr; } free(hash->table); free(hash); } /* Function: GKIStoreKey() * Date: SRE, Sat May 1 11:16:48 1999 [May Day geek-out] * * Purpose: Store a key in the key index hash table. * Associate it with a unique "key index", counting * from 0. (It's this index that lets us map * the hashed keys to indexed C arrays, (clumsily) * emulating Perl's hashes.) * * Does *not* check to see if the key's already * in the table, so it's possible to store multiple * copies of a key with different indices; probably * not what you want, so if you're not sure the * key is unique, check the table first with * GKIKeyIndex(). * * Args: hash - GKI structure to store the key in * key - string to store * * Returns: the new key's index. Since it's always the * last one in the current array, this index is * just hash->nkeys-1. * On a malloc failure, returns -1. * hash table is modified. */ int GKIStoreKey(GKI *hash, char *key) { int val; struct gki_elem *ptr; val = gki_hashvalue(hash, key); ptr = hash->table[val]; hash->table[val] = MallocOrDie(sizeof(struct gki_elem)); hash->table[val]->key = MallocOrDie(sizeof(char) * (strlen(key)+1)); strcpy(hash->table[val]->key, key); hash->table[val]->idx = hash->nkeys; hash->table[val]->nxt = ptr; hash->nkeys++; /* time to upsize? */ if (hash->nkeys > 3*hash->nhash && hash->primelevel < GKI_NPRIMES-1) gki_upsize(hash); return hash->nkeys-1; } /* Function: GKIKeyIndex() * Date: SRE, Sat May 1 11:20:42 1999 [May Day geek-out] * * Purpose: Look up a key in the hash table. Return * its index (0..nkeys-1), else -1 if the key * isn't in the hash (yet). * * Args: hash - the GKI hash table to search in * key - the key to look up * * Returns: -1 if key is not found; * index of key if it is found (range 0..nkeys-1). * hash table is unchanged. */ int GKIKeyIndex(GKI *hash, char *key) { struct gki_elem *ptr; int val; val = gki_hashvalue(hash, key); for (ptr = hash->table[val]; ptr != NULL; ptr = ptr->nxt) if (strcmp(key, ptr->key) == 0) return ptr->idx; return -1; } /* Function: GKIStatus() * Date: SRE, Sat May 1 11:11:13 1999 [St. Louis] * * Purpose: (DEBUGGING) How are we doing? Calculate some * simple statistics for the hash table. * * Args: hash - the GKI hash table to look at * * Returns: (void) * Prints diagnostics on stdout. * hash table is unchanged. */ void GKIStatus(GKI *hash) { struct gki_elem *ptr; int i; int nkeys; int nempty = 0; int maxkeys = -1; int minkeys = INT_MAX; for (i = 0; i < hash->nhash; i++) { nkeys = 0; for (ptr = hash->table[i]; ptr != NULL; ptr = ptr->nxt) nkeys++; if (nkeys == 0) nempty++; if (nkeys > maxkeys) maxkeys = nkeys; if (nkeys < minkeys) minkeys = nkeys; } printf("Total keys: %d\n", hash->nkeys); printf("Hash table size: %d\n", hash->nhash); printf("Average occupancy: %.1f\n", (float) hash->nkeys / (float) hash->nhash); printf("Unoccupied slots: %d\n", nempty); printf("Most in one slot: %d\n", maxkeys); printf("Least in one slot: %d\n", minkeys); } /* Function: gki_alloc() * Date: SRE, Sat May 1 11:55:47 1999 [May Day geek-out] * * Purpose: Allocate a hash table structure with the * size given by primelevel. * * Args: primelevel - level 0..GKI_NPRIMES-1, specifying * the size of the table; see gki_primes[] * array. * * Returns: An allocated hash table structure. * Caller frees with GKIFree(). */ static GKI * gki_alloc(int primelevel) { GKI *hash; int i; if (primelevel < 0 || primelevel >= GKI_NPRIMES) Die("bad primelevel in gki_alloc()"); hash = MallocOrDie(sizeof(GKI)); hash->primelevel = primelevel; hash->nhash = gki_primes[hash->primelevel]; hash->table = MallocOrDie(sizeof(struct gki_elem) * hash->nhash); for (i = 0; i < hash->nhash; i++) hash->table[i] = NULL; hash->nkeys = 0; return hash; } /* Function: gki_hashvalue() * Date: SRE, Sat May 1 11:14:10 1999 [May Day geek-out] * * Purpose: Calculate the hash value for a key. Usually * we expect a one-word key, but the function will * hash any ASCII string effectively. The hash function * is a simple one (see p. 233 of Sedgewick, * Algorithms in C). * Slightly optimized: does two characters at a time * before doing the modulo; this gives us a significant * speedup. * * Args: hash - the gki structure (we need to know the hash table size) * key - a string to calculate the hash value for * * Returns: a hash value, in the range 0..hash->nhash-1. * hash table is unmodified. */ static int gki_hashvalue(GKI *hash, char *key) { int val = 0; for (; *key != '\0'; key++) { val = GKI_ALPHABETSIZE*val + *key; if (*(++key) == '\0') { val = val % hash->nhash; break; } val = (GKI_ALPHABETSIZE*val + *key) % hash->nhash; } return val; } /* Function: gki_upsize() * Date: SRE, Sat May 1 11:46:07 1999 [May Day geek-out] * * Purpose: Grow the hash table to the next available size. * * Args: old - the GKI hash table to reallocate. * * Returns: 1 on success (the hash table is changed); * 0 on failure; the table is already at its maximum size, * and the hash table is returned unchanged. */ static int gki_upsize(GKI *old) { GKI *new; int i; struct gki_elem *optr; struct gki_elem *nptr; int val; if (old->primelevel >= GKI_NPRIMES-1) return 0; new = gki_alloc(old->primelevel+1); /* Read the old, store in the new, while *not changing* * any key indices. Because of the way the lists are * treated as LIFO stacks, all the lists are reversed * in the new structure. */ for (i = 0; i < old->nhash; i++) { optr = old->table[i]; while (optr != NULL) { val = gki_hashvalue(new, optr->key); nptr = new->table[val]; new->table[val] = optr; optr = optr->nxt; new->table[val]->nxt = nptr; } } free(old->table); /* Now swap within the interior of the structures, so the old * structure is updated to the new structure. * (nkeys is identical, so we don't need to swap that element.) */ old->primelevel = new->primelevel; old->nhash = new->nhash; old->table = new->table; free(new); return 1; }