+++ /dev/null
-#include <stdio.h> /* printf */
-#include <stdlib.h> /* malloc */
-#include <string.h> /* strcmp, strlen */
-
-#include "trie.h"
-
-/* The following is necessary to make sure that trie.pyd won't link
- * to msvcrt.dll in addition to msvcr71.dll on Windows.
- * See Bug #1767 on Bugzilla.
- */
-#ifdef __MINGW32__
-# define strdup _strdup
-#endif
-
-struct _Transition; /* Forward declaration, needed in _Trie. */
-
-
-/* _Trie is a recursive data structure. A _Trie contains zero or more
- * _Transitions that lead to more _Tries. The transitions are stored
- * in alphabetical order of the suffix member of the data structure.
- * _Trie also contains a pointer called value where the user can store
- * arbitrary data. If value is NULL, then no data is stored here.
- */
-struct _Trie {
- struct _Transition *transitions;
- unsigned char num_transitions;
- void *value; /* specified by user, never freed or allocated by me! */
-};
-
-/* _Transition holds information about the transitions leading from
- * one _Trie to another. The trie structure here is different from
- * typical ones, because the transitions between nodes can contain
- * strings of arbitrary length, not just single characters. Suffix is
- * the string that is matched from one node to the next.
- */
-typedef struct _Transition {
- unsigned char *suffix;
- Trie next;
-} *Transition;
-
-
-#define MAX_KEY_LENGTH 1000
-static unsigned char KEY[MAX_KEY_LENGTH];
-
-
-Trie Trie_new(void) {
- Trie trie;
-
- if(!(trie = (Trie)malloc(sizeof(struct _Trie))))
- return NULL;
- trie->transitions = NULL;
- trie->num_transitions = 0;
- trie->value = NULL;
- return trie;
-}
-
-int Trie_set(Trie trie, const unsigned char *key, const void *value) {
- int i;
- Transition transition=NULL;
- unsigned char *suffix=NULL;
- int retval = 0;
- int first, last, mid;
-
- if(!key[0]) {
- trie->value = (void *)value;
- return 0;
- }
-
- /* Insert the key in alphabetical order. Do a binary search to
- find the proper place. */
- first = 0;
- last = trie->num_transitions-1;
- i = -1;
- while(first <= last) {
- mid = (first+last)/2;
- transition = &trie->transitions[mid];
- suffix = transition->suffix;
- if(key[0] < suffix[0])
- last = mid-1;
- else if(key[0] > suffix[0])
- first = mid+1;
- else {
- i = mid;
- break;
- }
- }
-
- /* If no place was found for it, then the indexes will be in the
- order last,first. Place it at index first. */
- if(i == -1)
- i = first;
-
- /* If nothing matches, then insert a new trie here. */
- if((i >= trie->num_transitions) || (key[0] != suffix[0])) {
- unsigned char *new_suffix=NULL;
- Trie newtrie=NULL;
- Transition new_transitions=NULL;
-
- /* Create some variables for the new transition. I'm going to
- allocate these first so that if I can detect memory errors
- before I mess up the data structure of the transitions.
- */
- if(!(new_suffix = (unsigned char *)strdup(key)))
- goto insert_memerror;
- if(!(newtrie = Trie_new()))
- goto insert_memerror;
-
- /* Create some space for the next transition. Allocate some
- memory and shift the old transitions over to make room for
- this one.
- */
- if(!(new_transitions = malloc(sizeof(struct _Transition) *
- (trie->num_transitions+1))))
- goto insert_memerror;
- memcpy(new_transitions, trie->transitions,
- sizeof(struct _Transition)*i);
- memcpy(&new_transitions[i+1], &trie->transitions[i],
- sizeof(struct _Transition)*(trie->num_transitions-i));
- free(trie->transitions);
- trie->transitions = new_transitions;
- new_transitions = NULL;
- trie->num_transitions += 1;
-
- /* Initialize the new transition. */
- transition = &trie->transitions[i];
- transition->suffix = new_suffix;
- transition->next = newtrie;
- transition->next->value = (void *)value;
-
- if(0) {
- insert_memerror:
- if(new_transitions) free(new_transitions);
- if(newtrie) free(newtrie);
- if(new_suffix) free(new_suffix);
- return 1;
- }
- }
- /* There are three cases where the key and suffix share some
- letters.
- 1. suffix is proper substring of key.
- 2. key is proper substring of suffix.
- 3. neither is proper substring of other.
-
- For cases 2 and 3, I need to first split up the transition
- based on the number of characters shared. Then, I can insert
- the rest of the key into the next trie.
- */
- else {
- /* Count the number of characters shared between key
- and suffix. */
- int chars_shared = 0;
- while(key[chars_shared] && key[chars_shared] == suffix[chars_shared])
- chars_shared++;
-
- /* Case 2 or 3, split this sucker! */
- if(chars_shared < strlen(suffix)) {
- Trie newtrie=NULL;
- unsigned char *new_suffix1=NULL, *new_suffix2=NULL;
-
- if(!(new_suffix1 = (unsigned char *)malloc(chars_shared+1)))
- goto split_memerror;
- strncpy(new_suffix1, key, chars_shared);
- new_suffix1[chars_shared] = 0;
- if(!(new_suffix2 = (unsigned char *)strdup(suffix+chars_shared)))
- goto split_memerror;
- if(!(newtrie = Trie_new()))
- goto split_memerror;
- if(!(newtrie->transitions =
- (Transition)malloc(sizeof(struct _Transition))))
- goto split_memerror;
- newtrie->num_transitions = 1;
- newtrie->transitions[0].next = transition->next;
- newtrie->transitions[0].suffix = new_suffix2;
-
- free(transition->suffix);
- transition->suffix = new_suffix1;
- transition->next = newtrie;
-
- if(0) {
- split_memerror:
- if(newtrie && newtrie->transitions) free(newtrie->transitions);
- if(newtrie) free(newtrie);
- if(new_suffix2) free(new_suffix2);
- if(new_suffix1) free(new_suffix1);
- return 1;
- }
- }
- retval = Trie_set(transition->next, key+chars_shared, value);
- }
-
- return retval;
-}
-
-void Trie_del(Trie trie) {
- int i;
- if(!trie)
- return;
- for(i=0; i<trie->num_transitions; i++) {
- Transition transition = &trie->transitions[i];
- if(transition->suffix)
- free(transition->suffix);
- Trie_del(transition->next);
- }
- free(trie);
-}
-
-void *Trie_get(const Trie trie, const unsigned char *key) {
- int first, last, mid;
-
- if(!key[0]) {
- return trie->value;
- }
-
- /* The transitions are stored in alphabetical order. Do a binary
- * search to find the proper one.
- */
- first = 0;
- last = trie->num_transitions-1;
- while(first <= last) {
- Transition transition;
- unsigned char *suffix;
- int c;
- mid = (first+last)/2;
- transition = &trie->transitions[mid];
- suffix = transition->suffix;
- /* If suffix is a substring of key, then get the value from
- the next trie.
- */
- c = strncmp(key, suffix, strlen(suffix));
- if(c < 0)
- last = mid-1;
- else if(c > 0)
- first = mid+1;
- else
- return Trie_get(transition->next, key+strlen(suffix));
- }
- return NULL;
-}
-
-
-/* Mutually recursive, so need to make a forward declaration. */
-void
-_get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
- void (*callback)(const unsigned char *key,
- const void *value,
- const int mismatches,
- void *data),
- void *data,
- const int mismatches,
- unsigned char *current_key, const int max_key
- );
-
-void
-_get_approximate_transition(const unsigned char *key,
- const int k,
- const Transition transition,
- const unsigned char *suffix,
- void (*callback)(const unsigned char *key,
- const void *value,
- const int mismatches,
- void *data),
- void *data,
- const int mismatches,
- unsigned char *current_key, const int max_key
- )
-{
- int i;
- int prev_keylen = strlen(current_key);
-
- /* Short circuit optimization. If there's too many characters to
- possibly be a match, then don't even try to match things. */
- if((int)(strlen(suffix) - strlen(key)) > k)
- return;
-
- /* Match as many characters as possible. */
- i = 0;
- while(suffix[i] && (key[i] == suffix[i])) {
- i++;
- }
- /* Check to make sure the key is not too long. BUG: If it is,
- fails silently. */
- if((prev_keylen+i) >= max_key)
- return;
- strncat(current_key, suffix, i);
-
- /* If all the letters in the suffix matched, then move to the
- next trie. */
- if(!suffix[i]) {
- _get_approximate_trie(transition->next, &key[i], k, callback, data,
- mismatches, current_key, max_key);
- }
- /* Otherwise, try out different kinds of mismatches. */
- else if(k) {
- int new_keylen = prev_keylen+i;
-
- /* Letter replacement, skip the next letter in both the key and
- suffix. */
- if((new_keylen+1 < max_key) && key[i] && suffix[i]) {
- current_key[new_keylen] = suffix[i];
- current_key[new_keylen+1] = 0;
- _get_approximate_transition(&key[i+1], k-1,
- transition, &suffix[i+1],
- callback, data,
- mismatches+1, current_key, max_key);
- current_key[new_keylen] = 0;
- }
-
- /* Insertion in key, skip the next letter in the key. */
- if(key[i]) {
- _get_approximate_transition(&key[i+1], k-1,
- transition, &suffix[i],
- callback, data,
- mismatches+1, current_key, max_key);
- }
-
- /* Deletion from key, skip the next letter in the suffix. */
- if((new_keylen+1 < max_key) && suffix[i]) {
- current_key[new_keylen] = suffix[i];
- current_key[new_keylen+1] = 0;
- _get_approximate_transition(&key[i], k-1,
- transition, &suffix[i+1],
- callback, data,
- mismatches+1, current_key, max_key);
- current_key[new_keylen] = 0;
- }
- }
- current_key[prev_keylen] = 0;
-}
-
-void
-_get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
- void (*callback)(const unsigned char *key,
- const void *value,
- const int mismatches,
- void *data),
- void *data,
- const int mismatches,
- unsigned char *current_key, const int max_key
- )
-{
- int i;
-
- /* If there's no more key to match, then I'm done. */
- if(!key[0]) {
- if(trie->value)
- (*callback)(current_key, trie->value, mismatches, data);
- }
- /* If there are no more mismatches allowed, then fall back to the
- faster Trie_get. */
- else if(!k) {
- void *value = Trie_get(trie, key);
- if(value) {
- int l = strlen(current_key);
- /* Make sure I have enough space for the full key. */
- if(l + strlen(key) < max_key) {
- strcat(current_key, key);
- (*callback)(current_key, value, mismatches, data);
- current_key[l] = 0;
- }
- /* BUG: Ran out of space for the key. This fails
- silently, but should signal an error. */
- }
- }
- /* If there are no more transitions, then all the characters left
- in the key are mismatches. */
- else if(!trie->num_transitions) {
- if(trie->value && (strlen(key) <= k)) {
- (*callback)(current_key, trie->value,
- mismatches+strlen(key), data);
- }
- }
- /* Otherwise, try to match each of the transitions. */
- else {
- for(i=0; i<trie->num_transitions; i++) {
- Transition transition = &trie->transitions[i];
- unsigned char *suffix = transition->suffix;
- _get_approximate_transition(key, k, transition, suffix,
- callback, data,
- mismatches, current_key, max_key);
- }
- }
-
-}
-
-
-void
-Trie_get_approximate(const Trie trie, const unsigned char *key, const int k,
- void (*callback)(const unsigned char *key,
- const void *value,
- const int mismatches,
- void *data),
- void *data
- )
-{
- KEY[0] = 0;
- _get_approximate_trie(trie, key, k, callback, data, 0, KEY,MAX_KEY_LENGTH);
-}
-
-int Trie_len(const Trie trie)
-{
- int i;
- int length = 0;
-
- if(!trie)
- return 0;
- if(trie->value)
- length += 1;
- for(i=0; i<trie->num_transitions; i++) {
- length += Trie_len(trie->transitions[i].next);
- }
- return length;
-}
-
-int Trie_has_key(const Trie trie, const unsigned char *key)
-{
- return Trie_get(trie, key) != NULL;
-}
-
-int Trie_has_prefix(const Trie trie, const unsigned char *prefix)
-{
- int first, last, mid;
-
- if(!prefix[0]) {
- return 1;
- }
-
- /* The transitions are stored in alphabetical order. Do a binary
- * search to find the proper one.
- */
- first = 0;
- last = trie->num_transitions-1;
- while(first <= last) {
- Transition transition;
- unsigned char *suffix;
- int suffixlen, prefixlen, minlen;
- int c;
- mid = (first+last)/2;
- transition = &trie->transitions[mid];
- suffix = transition->suffix;
- suffixlen = strlen(suffix);
- prefixlen = strlen(prefix);
- minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
- c = strncmp(prefix, suffix, minlen);
- if(c < 0)
- last = mid-1;
- else if(c > 0)
- first = mid+1;
- else
- return Trie_has_prefix(transition->next, prefix+minlen);
- }
- return 0;
-}
-
-static void
-_iterate_helper(const Trie trie,
- void (*callback)(const unsigned char *key,
- const void *value,
- void *data),
- void *data,
- unsigned char *current_key, const int max_key)
-{
- int i;
- if(trie->value)
- (*callback)(current_key, trie->value, data);
- for(i=0; i<trie->num_transitions; i++) {
- Transition transition = &trie->transitions[i];
- unsigned char *suffix = transition->suffix;
- int keylen = strlen(current_key);
-
- if(keylen + strlen(suffix) >= max_key) {
- /* BUG: This will fail silently. It should raise some
- sort of error. */
- continue;
- }
- strcat(current_key, suffix);
- _iterate_helper(transition->next, callback, data,
- current_key, max_key);
- current_key[keylen] = 0;
- }
-}
-
-void
-Trie_iterate(const Trie trie,
- void (*callback)(const unsigned char *key,
- const void *value,
- void *data),
- void *data)
-{
- KEY[0] = 0;
- _iterate_helper(trie, callback, data, KEY, MAX_KEY_LENGTH);
-}
-
-static void
-_with_prefix_helper(const Trie trie, const unsigned char *prefix,
- void (*callback)(const unsigned char *key,
- const void *value,
- void *data),
- void *data,
- unsigned char *current_key, const int max_key)
-{
- int first, last, mid;
-
- if(!prefix[0]) {
- _iterate_helper(trie, callback, data, current_key, max_key);
- return;
- }
-
- /* The transitions are stored in alphabetical order. Do a binary
- * search to find the proper one.
- */
- first = 0;
- last = trie->num_transitions-1;
- while(first <= last) {
- Transition transition;
- unsigned char *suffix;
- int suffixlen, prefixlen, minlen;
- int c;
- mid = (first+last)/2;
- transition = &trie->transitions[mid];
- suffix = transition->suffix;
- suffixlen = strlen(suffix);
- prefixlen = strlen(prefix);
- minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
- c = strncmp(prefix, suffix, minlen);
- if(c < 0)
- last = mid-1;
- else if(c > 0)
- first = mid+1;
- else {
- int keylen = strlen(current_key);
- if(keylen + minlen >= max_key) {
- /* BUG: This will fail silently. It should raise some
- sort of error. */
- break;
- }
- strncat(current_key, suffix, minlen);
- _with_prefix_helper(transition->next, prefix+minlen,
- callback, data, current_key, max_key);
- current_key[keylen] = 0;
- break;
- }
- }
-}
-
-void
-Trie_with_prefix(const Trie trie, const unsigned char *prefix,
- void (*callback)(const unsigned char *key,
- const void *value,
- void *data),
- void *data
- )
-{
- KEY[0] = 0;
- _with_prefix_helper(trie, prefix, callback, data, KEY, MAX_KEY_LENGTH);
-}
-
-
-
-/* Need to declare _serialize_transition here so it can be called from
- _serialize_trie. */
-int _serialize_transition(const Transition transition,
- int (*write)(const void *towrite, const int length,
- void *data),
- int (*write_value)(const void *value, void *data),
- void *data);
-
-/* This library also provides code for flattening tries so that they
- * can be saved and read back in later. The format of a serialized
- * trie is:
- * TYPE NBYTES DESCRIPTION
- * byte 1 Whether or not there is a value
- * variable variable If there is a value, let the client store it.
- * byte 1 Number of transitions for this Trie.
- * transition variable
- * int 4 Number of characters in the suffix.
- * suffix variable the suffix for this transition
- * byte 1 Whether or not there is a trie
- * trie variable Recursively points to another trie.
- *
- * The number of bytes and the endian may vary from platform to
- * platform.
- */
-
-int _serialize_trie(const Trie trie,
- int (*write)(const void *towrite, const int length,
- void *data),
- int (*write_value)(const void *value, void *data),
- void *data)
-{
- int i;
- unsigned char has_value;
-
- has_value = (trie->value != NULL);
- if(!(*write)(&has_value, sizeof(has_value), data))
- return 0;
- if(has_value) {
- if(!(*write_value)(trie->value, data))
- return 0;
- }
-
- if(!(*write)(&trie->num_transitions, sizeof(trie->num_transitions), data))
- return 0;
- for(i=0; i<trie->num_transitions; i++) {
- if(!_serialize_transition(&trie->transitions[i],
- write, write_value, data))
- return 0;
- }
-
- return 1;
-}
-
-int _serialize_transition(const Transition transition,
- int (*write)(const void *towrite, const int length,
- void *data),
- int (*write_value)(const void *value, void *data),
- void *data)
-{
- int suffixlen;
- unsigned char has_trie;
-
- suffixlen = strlen(transition->suffix);
- if(!(*write)(&suffixlen, sizeof(suffixlen), data))
- return 0;
- if(!(*write)(transition->suffix, suffixlen, data))
- return 0;
-
- has_trie = (transition->next != NULL);
- if(!(*write)(&has_trie, sizeof(has_trie), data))
- return 0;
- if(has_trie) {
- if(!_serialize_trie(transition->next, write, write_value, data))
- return 0;
- }
- return 1;
-}
-
-int Trie_serialize(const Trie trie,
- int (*write)(const void *towrite, const int length,
- void *data),
- int (*write_value)(const void *value, void *data),
- void *data)
-{
- int success = _serialize_trie(trie, write, write_value, data);
- (*write)(NULL, 0, data);
- return success;
-}
-
-int _deserialize_transition(Transition transition,
- int (*read)(void *wasread, const int length,
- void *data),
- void *(*read_value)(void *data),
- void *data);
-
-int _deserialize_trie(Trie trie,
- int (*read)(void *wasread, const int length, void *data),
- void *(*read_value)(void *data),
- void *data)
-{
- int i;
- unsigned char has_value;
-
- if(!(*read)(&has_value, sizeof(has_value), data))
- goto _deserialize_trie_error;
- if(has_value != 0 && has_value != 1)
- goto _deserialize_trie_error;
- if(has_value) {
- if(!(trie->value = (*read_value)(data)))
- goto _deserialize_trie_error;
- }
- if(!(*read)(&trie->num_transitions, sizeof(trie->num_transitions), data))
- goto _deserialize_trie_error;
- if(!(trie->transitions =
- malloc(trie->num_transitions*sizeof(struct _Transition))))
- goto _deserialize_trie_error;
- for(i=0; i<trie->num_transitions; i++) {
- if(!_deserialize_transition(&trie->transitions[i],
- read, read_value, data))
- goto _deserialize_trie_error;
- }
- return 1;
-
- _deserialize_trie_error:
- trie->num_transitions = 0;
- if(trie->transitions) {
- free(trie->transitions);
- trie->transitions = NULL;
- }
- trie->value = NULL;
- return 0;
-}
-
-int _deserialize_transition(Transition transition,
- int (*read)(void *wasread, const int length,
- void *data),
- void *(*read_value)(void *data),
- void *data)
-{
- int suffixlen;
- unsigned char has_trie;
-
- if(!(*read)(&suffixlen, sizeof(suffixlen), data))
- goto _deserialize_transition_error;
- if(suffixlen < 0 || suffixlen >= MAX_KEY_LENGTH)
- goto _deserialize_transition_error;
- if(!(*read)(KEY, suffixlen, data))
- goto _deserialize_transition_error;
- KEY[suffixlen] = 0;
- if(!(transition->suffix = (unsigned char *)strdup(KEY)))
- goto _deserialize_transition_error;
- if(!(*read)(&has_trie, sizeof(has_trie), data))
- goto _deserialize_transition_error;
- if(has_trie != 0 && has_trie != 1)
- goto _deserialize_transition_error;
- if(has_trie) {
- transition->next = Trie_new();
- if(!_deserialize_trie(transition->next, read, read_value, data))
- goto _deserialize_transition_error;
- }
- return 1;
-
- _deserialize_transition_error:
- if(transition->suffix) {
- free(transition->suffix);
- transition->suffix = NULL;
- }
- if(transition->next) {
- Trie_del(transition->next);
- transition->next = NULL;
- }
- return 0;
-}
-
-Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data),
- void *(*read_value)(void *data),
- void *data)
-{
- Trie trie = Trie_new();
- if(!_deserialize_trie(trie, read, read_value, data)) {
- Trie_del(trie);
- return NULL;
- }
- return trie;
-}
-
-void test(void) {
- Trie trie;
-
- printf("Hello world!\n");
-
- trie = Trie_new();
- printf("New trie %p\n", trie);
- Trie_set(trie, "hello world", "s1");
- Trie_set(trie, "bye", "s2");
- Trie_set(trie, "hell sucks", "s3");
- Trie_set(trie, "hebee", "s4");
-
- printf("%s\n", (char *)Trie_get(trie, "hello world"));
- printf("%s\n", (char *)Trie_get(trie, "bye"));
- printf("%s\n", (char *)Trie_get(trie, "hell sucks"));
- printf("%s\n", (char *)Trie_get(trie, "hebee"));
-
- Trie_set(trie, "blah", "s5");
- printf("%s\n", (char *)Trie_get(trie, "blah"));
-
- printf("%p\n", Trie_get(trie, "foobar"));
- printf("%d\n", Trie_len(trie));
-
- Trie_set(trie, "blah", "snew");
- printf("%s\n", (char *)Trie_get(trie, "blah"));
-
- Trie_del(trie);
-}
-
-#if 0
-int main() {
- test();
-}
-#endif