#include /* printf */ #include /* malloc */ #include /* strcmp, strlen */ #include "trie.h" /* The following is necessary to make sure that trie.pyd won't link * to msvcrt.dll in addition to msvcr71.dll on Windows. * See Bug #1767 on Bugzilla. */ #ifdef __MINGW32__ # define strdup _strdup #endif struct _Transition; /* Forward declaration, needed in _Trie. */ /* _Trie is a recursive data structure. A _Trie contains zero or more * _Transitions that lead to more _Tries. The transitions are stored * in alphabetical order of the suffix member of the data structure. * _Trie also contains a pointer called value where the user can store * arbitrary data. If value is NULL, then no data is stored here. */ struct _Trie { struct _Transition *transitions; unsigned char num_transitions; void *value; /* specified by user, never freed or allocated by me! */ }; /* _Transition holds information about the transitions leading from * one _Trie to another. The trie structure here is different from * typical ones, because the transitions between nodes can contain * strings of arbitrary length, not just single characters. Suffix is * the string that is matched from one node to the next. */ typedef struct _Transition { unsigned char *suffix; Trie next; } *Transition; #define MAX_KEY_LENGTH 1000 static unsigned char KEY[MAX_KEY_LENGTH]; Trie Trie_new(void) { Trie trie; if(!(trie = (Trie)malloc(sizeof(struct _Trie)))) return NULL; trie->transitions = NULL; trie->num_transitions = 0; trie->value = NULL; return trie; } int Trie_set(Trie trie, const unsigned char *key, const void *value) { int i; Transition transition=NULL; unsigned char *suffix=NULL; int retval = 0; int first, last, mid; if(!key[0]) { trie->value = (void *)value; return 0; } /* Insert the key in alphabetical order. Do a binary search to find the proper place. */ first = 0; last = trie->num_transitions-1; i = -1; while(first <= last) { mid = (first+last)/2; transition = &trie->transitions[mid]; suffix = transition->suffix; if(key[0] < suffix[0]) last = mid-1; else if(key[0] > suffix[0]) first = mid+1; else { i = mid; break; } } /* If no place was found for it, then the indexes will be in the order last,first. Place it at index first. */ if(i == -1) i = first; /* If nothing matches, then insert a new trie here. */ if((i >= trie->num_transitions) || (key[0] != suffix[0])) { unsigned char *new_suffix=NULL; Trie newtrie=NULL; Transition new_transitions=NULL; /* Create some variables for the new transition. I'm going to allocate these first so that if I can detect memory errors before I mess up the data structure of the transitions. */ if(!(new_suffix = (unsigned char *)strdup(key))) goto insert_memerror; if(!(newtrie = Trie_new())) goto insert_memerror; /* Create some space for the next transition. Allocate some memory and shift the old transitions over to make room for this one. */ if(!(new_transitions = malloc(sizeof(struct _Transition) * (trie->num_transitions+1)))) goto insert_memerror; memcpy(new_transitions, trie->transitions, sizeof(struct _Transition)*i); memcpy(&new_transitions[i+1], &trie->transitions[i], sizeof(struct _Transition)*(trie->num_transitions-i)); free(trie->transitions); trie->transitions = new_transitions; new_transitions = NULL; trie->num_transitions += 1; /* Initialize the new transition. */ transition = &trie->transitions[i]; transition->suffix = new_suffix; transition->next = newtrie; transition->next->value = (void *)value; if(0) { insert_memerror: if(new_transitions) free(new_transitions); if(newtrie) free(newtrie); if(new_suffix) free(new_suffix); return 1; } } /* There are three cases where the key and suffix share some letters. 1. suffix is proper substring of key. 2. key is proper substring of suffix. 3. neither is proper substring of other. For cases 2 and 3, I need to first split up the transition based on the number of characters shared. Then, I can insert the rest of the key into the next trie. */ else { /* Count the number of characters shared between key and suffix. */ int chars_shared = 0; while(key[chars_shared] && key[chars_shared] == suffix[chars_shared]) chars_shared++; /* Case 2 or 3, split this sucker! */ if(chars_shared < strlen(suffix)) { Trie newtrie=NULL; unsigned char *new_suffix1=NULL, *new_suffix2=NULL; if(!(new_suffix1 = (unsigned char *)malloc(chars_shared+1))) goto split_memerror; strncpy(new_suffix1, key, chars_shared); new_suffix1[chars_shared] = 0; if(!(new_suffix2 = (unsigned char *)strdup(suffix+chars_shared))) goto split_memerror; if(!(newtrie = Trie_new())) goto split_memerror; if(!(newtrie->transitions = (Transition)malloc(sizeof(struct _Transition)))) goto split_memerror; newtrie->num_transitions = 1; newtrie->transitions[0].next = transition->next; newtrie->transitions[0].suffix = new_suffix2; free(transition->suffix); transition->suffix = new_suffix1; transition->next = newtrie; if(0) { split_memerror: if(newtrie && newtrie->transitions) free(newtrie->transitions); if(newtrie) free(newtrie); if(new_suffix2) free(new_suffix2); if(new_suffix1) free(new_suffix1); return 1; } } retval = Trie_set(transition->next, key+chars_shared, value); } return retval; } void Trie_del(Trie trie) { int i; if(!trie) return; for(i=0; inum_transitions; i++) { Transition transition = &trie->transitions[i]; if(transition->suffix) free(transition->suffix); Trie_del(transition->next); } free(trie); } void *Trie_get(const Trie trie, const unsigned char *key) { int first, last, mid; if(!key[0]) { return trie->value; } /* The transitions are stored in alphabetical order. Do a binary * search to find the proper one. */ first = 0; last = trie->num_transitions-1; while(first <= last) { Transition transition; unsigned char *suffix; int c; mid = (first+last)/2; transition = &trie->transitions[mid]; suffix = transition->suffix; /* If suffix is a substring of key, then get the value from the next trie. */ c = strncmp(key, suffix, strlen(suffix)); if(c < 0) last = mid-1; else if(c > 0) first = mid+1; else return Trie_get(transition->next, key+strlen(suffix)); } return NULL; } /* Mutually recursive, so need to make a forward declaration. */ void _get_approximate_trie(const Trie trie, const unsigned char *key, const int k, void (*callback)(const unsigned char *key, const void *value, const int mismatches, void *data), void *data, const int mismatches, unsigned char *current_key, const int max_key ); void _get_approximate_transition(const unsigned char *key, const int k, const Transition transition, const unsigned char *suffix, void (*callback)(const unsigned char *key, const void *value, const int mismatches, void *data), void *data, const int mismatches, unsigned char *current_key, const int max_key ) { int i; int prev_keylen = strlen(current_key); /* Short circuit optimization. If there's too many characters to possibly be a match, then don't even try to match things. */ if((int)(strlen(suffix) - strlen(key)) > k) return; /* Match as many characters as possible. */ i = 0; while(suffix[i] && (key[i] == suffix[i])) { i++; } /* Check to make sure the key is not too long. BUG: If it is, fails silently. */ if((prev_keylen+i) >= max_key) return; strncat(current_key, suffix, i); /* If all the letters in the suffix matched, then move to the next trie. */ if(!suffix[i]) { _get_approximate_trie(transition->next, &key[i], k, callback, data, mismatches, current_key, max_key); } /* Otherwise, try out different kinds of mismatches. */ else if(k) { int new_keylen = prev_keylen+i; /* Letter replacement, skip the next letter in both the key and suffix. */ if((new_keylen+1 < max_key) && key[i] && suffix[i]) { current_key[new_keylen] = suffix[i]; current_key[new_keylen+1] = 0; _get_approximate_transition(&key[i+1], k-1, transition, &suffix[i+1], callback, data, mismatches+1, current_key, max_key); current_key[new_keylen] = 0; } /* Insertion in key, skip the next letter in the key. */ if(key[i]) { _get_approximate_transition(&key[i+1], k-1, transition, &suffix[i], callback, data, mismatches+1, current_key, max_key); } /* Deletion from key, skip the next letter in the suffix. */ if((new_keylen+1 < max_key) && suffix[i]) { current_key[new_keylen] = suffix[i]; current_key[new_keylen+1] = 0; _get_approximate_transition(&key[i], k-1, transition, &suffix[i+1], callback, data, mismatches+1, current_key, max_key); current_key[new_keylen] = 0; } } current_key[prev_keylen] = 0; } void _get_approximate_trie(const Trie trie, const unsigned char *key, const int k, void (*callback)(const unsigned char *key, const void *value, const int mismatches, void *data), void *data, const int mismatches, unsigned char *current_key, const int max_key ) { int i; /* If there's no more key to match, then I'm done. */ if(!key[0]) { if(trie->value) (*callback)(current_key, trie->value, mismatches, data); } /* If there are no more mismatches allowed, then fall back to the faster Trie_get. */ else if(!k) { void *value = Trie_get(trie, key); if(value) { int l = strlen(current_key); /* Make sure I have enough space for the full key. */ if(l + strlen(key) < max_key) { strcat(current_key, key); (*callback)(current_key, value, mismatches, data); current_key[l] = 0; } /* BUG: Ran out of space for the key. This fails silently, but should signal an error. */ } } /* If there are no more transitions, then all the characters left in the key are mismatches. */ else if(!trie->num_transitions) { if(trie->value && (strlen(key) <= k)) { (*callback)(current_key, trie->value, mismatches+strlen(key), data); } } /* Otherwise, try to match each of the transitions. */ else { for(i=0; inum_transitions; i++) { Transition transition = &trie->transitions[i]; unsigned char *suffix = transition->suffix; _get_approximate_transition(key, k, transition, suffix, callback, data, mismatches, current_key, max_key); } } } void Trie_get_approximate(const Trie trie, const unsigned char *key, const int k, void (*callback)(const unsigned char *key, const void *value, const int mismatches, void *data), void *data ) { KEY[0] = 0; _get_approximate_trie(trie, key, k, callback, data, 0, KEY,MAX_KEY_LENGTH); } int Trie_len(const Trie trie) { int i; int length = 0; if(!trie) return 0; if(trie->value) length += 1; for(i=0; inum_transitions; i++) { length += Trie_len(trie->transitions[i].next); } return length; } int Trie_has_key(const Trie trie, const unsigned char *key) { return Trie_get(trie, key) != NULL; } int Trie_has_prefix(const Trie trie, const unsigned char *prefix) { int first, last, mid; if(!prefix[0]) { return 1; } /* The transitions are stored in alphabetical order. Do a binary * search to find the proper one. */ first = 0; last = trie->num_transitions-1; while(first <= last) { Transition transition; unsigned char *suffix; int suffixlen, prefixlen, minlen; int c; mid = (first+last)/2; transition = &trie->transitions[mid]; suffix = transition->suffix; suffixlen = strlen(suffix); prefixlen = strlen(prefix); minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen; c = strncmp(prefix, suffix, minlen); if(c < 0) last = mid-1; else if(c > 0) first = mid+1; else return Trie_has_prefix(transition->next, prefix+minlen); } return 0; } static void _iterate_helper(const Trie trie, void (*callback)(const unsigned char *key, const void *value, void *data), void *data, unsigned char *current_key, const int max_key) { int i; if(trie->value) (*callback)(current_key, trie->value, data); for(i=0; inum_transitions; i++) { Transition transition = &trie->transitions[i]; unsigned char *suffix = transition->suffix; int keylen = strlen(current_key); if(keylen + strlen(suffix) >= max_key) { /* BUG: This will fail silently. It should raise some sort of error. */ continue; } strcat(current_key, suffix); _iterate_helper(transition->next, callback, data, current_key, max_key); current_key[keylen] = 0; } } void Trie_iterate(const Trie trie, void (*callback)(const unsigned char *key, const void *value, void *data), void *data) { KEY[0] = 0; _iterate_helper(trie, callback, data, KEY, MAX_KEY_LENGTH); } static void _with_prefix_helper(const Trie trie, const unsigned char *prefix, void (*callback)(const unsigned char *key, const void *value, void *data), void *data, unsigned char *current_key, const int max_key) { int first, last, mid; if(!prefix[0]) { _iterate_helper(trie, callback, data, current_key, max_key); return; } /* The transitions are stored in alphabetical order. Do a binary * search to find the proper one. */ first = 0; last = trie->num_transitions-1; while(first <= last) { Transition transition; unsigned char *suffix; int suffixlen, prefixlen, minlen; int c; mid = (first+last)/2; transition = &trie->transitions[mid]; suffix = transition->suffix; suffixlen = strlen(suffix); prefixlen = strlen(prefix); minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen; c = strncmp(prefix, suffix, minlen); if(c < 0) last = mid-1; else if(c > 0) first = mid+1; else { int keylen = strlen(current_key); if(keylen + minlen >= max_key) { /* BUG: This will fail silently. It should raise some sort of error. */ break; } strncat(current_key, suffix, minlen); _with_prefix_helper(transition->next, prefix+minlen, callback, data, current_key, max_key); current_key[keylen] = 0; break; } } } void Trie_with_prefix(const Trie trie, const unsigned char *prefix, void (*callback)(const unsigned char *key, const void *value, void *data), void *data ) { KEY[0] = 0; _with_prefix_helper(trie, prefix, callback, data, KEY, MAX_KEY_LENGTH); } /* Need to declare _serialize_transition here so it can be called from _serialize_trie. */ int _serialize_transition(const Transition transition, int (*write)(const void *towrite, const int length, void *data), int (*write_value)(const void *value, void *data), void *data); /* This library also provides code for flattening tries so that they * can be saved and read back in later. The format of a serialized * trie is: * TYPE NBYTES DESCRIPTION * byte 1 Whether or not there is a value * variable variable If there is a value, let the client store it. * byte 1 Number of transitions for this Trie. * transition variable * int 4 Number of characters in the suffix. * suffix variable the suffix for this transition * byte 1 Whether or not there is a trie * trie variable Recursively points to another trie. * * The number of bytes and the endian may vary from platform to * platform. */ int _serialize_trie(const Trie trie, int (*write)(const void *towrite, const int length, void *data), int (*write_value)(const void *value, void *data), void *data) { int i; unsigned char has_value; has_value = (trie->value != NULL); if(!(*write)(&has_value, sizeof(has_value), data)) return 0; if(has_value) { if(!(*write_value)(trie->value, data)) return 0; } if(!(*write)(&trie->num_transitions, sizeof(trie->num_transitions), data)) return 0; for(i=0; inum_transitions; i++) { if(!_serialize_transition(&trie->transitions[i], write, write_value, data)) return 0; } return 1; } int _serialize_transition(const Transition transition, int (*write)(const void *towrite, const int length, void *data), int (*write_value)(const void *value, void *data), void *data) { int suffixlen; unsigned char has_trie; suffixlen = strlen(transition->suffix); if(!(*write)(&suffixlen, sizeof(suffixlen), data)) return 0; if(!(*write)(transition->suffix, suffixlen, data)) return 0; has_trie = (transition->next != NULL); if(!(*write)(&has_trie, sizeof(has_trie), data)) return 0; if(has_trie) { if(!_serialize_trie(transition->next, write, write_value, data)) return 0; } return 1; } int Trie_serialize(const Trie trie, int (*write)(const void *towrite, const int length, void *data), int (*write_value)(const void *value, void *data), void *data) { int success = _serialize_trie(trie, write, write_value, data); (*write)(NULL, 0, data); return success; } int _deserialize_transition(Transition transition, int (*read)(void *wasread, const int length, void *data), void *(*read_value)(void *data), void *data); int _deserialize_trie(Trie trie, int (*read)(void *wasread, const int length, void *data), void *(*read_value)(void *data), void *data) { int i; unsigned char has_value; if(!(*read)(&has_value, sizeof(has_value), data)) goto _deserialize_trie_error; if(has_value != 0 && has_value != 1) goto _deserialize_trie_error; if(has_value) { if(!(trie->value = (*read_value)(data))) goto _deserialize_trie_error; } if(!(*read)(&trie->num_transitions, sizeof(trie->num_transitions), data)) goto _deserialize_trie_error; if(!(trie->transitions = malloc(trie->num_transitions*sizeof(struct _Transition)))) goto _deserialize_trie_error; for(i=0; inum_transitions; i++) { if(!_deserialize_transition(&trie->transitions[i], read, read_value, data)) goto _deserialize_trie_error; } return 1; _deserialize_trie_error: trie->num_transitions = 0; if(trie->transitions) { free(trie->transitions); trie->transitions = NULL; } trie->value = NULL; return 0; } int _deserialize_transition(Transition transition, int (*read)(void *wasread, const int length, void *data), void *(*read_value)(void *data), void *data) { int suffixlen; unsigned char has_trie; if(!(*read)(&suffixlen, sizeof(suffixlen), data)) goto _deserialize_transition_error; if(suffixlen < 0 || suffixlen >= MAX_KEY_LENGTH) goto _deserialize_transition_error; if(!(*read)(KEY, suffixlen, data)) goto _deserialize_transition_error; KEY[suffixlen] = 0; if(!(transition->suffix = (unsigned char *)strdup(KEY))) goto _deserialize_transition_error; if(!(*read)(&has_trie, sizeof(has_trie), data)) goto _deserialize_transition_error; if(has_trie != 0 && has_trie != 1) goto _deserialize_transition_error; if(has_trie) { transition->next = Trie_new(); if(!_deserialize_trie(transition->next, read, read_value, data)) goto _deserialize_transition_error; } return 1; _deserialize_transition_error: if(transition->suffix) { free(transition->suffix); transition->suffix = NULL; } if(transition->next) { Trie_del(transition->next); transition->next = NULL; } return 0; } Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data), void *(*read_value)(void *data), void *data) { Trie trie = Trie_new(); if(!_deserialize_trie(trie, read, read_value, data)) { Trie_del(trie); return NULL; } return trie; } void test(void) { Trie trie; printf("Hello world!\n"); trie = Trie_new(); printf("New trie %p\n", trie); Trie_set(trie, "hello world", "s1"); Trie_set(trie, "bye", "s2"); Trie_set(trie, "hell sucks", "s3"); Trie_set(trie, "hebee", "s4"); printf("%s\n", (char *)Trie_get(trie, "hello world")); printf("%s\n", (char *)Trie_get(trie, "bye")); printf("%s\n", (char *)Trie_get(trie, "hell sucks")); printf("%s\n", (char *)Trie_get(trie, "hebee")); Trie_set(trie, "blah", "s5"); printf("%s\n", (char *)Trie_get(trie, "blah")); printf("%p\n", Trie_get(trie, "foobar")); printf("%d\n", Trie_len(trie)); Trie_set(trie, "blah", "snew"); printf("%s\n", (char *)Trie_get(trie, "blah")); Trie_del(trie); } #if 0 int main() { test(); } #endif