1 #include <stdio.h> /* printf */
2 #include <stdlib.h> /* malloc */
3 #include <string.h> /* strcmp, strlen */
7 /* The following is necessary to make sure that trie.pyd won't link
8 * to msvcrt.dll in addition to msvcr71.dll on Windows.
9 * See Bug #1767 on Bugzilla.
12 # define strdup _strdup
15 struct _Transition; /* Forward declaration, needed in _Trie. */
18 /* _Trie is a recursive data structure. A _Trie contains zero or more
19 * _Transitions that lead to more _Tries. The transitions are stored
20 * in alphabetical order of the suffix member of the data structure.
21 * _Trie also contains a pointer called value where the user can store
22 * arbitrary data. If value is NULL, then no data is stored here.
25 struct _Transition *transitions;
26 unsigned char num_transitions;
27 void *value; /* specified by user, never freed or allocated by me! */
30 /* _Transition holds information about the transitions leading from
31 * one _Trie to another. The trie structure here is different from
32 * typical ones, because the transitions between nodes can contain
33 * strings of arbitrary length, not just single characters. Suffix is
34 * the string that is matched from one node to the next.
36 typedef struct _Transition {
37 unsigned char *suffix;
42 #define MAX_KEY_LENGTH 1000
43 static unsigned char KEY[MAX_KEY_LENGTH];
49 if(!(trie = (Trie)malloc(sizeof(struct _Trie))))
51 trie->transitions = NULL;
52 trie->num_transitions = 0;
57 int Trie_set(Trie trie, const unsigned char *key, const void *value) {
59 Transition transition=NULL;
60 unsigned char *suffix=NULL;
65 trie->value = (void *)value;
69 /* Insert the key in alphabetical order. Do a binary search to
70 find the proper place. */
72 last = trie->num_transitions-1;
74 while(first <= last) {
76 transition = &trie->transitions[mid];
77 suffix = transition->suffix;
78 if(key[0] < suffix[0])
80 else if(key[0] > suffix[0])
88 /* If no place was found for it, then the indexes will be in the
89 order last,first. Place it at index first. */
93 /* If nothing matches, then insert a new trie here. */
94 if((i >= trie->num_transitions) || (key[0] != suffix[0])) {
95 unsigned char *new_suffix=NULL;
97 Transition new_transitions=NULL;
99 /* Create some variables for the new transition. I'm going to
100 allocate these first so that if I can detect memory errors
101 before I mess up the data structure of the transitions.
103 if(!(new_suffix = (unsigned char *)strdup(key)))
104 goto insert_memerror;
105 if(!(newtrie = Trie_new()))
106 goto insert_memerror;
108 /* Create some space for the next transition. Allocate some
109 memory and shift the old transitions over to make room for
112 if(!(new_transitions = malloc(sizeof(struct _Transition) *
113 (trie->num_transitions+1))))
114 goto insert_memerror;
115 memcpy(new_transitions, trie->transitions,
116 sizeof(struct _Transition)*i);
117 memcpy(&new_transitions[i+1], &trie->transitions[i],
118 sizeof(struct _Transition)*(trie->num_transitions-i));
119 free(trie->transitions);
120 trie->transitions = new_transitions;
121 new_transitions = NULL;
122 trie->num_transitions += 1;
124 /* Initialize the new transition. */
125 transition = &trie->transitions[i];
126 transition->suffix = new_suffix;
127 transition->next = newtrie;
128 transition->next->value = (void *)value;
132 if(new_transitions) free(new_transitions);
133 if(newtrie) free(newtrie);
134 if(new_suffix) free(new_suffix);
138 /* There are three cases where the key and suffix share some
140 1. suffix is proper substring of key.
141 2. key is proper substring of suffix.
142 3. neither is proper substring of other.
144 For cases 2 and 3, I need to first split up the transition
145 based on the number of characters shared. Then, I can insert
146 the rest of the key into the next trie.
149 /* Count the number of characters shared between key
151 int chars_shared = 0;
152 while(key[chars_shared] && key[chars_shared] == suffix[chars_shared])
155 /* Case 2 or 3, split this sucker! */
156 if(chars_shared < strlen(suffix)) {
158 unsigned char *new_suffix1=NULL, *new_suffix2=NULL;
160 if(!(new_suffix1 = (unsigned char *)malloc(chars_shared+1)))
162 strncpy(new_suffix1, key, chars_shared);
163 new_suffix1[chars_shared] = 0;
164 if(!(new_suffix2 = (unsigned char *)strdup(suffix+chars_shared)))
166 if(!(newtrie = Trie_new()))
168 if(!(newtrie->transitions =
169 (Transition)malloc(sizeof(struct _Transition))))
171 newtrie->num_transitions = 1;
172 newtrie->transitions[0].next = transition->next;
173 newtrie->transitions[0].suffix = new_suffix2;
175 free(transition->suffix);
176 transition->suffix = new_suffix1;
177 transition->next = newtrie;
181 if(newtrie && newtrie->transitions) free(newtrie->transitions);
182 if(newtrie) free(newtrie);
183 if(new_suffix2) free(new_suffix2);
184 if(new_suffix1) free(new_suffix1);
188 retval = Trie_set(transition->next, key+chars_shared, value);
194 void Trie_del(Trie trie) {
198 for(i=0; i<trie->num_transitions; i++) {
199 Transition transition = &trie->transitions[i];
200 if(transition->suffix)
201 free(transition->suffix);
202 Trie_del(transition->next);
207 void *Trie_get(const Trie trie, const unsigned char *key) {
208 int first, last, mid;
214 /* The transitions are stored in alphabetical order. Do a binary
215 * search to find the proper one.
218 last = trie->num_transitions-1;
219 while(first <= last) {
220 Transition transition;
221 unsigned char *suffix;
223 mid = (first+last)/2;
224 transition = &trie->transitions[mid];
225 suffix = transition->suffix;
226 /* If suffix is a substring of key, then get the value from
229 c = strncmp(key, suffix, strlen(suffix));
235 return Trie_get(transition->next, key+strlen(suffix));
241 /* Mutually recursive, so need to make a forward declaration. */
243 _get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
244 void (*callback)(const unsigned char *key,
246 const int mismatches,
249 const int mismatches,
250 unsigned char *current_key, const int max_key
254 _get_approximate_transition(const unsigned char *key,
256 const Transition transition,
257 const unsigned char *suffix,
258 void (*callback)(const unsigned char *key,
260 const int mismatches,
263 const int mismatches,
264 unsigned char *current_key, const int max_key
268 int prev_keylen = strlen(current_key);
270 /* Short circuit optimization. If there's too many characters to
271 possibly be a match, then don't even try to match things. */
272 if((int)(strlen(suffix) - strlen(key)) > k)
275 /* Match as many characters as possible. */
277 while(suffix[i] && (key[i] == suffix[i])) {
280 /* Check to make sure the key is not too long. BUG: If it is,
282 if((prev_keylen+i) >= max_key)
284 strncat(current_key, suffix, i);
286 /* If all the letters in the suffix matched, then move to the
289 _get_approximate_trie(transition->next, &key[i], k, callback, data,
290 mismatches, current_key, max_key);
292 /* Otherwise, try out different kinds of mismatches. */
294 int new_keylen = prev_keylen+i;
296 /* Letter replacement, skip the next letter in both the key and
298 if((new_keylen+1 < max_key) && key[i] && suffix[i]) {
299 current_key[new_keylen] = suffix[i];
300 current_key[new_keylen+1] = 0;
301 _get_approximate_transition(&key[i+1], k-1,
302 transition, &suffix[i+1],
304 mismatches+1, current_key, max_key);
305 current_key[new_keylen] = 0;
308 /* Insertion in key, skip the next letter in the key. */
310 _get_approximate_transition(&key[i+1], k-1,
311 transition, &suffix[i],
313 mismatches+1, current_key, max_key);
316 /* Deletion from key, skip the next letter in the suffix. */
317 if((new_keylen+1 < max_key) && suffix[i]) {
318 current_key[new_keylen] = suffix[i];
319 current_key[new_keylen+1] = 0;
320 _get_approximate_transition(&key[i], k-1,
321 transition, &suffix[i+1],
323 mismatches+1, current_key, max_key);
324 current_key[new_keylen] = 0;
327 current_key[prev_keylen] = 0;
331 _get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
332 void (*callback)(const unsigned char *key,
334 const int mismatches,
337 const int mismatches,
338 unsigned char *current_key, const int max_key
343 /* If there's no more key to match, then I'm done. */
346 (*callback)(current_key, trie->value, mismatches, data);
348 /* If there are no more mismatches allowed, then fall back to the
351 void *value = Trie_get(trie, key);
353 int l = strlen(current_key);
354 /* Make sure I have enough space for the full key. */
355 if(l + strlen(key) < max_key) {
356 strcat(current_key, key);
357 (*callback)(current_key, value, mismatches, data);
360 /* BUG: Ran out of space for the key. This fails
361 silently, but should signal an error. */
364 /* If there are no more transitions, then all the characters left
365 in the key are mismatches. */
366 else if(!trie->num_transitions) {
367 if(trie->value && (strlen(key) <= k)) {
368 (*callback)(current_key, trie->value,
369 mismatches+strlen(key), data);
372 /* Otherwise, try to match each of the transitions. */
374 for(i=0; i<trie->num_transitions; i++) {
375 Transition transition = &trie->transitions[i];
376 unsigned char *suffix = transition->suffix;
377 _get_approximate_transition(key, k, transition, suffix,
379 mismatches, current_key, max_key);
387 Trie_get_approximate(const Trie trie, const unsigned char *key, const int k,
388 void (*callback)(const unsigned char *key,
390 const int mismatches,
396 _get_approximate_trie(trie, key, k, callback, data, 0, KEY,MAX_KEY_LENGTH);
399 int Trie_len(const Trie trie)
408 for(i=0; i<trie->num_transitions; i++) {
409 length += Trie_len(trie->transitions[i].next);
414 int Trie_has_key(const Trie trie, const unsigned char *key)
416 return Trie_get(trie, key) != NULL;
419 int Trie_has_prefix(const Trie trie, const unsigned char *prefix)
421 int first, last, mid;
427 /* The transitions are stored in alphabetical order. Do a binary
428 * search to find the proper one.
431 last = trie->num_transitions-1;
432 while(first <= last) {
433 Transition transition;
434 unsigned char *suffix;
435 int suffixlen, prefixlen, minlen;
437 mid = (first+last)/2;
438 transition = &trie->transitions[mid];
439 suffix = transition->suffix;
440 suffixlen = strlen(suffix);
441 prefixlen = strlen(prefix);
442 minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
443 c = strncmp(prefix, suffix, minlen);
449 return Trie_has_prefix(transition->next, prefix+minlen);
455 _iterate_helper(const Trie trie,
456 void (*callback)(const unsigned char *key,
460 unsigned char *current_key, const int max_key)
464 (*callback)(current_key, trie->value, data);
465 for(i=0; i<trie->num_transitions; i++) {
466 Transition transition = &trie->transitions[i];
467 unsigned char *suffix = transition->suffix;
468 int keylen = strlen(current_key);
470 if(keylen + strlen(suffix) >= max_key) {
471 /* BUG: This will fail silently. It should raise some
475 strcat(current_key, suffix);
476 _iterate_helper(transition->next, callback, data,
477 current_key, max_key);
478 current_key[keylen] = 0;
483 Trie_iterate(const Trie trie,
484 void (*callback)(const unsigned char *key,
490 _iterate_helper(trie, callback, data, KEY, MAX_KEY_LENGTH);
494 _with_prefix_helper(const Trie trie, const unsigned char *prefix,
495 void (*callback)(const unsigned char *key,
499 unsigned char *current_key, const int max_key)
501 int first, last, mid;
504 _iterate_helper(trie, callback, data, current_key, max_key);
508 /* The transitions are stored in alphabetical order. Do a binary
509 * search to find the proper one.
512 last = trie->num_transitions-1;
513 while(first <= last) {
514 Transition transition;
515 unsigned char *suffix;
516 int suffixlen, prefixlen, minlen;
518 mid = (first+last)/2;
519 transition = &trie->transitions[mid];
520 suffix = transition->suffix;
521 suffixlen = strlen(suffix);
522 prefixlen = strlen(prefix);
523 minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
524 c = strncmp(prefix, suffix, minlen);
530 int keylen = strlen(current_key);
531 if(keylen + minlen >= max_key) {
532 /* BUG: This will fail silently. It should raise some
536 strncat(current_key, suffix, minlen);
537 _with_prefix_helper(transition->next, prefix+minlen,
538 callback, data, current_key, max_key);
539 current_key[keylen] = 0;
546 Trie_with_prefix(const Trie trie, const unsigned char *prefix,
547 void (*callback)(const unsigned char *key,
554 _with_prefix_helper(trie, prefix, callback, data, KEY, MAX_KEY_LENGTH);
559 /* Need to declare _serialize_transition here so it can be called from
561 int _serialize_transition(const Transition transition,
562 int (*write)(const void *towrite, const int length,
564 int (*write_value)(const void *value, void *data),
567 /* This library also provides code for flattening tries so that they
568 * can be saved and read back in later. The format of a serialized
570 * TYPE NBYTES DESCRIPTION
571 * byte 1 Whether or not there is a value
572 * variable variable If there is a value, let the client store it.
573 * byte 1 Number of transitions for this Trie.
574 * transition variable
575 * int 4 Number of characters in the suffix.
576 * suffix variable the suffix for this transition
577 * byte 1 Whether or not there is a trie
578 * trie variable Recursively points to another trie.
580 * The number of bytes and the endian may vary from platform to
584 int _serialize_trie(const Trie trie,
585 int (*write)(const void *towrite, const int length,
587 int (*write_value)(const void *value, void *data),
591 unsigned char has_value;
593 has_value = (trie->value != NULL);
594 if(!(*write)(&has_value, sizeof(has_value), data))
597 if(!(*write_value)(trie->value, data))
601 if(!(*write)(&trie->num_transitions, sizeof(trie->num_transitions), data))
603 for(i=0; i<trie->num_transitions; i++) {
604 if(!_serialize_transition(&trie->transitions[i],
605 write, write_value, data))
612 int _serialize_transition(const Transition transition,
613 int (*write)(const void *towrite, const int length,
615 int (*write_value)(const void *value, void *data),
619 unsigned char has_trie;
621 suffixlen = strlen(transition->suffix);
622 if(!(*write)(&suffixlen, sizeof(suffixlen), data))
624 if(!(*write)(transition->suffix, suffixlen, data))
627 has_trie = (transition->next != NULL);
628 if(!(*write)(&has_trie, sizeof(has_trie), data))
631 if(!_serialize_trie(transition->next, write, write_value, data))
637 int Trie_serialize(const Trie trie,
638 int (*write)(const void *towrite, const int length,
640 int (*write_value)(const void *value, void *data),
643 int success = _serialize_trie(trie, write, write_value, data);
644 (*write)(NULL, 0, data);
648 int _deserialize_transition(Transition transition,
649 int (*read)(void *wasread, const int length,
651 void *(*read_value)(void *data),
654 int _deserialize_trie(Trie trie,
655 int (*read)(void *wasread, const int length, void *data),
656 void *(*read_value)(void *data),
660 unsigned char has_value;
662 if(!(*read)(&has_value, sizeof(has_value), data))
663 goto _deserialize_trie_error;
664 if(has_value != 0 && has_value != 1)
665 goto _deserialize_trie_error;
667 if(!(trie->value = (*read_value)(data)))
668 goto _deserialize_trie_error;
670 if(!(*read)(&trie->num_transitions, sizeof(trie->num_transitions), data))
671 goto _deserialize_trie_error;
672 if(!(trie->transitions =
673 malloc(trie->num_transitions*sizeof(struct _Transition))))
674 goto _deserialize_trie_error;
675 for(i=0; i<trie->num_transitions; i++) {
676 if(!_deserialize_transition(&trie->transitions[i],
677 read, read_value, data))
678 goto _deserialize_trie_error;
682 _deserialize_trie_error:
683 trie->num_transitions = 0;
684 if(trie->transitions) {
685 free(trie->transitions);
686 trie->transitions = NULL;
692 int _deserialize_transition(Transition transition,
693 int (*read)(void *wasread, const int length,
695 void *(*read_value)(void *data),
699 unsigned char has_trie;
701 if(!(*read)(&suffixlen, sizeof(suffixlen), data))
702 goto _deserialize_transition_error;
703 if(suffixlen < 0 || suffixlen >= MAX_KEY_LENGTH)
704 goto _deserialize_transition_error;
705 if(!(*read)(KEY, suffixlen, data))
706 goto _deserialize_transition_error;
708 if(!(transition->suffix = (unsigned char *)strdup(KEY)))
709 goto _deserialize_transition_error;
710 if(!(*read)(&has_trie, sizeof(has_trie), data))
711 goto _deserialize_transition_error;
712 if(has_trie != 0 && has_trie != 1)
713 goto _deserialize_transition_error;
715 transition->next = Trie_new();
716 if(!_deserialize_trie(transition->next, read, read_value, data))
717 goto _deserialize_transition_error;
721 _deserialize_transition_error:
722 if(transition->suffix) {
723 free(transition->suffix);
724 transition->suffix = NULL;
726 if(transition->next) {
727 Trie_del(transition->next);
728 transition->next = NULL;
733 Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data),
734 void *(*read_value)(void *data),
737 Trie trie = Trie_new();
738 if(!_deserialize_trie(trie, read, read_value, data)) {
748 printf("Hello world!\n");
751 printf("New trie %p\n", trie);
752 Trie_set(trie, "hello world", "s1");
753 Trie_set(trie, "bye", "s2");
754 Trie_set(trie, "hell sucks", "s3");
755 Trie_set(trie, "hebee", "s4");
757 printf("%s\n", (char *)Trie_get(trie, "hello world"));
758 printf("%s\n", (char *)Trie_get(trie, "bye"));
759 printf("%s\n", (char *)Trie_get(trie, "hell sucks"));
760 printf("%s\n", (char *)Trie_get(trie, "hebee"));
762 Trie_set(trie, "blah", "s5");
763 printf("%s\n", (char *)Trie_get(trie, "blah"));
765 printf("%p\n", Trie_get(trie, "foobar"));
766 printf("%d\n", Trie_len(trie));
768 Trie_set(trie, "blah", "snew");
769 printf("%s\n", (char *)Trie_get(trie, "blah"));