Mac binaries
[jabaws.git] / website / archive / binaries / mac / src / globplot / biopython-1.50 / Bio / trie.c
diff --git a/website/archive/binaries/mac/src/globplot/biopython-1.50/Bio/trie.c b/website/archive/binaries/mac/src/globplot/biopython-1.50/Bio/trie.c
new file mode 100644 (file)
index 0000000..d73d784
--- /dev/null
@@ -0,0 +1,778 @@
+#include <stdio.h>    /* printf */
+#include <stdlib.h>   /* malloc */
+#include <string.h>   /* strcmp, strlen */
+
+#include "trie.h"
+
+/* The following is necessary to make sure that trie.pyd won't link
+ * to msvcrt.dll in addition to msvcr71.dll on Windows.
+ * See Bug #1767 on Bugzilla.
+ */
+#ifdef __MINGW32__
+#  define strdup _strdup
+#endif
+
+struct _Transition;   /* Forward declaration, needed in _Trie. */
+
+
+/* _Trie is a recursive data structure.  A _Trie contains zero or more
+ * _Transitions that lead to more _Tries.  The transitions are stored
+ * in alphabetical order of the suffix member of the data structure.
+ * _Trie also contains a pointer called value where the user can store
+ * arbitrary data.  If value is NULL, then no data is stored here.
+ */
+struct _Trie {
+    struct _Transition *transitions;
+    unsigned char num_transitions;
+    void *value;   /* specified by user, never freed or allocated by me! */
+};
+
+/* _Transition holds information about the transitions leading from
+ * one _Trie to another.  The trie structure here is different from
+ * typical ones, because the transitions between nodes can contain
+ * strings of arbitrary length, not just single characters.  Suffix is
+ * the string that is matched from one node to the next.
+ */
+typedef struct _Transition {
+    unsigned char *suffix;
+    Trie next;
+} *Transition;
+
+
+#define MAX_KEY_LENGTH 1000
+static unsigned char KEY[MAX_KEY_LENGTH];
+
+
+Trie Trie_new(void) {
+    Trie trie;
+
+    if(!(trie = (Trie)malloc(sizeof(struct _Trie))))
+       return NULL;
+    trie->transitions = NULL;
+    trie->num_transitions = 0;
+    trie->value = NULL;
+    return trie;
+}
+
+int Trie_set(Trie trie, const unsigned char *key, const void *value) {
+    int i;
+    Transition transition=NULL;
+    unsigned char *suffix=NULL;
+    int retval = 0;
+    int first, last, mid;
+
+    if(!key[0]) {
+       trie->value = (void *)value;
+       return 0;
+    }
+
+    /* Insert the key in alphabetical order.  Do a binary search to
+       find the proper place. */
+    first = 0;
+    last = trie->num_transitions-1;
+    i = -1;
+    while(first <= last) {
+       mid = (first+last)/2;
+       transition = &trie->transitions[mid];
+       suffix = transition->suffix;
+       if(key[0] < suffix[0])
+           last = mid-1;
+       else if(key[0] > suffix[0])
+           first = mid+1;
+       else {
+           i = mid;
+           break;
+       }
+    }
+
+    /* If no place was found for it, then the indexes will be in the
+       order last,first.  Place it at index first. */
+    if(i == -1)
+       i = first;
+
+    /* If nothing matches, then insert a new trie here. */
+    if((i >= trie->num_transitions) || (key[0] != suffix[0])) {
+       unsigned char *new_suffix=NULL;
+       Trie newtrie=NULL;
+       Transition new_transitions=NULL;
+
+       /* Create some variables for the new transition.  I'm going to
+          allocate these first so that if I can detect memory errors
+          before I mess up the data structure of the transitions.
+       */
+       if(!(new_suffix = (unsigned char *)strdup(key)))
+           goto insert_memerror;
+       if(!(newtrie = Trie_new()))
+           goto insert_memerror;
+
+       /* Create some space for the next transition.  Allocate some
+          memory and shift the old transitions over to make room for
+          this one.
+       */
+       if(!(new_transitions = malloc(sizeof(struct _Transition) *
+                                     (trie->num_transitions+1))))
+           goto insert_memerror;
+       memcpy(new_transitions, trie->transitions,
+              sizeof(struct _Transition)*i);
+       memcpy(&new_transitions[i+1], &trie->transitions[i],
+              sizeof(struct _Transition)*(trie->num_transitions-i));
+       free(trie->transitions);
+       trie->transitions = new_transitions;
+       new_transitions = NULL;
+       trie->num_transitions += 1;
+
+       /* Initialize the new transition. */
+       transition = &trie->transitions[i];
+       transition->suffix = new_suffix;
+       transition->next = newtrie;
+       transition->next->value = (void *)value;
+
+       if(0) {
+       insert_memerror:
+           if(new_transitions) free(new_transitions);
+           if(newtrie) free(newtrie);
+           if(new_suffix) free(new_suffix);
+           return 1;
+       }
+    } 
+    /* There are three cases where the key and suffix share some
+       letters. 
+       1.  suffix is proper substring of key.
+       2.  key is proper substring of suffix.
+       3.  neither is proper substring of other.
+
+       For cases 2 and 3, I need to first split up the transition
+       based on the number of characters shared.  Then, I can insert
+       the rest of the key into the next trie.
+    */
+    else {
+       /* Count the number of characters shared between key
+          and suffix. */
+       int chars_shared = 0;
+       while(key[chars_shared] && key[chars_shared] == suffix[chars_shared])
+           chars_shared++;
+
+       /* Case 2 or 3, split this sucker! */
+       if(chars_shared < strlen(suffix)) {
+           Trie newtrie=NULL;
+           unsigned char *new_suffix1=NULL, *new_suffix2=NULL;
+
+           if(!(new_suffix1 = (unsigned char *)malloc(chars_shared+1)))
+               goto split_memerror;
+           strncpy(new_suffix1, key, chars_shared);
+           new_suffix1[chars_shared] = 0;
+           if(!(new_suffix2 = (unsigned char *)strdup(suffix+chars_shared)))
+               goto split_memerror;
+           if(!(newtrie = Trie_new()))
+               goto split_memerror;
+           if(!(newtrie->transitions = 
+                (Transition)malloc(sizeof(struct _Transition))))
+               goto split_memerror;
+           newtrie->num_transitions = 1;
+           newtrie->transitions[0].next = transition->next;
+           newtrie->transitions[0].suffix = new_suffix2;
+
+           free(transition->suffix);
+           transition->suffix = new_suffix1;
+           transition->next = newtrie;
+
+           if(0) {
+           split_memerror:
+               if(newtrie && newtrie->transitions) free(newtrie->transitions);
+               if(newtrie) free(newtrie);
+               if(new_suffix2) free(new_suffix2);
+               if(new_suffix1) free(new_suffix1);
+               return 1;
+           }
+       }
+       retval = Trie_set(transition->next, key+chars_shared, value);
+    }
+
+    return retval;
+}
+
+void Trie_del(Trie trie) {
+    int i;
+    if(!trie)
+       return;
+    for(i=0; i<trie->num_transitions; i++) {
+       Transition transition = &trie->transitions[i];
+       if(transition->suffix)
+           free(transition->suffix);
+       Trie_del(transition->next);
+    }
+    free(trie);
+}
+
+void *Trie_get(const Trie trie, const unsigned char *key) {
+    int first, last, mid;
+
+    if(!key[0]) {
+       return trie->value;
+    }
+
+    /* The transitions are stored in alphabetical order.  Do a binary
+     * search to find the proper one.
+     */
+    first = 0;
+    last = trie->num_transitions-1;
+    while(first <= last) {
+       Transition transition;
+       unsigned char *suffix;
+       int c;
+       mid = (first+last)/2;
+       transition = &trie->transitions[mid];
+       suffix = transition->suffix;
+       /* If suffix is a substring of key, then get the value from
+          the next trie.
+       */
+       c = strncmp(key, suffix, strlen(suffix));
+       if(c < 0)
+           last = mid-1;
+       else if(c > 0)
+           first = mid+1;
+       else
+           return Trie_get(transition->next, key+strlen(suffix));
+    }
+    return NULL;
+}
+
+
+/* Mutually recursive, so need to make a forward declaration. */
+void
+_get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
+                     void (*callback)(const unsigned char *key, 
+                                      const void *value,
+                                      const int mismatches,
+                                      void *data),
+                     void *data, 
+                     const int mismatches,
+                     unsigned char *current_key, const int max_key
+                     );
+
+void 
+_get_approximate_transition(const unsigned char *key, 
+                           const int k,
+                           const Transition transition, 
+                           const unsigned char *suffix,
+                           void (*callback)(const unsigned char *key, 
+                                            const void *value,
+                                            const int mismatches,
+                                            void *data),
+                           void *data, 
+                           const int mismatches,
+                           unsigned char *current_key, const int max_key
+                           )
+{
+    int i;
+    int prev_keylen = strlen(current_key);
+
+    /* Short circuit optimization.  If there's too many characters to
+       possibly be a match, then don't even try to match things. */
+    if((int)(strlen(suffix) - strlen(key)) > k)
+       return;
+
+    /* Match as many characters as possible. */
+    i = 0;
+    while(suffix[i] && (key[i] == suffix[i])) {
+       i++;
+    }
+    /* Check to make sure the key is not too long.  BUG: If it is,
+       fails silently. */
+    if((prev_keylen+i) >= max_key)
+       return;
+    strncat(current_key, suffix, i);
+
+    /* If all the letters in the suffix matched, then move to the
+       next trie. */
+    if(!suffix[i]) {
+       _get_approximate_trie(transition->next, &key[i], k, callback, data,
+                             mismatches, current_key, max_key);
+    }
+    /* Otherwise, try out different kinds of mismatches. */
+    else if(k) {
+       int new_keylen = prev_keylen+i;
+
+       /* Letter replacement, skip the next letter in both the key and
+          suffix. */
+       if((new_keylen+1 < max_key) && key[i] && suffix[i]) {
+           current_key[new_keylen] = suffix[i];
+           current_key[new_keylen+1] = 0;
+           _get_approximate_transition(&key[i+1], k-1,
+                                       transition, &suffix[i+1],
+                                       callback, data,
+                                       mismatches+1, current_key, max_key);
+           current_key[new_keylen] = 0;
+       }
+
+       /* Insertion in key, skip the next letter in the key. */
+       if(key[i]) {
+           _get_approximate_transition(&key[i+1], k-1, 
+                                       transition, &suffix[i],
+                                       callback, data,
+                                       mismatches+1, current_key, max_key);
+       }
+
+       /* Deletion from key, skip the next letter in the suffix. */
+       if((new_keylen+1 < max_key) && suffix[i]) {
+           current_key[new_keylen] = suffix[i];
+           current_key[new_keylen+1] = 0;
+           _get_approximate_transition(&key[i], k-1,
+                                       transition, &suffix[i+1],
+                                       callback, data,
+                                       mismatches+1, current_key, max_key);
+           current_key[new_keylen] = 0;
+       }
+    }
+    current_key[prev_keylen] = 0;
+}
+
+void
+_get_approximate_trie(const Trie trie, const unsigned char *key, const int k,
+                     void (*callback)(const unsigned char *key, 
+                                      const void *value,
+                                      const int mismatches,
+                                      void *data),
+                     void *data, 
+                     const int mismatches,
+                     unsigned char *current_key, const int max_key
+                     )
+{
+    int i;
+
+    /* If there's no more key to match, then I'm done. */
+    if(!key[0]) {
+       if(trie->value)
+           (*callback)(current_key, trie->value, mismatches, data);
+    }
+    /* If there are no more mismatches allowed, then fall back to the
+       faster Trie_get. */
+    else if(!k) {
+       void *value = Trie_get(trie, key);
+       if(value) {
+           int l = strlen(current_key);
+           /* Make sure I have enough space for the full key. */
+           if(l + strlen(key) < max_key) {
+               strcat(current_key, key);
+               (*callback)(current_key, value, mismatches, data);
+               current_key[l] = 0;
+           }
+           /* BUG: Ran out of space for the key.  This fails
+              silently, but should signal an error. */
+       }
+    }
+    /* If there are no more transitions, then all the characters left
+       in the key are mismatches. */
+    else if(!trie->num_transitions) {
+       if(trie->value && (strlen(key) <= k)) {
+           (*callback)(current_key, trie->value, 
+                       mismatches+strlen(key), data);
+       }
+    }
+    /* Otherwise, try to match each of the transitions. */
+    else {
+       for(i=0; i<trie->num_transitions; i++) {
+           Transition transition = &trie->transitions[i];
+           unsigned char *suffix = transition->suffix;
+           _get_approximate_transition(key, k, transition, suffix,
+                                       callback, data, 
+                                       mismatches, current_key, max_key);
+       }
+    }
+
+}
+
+
+void 
+Trie_get_approximate(const Trie trie, const unsigned char *key, const int k,
+                    void (*callback)(const unsigned char *key, 
+                                     const void *value,
+                                     const int mismatches,
+                                     void *data),
+                    void *data
+                    )
+{
+    KEY[0] = 0;
+    _get_approximate_trie(trie, key, k, callback, data, 0, KEY,MAX_KEY_LENGTH);
+}
+
+int Trie_len(const Trie trie) 
+{
+    int i;
+    int length = 0;
+    
+    if(!trie)
+       return 0;
+    if(trie->value)
+       length += 1;
+    for(i=0; i<trie->num_transitions; i++) {
+       length += Trie_len(trie->transitions[i].next);
+    }
+    return length;
+}
+
+int Trie_has_key(const Trie trie, const unsigned char *key) 
+{
+    return Trie_get(trie, key) != NULL;
+}
+
+int Trie_has_prefix(const Trie trie, const unsigned char *prefix) 
+{
+    int first, last, mid;
+
+    if(!prefix[0]) {
+       return 1;
+    }
+
+    /* The transitions are stored in alphabetical order.  Do a binary
+     * search to find the proper one.
+     */
+    first = 0;
+    last = trie->num_transitions-1;
+    while(first <= last) {
+       Transition transition;
+       unsigned char *suffix;
+       int suffixlen, prefixlen, minlen;
+       int c;
+       mid = (first+last)/2;
+       transition = &trie->transitions[mid];
+       suffix = transition->suffix;
+       suffixlen = strlen(suffix);
+       prefixlen = strlen(prefix);
+       minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
+       c = strncmp(prefix, suffix, minlen);
+       if(c < 0)
+           last = mid-1;
+       else if(c > 0)
+           first = mid+1;
+       else
+           return Trie_has_prefix(transition->next, prefix+minlen);
+    }
+    return 0;
+}
+
+static void 
+_iterate_helper(const Trie trie, 
+               void (*callback)(const unsigned char *key, 
+                                const void *value,
+                                void *data),
+               void *data,
+               unsigned char *current_key, const int max_key)
+{
+    int i;
+    if(trie->value)
+       (*callback)(current_key, trie->value, data);
+    for(i=0; i<trie->num_transitions; i++) {
+       Transition transition = &trie->transitions[i];
+       unsigned char *suffix = transition->suffix;
+       int keylen = strlen(current_key);
+
+       if(keylen + strlen(suffix) >= max_key) {
+           /* BUG: This will fail silently.  It should raise some
+              sort of error. */
+           continue;
+       }
+       strcat(current_key, suffix);
+       _iterate_helper(transition->next, callback, data, 
+                       current_key, max_key);
+       current_key[keylen] = 0;
+    }
+}
+
+void 
+Trie_iterate(const Trie trie, 
+            void (*callback)(const unsigned char *key, 
+                             const void *value,
+                             void *data),
+            void *data)
+{
+    KEY[0] = 0;
+    _iterate_helper(trie, callback, data, KEY, MAX_KEY_LENGTH);
+}
+
+static void
+_with_prefix_helper(const Trie trie, const unsigned char *prefix,
+                   void (*callback)(const unsigned char *key, 
+                                    const void *value,
+                                    void *data),
+                   void *data,
+                   unsigned char *current_key, const int max_key)
+{
+    int first, last, mid;
+
+    if(!prefix[0]) {
+       _iterate_helper(trie, callback, data, current_key, max_key);
+       return;
+    }
+
+    /* The transitions are stored in alphabetical order.  Do a binary
+     * search to find the proper one.
+     */
+    first = 0;
+    last = trie->num_transitions-1;
+    while(first <= last) {
+       Transition transition;
+       unsigned char *suffix;
+       int suffixlen, prefixlen, minlen;
+       int c;
+       mid = (first+last)/2;
+       transition = &trie->transitions[mid];
+       suffix = transition->suffix;
+       suffixlen = strlen(suffix);
+       prefixlen = strlen(prefix);
+       minlen = (suffixlen < prefixlen) ? suffixlen : prefixlen;
+       c = strncmp(prefix, suffix, minlen);
+       if(c < 0)
+           last = mid-1;
+       else if(c > 0)
+           first = mid+1;
+       else {
+           int keylen = strlen(current_key);
+           if(keylen + minlen >= max_key) {
+               /* BUG: This will fail silently.  It should raise some
+                  sort of error. */
+               break;
+           }
+           strncat(current_key, suffix, minlen);
+           _with_prefix_helper(transition->next, prefix+minlen,
+                               callback, data, current_key, max_key);
+           current_key[keylen] = 0;
+           break;
+       }
+    }
+}
+
+void 
+Trie_with_prefix(const Trie trie, const unsigned char *prefix,
+                void (*callback)(const unsigned char *key, 
+                                 const void *value,
+                                 void *data),
+                void *data
+                )
+{
+    KEY[0] = 0;
+    _with_prefix_helper(trie, prefix, callback, data, KEY, MAX_KEY_LENGTH);
+}
+
+
+
+/* Need to declare _serialize_transition here so it can be called from
+   _serialize_trie. */
+int _serialize_transition(const Transition transition, 
+                         int (*write)(const void *towrite, const int length,
+                                      void *data),
+                         int (*write_value)(const void *value, void *data),
+                         void *data);
+
+/* This library also provides code for flattening tries so that they
+ * can be saved and read back in later.  The format of a serialized
+ * trie is:
+ * TYPE        NBYTES    DESCRIPTION
+ * byte        1         Whether or not there is a value
+ * variable    variable  If there is a value, let the client store it.
+ * byte        1         Number of transitions for this Trie.
+ * transition  variable
+ *   int       4         Number of characters in the suffix.
+ *   suffix    variable  the suffix for this transition
+ *   byte      1         Whether or not there is a trie
+ *   trie      variable  Recursively points to another trie.
+ * 
+ * The number of bytes and the endian may vary from platform to
+ * platform.
+ */
+
+int _serialize_trie(const Trie trie, 
+                   int (*write)(const void *towrite, const int length,
+                                void *data),
+                   int (*write_value)(const void *value, void *data),
+                   void *data)
+{
+    int i;
+    unsigned char has_value;
+
+    has_value = (trie->value != NULL);
+    if(!(*write)(&has_value, sizeof(has_value), data))
+       return 0;
+    if(has_value) {
+       if(!(*write_value)(trie->value, data))
+           return 0;
+    }
+
+    if(!(*write)(&trie->num_transitions, sizeof(trie->num_transitions), data))
+       return 0;
+    for(i=0; i<trie->num_transitions; i++) {
+       if(!_serialize_transition(&trie->transitions[i], 
+                                 write, write_value, data))
+           return 0;
+    }
+
+    return 1;
+}
+
+int _serialize_transition(const Transition transition, 
+                         int (*write)(const void *towrite, const int length,
+                                      void *data),
+                         int (*write_value)(const void *value, void *data),
+                         void *data)
+{
+    int suffixlen;
+    unsigned char has_trie;
+
+    suffixlen = strlen(transition->suffix);
+    if(!(*write)(&suffixlen, sizeof(suffixlen), data))
+       return 0;
+    if(!(*write)(transition->suffix, suffixlen, data))
+       return 0;
+
+    has_trie = (transition->next != NULL);
+    if(!(*write)(&has_trie, sizeof(has_trie), data))
+       return 0;
+    if(has_trie) {
+       if(!_serialize_trie(transition->next, write, write_value, data))
+           return 0;
+    }
+    return 1;
+}
+
+int Trie_serialize(const Trie trie, 
+                  int (*write)(const void *towrite, const int length, 
+                               void *data),
+                  int (*write_value)(const void *value, void *data),
+                  void *data)
+{
+    int success = _serialize_trie(trie, write, write_value, data);
+    (*write)(NULL, 0, data);
+    return success;
+}
+
+int _deserialize_transition(Transition transition,
+                           int (*read)(void *wasread, const int length, 
+                                       void *data),
+                           void *(*read_value)(void *data),
+                           void *data);
+
+int _deserialize_trie(Trie trie, 
+                     int (*read)(void *wasread, const int length, void *data),
+                     void *(*read_value)(void *data),
+                     void *data)
+{
+    int i;
+    unsigned char has_value;
+
+    if(!(*read)(&has_value, sizeof(has_value), data))
+       goto _deserialize_trie_error;
+    if(has_value != 0 && has_value != 1)
+       goto _deserialize_trie_error;
+    if(has_value) {
+       if(!(trie->value = (*read_value)(data)))
+           goto _deserialize_trie_error;
+    }
+    if(!(*read)(&trie->num_transitions, sizeof(trie->num_transitions), data))
+       goto _deserialize_trie_error;
+    if(!(trie->transitions = 
+        malloc(trie->num_transitions*sizeof(struct _Transition))))
+       goto _deserialize_trie_error;
+    for(i=0; i<trie->num_transitions; i++) {
+       if(!_deserialize_transition(&trie->transitions[i], 
+                                   read, read_value, data))
+           goto _deserialize_trie_error;
+    }
+    return 1;
+   
+ _deserialize_trie_error:
+    trie->num_transitions = 0;
+    if(trie->transitions) {
+       free(trie->transitions);
+       trie->transitions = NULL;
+    }
+    trie->value = NULL;
+    return 0;
+}
+
+int _deserialize_transition(Transition transition,
+                           int (*read)(void *wasread, const int length, 
+                                       void *data),
+                           void *(*read_value)(void *data),
+                           void *data)
+{
+    int suffixlen;
+    unsigned char has_trie;
+    
+    if(!(*read)(&suffixlen, sizeof(suffixlen), data))
+       goto _deserialize_transition_error;
+    if(suffixlen < 0 || suffixlen >= MAX_KEY_LENGTH)
+       goto _deserialize_transition_error;
+    if(!(*read)(KEY, suffixlen, data))
+       goto _deserialize_transition_error;
+    KEY[suffixlen] = 0;
+    if(!(transition->suffix = (unsigned char *)strdup(KEY)))
+       goto _deserialize_transition_error;
+    if(!(*read)(&has_trie, sizeof(has_trie), data))
+       goto _deserialize_transition_error;
+    if(has_trie != 0 && has_trie != 1)
+       goto _deserialize_transition_error;
+    if(has_trie) {
+       transition->next = Trie_new();
+       if(!_deserialize_trie(transition->next, read, read_value, data))
+           goto _deserialize_transition_error;
+    }
+    return 1;
+
+ _deserialize_transition_error:
+    if(transition->suffix) {
+       free(transition->suffix);
+       transition->suffix = NULL;
+    }
+    if(transition->next) {
+       Trie_del(transition->next);
+       transition->next = NULL;
+    }
+    return 0;
+}
+
+Trie Trie_deserialize(int (*read)(void *wasread, const int length, void *data),
+                     void *(*read_value)(void *data),
+                     void *data)
+{
+    Trie trie = Trie_new();
+    if(!_deserialize_trie(trie, read, read_value, data)) {
+       Trie_del(trie);
+       return NULL;
+    }
+    return trie;
+}
+
+void test(void) {
+    Trie trie;
+
+    printf("Hello world!\n");
+
+    trie = Trie_new();
+    printf("New trie %p\n", trie);
+    Trie_set(trie, "hello world", "s1");
+    Trie_set(trie, "bye", "s2");
+    Trie_set(trie, "hell sucks", "s3");
+    Trie_set(trie, "hebee", "s4");
+
+    printf("%s\n", (char *)Trie_get(trie, "hello world"));
+    printf("%s\n", (char *)Trie_get(trie, "bye"));
+    printf("%s\n", (char *)Trie_get(trie, "hell sucks"));
+    printf("%s\n", (char *)Trie_get(trie, "hebee"));
+
+    Trie_set(trie, "blah", "s5");
+    printf("%s\n", (char *)Trie_get(trie, "blah"));
+
+    printf("%p\n", Trie_get(trie, "foobar"));
+    printf("%d\n", Trie_len(trie));
+
+    Trie_set(trie, "blah", "snew");
+    printf("%s\n", (char *)Trie_get(trie, "blah"));
+
+    Trie_del(trie);
+}
+
+#if 0
+int main() {
+    test();
+}
+#endif