--- /dev/null
+"""
+Given a trie, find all occurrences of a word in the trie in a string.
+
+Like searching a string for a substring, except that the substring is
+any word in a trie.
+
+Functions:
+match Find longest key in a trie matching the beginning of the string.
+match_all Find all keys in a trie matching the beginning of the string.
+find Find keys in a trie matching anywhere in a string.
+find_words Find keys in a trie matching whole words in a string.
+
+"""
+import string
+import re
+
+def match(string, trie):
+ """match(string, trie) -> longest key or None
+
+ Find the longest key in the trie that matches the beginning of the
+ string.
+
+ """
+ longest = None
+ for i in range(len(string)):
+ substr = string[:i+1]
+ if not trie.has_prefix(substr):
+ break
+ if trie.has_key(substr):
+ longest = substr
+ return longest
+
+def match_all(string, trie):
+ """match_all(string, trie) -> list of keys
+
+ Find all the keys in the trie that matches the beginning of the
+ string.
+
+ """
+ matches = []
+ for i in range(len(string)):
+ substr = string[:i+1]
+ if not trie.has_prefix(substr):
+ break
+ if trie.has_key(substr):
+ matches.append(substr)
+ return matches
+
+def find(string, trie):
+ """find(string, trie) -> list of tuples (key, start, end)
+
+ Find all the keys in the trie that match anywhere in the string.
+
+ """
+ results = []
+ start = 0 # index to start the search
+ while start < len(string):
+ # Look for a match.
+ keys = match_all(string[start:], trie)
+ for key in keys:
+ results.append((key, start, start+len(key)))
+ start += 1
+ return results
+
+DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace
+
+def find_words(string, trie):
+ """find_words(string, trie) -> list of tuples (key, start, end)
+
+ Find all the keys in the trie that match full words in the string.
+ Word boundaries are defined as any punctuation or whitespace.
+
+ """
+ _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
+
+ results = []
+ start = 0 # index of word boundary
+ while start < len(string):
+ # Look for a match.
+ keys = match_all(string[start:], trie)
+ for key in keys:
+ l = len(key)
+ # Make sure it ends at a boundary.
+ if start+l == len(string) or \
+ _boundary_re.match(string[start+l]):
+ results.append((key, start, start+l))
+ # Move forward to the next boundary.
+ m = _boundary_re.search(string, start)
+ if m is None:
+ break
+ start = m.end()
+ return results