+++ /dev/null
-"""
-Given a trie, find all occurrences of a word in the trie in a string.
-
-Like searching a string for a substring, except that the substring is
-any word in a trie.
-
-Functions:
-match Find longest key in a trie matching the beginning of the string.
-match_all Find all keys in a trie matching the beginning of the string.
-find Find keys in a trie matching anywhere in a string.
-find_words Find keys in a trie matching whole words in a string.
-
-"""
-import string
-import re
-
-def match(string, trie):
- """match(string, trie) -> longest key or None
-
- Find the longest key in the trie that matches the beginning of the
- string.
-
- """
- longest = None
- for i in range(len(string)):
- substr = string[:i+1]
- if not trie.has_prefix(substr):
- break
- if trie.has_key(substr):
- longest = substr
- return longest
-
-def match_all(string, trie):
- """match_all(string, trie) -> list of keys
-
- Find all the keys in the trie that matches the beginning of the
- string.
-
- """
- matches = []
- for i in range(len(string)):
- substr = string[:i+1]
- if not trie.has_prefix(substr):
- break
- if trie.has_key(substr):
- matches.append(substr)
- return matches
-
-def find(string, trie):
- """find(string, trie) -> list of tuples (key, start, end)
-
- Find all the keys in the trie that match anywhere in the string.
-
- """
- results = []
- start = 0 # index to start the search
- while start < len(string):
- # Look for a match.
- keys = match_all(string[start:], trie)
- for key in keys:
- results.append((key, start, start+len(key)))
- start += 1
- return results
-
-DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace
-
-def find_words(string, trie):
- """find_words(string, trie) -> list of tuples (key, start, end)
-
- Find all the keys in the trie that match full words in the string.
- Word boundaries are defined as any punctuation or whitespace.
-
- """
- _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
-
- results = []
- start = 0 # index of word boundary
- while start < len(string):
- # Look for a match.
- keys = match_all(string[start:], trie)
- for key in keys:
- l = len(key)
- # Make sure it ends at a boundary.
- if start+l == len(string) or \
- _boundary_re.match(string[start+l]):
- results.append((key, start, start+l))
- # Move forward to the next boundary.
- m = _boundary_re.search(string, start)
- if m is None:
- break
- start = m.end()
- return results