Parsers/spark.py

   1 #  Copyright (c) 1998-2000 John Aycock
   2 #
   3 #  Permission is hereby granted, free of charge, to any person obtaining
   4 #  a copy of this software and associated documentation files (the
   5 #  "Software"), to deal in the Software without restriction, including
   6 #  without limitation the rights to use, copy, modify, merge, publish,
   7 #  distribute, sublicense, and/or sell copies of the Software, and to
   8 #  permit persons to whom the Software is furnished to do so, subject to
   9 #  the following conditions:
  10 #
  11 #  The above copyright notice and this permission notice shall be
  12 #  included in all copies or substantial portions of the Software.
  13 #
  14 #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16 #  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  17 #  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18 #  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  19 #  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  20 #  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  21
  22 __version__ = 'SPARK-0.6.1'
  23
  24 import re
  25 import sys
  26
  27 def _namelist(instance):
  28     namelist, namedict, classlist = [], {}, [instance.__class__]
  29     for c in classlist:
  30         for b in c.__bases__:
  31             classlist.append(b)
  32         for name in dir(c):
  33             if name not in namedict:
  34                 namelist.append(name)
  35                 namedict[name] = 1
  36     return namelist
  37
  38 class GenericScanner:
  39     def __init__(self):
  40         pattern = self.reflect()
  41         self.re = re.compile(pattern, re.VERBOSE)
  42
  43         self.index2func = {}
  44         for name, number in self.re.groupindex.items():
  45             self.index2func[number-1] = getattr(self, 't_' + name)
  46
  47     def makeRE(self, name):
  48         doc = getattr(self, name).__doc__
  49         rv = '(?P<%s>%s)' % (name[2:], doc)
  50         return rv
  51
  52     def reflect(self):
  53         rv = []
  54         for name in _namelist(self):
  55             if name[:2] == 't_' and name != 't_default':
  56                 rv.append(self.makeRE(name))
  57
  58         rv.append(self.makeRE('t_default'))
  59         return '|'.join(rv)
  60
  61     def error(self, s, pos):
  62         print "Lexical error at position %s" % pos
  63         raise SystemExit
  64
  65     def tokenize(self, s):
  66         pos = 0
  67         n = len(s)
  68         while pos < n:
  69             m = self.re.match(s, pos)
  70             if m is None:
  71                 self.error(s, pos)
  72
  73             groups = m.groups()
  74             for i in range(len(groups)):
  75                 if groups[i] and i in self.index2func:
  76                     self.index2func[i](groups[i])
  77             pos = m.end()
  78
  79     def t_default(self, s):
  80         r'( . | \n )+'
  81         pass
  82
  83 class GenericParser:
  84     def __init__(self, start):
  85         self.rules = {}
  86         self.rule2func = {}
  87         self.rule2name = {}
  88         self.collectRules()
  89         self.startRule = self.augment(start)
  90         self.ruleschanged = 1
  91
  92     _START = 'START'
  93     _EOF = 'EOF'
  94
  95     #
  96     #  A hook for GenericASTBuilder and GenericASTMatcher.
  97     #
  98     def preprocess(self, rule, func):   return rule, func
  99
 100     def addRule(self, doc, func):
 101         rules = doc.split()
 102
 103         index = []
 104         for i in range(len(rules)):
 105             if rules[i] == '::=':
 106                 index.append(i-1)
 107         index.append(len(rules))
 108
 109         for i in range(len(index)-1):
 110             lhs = rules[index[i]]
 111             rhs = rules[index[i]+2:index[i+1]]
 112             rule = (lhs, tuple(rhs))
 113
 114             rule, fn = self.preprocess(rule, func)
 115
 116             if lhs in self.rules:
 117                 self.rules[lhs].append(rule)
 118             else:
 119                 self.rules[lhs] = [ rule ]
 120             self.rule2func[rule] = fn
 121             self.rule2name[rule] = func.__name__[2:]
 122         self.ruleschanged = 1
 123
 124     def collectRules(self):
 125         for name in _namelist(self):
 126             if name[:2] == 'p_':
 127                 func = getattr(self, name)
 128                 doc = func.__doc__
 129                 self.addRule(doc, func)
 130
 131     def augment(self, start):
 132         #
 133         #  Tempting though it is, this isn't made into a call
 134         #  to self.addRule() because the start rule shouldn't
 135         #  be subject to preprocessing.
 136         #
 137         startRule = (self._START, ( start, self._EOF ))
 138         self.rule2func[startRule] = lambda args: args[0]
 139         self.rules[self._START] = [ startRule ]
 140         self.rule2name[startRule] = ''
 141         return startRule
 142
 143     def makeFIRST(self):
 144         union = {}
 145         self.first = {}
 146
 147         for rulelist in self.rules.values():
 148             for lhs, rhs in rulelist:
 149                 if lhs not in self.first:
 150                     self.first[lhs] = {}
 151
 152                 if len(rhs) == 0:
 153                     self.first[lhs][None] = 1
 154                     continue
 155
 156                 sym = rhs[0]
 157                 if sym not in self.rules:
 158                     self.first[lhs][sym] = 1
 159                 else:
 160                     union[(sym, lhs)] = 1
 161         changes = 1
 162         while changes:
 163             changes = 0
 164             for src, dest in union.keys():
 165                 destlen = len(self.first[dest])
 166                 self.first[dest].update(self.first[src])
 167                 if len(self.first[dest]) != destlen:
 168                     changes = 1
 169
 170     #
 171     #  An Earley parser, as per J. Earley, "An Efficient Context-Free
 172     #  Parsing Algorithm", CACM 13(2), pp. 94-102.  Also J. C. Earley,
 173     #  "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis,
 174     #  Carnegie-Mellon University, August 1968, p. 27.
 175     #
 176
 177     def typestring(self, token):
 178         return None
 179
 180     def error(self, token):
 181         print "Syntax error at or near `%s' token" % token
 182         raise SystemExit
 183
 184     def parse(self, tokens):
 185         tree = {}
 186         tokens.append(self._EOF)
 187         states = { 0: [ (self.startRule, 0, 0) ] }
 188
 189         if self.ruleschanged:
 190             self.makeFIRST()
 191
 192         for i in xrange(len(tokens)):
 193             states[i+1] = []
 194
 195             if states[i] == []:
 196                 break
 197             self.buildState(tokens[i], states, i, tree)
 198
 199         #_dump(tokens, states)
 200
 201         if i < len(tokens)-1 or states[i+1] != [(self.startRule, 2, 0)]:
 202             del tokens[-1]
 203             self.error(tokens[i-1])
 204         rv = self.buildTree(tokens, tree, ((self.startRule, 2, 0), i+1))
 205         del tokens[-1]
 206         return rv
 207
 208     def buildState(self, token, states, i, tree):
 209         needsCompletion = {}
 210         state = states[i]
 211         predicted = {}
 212
 213         for item in state:
 214             rule, pos, parent = item
 215             lhs, rhs = rule
 216
 217             #
 218             #  A -> a . (completer)
 219             #
 220             if pos == len(rhs):
 221                 if len(rhs) == 0:
 222                     needsCompletion[lhs] = (item, i)
 223
 224                 for pitem in states[parent]:
 225                     if pitem is item:
 226                         break
 227
 228                     prule, ppos, pparent = pitem
 229                     plhs, prhs = prule
 230
 231                     if prhs[ppos:ppos+1] == (lhs,):
 232                         new = (prule,
 233                                ppos+1,
 234                                pparent)
 235                         if new not in state:
 236                             state.append(new)
 237                             tree[(new, i)] = [(item, i)]
 238                         else:
 239                             tree[(new, i)].append((item, i))
 240                 continue
 241
 242             nextSym = rhs[pos]
 243
 244             #
 245             #  A -> a . B (predictor)
 246             #
 247             if nextSym in self.rules:
 248                 #
 249                 #  Work on completer step some more; for rules
 250                 #  with empty RHS, the "parent state" is the
 251                 #  current state we're adding Earley items to,
 252                 #  so the Earley items the completer step needs
 253                 #  may not all be present when it runs.
 254                 #
 255                 if nextSym in needsCompletion:
 256                     new = (rule, pos+1, parent)
 257                     olditem_i = needsCompletion[nextSym]
 258                     if new not in state:
 259                         state.append(new)
 260                         tree[(new, i)] = [olditem_i]
 261                     else:
 262                         tree[(new, i)].append(olditem_i)
 263
 264                 #
 265                 #  Has this been predicted already?
 266                 #
 267                 if nextSym in predicted:
 268                     continue
 269                 predicted[nextSym] = 1
 270
 271                 ttype = token is not self._EOF and \
 272                     self.typestring(token) or \
 273                     None
 274                 if ttype is not None:
 275                     #
 276                     #  Even smarter predictor, when the
 277                     #  token's type is known.  The code is
 278                     #  grungy, but runs pretty fast.  Three
 279                     #  cases are looked for: rules with
 280                     #  empty RHS; first symbol on RHS is a
 281                     #  terminal; first symbol on RHS is a
 282                     #  nonterminal (and isn't nullable).
 283                     #
 284                     for prule in self.rules[nextSym]:
 285                         new = (prule, 0, i)
 286                         prhs = prule[1]
 287                         if len(prhs) == 0:
 288                             state.append(new)
 289                             continue
 290                         prhs0 = prhs[0]
 291                         if prhs0 not in self.rules:
 292                             if prhs0 != ttype:
 293                                 continue
 294                             else:
 295                                 state.append(new)
 296                                 continue
 297                         first = self.first[prhs0]
 298                         if None not in first and \
 299                            ttype not in first:
 300                             continue
 301                         state.append(new)
 302                     continue
 303
 304                 for prule in self.rules[nextSym]:
 305                     #
 306                     #  Smarter predictor, as per Grune &
 307                     #  Jacobs' _Parsing Techniques_.  Not
 308                     #  as good as FIRST sets though.
 309                     #
 310                     prhs = prule[1]
 311                     if len(prhs) > 0 and \
 312                        prhs[0] not in self.rules and \
 313                        token != prhs[0]:
 314                         continue
 315                     state.append((prule, 0, i))
 316
 317             #
 318             #  A -> a . c (scanner)
 319             #
 320             elif token == nextSym:
 321                 #assert new not in states[i+1]
 322                 states[i+1].append((rule, pos+1, parent))
 323
 324     def buildTree(self, tokens, tree, root):
 325         stack = []
 326         self.buildTree_r(stack, tokens, -1, tree, root)
 327         return stack[0]
 328
 329     def buildTree_r(self, stack, tokens, tokpos, tree, root):
 330         (rule, pos, parent), state = root
 331
 332         while pos > 0:
 333             want = ((rule, pos, parent), state)
 334             if want not in tree:
 335                 #
 336                 #  Since pos > 0, it didn't come from closure,
 337                 #  and if it isn't in tree[], then there must
 338                 #  be a terminal symbol to the left of the dot.
 339                 #  (It must be from a "scanner" step.)
 340                 #
 341                 pos = pos - 1
 342                 state = state - 1
 343                 stack.insert(0, tokens[tokpos])
 344                 tokpos = tokpos - 1
 345             else:
 346                 #
 347                 #  There's a NT to the left of the dot.
 348                 #  Follow the tree pointer recursively (>1
 349                 #  tree pointers from it indicates ambiguity).
 350                 #  Since the item must have come about from a
 351                 #  "completer" step, the state where the item
 352                 #  came from must be the parent state of the
 353                 #  item the tree pointer points to.
 354                 #
 355                 children = tree[want]
 356                 if len(children) > 1:
 357                     child = self.ambiguity(children)
 358                 else:
 359                     child = children[0]
 360
 361                 tokpos = self.buildTree_r(stack,
 362                               tokens, tokpos,
 363                               tree, child)
 364                 pos = pos - 1
 365                 (crule, cpos, cparent), cstate = child
 366                 state = cparent
 367
 368         lhs, rhs = rule
 369         result = self.rule2func[rule](stack[:len(rhs)])
 370         stack[:len(rhs)] = [result]
 371         return tokpos
 372
 373     def ambiguity(self, children):
 374         #
 375         #  XXX - problem here and in collectRules() if the same
 376         #    rule appears in >1 method.  But in that case the
 377         #    user probably gets what they deserve :-)  Also
 378         #    undefined results if rules causing the ambiguity
 379         #    appear in the same method.
 380         #
 381         sortlist = []
 382         name2index = {}
 383         for i in range(len(children)):
 384             ((rule, pos, parent), index) = children[i]
 385             lhs, rhs = rule
 386             name = self.rule2name[rule]
 387             sortlist.append((len(rhs), name))
 388             name2index[name] = i
 389         sortlist.sort()
 390         list = map(lambda (a,b): b, sortlist)
 391         return children[name2index[self.resolve(list)]]
 392
 393     def resolve(self, list):
 394         #
 395         #  Resolve ambiguity in favor of the shortest RHS.
 396         #  Since we walk the tree from the top down, this
 397         #  should effectively resolve in favor of a "shift".
 398         #
 399         return list[0]
 400
 401 #
 402 #  GenericASTBuilder automagically constructs a concrete/abstract syntax tree
 403 #  for a given input.  The extra argument is a class (not an instance!)
 404 #  which supports the "__setslice__" and "__len__" methods.
 405 #
 406 #  XXX - silently overrides any user code in methods.
 407 #
 408
 409 class GenericASTBuilder(GenericParser):
 410     def __init__(self, AST, start):
 411         GenericParser.__init__(self, start)
 412         self.AST = AST
 413
 414     def preprocess(self, rule, func):
 415         rebind = lambda lhs, self=self: \
 416                 lambda args, lhs=lhs, self=self: \
 417                     self.buildASTNode(args, lhs)
 418         lhs, rhs = rule
 419         return rule, rebind(lhs)
 420
 421     def buildASTNode(self, args, lhs):
 422         children = []
 423         for arg in args:
 424             if isinstance(arg, self.AST):
 425                 children.append(arg)
 426             else:
 427                 children.append(self.terminal(arg))
 428         return self.nonterminal(lhs, children)
 429
 430     def terminal(self, token):  return token
 431
 432     def nonterminal(self, type, args):
 433         rv = self.AST(type)
 434         rv[:len(args)] = args
 435         return rv
 436
 437 #
 438 #  GenericASTTraversal is a Visitor pattern according to Design Patterns.  For
 439 #  each node it attempts to invoke the method n_<node type>, falling
 440 #  back onto the default() method if the n_* can't be found.  The preorder
 441 #  traversal also looks for an exit hook named n_<node type>_exit (no default
 442 #  routine is called if it's not found).  To prematurely halt traversal
 443 #  of a subtree, call the prune() method -- this only makes sense for a
 444 #  preorder traversal.  Node type is determined via the typestring() method.
 445 #
 446
 447 class GenericASTTraversalPruningException:
 448     pass
 449
 450 class GenericASTTraversal:
 451     def __init__(self, ast):
 452         self.ast = ast
 453
 454     def typestring(self, node):
 455         return node.type
 456
 457     def prune(self):
 458         raise GenericASTTraversalPruningException
 459
 460     def preorder(self, node=None):
 461         if node is None:
 462             node = self.ast
 463
 464         try:
 465             name = 'n_' + self.typestring(node)
 466             if hasattr(self, name):
 467                 func = getattr(self, name)
 468                 func(node)
 469             else:
 470                 self.default(node)
 471         except GenericASTTraversalPruningException:
 472             return
 473
 474         for kid in node:
 475             self.preorder(kid)
 476
 477         name = name + '_exit'
 478         if hasattr(self, name):
 479             func = getattr(self, name)
 480             func(node)
 481
 482     def postorder(self, node=None):
 483         if node is None:
 484             node = self.ast
 485
 486         for kid in node:
 487             self.postorder(kid)
 488
 489         name = 'n_' + self.typestring(node)
 490         if hasattr(self, name):
 491             func = getattr(self, name)
 492             func(node)
 493         else:
 494             self.default(node)
 495
 496
 497     def default(self, node):
 498         pass
 499
 500 #
 501 #  GenericASTMatcher.  AST nodes must have "__getitem__" and "__cmp__"
 502 #  implemented.
 503 #
 504 #  XXX - makes assumptions about how GenericParser walks the parse tree.
 505 #
 506
 507 class GenericASTMatcher(GenericParser):
 508     def __init__(self, start, ast):
 509         GenericParser.__init__(self, start)
 510         self.ast = ast
 511
 512     def preprocess(self, rule, func):
 513         rebind = lambda func, self=self: \
 514                 lambda args, func=func, self=self: \
 515                     self.foundMatch(args, func)
 516         lhs, rhs = rule
 517         rhslist = list(rhs)
 518         rhslist.reverse()
 519
 520         return (lhs, tuple(rhslist)), rebind(func)
 521
 522     def foundMatch(self, args, func):
 523         func(args[-1])
 524         return args[-1]
 525
 526     def match_r(self, node):
 527         self.input.insert(0, node)
 528         children = 0
 529
 530         for child in node:
 531             if children == 0:
 532                 self.input.insert(0, '(')
 533             children = children + 1
 534             self.match_r(child)
 535
 536         if children > 0:
 537             self.input.insert(0, ')')
 538
 539     def match(self, ast=None):
 540         if ast is None:
 541             ast = self.ast
 542         self.input = []
 543
 544         self.match_r(ast)
 545         self.parse(self.input)
 546
 547     def resolve(self, list):
 548         #
 549         #  Resolve ambiguity in favor of the longest RHS.
 550         #
 551         return list[-1]
 552
 553 def _dump(tokens, states):
 554     for i in range(len(states)):
 555         print 'state', i
 556         for (lhs, rhs), pos, parent in states[i]:
 557             print '\t', lhs, '::=',
 558             print ' '.join(rhs[:pos]),
 559             print '.',
 560             print ' '.join(rhs[pos:]),
 561             print ',', parent
 562         if i < len(tokens):
 563             print
 564             print 'token', str(tokens[i])
 565             print