3219 lines
110 KiB
Python
3219 lines
110 KiB
Python
# Lrparsing.py is a LR(1) parser hiding behind a pythonic interface. It takes
|
|
# as input a grammar and a string to be parsed, and outputs the parse tree.
|
|
#
|
|
# Copyright (c) 2013,2014,2015,2016,2017,2018 Russell Stuart.
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU Affero General Public License as published
|
|
# by the Free Software Foundation, either version 3 of the License, or (at
|
|
# your option) any later version.
|
|
#
|
|
# The copyright holders grant you an additional permission under Section 7
|
|
# of the GNU Affero General Public License, version 3, exempting you from
|
|
# the requirement in Section 6 of the GNU General Public License, version 3,
|
|
# to accompany Corresponding Source with Installation Information for the
|
|
# Program or any work based on the Program. You are still required to
|
|
# comply with all other Section 6 requirements to provide Corresponding
|
|
# Source.
|
|
#
|
|
# This program is distributed in the hope that it will be useful, but
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Affero General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
import ast
|
|
import collections
|
|
import hashlib
|
|
import itertools
|
|
import re
|
|
import string
|
|
import sys
|
|
|
|
|
|
#
|
|
# Python2/3 compatibility hacks.
|
|
#
|
|
# Written this odd way to preserve 100% test coverage.
|
|
#
|
|
StandardError = Exception if sys.version_info >= (3,) else StandardError
|
|
python3_metaclass = (lambda cls: cls) if sys.version_info < (3,) else (
|
|
lambda cls: cls.__metaclass__(
|
|
cls.__name__, cls.__bases__, dict(cls.__dict__)))
|
|
string_types = basestring if sys.version_info < (3,) else str
|
|
string_maketrans = (
|
|
string.maketrans if sys.version_info < (3,) else str.maketrans)
|
|
to_str = lambda s: s if isinstance(s, string_types) else str(s)
|
|
|
|
|
|
#
|
|
# Common base class for all exceptions here.
|
|
#
|
|
class LrParsingError(StandardError):
|
|
pass
|
|
|
|
|
|
#
|
|
# Raised if the Grammar isn't valid.
|
|
#
|
|
class GrammarError(LrParsingError):
|
|
pass
|
|
|
|
|
|
#
|
|
# Base class for errors raised at parsing time.
|
|
#
|
|
class ParsingError(LrParsingError):
|
|
pass
|
|
|
|
|
|
#
|
|
# Raised if an invalid syntax is given.
|
|
#
|
|
class TokenError(ParsingError):
|
|
|
|
def __init__(self, message, data, offset, line, column):
|
|
self.data = data
|
|
self.offset = offset
|
|
self.line = line
|
|
self.column = column
|
|
super(TokenError, self).__init__(message, data, offset, line, column)
|
|
|
|
|
|
#
|
|
# Raised if an invalid syntax is given.
|
|
#
|
|
class ParseError(ParsingError):
|
|
input_token = None
|
|
stack = None
|
|
|
|
def __init__(self, input_token, stack):
|
|
def comma_or(lst):
|
|
strs = sorted("%s" % l for l in lst)
|
|
if len(strs) == 1:
|
|
return strs[0]
|
|
return ', '.join(strs[:-1]) + ' or ' + strs[-1]
|
|
self.input_token = input_token
|
|
self.stack = stack
|
|
lr1_state = stack[-1][0]
|
|
if len(lr1_state.actions) >= 10:
|
|
msg = "Got unexpected %s" % (input_token[0],)
|
|
else:
|
|
msg = "Got %s when expecting %s" % (
|
|
input_token[0], comma_or(lr1_state.actions))
|
|
if len(lr1_state.rules) < 10:
|
|
msg += " while trying to match %s" % (comma_or(lr1_state.rules))
|
|
msg += " in state %d" % (lr1_state.id,)
|
|
position = input_token[0].position(input_token)
|
|
if position:
|
|
msg = position + ": " + msg
|
|
super(ParseError, self).__init__(msg)
|
|
|
|
|
|
#
|
|
# Print a set of symbols.
|
|
#
|
|
def str_symbol_set(symbol_set):
|
|
return '[%s]' % (','.join(sorted(str(symbol) for symbol in symbol_set)),)
|
|
|
|
|
|
#
|
|
# An LR(0) Item. An item is just a production and the position the parser
|
|
# is up to in parsing it - the dot_pos. Eg:
|
|
#
|
|
# sym0 sym1 . sym2
|
|
#
|
|
# means the parser is processing a production wants to see the symbols
|
|
# "sym0 sym1 sym2" in order, and it has seen sym1 and sym2.
|
|
#
|
|
class Lr0Item(object):
|
|
__slots__ = ('dot_pos', '_key', 'lr0_item', 'production')
|
|
|
|
def __new__(cls, production, dot_pos, cache):
|
|
lr0_item = (production, dot_pos)
|
|
result = cache.get(lr0_item, None)
|
|
if result is None:
|
|
result = super(Lr0Item, cls).__new__(cls)
|
|
result.lr0_item = lr0_item
|
|
result.dot_pos = dot_pos
|
|
result.production = production
|
|
result._key = (
|
|
(str(result.production.lhs),) +
|
|
tuple(str(sym) for sym in result.production.rhs) +
|
|
(result.dot_pos,)
|
|
)
|
|
cache[result.lr0_item] = result
|
|
return result
|
|
|
|
def __repr__(self):
|
|
rhs = self.production.rhs
|
|
ll = lambda s, e: [str(symbol) for symbol in rhs[s:e]]
|
|
prod = ll(0, self.dot_pos) + ['^'] + ll(self.dot_pos, len(rhs))
|
|
return "%s = %s" % (self.production.lhs, ' '.join(prod))
|
|
|
|
def key(cls, lr0_item):
|
|
return lr0_item._key
|
|
key = classmethod(key)
|
|
|
|
|
|
#
|
|
# An item in an LR(1) grammar. It is just a LR(0) item, together with the
|
|
# set a of tokens that could follow the production called the lookahead.
|
|
# Eg, given the Item():
|
|
#
|
|
# L ::= sym0 sym1 sym2 . [tokA, tokB]
|
|
#
|
|
# The parser has seen all the symbols in this production, so if the next token
|
|
# to be processed is tokA or tokB, the production can be reduced (ie replaced)
|
|
# with it's left hand side, ie L
|
|
#
|
|
class Lr1Item(object):
|
|
__slots__ = ('dot_pos', 'lookahead', 'lr0_item', 'production')
|
|
|
|
def __init__(self, lr0_item, lookahead):
|
|
self.lr0_item = lr0_item
|
|
self.production = self.lr0_item.production
|
|
self.dot_pos = self.lr0_item.dot_pos
|
|
self.lookahead = lookahead
|
|
|
|
def lr1_shift(self, cache):
|
|
lr0_item = Lr0Item(self.production, self.dot_pos + 1, cache)
|
|
return Lr1Item(lr0_item, set(self.lookahead))
|
|
|
|
def __repr__(self):
|
|
return "%r %s" % (self.lr0_item, str_symbol_set(self.lookahead),)
|
|
|
|
def sorted(cls, iterable):
|
|
return sorted(iterable, key=lambda item: Lr0Item.key(item.lr0_item))
|
|
sorted = classmethod(sorted)
|
|
|
|
|
|
#
|
|
# An Lr0Kernel. A "kernel" is the state of the parser. It is just a set of
|
|
# items (Lr0Items in the case of an Lr0Kernels). The initial kernel is just
|
|
# the grammar's start production, eg:
|
|
#
|
|
# G ::= . E <eoi>
|
|
#
|
|
# Successive kernels are generated by looking every production that can
|
|
# be reached from a previous kernel if a particular given token is seen.
|
|
# For example, if the rest of the grammar is:
|
|
#
|
|
# E ::= E / E
|
|
# E ::= n
|
|
#
|
|
# Then after seeing an n, the Lr0Kernel would be:
|
|
#
|
|
# E ::= n .
|
|
#
|
|
class Lr0Kernel(object):
|
|
__slots__ = ("lr0_items",)
|
|
|
|
def __new__(cls, lr1_items, cache):
|
|
lr0_items = frozenset(item.lr0_item for item in lr1_items)
|
|
result = cache.get(lr0_items, None)
|
|
if result is None:
|
|
result = super(Lr0Kernel, cls).__new__(cls)
|
|
result.lr0_items = lr0_items
|
|
cache[result.lr0_items] = result
|
|
return result
|
|
|
|
|
|
#
|
|
# An Lr1State is the compiled version of an ItemSet. Ie, all the
|
|
# information the Parser doesn't need has been discarded.
|
|
#
|
|
class Lr1State(int):
|
|
if sys.version_info < (3,):
|
|
__slots__ = ('actions', 'gotos', 'id', 'rules')
|
|
|
|
def __new__(cls, id, actions, gotos, rules):
|
|
result = super(Lr1State, cls).__new__(cls, id)
|
|
result.actions = actions
|
|
result.gotos = gotos
|
|
result.id = id
|
|
result.rules = rules
|
|
return result
|
|
|
|
def __repr__(self):
|
|
def p(act):
|
|
if len(act) == 1:
|
|
return "shift %d" % act
|
|
if len(act) == 2 or act[2] is None:
|
|
return "reduce %d %d" % act[:2]
|
|
return "reduce %d %d %s" % act[:3]
|
|
result = [str(self)]
|
|
if self.actions:
|
|
result.append(" -- actions")
|
|
for token in sorted(self.actions, key=lambda sym: str(sym)):
|
|
result.append(" %s: %s" % (token, p(self.actions[token],)))
|
|
if self.gotos:
|
|
result.append(" -- gotos")
|
|
for nonterm_number in sorted(self.gotos, key=lambda sym: str(sym)):
|
|
result.append(
|
|
" %s: %s" %
|
|
(nonterm_number, self.gotos[nonterm_number],))
|
|
return '\n'.join(result)
|
|
|
|
def __str__(self):
|
|
return "Lr1State:%d" % (int(self),)
|
|
|
|
def to_flat(self, grammar):
|
|
actions = {}
|
|
for token, action in self.actions.items():
|
|
if len(action) == 1:
|
|
new_action = action[0]
|
|
elif action[2] is None:
|
|
new_action = tuple(action[:2])
|
|
else:
|
|
new_action = (action[0], action[1], action[2].name)
|
|
actions[token.name] = new_action
|
|
rules = sorted(rule.name for rule in self.rules)
|
|
return (actions, self.gotos, rules)
|
|
PYTHON_VAR_RE = re.compile(
|
|
"(?i)^[a-z_][a-z_0-9]*(?:[.][a-z_][a-z_0-9]*)*$")
|
|
|
|
def from_flat(cls, index, flat, rules, token_registry):
|
|
actions = {}
|
|
for token_name, action in flat[0].items():
|
|
if not isinstance(action, tuple):
|
|
new_action = (action,)
|
|
elif len(action) == 2:
|
|
new_action = (action[0], action[1], None, None)
|
|
else:
|
|
new_action = (action[0], action[1], rules[action[2]], None)
|
|
token = token_registry[token_name]
|
|
actions[token] = new_action
|
|
target_rules = (rules[rule_name] for rule_name in flat[2])
|
|
return cls(index, actions, flat[1], set(target_rules))
|
|
from_flat = classmethod(from_flat)
|
|
|
|
def sorted(cls, table):
|
|
return sorted(table)
|
|
sorted = classmethod(sorted)
|
|
|
|
|
|
#
|
|
# An ItemSet is a state in the LR(1) grammar. An ItemSet is identical in
|
|
# concept to an LR(0) kernel, but it consists of Lr1Item's rather than
|
|
# Lr0Item's. In other words, the items contain the tokens they expect to
|
|
# follow the production. These tokens are shown in []'s.
|
|
#
|
|
# Following on from the example above:
|
|
#
|
|
# G ::= . E <eoi>, [__empty__]
|
|
#
|
|
# And if an n is accepted:
|
|
#
|
|
# E ::= n ., [<eoi>, /]
|
|
#
|
|
class ItemSet(object):
|
|
ID = 0
|
|
__slots__ = (
|
|
'actions', '_closure', '_goto_cache', 'gotos', 'id',
|
|
'_kernel', '_lhs_prio', 'lr0_kernel', 'prio',)
|
|
|
|
def __init__(self, items, cache):
|
|
items = tuple(items)
|
|
self._kernel = dict((item[0].lr0_item, item[0]) for item in items)
|
|
self.prio = dict((item[0].lr0_item, item[1]) for item in items)
|
|
self.lr0_kernel = Lr0Kernel(iter(self), cache)
|
|
self.actions = None
|
|
self.gotos = {}
|
|
self._closure = None
|
|
self._goto_cache = None
|
|
self._lhs_prio = None
|
|
self.id = self.ID
|
|
self.__class__.ID += 1
|
|
|
|
def __str__(self):
|
|
return "ItemSet:%d" % (self.id,)
|
|
|
|
def __repr__(self):
|
|
result = [str(self)]
|
|
for kernel_item in Lr1Item.sorted(iter(self)):
|
|
line = " %r %s" % (kernel_item, self.repr_prio(kernel_item))
|
|
result.append(line)
|
|
if self._closure:
|
|
result.append(" -- closure")
|
|
for closure_item in Lr1Item.sorted(self._closure.values()):
|
|
line = " %r %s" % (
|
|
closure_item, self.repr_prio(closure_item))
|
|
result.append(line)
|
|
if self.actions:
|
|
result.append(" -- actions")
|
|
for token in sorted(self.actions, key=lambda action: str(action)):
|
|
actions = self.actions[token]
|
|
if isinstance(actions, Action):
|
|
lst = repr(actions)
|
|
else:
|
|
lst = ', '.join(sorted(repr(act) for act in actions))
|
|
result.append(" %s: %s" % (token, lst,))
|
|
if self.gotos:
|
|
result.append(" -- gotos")
|
|
for symbol in sorted(self.gotos, key=lambda symbol: str(symbol)):
|
|
result.append(" %s: %s" % (symbol, self.gotos[symbol],))
|
|
return '\n'.join(result)
|
|
|
|
def repr_prio(self, item):
|
|
result = []
|
|
for prio, lookahead in sorted(self.prio[item.lr0_item].items()):
|
|
result.append("%r:%s" % (prio, str_symbol_set(lookahead),))
|
|
return '{%s}' % ', '.join(result)
|
|
|
|
#
|
|
# Return all lhs tokens we generate.
|
|
#
|
|
def rules(self):
|
|
return set(item.production.lhs.get_rule() for item in self)
|
|
rules = property(rules)
|
|
|
|
#
|
|
# Compute the closure for the passed items. Ie, given the item
|
|
# {A ::= a . B c}, add {B ::= . C d} and repeat for B. In other words,
|
|
# the kernel plus its closure contains every production we can be
|
|
# expanding, and the position in them.
|
|
#
|
|
# X ::= a ^ b c, [l]
|
|
#
|
|
# b ::= ^ f g, [follow(c)]
|
|
#
|
|
def _close_kernel_items(self, items, cache):
|
|
modified = False
|
|
queue = collections.deque(items)
|
|
empty_token = cache['__empty__']
|
|
while queue:
|
|
#
|
|
# Find the next symbol that will be consumed by item.
|
|
#
|
|
item = queue.popleft()
|
|
rhs, dot_pos = item.production.rhs, item.dot_pos
|
|
if dot_pos >= len(rhs):
|
|
continue
|
|
symbol = rhs[dot_pos]
|
|
if isinstance(symbol, TokenSymbol):
|
|
continue
|
|
#
|
|
# Find all tokens we could possibly see after consuming that
|
|
# symbol.
|
|
#
|
|
first_set = self.symbol_seq_first_set(rhs[dot_pos + 1:], cache)
|
|
if empty_token not in first_set:
|
|
lookahead = set(first_set)
|
|
no_empty = first_set
|
|
else:
|
|
no_empty = first_set - empty_token.first_set
|
|
lookahead = no_empty | item.lookahead
|
|
#
|
|
# If the next symbol is a nonterm, then add all of its productions
|
|
# to the closure.
|
|
#
|
|
for production in symbol.productions:
|
|
lr0_item = Lr0Item(production, 0, cache)
|
|
existing = self._closure.get(lr0_item, None)
|
|
if existing is not None:
|
|
extra = lookahead - existing.lookahead
|
|
existing.lookahead |= extra
|
|
closure = existing
|
|
else:
|
|
extra = lookahead
|
|
closure = Lr1Item(lr0_item, lookahead)
|
|
self._closure[lr0_item] = closure
|
|
self.prio[closure.lr0_item] = {}
|
|
if not extra:
|
|
continue
|
|
queue.append(closure)
|
|
modified = True
|
|
#
|
|
# Push the priority through the closure. In other words, if
|
|
# the rhs of this production assigned this lhs a priority,
|
|
# then add this priority to the list we inherited from the
|
|
# ItemSet's that goto us. Eg, given:
|
|
#
|
|
# START = Prio(b,c) __end_of_input__, b.prio==0 and c.prio==1
|
|
# b = Prio(d+'X', e+'Y'), so d.prio==0 and e.prio==1
|
|
# c = Prio(e+'X', d+'Y'), so d.prio==1 and e.prio==0
|
|
# d = 'T'
|
|
# e = 'T'
|
|
#
|
|
# The prio of START is always (), so b's prio will be (0,)
|
|
# ie "() + (0,)", and c's prio will be (1,) and after that
|
|
# we expect both to be followed by __end_of_input__. From b's
|
|
# production d's prio will be b's plus 0, ie
|
|
# "(0,) + (0,) = (0,0,)", but only when followed by a 'X' (ie
|
|
# the lookahead is 'X'). From c's production d's prio will be
|
|
# (1,1), but only when followed by a 'Y'.
|
|
#
|
|
# Thus for 'd' we end up with these priorities:
|
|
#
|
|
# { (0,0,): set('X'), (0,1,): set('Y') }
|
|
#
|
|
# and for 'e' we end up with:
|
|
#
|
|
# { (1,0): set('X'), (1,1): set('Y') }
|
|
#
|
|
# If the resulting parser is given the input string:
|
|
#
|
|
# 'T' 'Y' __end_of_input__.
|
|
#
|
|
# First it has to decide whether 'T' is a 'd' or an 'e' as
|
|
# both will resolve to 'T'. A normal LR(1) would be stuck
|
|
# with a reduce/reduce conflict, but since the d's priority
|
|
# for 'Y' (0,1) < e's priority (1,1), we chose 'd'. We don't
|
|
# do that choosing here, but we create the priorities so it
|
|
# can happen should a conflict arise.
|
|
#
|
|
rank = closure.production.lhs.rank
|
|
priority = closure.production.lhs.priority
|
|
if priority is not None:
|
|
rank -= 1
|
|
append = (0,) * max(0, rank - item.production.lhs.rank)
|
|
if priority is not None:
|
|
append += (priority,)
|
|
prio_items = self.prio[item.lr0_item].items()
|
|
for itemset_prio, item_prio in prio_items:
|
|
existing_sets = self.prio[closure.lr0_item]
|
|
if empty_token not in first_set:
|
|
add = no_empty
|
|
else:
|
|
add = no_empty | extra & item_prio
|
|
existing_prio = itemset_prio[:rank] + append
|
|
if existing_prio not in existing_sets:
|
|
existing_sets[existing_prio] = add
|
|
else:
|
|
existing_sets[existing_prio] |= add
|
|
return modified
|
|
|
|
#
|
|
# Compute the first_set for a sequence of Symbols. If the sequence is:
|
|
#
|
|
# sym0 sym1 sym2
|
|
#
|
|
# The first_set is the same as the first_set of sym0, and if the sym0
|
|
# can be empty then the first_set of sym1 and so on.
|
|
#
|
|
def symbol_seq_first_set(cls, symbol_sequence, cache):
|
|
syms = tuple(symbol_sequence)
|
|
result = cache.get(syms, None)
|
|
if result is None:
|
|
empty_token = cache['__empty__']
|
|
first_set = set(empty_token.first_set)
|
|
for symbol in syms:
|
|
first_set |= symbol.first_set
|
|
if empty_token not in symbol.first_set:
|
|
first_set -= empty_token.first_set
|
|
break
|
|
result = frozenset(first_set)
|
|
cache[syms] = result
|
|
return result
|
|
symbol_seq_first_set = classmethod(symbol_seq_first_set)
|
|
|
|
#
|
|
# Iterating over us returns the Lr1Item's in our kernel.
|
|
#
|
|
def __iter__(self):
|
|
return iter(self._kernel.values())
|
|
|
|
#
|
|
# Return a generator for the kernel + closure.
|
|
#
|
|
def all_items(self):
|
|
return itertools.chain(self, self._closure.values())
|
|
|
|
#
|
|
# Compute the closure for ourselves.
|
|
#
|
|
def compute_closure(self, cache):
|
|
if self._closure is None:
|
|
empty_token = cache['__empty__']
|
|
for lr0_item, old_prio_dict in self.prio.items():
|
|
prio_dict = {}
|
|
last_lookahead = empty_token.first_set
|
|
for prio, lookahead in sorted(old_prio_dict.items()):
|
|
if lookahead != last_lookahead:
|
|
prio_dict[prio] = lookahead
|
|
last_lookahead = lookahead
|
|
self.prio[lr0_item] = prio_dict
|
|
self._closure = {}
|
|
self._close_kernel_items(iter(self), cache)
|
|
|
|
#
|
|
# Calculate the kernel of the goto set, given a particular symbol.
|
|
#
|
|
def goto_sets(self, cache):
|
|
if self._goto_cache is None:
|
|
dot_symbols = collections.defaultdict(set)
|
|
for item in self.all_items():
|
|
dot_pos, rhs = item.dot_pos, item.production.rhs
|
|
if dot_pos < len(rhs):
|
|
dot_symbols[rhs[dot_pos]].add(item)
|
|
self._goto_cache = {}
|
|
for symbol in sorted(dot_symbols, key=lambda symbol: symbol.id):
|
|
gen = (
|
|
(item.lr1_shift(cache), self.prio[item.lr0_item])
|
|
for item in dot_symbols[symbol])
|
|
item_set = ItemSet(gen, cache)
|
|
self._goto_cache[symbol] = item_set
|
|
return self._goto_cache
|
|
|
|
#
|
|
# Check for reduce/reduce compatibility between ItemSet's. The two
|
|
# ItemSet's passed must have the same same lr0 kernel. ItemSet's with
|
|
# different lr0 kernels are never compatible.
|
|
#
|
|
# When to reduce and what to is determined by the lookahead:
|
|
#
|
|
# S ::= W; S ::= X; S ::= Y; S ::= Z
|
|
# W ::= a P i
|
|
# X ::= a Q j
|
|
# Y ::= b P j
|
|
# Z ::= b Q i
|
|
# P ::= c
|
|
# Q ::= c
|
|
#
|
|
# Produces these ItemSets, among others:
|
|
#
|
|
# WX: P -> c ., [i] (i-->reduce(P)); Q -> c ., [j] (j-->reduce(Q))
|
|
# YZ: P -> c ., [j] (j-->reduce(Q)); Q -> c ., [i] (i-->reduce(P))
|
|
#
|
|
# In this case merging WX and YZ would produce reduce conflicts because if
|
|
# we see the token i, we can't both reduce(P) and reduce(Q).
|
|
#
|
|
def compatible(self, other):
|
|
#
|
|
# Compatible() isn't always called, so lazily evaluate
|
|
# ItemSet._lhs_prio.
|
|
#
|
|
def lhs_prio(item_set):
|
|
if item_set._lhs_prio is None:
|
|
all_lookaheads = None
|
|
lhs_prio = {}
|
|
for lr0_item in item_set._kernel:
|
|
for prio, lookahead in item_set.prio[lr0_item].items():
|
|
key = (lr0_item.production.lhs, prio)
|
|
if key not in lhs_prio:
|
|
lhs_prio[key] = lookahead
|
|
else:
|
|
lhs_prio[key] |= lookahead
|
|
if all_lookaheads is None:
|
|
all_lookaheads = lookahead
|
|
else:
|
|
all_lookaheads |= lookahead
|
|
item_set._lhs_prio = (lhs_prio, all_lookaheads)
|
|
return item_set._lhs_prio
|
|
if self is other:
|
|
return True
|
|
#
|
|
# For an LR(1) scheme we are saying for a given lhs in self its
|
|
# lookahead set can only share lookahead tokens with the same lhs in
|
|
# other.
|
|
#
|
|
# For us it gets a little more complex, as we are carrying
|
|
# ItemSet.prio's and they have to be compatible as well. However,
|
|
# turns out this reduces to insisting that (lhs, prio) combinations
|
|
# can't share lookaheads suffices.
|
|
#
|
|
self_lhs_prio, self_all_lookaheads = lhs_prio(self)
|
|
other_lhs_prio, other_all_lookaheads = lhs_prio(other)
|
|
common = self_all_lookaheads & other_all_lookaheads
|
|
for key, self_lookaheads in self_lhs_prio.items():
|
|
other_lookaheads = other_lhs_prio.get(key, None)
|
|
if other_lookaheads is not None:
|
|
if (self_lookaheads & common) != (other_lookaheads & common):
|
|
return False
|
|
return True
|
|
|
|
#
|
|
# Merge two ItemSet's. They must be ItemSet.compatible(). Return True
|
|
# if the merge altered 'self', so it needs a new closure computed.
|
|
#
|
|
def merge(self, other, cache):
|
|
closure_items = []
|
|
for item in self:
|
|
other_item = other._kernel[item.lr0_item]
|
|
modified = False
|
|
item_prio = self.prio[item.lr0_item]
|
|
expanded = False
|
|
for prio, other_lookahead in other.prio[item.lr0_item].items():
|
|
self_lookahead = self.prio[item.lr0_item].get(prio, None)
|
|
if self_lookahead is None:
|
|
expanded = expanded or item_prio.copy()
|
|
expanded[prio] = other_lookahead
|
|
elif not self_lookahead >= other_lookahead:
|
|
expanded = expanded or item_prio.copy()
|
|
expanded[prio] |= other_lookahead
|
|
if expanded:
|
|
self.prio[item.lr0_item] = expanded
|
|
modified = True
|
|
if modified:
|
|
item.lookahead |= other_item.lookahead
|
|
closure_items.append(item)
|
|
if not closure_items:
|
|
return False
|
|
self._goto_cache = None
|
|
self._lhs_prio = None
|
|
self._close_kernel_items(closure_items, cache)
|
|
return True
|
|
|
|
def sorted(cls, table):
|
|
return sorted(table, key=lambda item_set: item_set.id)
|
|
sorted = classmethod(sorted)
|
|
|
|
|
|
#
|
|
# LR parser actions. These have to be tuples because that is what the
|
|
# lr1_parser expects in the optimised case. These are for the non-optimised
|
|
# case, so we carry more information for debugging and priority resolution.
|
|
# Nonetheless these must be backwards compatible with what
|
|
# Grammar.optimise_parsing_table() produces.
|
|
#
|
|
class Action(tuple):
|
|
__slots__ = ()
|
|
|
|
#
|
|
# Actions for an owning item are placed into a set. Identical actions
|
|
# can't be in that set, so __hash__ and __eq__ must be implemented
|
|
# accordingly.
|
|
#
|
|
def __hash__(self):
|
|
raise NotImplementedError()
|
|
|
|
def __eq__(self, other):
|
|
raise NotImplementedError()
|
|
|
|
#
|
|
# Sort order. Only important for tests where we need a repeatable test
|
|
# outcomes.
|
|
#
|
|
def __lt__(self, other):
|
|
return self.key() < other.key()
|
|
|
|
#
|
|
# This function returns three things:
|
|
#
|
|
# (lhs, low, high)
|
|
#
|
|
# where:
|
|
#
|
|
# lhs Is the symbol we will reduce to, or None if there could be
|
|
# several. This is used to compare associativity.
|
|
#
|
|
# low The lowest priority for the passed token.
|
|
#
|
|
# high The highest priority for the passed token.
|
|
def precedence(self, token, item_set):
|
|
raise NotImplementedError()
|
|
|
|
|
|
#
|
|
# A Shift action - consume a token, ie move it onto the stack.
|
|
#
|
|
class ShiftAction(Action):
|
|
__slots__ = ()
|
|
|
|
def __new__(cls, *next_state):
|
|
result = Action.__new__(cls, next_state)
|
|
return result
|
|
|
|
def __repr__(self):
|
|
return "shift %s" % self[0]
|
|
|
|
def __hash__(self):
|
|
return hash(self[0])
|
|
|
|
def __eq__(self, other):
|
|
return isinstance(other, ShiftAction) and self[0] == other[0]
|
|
|
|
#
|
|
# Key used for sorting. Used only to get repeatable tests.
|
|
#
|
|
def key(self):
|
|
return "shift", self[0].id
|
|
|
|
#
|
|
# In a shift:
|
|
#
|
|
# lhs Is the lhs of all items if they are the same, otherwise None.
|
|
#
|
|
# low The lowest prio for the token in the kernel.
|
|
#
|
|
# high The highest prio for the token in the kernel.
|
|
#
|
|
def precedence(self, token, item_set):
|
|
my_item_set = self[0]
|
|
lhs = next(iter(my_item_set)).production.lhs
|
|
low = (1e100,)
|
|
high = ()
|
|
for lr1_item in my_item_set:
|
|
if lhs != lr1_item.production.lhs:
|
|
lhs = None
|
|
items = my_item_set.prio[lr1_item.lr0_item].items()
|
|
for prio, lookahead in items:
|
|
if token in lookahead:
|
|
if low > prio:
|
|
low = prio
|
|
if high < prio:
|
|
high = prio
|
|
return lhs, low, high
|
|
|
|
|
|
#
|
|
# A reduce action - ie the top of the stack is a production we recognise.
|
|
# Replace it with it's lhs.
|
|
#
|
|
class ReduceAction(Action):
|
|
__slots__ = ()
|
|
|
|
def __new__(cls, lr1_item):
|
|
lhs = lr1_item.production.lhs
|
|
output = lhs if isinstance(lhs, Rule) and lhs.name[0] != '_' else None
|
|
me = (lhs, len(lr1_item.production.rhs), output, lr1_item)
|
|
result = Action.__new__(cls, me)
|
|
return result
|
|
|
|
def __repr__(self):
|
|
return "reduce %s = %s" % (
|
|
self[0], ' '.join(str(sym) for sym in self[3].production.rhs))
|
|
|
|
def __hash__(self):
|
|
return hash(self[:3])
|
|
|
|
def __eq__(self, other):
|
|
return isinstance(other, ReduceAction) and self[:3] == other[:3]
|
|
|
|
#
|
|
# Key used for sorting. Used only to get repeatable tests.
|
|
#
|
|
def key(self):
|
|
return "reduce", self[0].id, self[3].dot_pos
|
|
|
|
#
|
|
# In a reduce:
|
|
#
|
|
# lhs Is the lhs of the target production.
|
|
#
|
|
# low The lowest prio for the token for this Lr1Item.
|
|
#
|
|
# high The highest prio for the token for this Lr1Item.
|
|
#
|
|
def precedence(self, token, item_set):
|
|
lr1_item = self[3]
|
|
lhs = lr1_item.production.lhs
|
|
low = (1e100,)
|
|
high = ()
|
|
for prio, lookahead in item_set.prio[lr1_item.lr0_item].items():
|
|
if token in lookahead:
|
|
if low > prio:
|
|
low = prio
|
|
if high < prio:
|
|
high = prio
|
|
return lhs, low, high
|
|
|
|
|
|
#
|
|
# The thing that implements the grammar - the Parser.
|
|
#
|
|
class Parser(object):
|
|
VERSION = "0.1"
|
|
comments = None # object, TokenSymbol(), list of tokens.
|
|
empty_token = None # object, MetaToken("__empty__")
|
|
eoi_token = None # object, MetaToken("__end_of_input__")
|
|
epoch_symbol = None # object, Rule(), <G> = START __end_of_input__
|
|
parser_name = None # string, Name of the parser
|
|
parsing_table = None # tuple, (Lr1State(), ...)
|
|
rules = None # dict, {"name": Rule(), ...}
|
|
token_registry = None # object, TokenRegistry()
|
|
unused_symbols = None # set, SymbolSet(Rule(), ...)
|
|
whitespace = None # string, Characters defined to be whitespace
|
|
|
|
def __init__(self, parser_name, dct):
|
|
def new_meta(name):
|
|
meta = MetaToken(name)
|
|
return meta.resolve_symbol(name, self.rules, self.token_registry)
|
|
#
|
|
# Step 1 is to replace all productions with a Rule() equivalent and
|
|
# find the TokenRegistry.
|
|
#
|
|
self.parser_name = parser_name
|
|
rule_symbols, self.rules, token_registry = (
|
|
self.catalogue_symbols(dct))
|
|
if token_registry is None:
|
|
self.token_registry = TokenRegistry()
|
|
else:
|
|
token_registry.restore_dicts()
|
|
self.token_registry = token_registry()
|
|
for name in sorted(self.rules):
|
|
self.resolve_rule(self.rules[name], rule_symbols)
|
|
self.empty_token = new_meta("__empty__")
|
|
self.eoi_token = new_meta("__end_of_input__")
|
|
for name in sorted(self.rules):
|
|
self.resolve_symbol(self.rules[name])
|
|
if name in self.token_registry:
|
|
msg = "A token and symbol share the same name %r"
|
|
raise GrammarError(msg % name)
|
|
#
|
|
# Create the starting production for the grammar.
|
|
#
|
|
start_symbol = dct.get("START", None)
|
|
if start_symbol is None:
|
|
raise GrammarError("No START symbol defined")
|
|
if not isinstance(start_symbol, Rule):
|
|
raise GrammarError("START is not a Nonterm")
|
|
if len(start_symbol.dict) != 0:
|
|
raise GrammarError("START symbol may not have dictionary elements")
|
|
epoch_symbol = Sequence()
|
|
epoch_symbol.nested = [start_symbol, self.eoi_token]
|
|
self.epoch_symbol = Rule('<%s>' % self.parser_name, epoch_symbol)
|
|
epoch_symbol.parent = self.epoch_symbol
|
|
if self.epoch_symbol.name in self.rules:
|
|
msg = "Symbol name %r is reserved" % self.epoch_symbol.name
|
|
raise GrammarError(msg)
|
|
self.rules[self.epoch_symbol.name] = self.epoch_symbol
|
|
for rule in self.rules.values():
|
|
rule.resolved = True
|
|
#
|
|
# Get the special cased tokens.
|
|
#
|
|
self.whitespace = dct.get("WHITESPACE", None)
|
|
self.token_registry.compile_tokens(self.whitespace)
|
|
comments = dct.get("COMMENTS", None)
|
|
if comments is None:
|
|
self.comment_tokens = None
|
|
else:
|
|
error = not isinstance(comments, Rule)
|
|
if not error:
|
|
comment_symbol = comments.nested[0]
|
|
if isinstance(comment_symbol, TokenSymbol):
|
|
self.comment_tokens = comments.nested
|
|
elif isinstance(comment_symbol, Choice):
|
|
error = all(
|
|
not isinstance(sym, TokenSymbol)
|
|
for sym in comments.nested[0].nested)
|
|
if not error:
|
|
self.comment_tokens = comments.nested[0].nested
|
|
if error:
|
|
raise GrammarError("COMMENTS must be Token | Token ...")
|
|
del self.rules['COMMENTS']
|
|
|
|
#
|
|
# Compile the grammar into an LR(1) parse_table, or raise a GrammarError
|
|
# if there is a problem with the grammar.
|
|
#
|
|
def compile_grammar(self):
|
|
if self.parsing_table:
|
|
return
|
|
#
|
|
# Initialise the grammar by creating the start production. Then
|
|
# compile it.
|
|
#
|
|
used_symbols = self.epoch_symbol.compile_grammar(self.empty_token)
|
|
self.unused_symbols = frozenset(
|
|
symbol for symbol in self.rules.values()
|
|
if symbol not in used_symbols)
|
|
#
|
|
# Resolve first sets.
|
|
#
|
|
self.calc_first_sets(self.epoch_symbol)
|
|
#
|
|
# Create the parser.
|
|
#
|
|
start_state, lr0_item_sets = self.compute_lr1_items(self.epoch_symbol)
|
|
table = self.compute_parsing_table(lr0_item_sets)
|
|
self.parsing_table = (start_state, table)
|
|
self.normalise_item_set_id(self.parsing_table)
|
|
self.disambiguate(table)
|
|
|
|
#
|
|
# Return the optimised_grammar suitable for passing to compile_grammar().
|
|
#
|
|
def pre_compile_grammar(self, grammar_class, pre_compiled=None):
|
|
def from_flat(i, flat):
|
|
return Lr1State.from_flat(i, flat, self.rules, self.token_registry)
|
|
#
|
|
# If it has already been pre compiled just return.
|
|
#
|
|
if self.parsing_table is not None:
|
|
if isinstance(self.parsing_table[0], int):
|
|
return None
|
|
#
|
|
# If we don't have a table see if we can use the pre_compiled version.
|
|
#
|
|
if pre_compiled:
|
|
if isinstance(pre_compiled, string_types):
|
|
pre_compiled = ast.literal_eval(pre_compiled)
|
|
if pre_compiled[0] == self.grammar_hash():
|
|
optimised_start_state = pre_compiled[1]
|
|
optimised_parsing_table = tuple(
|
|
from_flat(i - 2, pre_compiled[i])
|
|
for i in range(2, len(pre_compiled)))
|
|
self.parsing_table = (
|
|
optimised_start_state, optimised_parsing_table)
|
|
return None
|
|
#
|
|
# Optimise it.
|
|
#
|
|
self.compile_grammar()
|
|
self.parsing_table = self.optimise_parsing_table(self.parsing_table)
|
|
flattened = tuple(
|
|
state.to_flat(grammar_class)
|
|
for state in self.parsing_table[1])
|
|
return repr((self.grammar_hash(), self.parsing_table[0]) + flattened)
|
|
|
|
#
|
|
# Make the start state item_set.id==0 and the remainder following
|
|
# sequentially.
|
|
#
|
|
def normalise_item_set_id(cls, parsing_table):
|
|
start_state, table = parsing_table
|
|
mapping = iter(zip(ItemSet.sorted(table), itertools.count(0)))
|
|
first = next(mapping)
|
|
for item_set, id in itertools.chain((first,), mapping):
|
|
item_set.id = id
|
|
start_state.id, first[0].id = first[0].id, start_state.id
|
|
normalise_item_set_id = classmethod(normalise_item_set_id)
|
|
|
|
#
|
|
# Debug dump of nonterms.
|
|
#
|
|
def repr_productions(self):
|
|
def r(nonterm):
|
|
all_nonterms.add(nonterm)
|
|
for symbol in nonterm.nested:
|
|
if isinstance(symbol, Nonterm) and symbol not in all_nonterms:
|
|
r(symbol)
|
|
all_nonterms = set()
|
|
for rule in self.rules.values():
|
|
r(rule)
|
|
all_nonterms = [n for n in sorted(all_nonterms, key=lambda t: str(t))]
|
|
result = []
|
|
i = 0
|
|
for nonterm in all_nonterms:
|
|
if not nonterm.productions:
|
|
continue
|
|
rank = ".%d" % nonterm.rank if nonterm.rank else ""
|
|
result.append(
|
|
"%-6s: %r" % ("%d%s" % (i, rank), nonterm.productions[0],))
|
|
for p in nonterm.productions[1:]:
|
|
result.append(" %r" % (p,))
|
|
i += 1
|
|
return '\n'.join(result)
|
|
|
|
#
|
|
# Debug dump of the parsing table.
|
|
#
|
|
def repr_parse_table(self, state=None):
|
|
if self.parsing_table is None:
|
|
return ''
|
|
a_state = next(iter(self.parsing_table[1]))
|
|
if state is None:
|
|
func = lambda item_set: True
|
|
elif state >= 0:
|
|
func = lambda item_set: item_set.id == state
|
|
else:
|
|
is_state = lambda s: (s if isinstance(s, int) else s.id) == -state
|
|
action_state = lambda act: is_state(tuple(act)[0])
|
|
func = lambda item_set: (
|
|
is_state(item_set) or
|
|
any(is_state(g) for g in item_set.gotos.values()) or
|
|
any(action_state(a) for a in item_set.actions.values()))
|
|
item_sets = (i for i in self.parsing_table[1] if func(i))
|
|
result = []
|
|
for item_set in a_state.sorted(item_sets):
|
|
result.append(repr(item_set))
|
|
result.append("")
|
|
if result:
|
|
del result[-1]
|
|
return '\n'.join(result)
|
|
|
|
#
|
|
# Dump the grammar.
|
|
#
|
|
def repr_grammar(self):
|
|
result = [repr(rule) for name, rule in sorted(self.rules.items())]
|
|
return '\n'.join(result)
|
|
|
|
#
|
|
# Parse a feed.
|
|
#
|
|
def parse(self, input, tree_factory=None, on_error=None, log=None):
|
|
if self.parsing_table is None:
|
|
self.compile_grammar()
|
|
token_feed = self.token_registry.tokeniser(input, self.whitespace)
|
|
return self.lr1_parser(token_feed, tree_factory, on_error, log)
|
|
|
|
#
|
|
# Build up a catalogue of all symbols.
|
|
#
|
|
def catalogue_symbols(cls, dct):
|
|
#
|
|
# Move one symbols dict to another.
|
|
#
|
|
def move_dict(to, frm):
|
|
for key in list(frm.dict):
|
|
to.dict[key] = frm.dict[key]
|
|
del frm.dict[key]
|
|
|
|
#
|
|
# Create a new Rule() for this rule.
|
|
#
|
|
def catalogue(name, field):
|
|
rule = Rule(name, field)
|
|
rule_symbols[field] = rule
|
|
symbols[name] = rule
|
|
dct[name] = rule
|
|
return rule
|
|
|
|
rule_symbols = {}
|
|
symbols = {}
|
|
token_registry = None
|
|
for name, field in sorted(dct.items()):
|
|
if isinstance(field, Symbol):
|
|
if isinstance(field, Ref):
|
|
raise GrammarError("Ref(%r) hasn't been defined" % name)
|
|
if field not in rule_symbols:
|
|
move_dict(catalogue(name, field), field)
|
|
elif name == "START":
|
|
catalogue(name, rule_symbols[field])
|
|
elif rule_symbols[field].name == "START":
|
|
rule = rule_symbols[field]
|
|
field_rule = catalogue(name, field)
|
|
move_dict(field_rule, rule)
|
|
catalogue(rule.name, field_rule)
|
|
else:
|
|
msg = (
|
|
"You have \"%s = %s\" or " +
|
|
"\"%s = TokenRegistry.tok; " +
|
|
"%s = TokenRegistry.tok\".\n" +
|
|
"Only START maybe assigned directly to " +
|
|
"another Symbol. A workaround is %s = %s * 1"
|
|
)
|
|
raise GrammarError(msg % (
|
|
name, rule_symbols[field],
|
|
name, rule_symbols[field],
|
|
name, rule_symbols[field],))
|
|
elif isinstance(field, type) and TokenRegistry in field.__bases__:
|
|
if token_registry is None:
|
|
token_registry = field
|
|
else:
|
|
msg = "Can't have more than one %s"
|
|
raise GrammarError(msg, TokenRegistry.__name__)
|
|
return rule_symbols, symbols, token_registry
|
|
catalogue_symbols = classmethod(catalogue_symbols)
|
|
|
|
#
|
|
# Replace all occurrences of a declared symbol with it's Rule().
|
|
#
|
|
def resolve_rule(self, rule, rule_symbols):
|
|
def r(nested):
|
|
for i, symbol in zip(itertools.count(), nested):
|
|
if not isinstance(symbol, Rule):
|
|
resolved = rule_symbols.get(symbol, None)
|
|
if resolved is not None:
|
|
nested[i] = resolved
|
|
else:
|
|
r(symbol.nested)
|
|
for i, symbol in zip(itertools.count(), rule.nested):
|
|
# resolved = rule_symbols[symbol]
|
|
# if resolved is not rule:
|
|
# rule.nested[i] = resolved
|
|
assert rule_symbols[symbol] is rule
|
|
if not isinstance(symbol, Rule):
|
|
r(symbol.nested)
|
|
|
|
#
|
|
# Scan the entire grammar, allowing nodes in the parse tree to replace
|
|
# themselves with other nodes. Eg, Ref's with the Symbol they are
|
|
# referencing.
|
|
#
|
|
def resolve_symbol(self, rule):
|
|
def r(parent, symbol):
|
|
resolved = symbol.resolve_symbol(name, self.rules, token_registry)
|
|
if isinstance(resolved, Rule):
|
|
return resolved
|
|
resolved.parent = parent
|
|
for i, sym in zip(itertools.count(), resolved.nested):
|
|
if not isinstance(sym, Rule):
|
|
resolved.nested[i] = r(resolved, sym)
|
|
return resolved
|
|
name = rule.name
|
|
token_registry = self.token_registry
|
|
for i, sym in zip(itertools.count(), rule.nested):
|
|
rule.nested[i] = r(rule, sym)
|
|
|
|
#
|
|
# Compute the first sets for all symbols.
|
|
#
|
|
def calc_first_sets(self, epoch_symbol):
|
|
#
|
|
# Collect all nonterminals used by the grammar.
|
|
#
|
|
def r(nonterm):
|
|
for prod in nonterm.productions:
|
|
for sym in prod.rhs:
|
|
if isinstance(sym, Nonterm) and sym not in nonterms:
|
|
nonterms.add(sym)
|
|
nonterm_list.append(sym)
|
|
r(sym)
|
|
#
|
|
# We use nonterm_list to make it deterministic.
|
|
#
|
|
nonterms = set()
|
|
nonterm_list = []
|
|
r(epoch_symbol)
|
|
changed = True
|
|
while changed:
|
|
changed = any(sym.merge_first_set(self) for sym in nonterm_list)
|
|
|
|
#
|
|
# Compute the collection of sets of LR(1) items.
|
|
#
|
|
# Wikipedia is a good reference:
|
|
# http://en.wikipedia.org/wiki/Canonical_LR_parser
|
|
#
|
|
def compute_lr1_items(self, epoch_symbol):
|
|
#
|
|
# Initialise the parse table by creating the start ItemSet. It
|
|
# contains one item:
|
|
#
|
|
# FINISH ::= ^ START $., <e>
|
|
#
|
|
cache = {'__empty__': self.empty_token}
|
|
start_production = epoch_symbol.productions[0]
|
|
start_item_lr0 = Lr0Item(start_production, 0, cache)
|
|
start_item_lookahead = set(self.empty_token.first_set)
|
|
start_item_lr1 = Lr1Item(start_item_lr0, start_item_lookahead)
|
|
start_item_set = ItemSet(((start_item_lr1, {(): set()}),), cache)
|
|
start_item_set.compute_closure(cache)
|
|
lr0_item_sets = collections.defaultdict(list)
|
|
lr0_item_sets[start_item_set.lr0_kernel].append(start_item_set)
|
|
#
|
|
# Now compute new ItemSet's from the ones we have created, until we
|
|
# all we create is duplicates.
|
|
#
|
|
worklist = collections.deque([start_item_set])
|
|
while worklist:
|
|
item_set = worklist.popleft()
|
|
goto_sets = item_set.goto_sets(cache).items()
|
|
for symbol, goto_set in sorted(goto_sets, key=lambda i: i[0].id):
|
|
lr1_merge = None
|
|
for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]:
|
|
if lr1_item_set.compatible(goto_set):
|
|
lr1_merge = lr1_item_set
|
|
break
|
|
if lr1_merge is None:
|
|
goto_set.compute_closure(cache)
|
|
worklist.append(goto_set)
|
|
lr0_item_sets[goto_set.lr0_kernel].append(goto_set)
|
|
elif lr1_merge.merge(goto_set, cache):
|
|
if lr1_merge in worklist:
|
|
worklist.remove(lr1_merge)
|
|
worklist.appendleft(lr1_merge)
|
|
return start_item_set, lr0_item_sets
|
|
|
|
#
|
|
# Compute LR(1) actions.
|
|
#
|
|
def compute_parsing_table(cls, lr0_item_sets):
|
|
#
|
|
# First assign a unique ID to each item_set. This ID will be it's
|
|
# index into actions[] and gotos[].
|
|
#
|
|
table = dict(
|
|
(item_set, item_set)
|
|
for item_set_list in lr0_item_sets.values()
|
|
for item_set in item_set_list)
|
|
for item_set in table:
|
|
#
|
|
# Compute actions.
|
|
#
|
|
actions = collections.defaultdict(set)
|
|
goto_sets = item_set.goto_sets(None)
|
|
for item in item_set.all_items():
|
|
dot_pos, rhs = item.dot_pos, item.production.rhs
|
|
if dot_pos == len(rhs):
|
|
for token in item.lookahead:
|
|
actions[token].add(ReduceAction(item))
|
|
elif isinstance(rhs[item.dot_pos], TokenSymbol):
|
|
token = rhs[item.dot_pos]
|
|
goto_set = goto_sets[token]
|
|
found = False
|
|
for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]:
|
|
if lr1_item_set.compatible(goto_set):
|
|
actions[token].add(ShiftAction(lr1_item_set))
|
|
found = True
|
|
break
|
|
assert found, repr(token)
|
|
#
|
|
# Turn the action lists into tuples.
|
|
#
|
|
item_set.actions = dict(actions)
|
|
for token in actions:
|
|
item_set.actions[token] = tuple(item_set.actions[token])
|
|
#
|
|
# Compute goto's.
|
|
#
|
|
gotos = item_set.gotos
|
|
for symbol in goto_sets:
|
|
if not isinstance(symbol, Nonterm):
|
|
continue
|
|
goto_set = goto_sets[symbol]
|
|
for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]:
|
|
if lr1_item_set.compatible(goto_set):
|
|
assert symbol not in gotos
|
|
gotos[symbol] = lr1_item_set
|
|
break
|
|
return table
|
|
compute_parsing_table = classmethod(compute_parsing_table)
|
|
|
|
#
|
|
# Look for action ambiguities and resolve them if possible.
|
|
#
|
|
def disambiguate(cls, item_sets):
|
|
action_list = [
|
|
(item_set, act)
|
|
for item_set in item_sets
|
|
for act in item_set.actions.items()]
|
|
for item_set, (token, actions) in action_list:
|
|
if len(actions) == 1:
|
|
item_set.actions[token] = actions[0]
|
|
continue
|
|
#
|
|
# Multiple actions are ambiguities. Compare every action with
|
|
# all others in the hope that Prio() and Assoc() can eliminate
|
|
# all bar one.
|
|
#
|
|
new_actions = sorted(actions) # Repeatability for testing
|
|
i = 0
|
|
while i < len(new_actions) - 1:
|
|
act_0 = new_actions[i]
|
|
j = i + 1
|
|
while j < len(new_actions):
|
|
act_1 = new_actions[j]
|
|
keep = cls.resolve_ambiguity(item_set, token, act_0, act_1)
|
|
if "1" not in keep:
|
|
del new_actions[j]
|
|
j -= 1
|
|
j += 1
|
|
if "0" not in keep:
|
|
del new_actions[i]
|
|
i -= 1
|
|
break
|
|
i += 1
|
|
#
|
|
# Since we don't support GLR(1) grammars yet (ie, we don't do
|
|
# Split()), resolve_ambiguity() must have not more than one result
|
|
# left. It's possible Nonassoc eliminates all of them.
|
|
#
|
|
if not new_actions:
|
|
del item_set.actions[token]
|
|
else:
|
|
assert len(new_actions) == 1
|
|
item_set.actions[token] = new_actions[0]
|
|
disambiguate = classmethod(disambiguate)
|
|
|
|
#
|
|
# Compute how to resolve an action conflict. Returns the actions to
|
|
# keep, or "err".
|
|
#
|
|
def resolve_ambiguity(cls, item_set, token, action_0, action_1):
|
|
#
|
|
# Print a nice and hopefully useful error message when we can't
|
|
# resolve a conflict.
|
|
#
|
|
def err(reason):
|
|
#
|
|
# Print an action in a nice looking way.
|
|
#
|
|
def explain_action(action):
|
|
if isinstance(action, ReduceAction):
|
|
rhs = ' '.join(str(s) for s in action[3].production.rhs)
|
|
lhs = action[3].production.lhs
|
|
return [" replace the sequence [%s] with %s" % (rhs, lhs)]
|
|
next_state_lr1_items = list(action[0])
|
|
one_of = "" if len(next_state_lr1_items) == 1 else " one of"
|
|
msg = [
|
|
" accept the %s in the hope it will match%s:"
|
|
% (token, one_of)
|
|
]
|
|
msg.extend([
|
|
" " + repr(lr1_item.lr0_item)
|
|
for lr1_item in Lr1Item.sorted(next_state_lr1_items)])
|
|
return msg
|
|
|
|
#
|
|
# Is the passed item relevant to the action?
|
|
#
|
|
def relevant(item, action):
|
|
if isinstance(action, ReduceAction):
|
|
return action[3] is item
|
|
return (item.dot_pos < len(item.production.rhs) and
|
|
item.production.rhs[item.dot_pos] == token)
|
|
#
|
|
# Produce a nicely formatted error message.
|
|
#
|
|
msg = ["Conflict: %s" % (reason,)]
|
|
msg.append("While trying to recognise state %d:" % (item_set.id,))
|
|
for item in item_set.all_items():
|
|
if relevant(item, action_0) or relevant(item, action_1):
|
|
msg.append(" %r" % (item.lr0_item,))
|
|
msg.append("on seeing a %s I could not decide between:" % (token,))
|
|
msg.extend(explain_action(action_0))
|
|
msg.append("and")
|
|
msg.extend(explain_action(action_1))
|
|
msg.append("Please remove this ambiguity from your grammar")
|
|
raise GrammarError('\n'.join(msg))
|
|
|
|
#
|
|
# Decide the associativity of a symbol.
|
|
#
|
|
def assoc(nonterm):
|
|
if isinstance(nonterm, Assoc):
|
|
return nonterm
|
|
return None
|
|
#
|
|
# Ambiguities can arise from productions like this:
|
|
#
|
|
# e = e op e
|
|
# op = op1 | op2
|
|
#
|
|
# When confronted with the token string:
|
|
#
|
|
# e op1 e ^ op2 e
|
|
#
|
|
# We get a shift / reduce conflict at the indicated position.
|
|
# The choice is really between two different parse trees:
|
|
#
|
|
# Shift: (e op1 (e op2 e))
|
|
# Reduce: ((e op1 e) op2 e)
|
|
#
|
|
# This can be resolved in two ways. If priorities are allocated to
|
|
# the clashing productions, then we can choose the action the
|
|
# grammar writer preferred based on which of 'op1' or 'op2' was
|
|
# used:
|
|
#
|
|
# e = Prio(e op1 e, e op2 e)
|
|
# which is equivalent to:
|
|
# e = e op1 e, Priority=0
|
|
# e = e op2 e, Priority=1
|
|
#
|
|
# Or, the grammar writer can specify associativity, ie saying he always
|
|
# wants ((e <op> 2) <op> e) regardless of 'op' (left associative) or he
|
|
# wants (e op (e op e)) regardless of 'op' (right associative). For
|
|
# left associativity it is like this:
|
|
#
|
|
# e = e << op << e
|
|
#
|
|
# A reduce/reduce conflict can happen if the clash happens higher up
|
|
# in the parse tree:
|
|
#
|
|
# e0 = Prio(e1, e2)
|
|
# e1 = 'n'
|
|
# e2 = 'n'
|
|
#
|
|
# When parsing:
|
|
#
|
|
# 'n' ^
|
|
#
|
|
# We have two reduction choices, e1='n' and e2='n'. These productions
|
|
# don't have priorities allocated directly, but the parse table builder
|
|
# will have pushed the priorities e0 allocated down to them. Nested
|
|
# priorities are ranked (see allocate_rank()), and this ranking makes
|
|
# priorities globally comparable.
|
|
#
|
|
lhs_0, low_0, high_0 = action_0.precedence(token, item_set)
|
|
lhs_1, low_1, high_1 = action_1.precedence(token, item_set)
|
|
if low_0 < low_1:
|
|
return "0"
|
|
if low_0 > low_1:
|
|
return "1"
|
|
#
|
|
# The priorities are the same. Try using the associativity.
|
|
#
|
|
assert (
|
|
not isinstance(action_0, ShiftAction) or
|
|
not isinstance(action_1, ShiftAction))
|
|
if (isinstance(action_0, ReduceAction) and
|
|
isinstance(action_1, ReduceAction)):
|
|
return err("Reduce/Reduce")
|
|
lhs_0_assoc, lhs_1_assoc = assoc(lhs_0), assoc(lhs_1)
|
|
lhs = lhs_0_assoc if lhs_0_assoc is not None else lhs_1_assoc
|
|
if not isinstance(lhs, Assoc):
|
|
return err("Shift/Reduce and no associativity")
|
|
if isinstance(lhs_1_assoc, Assoc) and lhs.assoc != lhs_1_assoc.assoc:
|
|
return err("Shift/Reduce and conflicting associativity")
|
|
#
|
|
# The actions have met the associativity pre-conditions so we can
|
|
# resolve the conflict.
|
|
#
|
|
if lhs.assoc == 'l': # Left()
|
|
return "1" if isinstance(action_0, ShiftAction) else "0"
|
|
if lhs.assoc == 'r': # Right()
|
|
return "0" if isinstance(action_0, ShiftAction) else "1"
|
|
assert lhs.assoc == 'n', lhs_0.assoc
|
|
return "" # Nonassoc() - Association is a parse error
|
|
resolve_ambiguity = classmethod(resolve_ambiguity)
|
|
|
|
#
|
|
# This optional step creates a parsing_table with all the information
|
|
# in the ItemSet() pruned.
|
|
#
|
|
def optimise_parsing_table(self, parsing_table):
|
|
start_state, table = parsing_table
|
|
#
|
|
# Create a mapping of item_set: int, with start_state mapping to 0.
|
|
#
|
|
state_number = dict(zip(ItemSet.sorted(table), itertools.count()))
|
|
item_set_0 = next(i for i in state_number if state_number[i] == 0)
|
|
state_number[item_set_0] = state_number[start_state]
|
|
state_number[start_state] = 0
|
|
#
|
|
# Map each item set to it's number.
|
|
#
|
|
optimised = []
|
|
for item_set in sorted(table, key=lambda i: state_number[i]):
|
|
actions = {}
|
|
for token in item_set.actions:
|
|
action = item_set.actions[token]
|
|
if isinstance(action, ShiftAction):
|
|
new_action = (state_number[action[0]],)
|
|
else:
|
|
nonterm_nr = action[0].id
|
|
len_rhs = action[1]
|
|
output = action[2]
|
|
new_action = (nonterm_nr, len_rhs, output, None)
|
|
actions[token] = new_action
|
|
gotos = dict(
|
|
(nonterm.id, state_number[itm_set])
|
|
for nonterm, itm_set in item_set.gotos.items())
|
|
state_id = len(optimised)
|
|
lr1_state = Lr1State(state_id, actions, gotos, item_set.rules)
|
|
optimised.append(lr1_state)
|
|
#
|
|
# And we are done.
|
|
#
|
|
return 0, optimised
|
|
|
|
#
|
|
# The grammar hash.
|
|
#
|
|
def grammar_hash(self):
|
|
grammar_tree = '; '.join(sorted(
|
|
repr(self.rules[name]) for name in self.rules))
|
|
hsh = hashlib.sha512()
|
|
hsh.update(grammar_tree.encode())
|
|
hsh.update(self.VERSION.encode())
|
|
return hsh.hexdigest()
|
|
|
|
#
|
|
# The LR(1) table driven parser.
|
|
#
|
|
def lr1_parser(self, token_feed, tree_factory, on_error, log):
|
|
#
|
|
# Print the current stack, for log.
|
|
#
|
|
def print_stack(stk):
|
|
return " ".join(
|
|
"%s=%s" % (s[0], s[1][0][0] if s[1] else '()') for s in stk)
|
|
|
|
#
|
|
# The lengths I am prepared to go in the name of fast path
|
|
# efficiency frightens me a times.
|
|
#
|
|
def insert_error_recovery_tuple():
|
|
while recovery_stack:
|
|
input_tuple = next(recovery_stack[-1], EOF)
|
|
if input_tuple is not EOF:
|
|
yield input_tuple
|
|
else:
|
|
recovery_stack.pop()
|
|
iterator[0] = original_input
|
|
yield next(iterator[0], EOF)
|
|
|
|
EOF = object()
|
|
recovery_stack = []
|
|
original_input = itertools.chain(token_feed, ((self.eoi_token,),))
|
|
iterator = [original_input]
|
|
start_state, table = self.parsing_table
|
|
state = table[start_state]
|
|
stack = [(state, ((self.empty_token,),))]
|
|
if not self.comment_tokens:
|
|
comments = ()
|
|
else:
|
|
comments = frozenset(self.comment_tokens)
|
|
input_tuple = next(iterator[0], EOF)
|
|
while input_tuple is not EOF:
|
|
token = input_tuple[0]
|
|
if token in comments:
|
|
input_tuple = next(iterator[0], EOF)
|
|
continue
|
|
while True:
|
|
try:
|
|
action = state.actions[token]
|
|
except KeyError:
|
|
#
|
|
# A Parse error. Does he want to do error recovery?
|
|
#
|
|
if on_error is None:
|
|
raise ParseError(input_tuple, stack)
|
|
insert = on_error(iterator[0], input_tuple, stack)
|
|
if insert is None:
|
|
raise ParseError(input_tuple, stack)
|
|
recovery_stack.append(iter(insert))
|
|
iterator = [insert_error_recovery_tuple()]
|
|
break
|
|
#
|
|
# A shift?
|
|
#
|
|
if len(action) == 1:
|
|
if token is self.eoi_token:
|
|
break
|
|
if tree_factory:
|
|
input_tuple = tree_factory(input_tuple)
|
|
state = table[action[0]]
|
|
stack.append((state, (input_tuple,)))
|
|
if log:
|
|
log("shift %s; %s" % (token, print_stack(stack[1:]),))
|
|
break
|
|
#
|
|
# A reduce.
|
|
#
|
|
goto, pop_count, output, _ = action
|
|
if pop_count == 0:
|
|
tail = ()
|
|
nodes = ()
|
|
else:
|
|
tail = stack[-pop_count:]
|
|
del stack[-pop_count:]
|
|
nodes = sum((s[1] for s in tail), ())
|
|
if output is not None:
|
|
nodes = (output,) + nodes
|
|
if tree_factory:
|
|
nodes = tree_factory(nodes)
|
|
nodes = (nodes,)
|
|
state = table[stack[-1][0].gotos[goto]]
|
|
stack.append((state, nodes))
|
|
if log:
|
|
log(
|
|
"reduce %s; %s -- %s" %
|
|
(token, print_stack(tail), print_stack(stack[1:])))
|
|
input_tuple = next(iterator[0], EOF)
|
|
return stack[-1][1][0]
|
|
|
|
|
|
#
|
|
# The base class for Symbol's in the grammar: a token or a non-terminal.
|
|
#
|
|
class Symbol(object):
|
|
DICT_METHODS = (
|
|
"__contains__", "__delitem__", "__getitem__", "__iter__",
|
|
"__len__", "__setitem__")
|
|
__slots__ = ('dict', 'id', 'first_set', 'nested', 'parent') + DICT_METHODS
|
|
SYMBOL_PRECEDENCE = 0
|
|
|
|
def __init__(self):
|
|
self.dict = {}
|
|
self.first_set = frozenset()
|
|
self.nested = ()
|
|
self.parent = None
|
|
#
|
|
# The original idea was to inherit from dict. Doing so meant Symbol
|
|
# wasn't hashable and since it is used extensively in sets and as
|
|
# keys to dict's overriding __hash__ so it was hash'able caused a
|
|
# 10% slowdown in generating the parse table. So now we just emulate
|
|
# a dict.
|
|
#
|
|
self.__contains__ = self.dict.__contains__
|
|
self.__delitem__ = self.dict.__delitem__
|
|
self.__getitem__ = self.dict.__getitem__
|
|
self.__iter__ = self.dict.__iter__
|
|
self.__len__ = self.dict.__len__
|
|
self.__setitem__ = self.dict.__setitem__
|
|
|
|
def __add__(self, other):
|
|
return OpPlus(self, other)
|
|
|
|
def __radd__(self, other):
|
|
return OpPlus(other, self)
|
|
|
|
def __mul__(self, other):
|
|
if other is Opt or other is Some or other is Many or other is Repeat:
|
|
return other(self)
|
|
if isinstance(other, int):
|
|
return Repeat(self, other, other)
|
|
if (not isinstance(other, tuple) or len(other) > 2 or
|
|
any(not isinstance(a, int) for a in other)):
|
|
msg = (
|
|
"right operand of * must be one of: " +
|
|
"Opt, Some, Many, Repeat, (), (min,), (min,max)")
|
|
raise GrammarError(msg)
|
|
return Repeat(self, *other)
|
|
|
|
def __rmul__(self, other):
|
|
if other is Opt or other is Some or other is Many or other is Repeat:
|
|
return other(self)
|
|
if isinstance(other, int):
|
|
return Repeat(self, other, other)
|
|
if (not isinstance(other, tuple) or len(other) > 2 or
|
|
any(not isinstance(a, int) for a in other)):
|
|
msg = (
|
|
"left operand of * must be one of: " +
|
|
"Opt, Some, Many, Repeat, (), (min,), (min,max)")
|
|
raise GrammarError(msg)
|
|
return Repeat(self, *other)
|
|
|
|
def __lshift__(self, other):
|
|
return OpLshift(self, other)
|
|
|
|
def __rlshift__(self, other):
|
|
return OpLshift(other, self)
|
|
|
|
def __rshift__(self, other):
|
|
return OpRshift(self, other)
|
|
|
|
def __rrshift__(self, other):
|
|
return OpRshift(other, self)
|
|
|
|
def __or__(self, other):
|
|
return OpOr(self, other)
|
|
|
|
def __ror__(self, other):
|
|
return OpOr(other, self)
|
|
|
|
def __nonzero__(self):
|
|
return True
|
|
__bool__ = __nonzero__ # For Python3
|
|
|
|
def cast(cls, value):
|
|
value = cls.CAST.get(type(value), lambda x: x)(value)
|
|
if not isinstance(value, Symbol):
|
|
raise GrammarError("%r can't be a Symbol" % value)
|
|
return value
|
|
|
|
def compile_symbol(self, comp):
|
|
raise NotImplementedError()
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
return self
|
|
|
|
#
|
|
# Return a unique name for the symbol.
|
|
#
|
|
def __str__(self):
|
|
if self.parent is None:
|
|
return self.__class__.__name__
|
|
same_as_me = tuple(
|
|
sym
|
|
for sym in self.parent.nested
|
|
if sym.__class__.__name__ == self.__class__.__name__)
|
|
if len(same_as_me) < 2:
|
|
my_name = self.__class__.__name__
|
|
else:
|
|
index = same_as_me.index(self)
|
|
my_name = "%s%d" % (self.__class__.__name__, index)
|
|
return "%s.%s" % (self.parent, my_name,)
|
|
|
|
def __repr__(self):
|
|
return "%s(%s)" % (self.__class__.__name__, self.repr_nested())
|
|
|
|
def repr_nested(self):
|
|
return ', '.join(sym.nested_repr() for sym in self.nested)
|
|
|
|
def nested_repr(self):
|
|
return repr(self)
|
|
|
|
def get_rule(self):
|
|
rule = self
|
|
while rule.parent is not None:
|
|
rule = rule.parent
|
|
assert isinstance(rule, Rule)
|
|
return rule
|
|
|
|
|
|
#
|
|
# A forward reference to a symbol that will be defined later.
|
|
#
|
|
class Ref(Symbol):
|
|
__slots__ = ('referenced_name',)
|
|
|
|
def __init__(self, referenced_name):
|
|
super(Ref, self).__init__()
|
|
self.referenced_name = referenced_name
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
if self.referenced_name not in rules:
|
|
raise GrammarError("%s references undefined %r" % (name, self))
|
|
if len(self.dict) != 0:
|
|
msg = "Ref(%s) may not have dictionary elements"
|
|
raise GrammarError(msg % (self.referenced_name,))
|
|
return rules[self.referenced_name]
|
|
|
|
def __repr__(self):
|
|
return "%s(%r)" % (self.__class__.__name__, self.referenced_name)
|
|
|
|
|
|
#
|
|
# A reference to the Rule() of the current production.
|
|
#
|
|
class This(Symbol):
|
|
__slots__ = ()
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
return rules[name]
|
|
|
|
|
|
THIS = This()
|
|
|
|
|
|
#
|
|
# Combine several lists productions as a sequence. In other words, say we
|
|
# had several nonterms which were combined as a sequence:
|
|
#
|
|
# a = a0 + a1 + a2
|
|
#
|
|
# a0, a1 and a2 have their own lists of productions. We must produce the
|
|
# productions for a from those lists. So:
|
|
#
|
|
# Given:
|
|
# a0 = [(a,),(b,)]
|
|
# a1 = [(c,d),(e,f)]
|
|
# a2 = [(g,)]
|
|
#
|
|
# Return:
|
|
# [(a,c,d,g), (a,e,f,g), (b,c,d,g), (b,e,f,g)]
|
|
#
|
|
def seq(*args):
|
|
if not isinstance(args[-1], int):
|
|
repeats = 1
|
|
else:
|
|
repeats = args[-1]
|
|
args = args[:-1]
|
|
args = [[(p,)] if isinstance(p, Symbol) else p for p in args]
|
|
result = list(
|
|
tuple(itertools.chain.from_iterable(prod))
|
|
for prod in itertools.product(*args, repeat=repeats))
|
|
return result
|
|
|
|
|
|
#
|
|
# A non-terminal: A symbol that appears on the left hand side of a rule.
|
|
#
|
|
class Nonterm(Symbol):
|
|
__slots__ = ('rank', 'productions', 'priority')
|
|
|
|
def __init__(self, *args):
|
|
super(Nonterm, self).__init__()
|
|
for symbol in args:
|
|
if isinstance(symbol, Rule) and symbol.resolved:
|
|
msg = "Can not import %s from another Grammar" % (
|
|
symbol.name,)
|
|
raise GrammarError(msg)
|
|
if isinstance(symbol, MetaToken):
|
|
msg = "%s can not be used in a production" % (symbol.name,)
|
|
raise GrammarError(msg)
|
|
self.nested = [self.cast(symbol) for symbol in args]
|
|
self.priority = None
|
|
self.productions = ()
|
|
self.rank = None
|
|
|
|
#
|
|
# Compile the grammar. In other words turn it into a series of LR
|
|
# productions. Start from the start symbol and work our way down.
|
|
# Record what symbols were referenced, as unreferenced ones are
|
|
# probably an error.
|
|
#
|
|
def compile_grammar(self, empty_token):
|
|
#
|
|
# A recursive function that visits all symbols in the grammar,
|
|
# compiling them.
|
|
#
|
|
def comp(symbol):
|
|
if not isinstance(symbol, Rule):
|
|
return symbol.compile_symbol(comp)
|
|
if symbol not in seen:
|
|
seen.add(symbol)
|
|
symbol.emit(symbol.compile_symbol(comp))
|
|
return [(symbol,)]
|
|
seen = set()
|
|
comp(self)
|
|
self.allocate_symbol_id(empty_token)
|
|
self.allocate_rank()
|
|
return seen
|
|
|
|
#
|
|
# Allocate Nonterm.rank, so priorities can be calculated during the
|
|
# parser generator process. Lets say we have:
|
|
#
|
|
# START = Prio(a, b)
|
|
# a = Prio(b, c)
|
|
# b = 'n'
|
|
# c = Prio(START, a)
|
|
#
|
|
# So START has allocated 'b' a priority of 1, but 'a' has allocated 'b' a
|
|
# priority of 0. The question is how do we compare these? The answer
|
|
# used here is to rank the priorities based on how close they are to
|
|
# the START symbol. START is 0 away from itself, so it has the most
|
|
# preferred rank. 'a' and 'b' have ranks of 1, because they are referred
|
|
# to directly by START, and 'c' has a rank of 2.
|
|
#
|
|
# The parser uses this rank to create a priority tuple: (p0, p1, p2, ...),
|
|
# where p0 is the assigned by rank 0, p1 is the priority assigned by rank 1
|
|
# and so on.
|
|
#
|
|
# So in answering the question earlier, START's priority for 'b' is:
|
|
# (START = b,) which is (1,),
|
|
# and 'a's priority for 'b' is:
|
|
# (START = a, a = b,) which is (0,0,).
|
|
#
|
|
# When parsing this sequence of symbols:
|
|
#
|
|
# 'n' __end_of_input__
|
|
#
|
|
# The parser will go through these states.
|
|
#
|
|
# STACK INPUT POSSIBLE ACTIONS
|
|
# [] 'n' shift 'n'
|
|
# ['n'] __end_of_input__ reduce b = 'n'
|
|
# [b] __end_of_input__ reduce a = b, reduce START = b
|
|
#
|
|
# The priority of a = b is (0,0), and the priority of START = b is (1,).
|
|
# Since (0,0) < (1,), the parser will chose a = b.
|
|
#
|
|
def allocate_rank(self):
|
|
#
|
|
# First determine the height of each Nonterm in the production tree
|
|
# by doing a a breadth first pass over it. Collect the tree structure
|
|
# as we go.
|
|
#
|
|
height = 0
|
|
refers = collections.defaultdict(set)
|
|
refers[self].add(self)
|
|
symbol_heights = {self: height}
|
|
height += 1
|
|
queue = [self]
|
|
while queue:
|
|
queue, process = [], queue
|
|
for nonterm in process:
|
|
refers[nonterm] |= frozenset()
|
|
for production in nonterm.productions:
|
|
for symbol in production.rhs:
|
|
if isinstance(symbol, Nonterm):
|
|
if nonterm is not symbol:
|
|
refers[symbol].add(nonterm)
|
|
if symbol not in symbol_heights:
|
|
symbol_heights[symbol] = height
|
|
queue.append(symbol)
|
|
height += 1
|
|
|
|
#
|
|
# Turn it into a hierarchy. The idea is we do a topological sort on
|
|
# the graph using the tree structure we accumulated, breaking any
|
|
# cycles using height.
|
|
#
|
|
def remove(no_refers):
|
|
no_prio = set(
|
|
s for s in no_refers
|
|
if not isinstance(s, Prio.Prioritised))
|
|
if not no_prio:
|
|
rank[0] += 1
|
|
else:
|
|
no_refers = no_prio
|
|
for s in no_refers:
|
|
s.rank = rank[0]
|
|
del refers[s]
|
|
for s in refers:
|
|
refers[s] -= no_refers
|
|
rank = [0]
|
|
while refers:
|
|
no_refers = set(s for s, r in refers.items() if not r)
|
|
while refers and no_refers:
|
|
remove(no_refers)
|
|
no_refers = set(s for s, r in refers.items() if not r)
|
|
#
|
|
# If that didn't consume every token we have a cycle. Break the
|
|
# cycle by considering the node closest to the START symbol (ie
|
|
# lowest height) as havingt the heighest priority.
|
|
#
|
|
if refers:
|
|
low = min(symbol_heights[s] for s in refers)
|
|
lowest = set(s for s in refers if symbol_heights[s] == low)
|
|
remove(lowest)
|
|
# print sorted("%d %s" % (sym.rank, sym) for sym in symbol_heights)
|
|
|
|
#
|
|
# Allocate each symbol a unique id. This is used to make things
|
|
# deterministic when we iterate through hash tables and sets.
|
|
#
|
|
def allocate_symbol_id(self, empty_token):
|
|
queue = collections.deque([self, empty_token])
|
|
all_symbols = set()
|
|
while queue:
|
|
symbol = queue.popleft()
|
|
if symbol in all_symbols:
|
|
continue
|
|
symbol.id = len(all_symbols)
|
|
all_symbols.add(symbol)
|
|
if isinstance(symbol, Nonterm):
|
|
for production in symbol.productions:
|
|
queue.extend(production.rhs)
|
|
|
|
#
|
|
# Generate the Production() objects for the Nonterm().
|
|
#
|
|
def emit(self, productions):
|
|
self.productions = [Production(self, p) for p in productions]
|
|
|
|
#
|
|
# Merge the first_sets of all productions into ours. Return True if
|
|
# the first_set was changed.
|
|
#
|
|
def merge_first_set(self, parser):
|
|
want_empty = False
|
|
have_empty = parser.empty_token in self.first_set
|
|
result = self.first_set
|
|
for production in self.productions:
|
|
#
|
|
# For all A = a + b, merge first(a) into first(A).
|
|
#
|
|
for symbol in production.rhs:
|
|
result |= symbol.first_set
|
|
if parser.empty_token not in symbol.first_set:
|
|
break
|
|
have_empty = True
|
|
else:
|
|
want_empty = True
|
|
if want_empty:
|
|
if not have_empty:
|
|
result |= parser.empty_token.first_set
|
|
else:
|
|
if have_empty:
|
|
result -= parser.empty_token.first_set
|
|
if len(result) != len(self.first_set):
|
|
self.first_set = result
|
|
return True
|
|
return False
|
|
|
|
#
|
|
# For nonterms that accept lists, print in a nice way.
|
|
#
|
|
def nonterm_repr(self, delimiter):
|
|
def repr_sym(symbol):
|
|
if (len(symbol.nested) < 2 or
|
|
self.SYMBOL_PRECEDENCE <= symbol.SYMBOL_PRECEDENCE):
|
|
return symbol.nested_repr()
|
|
return '(' + symbol.nested_repr() + ')'
|
|
if len(self.nested) < 2:
|
|
return "%s(%s)" % (
|
|
self.__class__.__name__,
|
|
', '.join(sym.nested_repr() for sym in self.nested),)
|
|
nested = (repr_sym(symbol) for symbol in self.nested)
|
|
return (' ' + delimiter + ' ').join(nested)
|
|
|
|
|
|
#
|
|
# A symbol on the left hand side of a grammar rule.
|
|
#
|
|
class Rule(Nonterm):
|
|
__slots__ = ('name', 'resolved')
|
|
|
|
def __init__(self, name, symbol):
|
|
super(Rule, self).__init__(symbol)
|
|
self.name = name
|
|
self.resolved = False
|
|
|
|
def compile_symbol(self, comp):
|
|
return comp(*self.nested)
|
|
|
|
def __str__(self):
|
|
return self.name
|
|
|
|
def __repr__(self):
|
|
if isinstance(self.nested[0], Rule):
|
|
return "%s = %s" % (self.name, self.nested[0].name)
|
|
return "%s = %r" % (self.name, self.nested[0])
|
|
|
|
def nested_repr(self):
|
|
return "%s" % (self.name,)
|
|
|
|
|
|
#
|
|
# The arguments are a single production. Thus:
|
|
#
|
|
# L = Sequence(sym0, sym1, sym2)
|
|
#
|
|
# Yields the production:
|
|
#
|
|
# L ::= sym0 sym1 sym2
|
|
#
|
|
class Sequence(Nonterm):
|
|
__slots__ = ()
|
|
SYMBOL_PRECEDENCE = 3
|
|
|
|
def __init__(self, *args):
|
|
super(Sequence, self).__init__(*args)
|
|
|
|
def compile_symbol(self, comp):
|
|
return seq(*[comp(symbol) for symbol in self.nested])
|
|
|
|
def __repr__(self):
|
|
return self.nonterm_repr('+')
|
|
|
|
|
|
#
|
|
# The arguments are alternate productions. Thus
|
|
#
|
|
# L = Choice(sym0, sym1, sym2)
|
|
#
|
|
# Yields the productions:
|
|
#
|
|
# L ::= sym0
|
|
# L ::= sym1
|
|
# L ::= sym2
|
|
#
|
|
class Choice(Nonterm):
|
|
__slots__ = ()
|
|
SYMBOL_PRECEDENCE = 1
|
|
|
|
def __init__(self, *args):
|
|
super(Choice, self).__init__(*args)
|
|
|
|
def compile_symbol(self, comp):
|
|
return sum([comp(sym) for sym in self.nested], [])
|
|
|
|
def __repr__(self):
|
|
return self.nonterm_repr('|')
|
|
|
|
|
|
#
|
|
# Binary operations. Repeated applications of the same binary operation
|
|
# yield a single list. The op is then applied to that list. Thus:
|
|
#
|
|
# L = sym0 + sym1 + sym2 | sym3 | sym4
|
|
#
|
|
# Gets turned into:
|
|
#
|
|
# L = Alternate(Sequence(sym0, sym1, sym2), sym3, sym4).
|
|
#
|
|
class BinOp(Nonterm):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, arg1, arg2):
|
|
super(BinOp, self).__init__(arg1, arg2)
|
|
|
|
def combine(self):
|
|
def r(arg1, arg2):
|
|
a1 = r(*arg1.nested) if type(self) == type(arg1) else [arg1]
|
|
a2 = r(*arg2.nested) if type(self) == type(arg2) else [arg2]
|
|
return a1 + a2
|
|
return r(*self.nested)
|
|
|
|
|
|
#
|
|
# Constructed for: sym | sym.
|
|
#
|
|
class OpOr(BinOp):
|
|
__slots__ = ()
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
return Choice(*self.combine())
|
|
|
|
|
|
#
|
|
# Constructed for: sym + sym.
|
|
#
|
|
class OpPlus(BinOp):
|
|
__slots__ = ()
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
return Sequence(*self.combine())
|
|
|
|
|
|
#
|
|
# Constructed for: sym << sym.
|
|
#
|
|
class OpLshift(BinOp):
|
|
__slots__ = ()
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
return Left(*self.combine())
|
|
|
|
|
|
#
|
|
# Constructed for: sym >> sym.
|
|
#
|
|
class OpRshift(BinOp):
|
|
__slots__ = ()
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
return Right(*self.combine())
|
|
|
|
|
|
#
|
|
# A priority list. A priority list is passed a list of symbols:
|
|
# S = Prio(sym0, sym1, sym2)
|
|
# is identical to:
|
|
# S = sym0 | sym1 | sym2
|
|
# with the side effect that in the event of a conflict we will choose
|
|
# sym0 over sym1 over sym2.
|
|
#
|
|
class Prio(Nonterm):
|
|
#
|
|
# This node holds priorities.
|
|
#
|
|
class Prioritised(Nonterm):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, priority, symbol):
|
|
super(Prio.Prioritised, self).__init__(symbol)
|
|
self.priority = priority
|
|
|
|
def compile_symbol(self, comp):
|
|
self.emit(comp(*self.nested))
|
|
return [(self,)]
|
|
|
|
__slots__ = ()
|
|
|
|
def __init__(self, *args):
|
|
super(Prio, self).__init__(*args)
|
|
|
|
#
|
|
# Priority nodes allocate a separate node for each sub-node. We
|
|
# subsume nested Prio's.
|
|
#
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
def r(prio):
|
|
for symbol in prio.nested:
|
|
if isinstance(symbol, Prio):
|
|
r(symbol)
|
|
else:
|
|
children.append(self.Prioritised(next(index), symbol))
|
|
index = itertools.count()
|
|
children = []
|
|
r(self)
|
|
self.nested = children
|
|
return self
|
|
|
|
def compile_symbol(self, comp):
|
|
self.emit(sum([comp(sym) for sym in self.nested], []))
|
|
return [(self,)]
|
|
|
|
def __repr__(self):
|
|
if len(self.nested) < 2:
|
|
return "%s(%s)" % (
|
|
self.__class__.__name__,
|
|
', '.join(sym.nested_repr() for sym in self.nested))
|
|
return "(%s)" % (', '.join(sym.nested_repr() for sym in self.nested),)
|
|
|
|
|
|
#
|
|
# This node holds associativity: ie left, right or not allowed.
|
|
#
|
|
class Assoc(Nonterm):
|
|
__slots__ = ('assoc',)
|
|
|
|
def __init__(self, assoc, *args):
|
|
if assoc not in 'lnr':
|
|
raise GrammarError("Unknown associativity %r" % (assoc,))
|
|
super(Assoc, self).__init__(*args)
|
|
self.assoc = assoc
|
|
|
|
def compile_symbol(self, comp):
|
|
self.emit(seq(*[comp(symbol) for symbol in self.nested]))
|
|
return [(self,)]
|
|
|
|
def __repr__(self):
|
|
if type(self) is not Assoc:
|
|
return super(Assoc, self).__repr__()
|
|
return "%s(%s, %s)" % (
|
|
self.__class__.__name__, self.assoc, self.repr_nested(),)
|
|
|
|
|
|
#
|
|
# Force left associativity.
|
|
#
|
|
class Left(Assoc):
|
|
__slots__ = ()
|
|
SYMBOL_PRECEDENCE = 2
|
|
|
|
def __init__(self, *args):
|
|
super(Left, self).__init__('l', *args)
|
|
|
|
def __repr__(self):
|
|
return self.nonterm_repr('<<')
|
|
|
|
|
|
#
|
|
# Force right associativity.
|
|
#
|
|
class Right(Assoc):
|
|
__slots__ = ()
|
|
SYMBOL_PRECEDENCE = 2
|
|
|
|
def __init__(self, *args):
|
|
super(Right, self).__init__('r', *args)
|
|
|
|
def __repr__(self):
|
|
return self.nonterm_repr('>>')
|
|
|
|
|
|
#
|
|
# Force non associative.
|
|
#
|
|
class Nonassoc(Assoc):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, *args):
|
|
super(Nonassoc, self).__init__('n', *args)
|
|
|
|
|
|
#
|
|
# Construct a list of productions separated by a delimiter.
|
|
#
|
|
class List(Nonterm):
|
|
__slots__ = ('max', 'min', 'opt',)
|
|
|
|
def __init__(self, symbol, delimiter, min=None, max=None, opt=None):
|
|
super(List, self).__init__(symbol, delimiter)
|
|
self.min = 0 if min is None else min
|
|
self.max = max
|
|
self.opt = opt
|
|
if max is not None and max < self.min:
|
|
raise GrammarError("min may not be greater than max")
|
|
|
|
def compile_symbol(self, comp):
|
|
symbol = comp(self.nested[0])
|
|
delimiter = comp(self.nested[1])
|
|
productions = []
|
|
if self.min == 0:
|
|
productions.append(())
|
|
#
|
|
# A fixed max means we can just list all the possibilities like this:
|
|
#
|
|
# [(), (sym,), (sym,delim,sym), (sym,delim,sym,delim,sym)]
|
|
#
|
|
if self.max is not None:
|
|
for repeat in range(max(0, self.min - 1), self.max):
|
|
prod = seq(symbol, seq(delimiter, symbol, repeat))
|
|
productions.extend(prod)
|
|
if self.opt:
|
|
productions.extend(seq(prod, delimiter))
|
|
return productions
|
|
#
|
|
# There is no upper maximum, so we need recursion:
|
|
#
|
|
# For right assoc:
|
|
# [(sym,delim,sym,delim,S)]
|
|
# S ::= [(sym,), (sym,delim,S)]
|
|
#
|
|
# For left assoc:
|
|
# [(S,delim,sym,delim,sym)]
|
|
# S ::= [(sym,), (S,delim,sym)]
|
|
#
|
|
my_productions = list(symbol)
|
|
if self.parent is None or not isinstance(self.parent, Assoc):
|
|
assoc = None
|
|
else:
|
|
assoc = self.parent.assoc
|
|
repeat = max(0, self.min - 1)
|
|
if assoc == 'r':
|
|
prod = seq(seq(symbol, delimiter, repeat), self)
|
|
productions.extend(prod)
|
|
my_productions.extend(seq(symbol, delimiter, self))
|
|
if self.opt:
|
|
my_productions.extend(seq(symbol, delimiter))
|
|
elif assoc is None or assoc == 'l':
|
|
prod = seq(self, seq(delimiter, symbol, repeat))
|
|
productions.extend(prod)
|
|
if self.opt:
|
|
productions.extend(seq(prod, delimiter))
|
|
my_productions.extend(seq(self, delimiter, symbol))
|
|
else:
|
|
msg = "Can't implement %s associativity on %r"
|
|
raise GrammarError(msg % (assoc, self))
|
|
self.emit(my_productions)
|
|
return productions
|
|
|
|
def __repr__(self):
|
|
if self.opt is not None:
|
|
return "%s(%s, %r, %r, %r)" % (
|
|
self.__class__.__name__, self.repr_nested(),
|
|
self.min, self.max, self.opt)
|
|
if self.max is not None:
|
|
return "%s(%s, %r, %r)" % (
|
|
self.__class__.__name__, self.repr_nested(),
|
|
self.min, self.max)
|
|
if self.min != 0:
|
|
return "%s(%s, %r)" % (
|
|
self.__class__.__name__, self.repr_nested(),
|
|
self.min)
|
|
return "%s(%s)" % (self.__class__.__name__, self.repr_nested())
|
|
|
|
|
|
#
|
|
# Repeats. This class handles all forms of repeats.
|
|
#
|
|
class Repeat(Nonterm):
|
|
__slots__ = ('min', 'max',)
|
|
|
|
def __init__(self, symbol, min=None, max=None):
|
|
super(Repeat, self).__init__(symbol)
|
|
self.min = 0 if min is None else min
|
|
self.max = max
|
|
if max is not None and max < self.min:
|
|
raise GrammarError("min may not be greater than max")
|
|
|
|
def compile_symbol(self, comp):
|
|
symbol = comp(*self.nested)
|
|
#
|
|
# If we have both min and max repeats we can just enumerate the
|
|
# results like this:
|
|
#
|
|
# [ (), (symbol,), (symbol,symbol,), ... ]
|
|
#
|
|
if self.max is not None:
|
|
prods = (seq(symbol, rpt) for rpt in range(self.min, self.max + 1))
|
|
return sum(prods, [])
|
|
#
|
|
# There is no maximum, so we must use recursion.
|
|
#
|
|
# For right associative:
|
|
# [ (symbol,symbol,S) ]
|
|
# S ::= [symbol, (symbol, S) ]
|
|
#
|
|
# For left associative:
|
|
# [ (S,symbol,symbol) ]
|
|
# S ::= [symbol, (S,symbol) ]
|
|
#
|
|
if self.parent is None or not isinstance(self.parent, Assoc):
|
|
assoc = None
|
|
else:
|
|
assoc = self.parent.assoc
|
|
if self.min == 0:
|
|
productions = [(), (self,)]
|
|
else:
|
|
productions = seq(seq(symbol, max(0, self.min - 1)), self)
|
|
if assoc == 'r':
|
|
my_productions = symbol + seq(symbol, self)
|
|
elif assoc is None or assoc == 'l':
|
|
my_productions = symbol + seq(self, symbol)
|
|
else:
|
|
msg = "Can't implement %s associativity on %r"
|
|
raise GrammarError(msg % (assoc, self))
|
|
self.emit(my_productions)
|
|
return productions
|
|
|
|
def __repr__(self):
|
|
nested = self.nested[0].nested_repr()
|
|
if len(self.nested[0].nested) >= 2:
|
|
nested = '(%s)' % (nested,)
|
|
if type(self) is not Repeat:
|
|
return "%s * %s" % (nested, self.__class__.__name__)
|
|
if self.min == self.max:
|
|
return "%s * %r" % (nested, self.min)
|
|
if self.max is not None:
|
|
return "%s * (%r, %r)" % (nested, self.min, self.max)
|
|
if self.min != 0:
|
|
return "%s * (%r,)" % (nested, self.min)
|
|
return "%s * ()" % (nested,)
|
|
|
|
|
|
#
|
|
# Optional - ie 0 or 1.
|
|
#
|
|
class Opt(Repeat):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, symbol):
|
|
super(Opt, self).__init__(symbol, 0, 1)
|
|
|
|
|
|
#
|
|
# 1 or more.
|
|
#
|
|
class Some(Repeat):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, symbol):
|
|
super(Some, self).__init__(symbol, 1, None)
|
|
|
|
|
|
#
|
|
# 0 or more.
|
|
#
|
|
class Many(Repeat):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, symbol):
|
|
super(Many, self).__init__(symbol, 0, None)
|
|
|
|
|
|
#
|
|
# Generate tokens by splitting a string.
|
|
#
|
|
class Tokens(Nonterm):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, literals, keywords=None, case=None):
|
|
tokens = []
|
|
if literals and literals.strip():
|
|
tokens.extend([
|
|
Token(literal, case=case)
|
|
for literal in literals.strip().split()])
|
|
if keywords and keywords.strip():
|
|
tokens.extend([
|
|
Keyword(keyword, case)
|
|
for keyword in keywords.strip().split()])
|
|
super(Tokens, self).__init__(*tokens)
|
|
|
|
def resolve_symbol(self, name, rules, token_registry):
|
|
return Choice(*self.nested)
|
|
|
|
|
|
#
|
|
# The Tokeniser() breaks up input into tokens.
|
|
#
|
|
class Tokeniser(object):
|
|
literals = None # dict, {"literal": token, ...}
|
|
regex = None # object, re.compile() - compiled token recognisers
|
|
re_groups = None # tuple, (int, ...)
|
|
re_list = None # tuple, (Token(), ...)
|
|
registry = None # object, TokenRegistry() that owns us
|
|
unrecognised = None # object, The UnrecognisedToken()
|
|
re_flags = re.DOTALL | re.MULTILINE
|
|
|
|
ANCHOR_RE = re.compile(r'(?:[^[\\]|\\.|\[\^?\]?(?:\\.|[^]\\])*\])*\\[AZ]')
|
|
BACKREF_RE = re.compile(
|
|
r'(?:[^[\\]|\\[^0-9]|\[\^?\]?(?:\\.|[^]\\])*\])*(\\[0-9]+)')
|
|
CAPTURE_RE = re.compile(r'[^[\\(]|\\.|\[\^?\]?(?:\\.|[^]\\])*\]|\(\?')
|
|
|
|
def compile_tokens(self, token_registry, whitespace):
|
|
self.registry = token_registry
|
|
#
|
|
# Whitespace must be a string.
|
|
#
|
|
if whitespace is not None:
|
|
if not isinstance(whitespace, string_types):
|
|
raise GrammarError("WHITESPACE must be a string")
|
|
all_tokens = (
|
|
token
|
|
for token in self.registry.values()
|
|
if isinstance(token, Token))
|
|
key = lambda t: (to_str(t.literal), to_str(t.re))
|
|
all_tokens = sorted(all_tokens, key=key, reverse=True)
|
|
patterns = [token for token in all_tokens if token.re is not None]
|
|
self.literals = {}
|
|
self.unrecognised = next(
|
|
(t for t in all_tokens if t.re is None and t.literal is None),
|
|
None)
|
|
#
|
|
# It's amazing what unit testing turns up. Would anybody really use
|
|
# the inbuilt tokeniser without recognising a single token?
|
|
#
|
|
self.re_list = []
|
|
self.re_groups = []
|
|
if not patterns:
|
|
pattern = "x(?<=y)" # An re that never matches
|
|
else:
|
|
#
|
|
# Currently Python's re module returns the first re that matches
|
|
# when given the sequence a|b|c. We always want it to match the
|
|
# longest possible literal. In Python 2.7 putting the longest
|
|
# literals first makes that happen.
|
|
#
|
|
# Backreferences are allowed, but since grammar writer has no idea
|
|
# what order we will put them in we have to renumber them.
|
|
#
|
|
longest = lambda token: (token.literal is not None, -len(token.re))
|
|
ordered_patterns = []
|
|
backref_base = 0
|
|
for token in sorted(patterns, key=longest):
|
|
base = backref_base
|
|
backref_base += 1
|
|
token_re = '(?:()(?:%s))' % token.re
|
|
self.re_groups.append(backref_base)
|
|
self.re_list.append(token)
|
|
backref_matches = tuple(self.BACKREF_RE.finditer(token_re))
|
|
for match in reversed(backref_matches):
|
|
backref_no = int(match.group(1)[1:], 10) + backref_base
|
|
token_re = "%s\\%d%s" % (
|
|
token_re[:match.start(1)],
|
|
backref_no,
|
|
token_re[match.end(1):])
|
|
backref_base = base + len(self.CAPTURE_RE.sub('', token_re))
|
|
ordered_patterns.append(token_re)
|
|
pattern = '|'.join(ordered_patterns)
|
|
self.regex = re.compile(pattern, self.re_flags)
|
|
self.re_list = tuple(self.re_list)
|
|
self.re_groups = tuple(self.re_groups)
|
|
#
|
|
# Gather all literals and checking there are no duplicates.
|
|
#
|
|
all_re = {}
|
|
for token in all_tokens:
|
|
#
|
|
# Ensure the re isn't duplicated.
|
|
#
|
|
if token.re is not None:
|
|
if token.re in all_re:
|
|
msg = "Token's %r and %r define the same re"
|
|
raise GrammarError(msg % (token, all_re[token.re]))
|
|
all_re[token.re] = token
|
|
if token.literal is not None:
|
|
self.literals[token.literal] = token
|
|
#
|
|
# Ensure the literal is matched by exactly one Token.
|
|
# This also ensures there are no duplicate literals.
|
|
#
|
|
matches = []
|
|
for re_tok in patterns:
|
|
if self.ANCHOR_RE.match(re_tok.re):
|
|
continue
|
|
match = re.match(re_tok.re, token.literal)
|
|
if match is not None:
|
|
if (re_tok.literal is None or
|
|
match.group() == token.literal):
|
|
matches.append((re_tok, match))
|
|
if not matches:
|
|
msg = "Keyword %r does not match any re"
|
|
raise GrammarError(msg % (token,))
|
|
if len(matches) == 1:
|
|
token.owner = matches[0][0]
|
|
else:
|
|
re_matches = [
|
|
m[0] for m in matches if m[0].literal is None]
|
|
if re_matches:
|
|
msg = (
|
|
"Literal token %s should be a Keyword " +
|
|
"as it matches re token %s")
|
|
res = ', '.join(str(match) for match in re_matches)
|
|
raise GrammarError(msg % (token, res))
|
|
matches = [m for m in matches if m is not token]
|
|
msg = "duplicate literal %r and %r"
|
|
raise GrammarError(msg % (matches[0][0], token,))
|
|
if not any(m.group() == token.literal for t, m in matches):
|
|
msg = "Token.re %r partially matches %r of Keyword %r"
|
|
raise GrammarError(msg % (matches[0], match, token))
|
|
|
|
#
|
|
# This generator is a filter. It takes a string generator as an argument
|
|
# and generates tokens ready to be fed into the parser. The strings
|
|
# returned by the generator are assumed to be entire tokens.
|
|
#
|
|
def tokeniser(self, input, whitespace=None):
|
|
#
|
|
# pos = [position_in_stream, line_number, column_number]
|
|
#
|
|
def update_position(data):
|
|
ldata = len(data)
|
|
pos[0] += ldata
|
|
matches = list(re.finditer("(?:\n\r?|\r\n?)", data))
|
|
if not matches:
|
|
pos[2] += ldata
|
|
else:
|
|
pos[1] += len(matches)
|
|
pos[2] = ldata - matches[-1].end() + 1
|
|
pos = [0, 1, 1]
|
|
#
|
|
# Normalise the parameters.
|
|
#
|
|
iterator = iter((input,) if isinstance(input, string_types) else input)
|
|
if whitespace == "":
|
|
is_whitespace = lambda s: False
|
|
last_whitespace = lambda s: len(s)
|
|
else:
|
|
spaces = " \f\n\r\t\v" if whitespace is None else whitespace
|
|
is_whitespace = lambda s: not s.lstrip(spaces)
|
|
trans = string_maketrans(spaces, spaces[0] * len(spaces))
|
|
last_whitespace = lambda s: s.translate(trans).rfind(spaces[0])
|
|
#
|
|
# Loop until end of the stream.
|
|
#
|
|
cur_tok = next(iterator, None)
|
|
cur_isstr = isinstance(cur_tok, string_types)
|
|
nxt_tok = next(iterator, None)
|
|
nxt_isstr = isinstance(nxt_tok, string_types)
|
|
while cur_tok is not None:
|
|
buf = ""
|
|
#
|
|
# Loop while the current token is a string.
|
|
#
|
|
while cur_isstr:
|
|
#
|
|
# If the next token isn't a string we must parse all of the
|
|
# input, otherwise we only parse up to the last space. This
|
|
# somewhat reduces the chance of tokens being truncated across
|
|
# iterator boundaries.
|
|
#
|
|
last = last_whitespace(cur_tok) if nxt_isstr else len(cur_tok)
|
|
if last == -1:
|
|
buf += cur_tok
|
|
else:
|
|
last += len(buf)
|
|
buf += cur_tok
|
|
offset = 0
|
|
while offset < last:
|
|
match = self.regex.search(buf, offset, last)
|
|
if match is not None:
|
|
start, end = match.span()
|
|
else:
|
|
if nxt_isstr:
|
|
break
|
|
start, end = last, last
|
|
#
|
|
# The only thing that can separate one token and the
|
|
# next is whitespace.
|
|
#
|
|
if offset < start:
|
|
in_between = buf[offset:start]
|
|
if not is_whitespace(in_between):
|
|
if self.unrecognised is None:
|
|
msg = (
|
|
"Unrecognised token %r " +
|
|
"at line %d column %d"
|
|
)
|
|
raise TokenError(
|
|
msg % (in_between, pos[1], pos[2]),
|
|
in_between, pos[0], pos[1], pos[2])
|
|
yield (
|
|
self.unrecognised, in_between,
|
|
pos[0], pos[1], pos[2])
|
|
update_position(in_between)
|
|
if start == last:
|
|
break
|
|
#
|
|
# Found some data that matches a token. Identify what
|
|
# token it matches.
|
|
#
|
|
data = match.group()
|
|
try:
|
|
token = self.literals[data]
|
|
except KeyError:
|
|
try:
|
|
token = self.literals[data.lower()]
|
|
if token.case:
|
|
token = None
|
|
except KeyError:
|
|
token = None
|
|
if token is None:
|
|
idx = match.group(*self.re_groups).index('')
|
|
token = self.re_list[idx]
|
|
if token.refine is not None:
|
|
token = token.refine(self.registry, data)
|
|
yield token, data, pos[0], pos[1], pos[2]
|
|
update_position(data)
|
|
offset = end
|
|
buf = buf[offset:]
|
|
cur_tok = nxt_tok
|
|
cur_isstr = nxt_isstr
|
|
nxt_tok = next(iterator, None)
|
|
nxt_isstr = isinstance(nxt_tok, string_types)
|
|
#
|
|
# If we are given non-strings pass it straight on.
|
|
#
|
|
while cur_tok is not None and not cur_isstr:
|
|
yield cur_tok
|
|
cur_tok = nxt_tok
|
|
cur_isstr = nxt_isstr
|
|
nxt_tok = next(iterator, None)
|
|
nxt_isstr = isinstance(nxt_tok, string_types)
|
|
|
|
|
|
#
|
|
# Meta class that does the work for a TokenRegistry.
|
|
#
|
|
class TokenRegistryMeta(type):
|
|
|
|
def __new__(cls, name, bases, dct):
|
|
registry = super(TokenRegistryMeta, cls).__new__(cls, name, bases, dct)
|
|
if dct.get("__metaclass__", None) is not cls:
|
|
registry.save_dicts()
|
|
return registry
|
|
|
|
|
|
#
|
|
# Put your token definitions in a class that inherits from this one.
|
|
#
|
|
class TokenRegistry(dict):
|
|
__dicts = None
|
|
__metaclass__ = TokenRegistryMeta
|
|
__tokeniser = None
|
|
|
|
def __init__(self):
|
|
for name, token_symbol in self.__class__.__dict__.items():
|
|
if isinstance(token_symbol, TokenSymbol):
|
|
qualified_name = "%s.%s" % (self.__class__.__name__, name)
|
|
token_symbol.set_name(qualified_name)
|
|
self._resolve_token_(token_symbol)
|
|
|
|
#
|
|
# Save the registered token's dicts, as they may be overwritten
|
|
# by the Grammar. This is called by the meta class, so it happens
|
|
# before the Grammar has a chance to get it's fingers into the pie.
|
|
#
|
|
def save_dicts(cls):
|
|
cls.__dicts = {}
|
|
for token_symbol in cls.__dict__.values():
|
|
if isinstance(token_symbol, TokenSymbol):
|
|
cls.__dicts[token_symbol] = token_symbol.dict
|
|
token_symbol.dict = {}
|
|
save_dicts = classmethod(save_dicts)
|
|
|
|
#
|
|
# Return the dict's of all registered tokens.
|
|
#
|
|
def restore_dicts(cls):
|
|
for token_symbol, token_dict in cls.__dicts.items():
|
|
token_symbol.dict = token_dict
|
|
del cls.__dicts
|
|
restore_dicts = classmethod(restore_dicts)
|
|
|
|
#
|
|
# Resolve duplicate tokens.
|
|
#
|
|
def _resolve_token_(self, token_symbol):
|
|
alias = self.get(token_symbol.name, None)
|
|
if alias is None:
|
|
alias = token_symbol
|
|
super(TokenRegistry, self).__setitem__(alias.name, alias)
|
|
if alias is not token_symbol:
|
|
alias.merge(token_symbol)
|
|
return alias
|
|
|
|
#
|
|
# Compile the re that recognises the tokens.
|
|
#
|
|
def compile_tokens(self, whitespace=None):
|
|
self.__tokeniser = Tokeniser()
|
|
self.__tokeniser.compile_tokens(self, whitespace)
|
|
|
|
#
|
|
# Return a generator for tokens in the grammar.
|
|
#
|
|
def tokeniser(self, input, whitespace=None):
|
|
return self.__tokeniser.tokeniser(input, whitespace)
|
|
|
|
def __setitem__(self, key, value):
|
|
raise NotImplementedError()
|
|
|
|
def __delitem__(self, key):
|
|
raise NotImplementedError()
|
|
|
|
|
|
TokenRegistry = python3_metaclass(TokenRegistry)
|
|
|
|
|
|
#
|
|
# A Token in the grammar. An instance of a TokenSymbol() defines one kind
|
|
# of token.
|
|
#
|
|
class TokenSymbol(Symbol):
|
|
__slots__ = ('name', 'named',)
|
|
|
|
def __init__(self, name):
|
|
super(TokenSymbol, self).__init__()
|
|
self.name = name
|
|
self.named = False
|
|
self.first_set = frozenset((self,))
|
|
|
|
def resolve_symbol(self, name, rules, token_definitions):
|
|
return token_definitions._resolve_token_(self)
|
|
|
|
#
|
|
# Absorb another token definition into this one.
|
|
#
|
|
def merge(self, other):
|
|
msg = "Token %r doesn't support merging with token %r"
|
|
raise GrammarError(msg % (self, other))
|
|
|
|
def compile_symbol(self, comp):
|
|
return [(self,)]
|
|
|
|
#
|
|
# Set the name of this token.
|
|
#
|
|
def set_name(self, name):
|
|
if self.name is None:
|
|
self.name = name
|
|
elif self.name != name:
|
|
msg = "Can not rename token %r to %r" % (self.name, name)
|
|
raise GrammarError(msg)
|
|
self.named = True
|
|
|
|
def __repr__(self):
|
|
return str(self)
|
|
|
|
def __str__(self):
|
|
return str(self.name)
|
|
|
|
#
|
|
# Given a tuple returned by the tokeniser, return an English
|
|
# description of where we are in the input stream.
|
|
#
|
|
def position(self, token_tuple):
|
|
return None
|
|
|
|
|
|
#
|
|
# MetaToken's are used internally by the Parser. They are re-usable
|
|
# by multiple grammar's.
|
|
#
|
|
class MetaToken(TokenSymbol):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, name):
|
|
super(MetaToken, self).__init__(name)
|
|
|
|
def __repr__(self):
|
|
return str(self.name)
|
|
|
|
def __str__(self):
|
|
return self.name
|
|
|
|
def position(self, token_tuple):
|
|
return self.name
|
|
|
|
|
|
#
|
|
# A user generated token.
|
|
#
|
|
class UserToken(TokenSymbol):
|
|
__slots__ = ()
|
|
|
|
def __init__(self, name=None):
|
|
super(UserToken, self).__init__(name)
|
|
|
|
def resolve_symbol(self, name, rules, token_definitions):
|
|
if self.name is None:
|
|
msg = "A %s must be assigned a name using a %s" % (
|
|
self.__class__.__name__, TokenRegistry.__name__)
|
|
raise GrammarError(msg)
|
|
return super(UserToken, self).resolve_symbol(
|
|
name, rules, token_definitions)
|
|
|
|
def merge(self, other):
|
|
if not isinstance(other, UserToken):
|
|
super(UserToken, self).merge(other)
|
|
|
|
|
|
#
|
|
# A Token() built by the inbuilt tokeniser. A token comes in two
|
|
# varieties:
|
|
#
|
|
# - A token defined by a regular expression.
|
|
# - A keyword, which is a special purposed token.
|
|
#
|
|
class Token(TokenSymbol):
|
|
_RE = re
|
|
KEYWORD = object()
|
|
UNRECOGNISED = object()
|
|
__slots__ = ('case', 'literal', 'owner', 're', 'refine')
|
|
|
|
def __init__(
|
|
self, literal=None, re=None, case=None, kind=None, refine=None):
|
|
if kind is self.KEYWORD:
|
|
if re is not None:
|
|
raise GrammarError("A keyword must not have an re")
|
|
if refine is not None:
|
|
raise GrammarError("A keyword can not be refined")
|
|
if literal is None:
|
|
raise GrammarError("A keyword must have a literal")
|
|
elif kind is self.UNRECOGNISED:
|
|
if literal is not None or re is not None:
|
|
msg = "The UnrecognisedToken can't have a literal or re"
|
|
raise GrammarError(msg)
|
|
elif kind is not None:
|
|
raise GrammarError("Unrecognised Token kind %r" % (kind,))
|
|
else:
|
|
if literal is None and re is None:
|
|
raise GrammarError("A Token must have a literal or an re")
|
|
if literal is not None:
|
|
if re is not None:
|
|
msg = "A Token can't have both a literal and a re"
|
|
raise GrammarError(msg)
|
|
if refine is not None:
|
|
raise GrammarError("A literal can't be refined")
|
|
self.literal = literal
|
|
self.re = re
|
|
self.named = False
|
|
super(Token, self).__init__(str(self))
|
|
self.case = case if case is not None else True
|
|
self.refine = refine
|
|
self.owner = self
|
|
if kind is not None:
|
|
pass
|
|
elif re is not None:
|
|
self._RE.compile(re)
|
|
else:
|
|
self.re = self._RE.escape(self.literal)
|
|
if not self.case and self.literal.lower() != self.literal.upper():
|
|
def either_case(match):
|
|
char = match.group()
|
|
return "[%c%c]" % (char.upper(), char.lower())
|
|
self.re = self._RE.sub("[a-zA-Z]", either_case, self.re)
|
|
|
|
def __repr__(self):
|
|
if self.literal is not None:
|
|
result = repr(self.literal)
|
|
elif self.re is not None:
|
|
result = self.repr_re()
|
|
else:
|
|
result = "%s()" % (UnrecognisedToken.__name__,)
|
|
if self.named:
|
|
return '%s=%s' % (self, result,)
|
|
return result
|
|
|
|
def __str__(self):
|
|
if self.named:
|
|
return super(Token, self).__str__()
|
|
if self.literal:
|
|
return repr(self.literal)
|
|
if self.re is not None:
|
|
return self.repr_re()
|
|
return '<unrecognised_token>'
|
|
|
|
def repr_re(self):
|
|
return "/%s/" % (repr(self.re)[1:-1].replace("/", "\\/"),)
|
|
|
|
def merge(self, other):
|
|
#
|
|
# We are only merge with like.
|
|
#
|
|
if not isinstance(other, Token):
|
|
TokenSymbol.merge(self, other)
|
|
if self.refine is None:
|
|
self.refine = other.refine
|
|
elif other.refine is not None and other.refine != self.refine:
|
|
msg = "Token %r defined with conflicting refine's %r and %r"
|
|
raise GrammarError(msg % (self.name, self.refine, other.refine))
|
|
|
|
#
|
|
# Set the name of this token.
|
|
#
|
|
def set_name(self, name):
|
|
if self.named and self.name != name:
|
|
msg = "Can not rename token %r to %r" % (self.name, name)
|
|
raise GrammarError(msg)
|
|
self.name = name
|
|
self.named = True
|
|
|
|
#
|
|
# Our tokeniser puts the line and column in the tuple.
|
|
#
|
|
def position(self, token_tuple):
|
|
if len(token_tuple) < 5 or None in token_tuple[3:5]:
|
|
return super(Token, self).position(token_tuple)
|
|
return "line %d column %d" % token_tuple[3:5]
|
|
|
|
|
|
#
|
|
# The Unrecognised Token.
|
|
#
|
|
def UnrecognisedToken():
|
|
return Token(kind=Token.UNRECOGNISED)
|
|
|
|
|
|
#
|
|
# A Keyword is literal without a regexp, ie it must match an existing regexp.
|
|
#
|
|
def Keyword(literal, case=None):
|
|
return Token(literal, case=case, kind=Token.KEYWORD)
|
|
|
|
|
|
#
|
|
# How we handle non-Symbol types in Symbol expression.
|
|
#
|
|
Symbol.CAST = {
|
|
str: Token,
|
|
tuple: lambda t: Prio(*t),
|
|
}
|
|
if sys.version_info < (3,):
|
|
Symbol.CAST[unicode] = Token
|
|
|
|
|
|
class Production(object):
|
|
__slots__ = ("lhs", "rhs")
|
|
|
|
def __init__(self, lhs, rhs):
|
|
self.lhs = lhs
|
|
self.rhs = tuple(rhs)
|
|
|
|
def __repr__(self):
|
|
return (
|
|
"%s = %s" %
|
|
(self.lhs, " ".join(str(elm) for elm in self.rhs),))
|
|
|
|
|
|
#
|
|
# Parser Constructor.
|
|
#
|
|
class GrammarMeta(type):
|
|
|
|
def __new__(cls, name, bases, dct):
|
|
if "_parser_" in dct:
|
|
raise GrammarError("_parser_ is reserved in Gramma's.")
|
|
if dct.get("__metaclass__", None) is not cls:
|
|
dct["_parser_"] = Parser(name, dct)
|
|
return super(GrammarMeta, cls).__new__(cls, name, bases, dct)
|
|
|
|
|
|
#
|
|
# The base class for Parsers.
|
|
#
|
|
class Grammar(object):
|
|
__metaclass__ = GrammarMeta
|
|
|
|
def compile_grammar(cls):
|
|
compile_grammar(cls)
|
|
compile_grammar = classmethod(compile_grammar)
|
|
|
|
def epoch_symbol(cls):
|
|
return epoch_symbol(cls)
|
|
epoch_symbol = classmethod(epoch_symbol)
|
|
|
|
def parse(cls, input, tree_factory=None, on_error=None, log=None):
|
|
return parse(cls, input, tree_factory, on_error, log)
|
|
parse = classmethod(parse)
|
|
|
|
def pre_compile_grammar(cls, pre_compiled=None):
|
|
return pre_compile_grammar(cls, pre_compiled)
|
|
pre_compile_grammar = classmethod(pre_compile_grammar)
|
|
|
|
def repr_grammar(cls):
|
|
return repr_grammar(cls)
|
|
repr_grammar = classmethod(repr_grammar)
|
|
|
|
def repr_parse_table(cls, state=None):
|
|
return repr_parse_table(cls, state)
|
|
repr_parse_table = classmethod(repr_parse_table)
|
|
|
|
def repr_parse_tree(cls, tree, indent=None):
|
|
return repr_parse_tree(tree, indent)
|
|
repr_parse_tree = classmethod(repr_parse_tree)
|
|
|
|
def repr_productions(cls):
|
|
return repr_productions(cls)
|
|
repr_productions = classmethod(repr_productions)
|
|
|
|
def unused_rules(cls):
|
|
return unused_rules(cls)
|
|
unused_rules = classmethod(unused_rules)
|
|
|
|
|
|
Grammar = python3_metaclass(Grammar)
|
|
|
|
|
|
def compile_grammar(grammar):
|
|
grammar._parser_.compile_grammar()
|
|
|
|
|
|
def epoch_symbol(grammar):
|
|
return grammar._parser_.epoch_symbol
|
|
|
|
|
|
def parse(grammar, input, tree_factory=None, on_error=None, log=None):
|
|
return grammar._parser_.parse(input, tree_factory, on_error, log)
|
|
|
|
|
|
def pre_compile_grammar(grammar, pre_compiled=None):
|
|
return grammar._parser_.pre_compile_grammar(grammar, pre_compiled)
|
|
|
|
|
|
def repr_grammar(grammar):
|
|
return grammar._parser_.repr_grammar()
|
|
|
|
|
|
def repr_parse_table(grammar, state=None):
|
|
return grammar._parser_.repr_parse_table(state)
|
|
|
|
|
|
def repr_productions(grammar):
|
|
return grammar._parser_.repr_productions()
|
|
|
|
|
|
def unused_rules(grammar):
|
|
return grammar._parser_.unused_symbols
|
|
|
|
|
|
def repr_parse_tree(tree, indent=None):
|
|
def indent_tree(tree, padding):
|
|
#
|
|
# Append tokens and empty productions to the current line.
|
|
#
|
|
def extend_line(line, prod):
|
|
while True:
|
|
while prod and isinstance(prod[0][0], TokenSymbol):
|
|
line.append(repr_token(prod.popleft()))
|
|
if not prod or len(prod[0]) != 1:
|
|
break
|
|
line.append("(%s)" % (prod.popleft()[0],))
|
|
repr_token = (
|
|
lambda t: repr(t[1]) if isinstance(t[0], Token) else str(t[0]))
|
|
#
|
|
# Were we passed a token?
|
|
#
|
|
if isinstance(tree[0], TokenSymbol):
|
|
return [repr_token(tree)]
|
|
#
|
|
# List singleton productions (ie productions of the form rule1 = rule2)
|
|
# on the same line.
|
|
#
|
|
line, result = [], []
|
|
nesting = 1
|
|
while len(tree) == 2 and not isinstance(tree[1][0], TokenSymbol):
|
|
line.append("(%s" % (tree[0],))
|
|
nesting += 1
|
|
tree = tree[1]
|
|
line.append("(%s" % (tree[0],))
|
|
#
|
|
# If the remainder of the symbols are tokens just list them as well.
|
|
#
|
|
prod = collections.deque(tree[1:])
|
|
extend_line(line, prod)
|
|
if not prod:
|
|
result.append('%s%s' % (padding, ' '.join(line)))
|
|
else:
|
|
#
|
|
# If we have rule1 = ... ^ rule2, then list them as:
|
|
#
|
|
# (rule1 ... rule2
|
|
# rule2-child0
|
|
# ...)
|
|
#
|
|
# If we have rule1 = ... ^ rule2 ... then list them as:
|
|
#
|
|
# (rule1 ...
|
|
# (rule2
|
|
# ...)
|
|
# ...)
|
|
#
|
|
if len(prod) > 1:
|
|
result.append('%s%s' % (padding, ' '.join(line)))
|
|
elif (len(prod) == 1 and
|
|
all(isinstance(t[0], TokenSymbol) for t in prod[0][1:])):
|
|
last = prod.popleft()
|
|
line.append("(%s" % last[0])
|
|
line.extend(repr_token(t) for t in last[1:])
|
|
line[-1] += ")"
|
|
result.append('%s%s' % (padding, ' '.join(line)))
|
|
else:
|
|
line.append('(%s' % (prod[0][0],))
|
|
result.append('%s%s' % (padding, ' '.join(line)))
|
|
prod = collections.deque(prod[0][1:])
|
|
if len(prod[0]) == 1 or isinstance(prod[0][0], TokenSymbol):
|
|
line = []
|
|
extend_line(line, prod)
|
|
result.append('%s%s' % (padding + indent, ' '.join(line)))
|
|
while prod:
|
|
result.extend(indent_tree(prod.popleft(), padding + indent))
|
|
line = [result[-1]]
|
|
extend_line(line, prod)
|
|
result[-1] = ' '.join(line)
|
|
result[-1] += ')' * nesting
|
|
return result
|
|
eol = ' ' if indent is False else '\n'
|
|
indent = " " if indent is None else indent or ""
|
|
return eol.join(indent_tree(tree, ""))
|
|
|
|
# vim: set shiftwidth=4 expandtab softtabstop=8 :
|