# Lrparsing.py is a LR(1) parser hiding behind a pythonic interface. It takes
# as input a grammar and a string to be parsed, and outputs the parse tree.
#
# Copyright (c) 2013,2014,2015,2016,2017,2018 Russell Stuart.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# The copyright holders grant you an additional permission under Section 7
# of the GNU Affero General Public License, version 3, exempting you from
# the requirement in Section 6 of the GNU General Public License, version 3,
# to accompany Corresponding Source with Installation Information for the
# Program or any work based on the Program. You are still required to
# comply with all other Section 6 requirements to provide Corresponding
# Source.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
#
import ast
import collections
import hashlib
import itertools
import re
import string
import sys
#
# Python2/3 compatibility hacks.
#
# Written this odd way to preserve 100% test coverage.
#
StandardError = Exception if sys.version_info >= (3,) else StandardError
python3_metaclass = (lambda cls: cls) if sys.version_info < (3,) else (
lambda cls: cls.__metaclass__(
cls.__name__, cls.__bases__, dict(cls.__dict__)))
string_types = basestring if sys.version_info < (3,) else str
string_maketrans = (
string.maketrans if sys.version_info < (3,) else str.maketrans)
to_str = lambda s: s if isinstance(s, string_types) else str(s)
#
# Common base class for all exceptions here.
#
class LrParsingError(StandardError):
pass
#
# Raised if the Grammar isn't valid.
#
class GrammarError(LrParsingError):
pass
#
# Base class for errors raised at parsing time.
#
class ParsingError(LrParsingError):
pass
#
# Raised if an invalid syntax is given.
#
class TokenError(ParsingError):
def __init__(self, message, data, offset, line, column):
self.data = data
self.offset = offset
self.line = line
self.column = column
super(TokenError, self).__init__(message, data, offset, line, column)
#
# Raised if an invalid syntax is given.
#
class ParseError(ParsingError):
input_token = None
stack = None
def __init__(self, input_token, stack):
def comma_or(lst):
strs = sorted("%s" % l for l in lst)
if len(strs) == 1:
return strs[0]
return ', '.join(strs[:-1]) + ' or ' + strs[-1]
self.input_token = input_token
self.stack = stack
lr1_state = stack[-1][0]
if len(lr1_state.actions) >= 10:
msg = "Got unexpected %s" % (input_token[0],)
else:
msg = "Got %s when expecting %s" % (
input_token[0], comma_or(lr1_state.actions))
if len(lr1_state.rules) < 10:
msg += " while trying to match %s" % (comma_or(lr1_state.rules))
msg += " in state %d" % (lr1_state.id,)
position = input_token[0].position(input_token)
if position:
msg = position + ": " + msg
super(ParseError, self).__init__(msg)
#
# Print a set of symbols.
#
def str_symbol_set(symbol_set):
return '[%s]' % (','.join(sorted(str(symbol) for symbol in symbol_set)),)
#
# An LR(0) Item. An item is just a production and the position the parser
# is up to in parsing it - the dot_pos. Eg:
#
# sym0 sym1 . sym2
#
# means the parser is processing a production wants to see the symbols
# "sym0 sym1 sym2" in order, and it has seen sym1 and sym2.
#
class Lr0Item(object):
__slots__ = ('dot_pos', '_key', 'lr0_item', 'production')
def __new__(cls, production, dot_pos, cache):
lr0_item = (production, dot_pos)
result = cache.get(lr0_item, None)
if result is None:
result = super(Lr0Item, cls).__new__(cls)
result.lr0_item = lr0_item
result.dot_pos = dot_pos
result.production = production
result._key = (
(str(result.production.lhs),) +
tuple(str(sym) for sym in result.production.rhs) +
(result.dot_pos,)
)
cache[result.lr0_item] = result
return result
def __repr__(self):
rhs = self.production.rhs
ll = lambda s, e: [str(symbol) for symbol in rhs[s:e]]
prod = ll(0, self.dot_pos) + ['^'] + ll(self.dot_pos, len(rhs))
return "%s = %s" % (self.production.lhs, ' '.join(prod))
def key(cls, lr0_item):
return lr0_item._key
key = classmethod(key)
#
# An item in an LR(1) grammar. It is just a LR(0) item, together with the
# set a of tokens that could follow the production called the lookahead.
# Eg, given the Item():
#
# L ::= sym0 sym1 sym2 . [tokA, tokB]
#
# The parser has seen all the symbols in this production, so if the next token
# to be processed is tokA or tokB, the production can be reduced (ie replaced)
# with it's left hand side, ie L
#
class Lr1Item(object):
__slots__ = ('dot_pos', 'lookahead', 'lr0_item', 'production')
def __init__(self, lr0_item, lookahead):
self.lr0_item = lr0_item
self.production = self.lr0_item.production
self.dot_pos = self.lr0_item.dot_pos
self.lookahead = lookahead
def lr1_shift(self, cache):
lr0_item = Lr0Item(self.production, self.dot_pos + 1, cache)
return Lr1Item(lr0_item, set(self.lookahead))
def __repr__(self):
return "%r %s" % (self.lr0_item, str_symbol_set(self.lookahead),)
def sorted(cls, iterable):
return sorted(iterable, key=lambda item: Lr0Item.key(item.lr0_item))
sorted = classmethod(sorted)
#
# An Lr0Kernel. A "kernel" is the state of the parser. It is just a set of
# items (Lr0Items in the case of an Lr0Kernels). The initial kernel is just
# the grammar's start production, eg:
#
# G ::= . E
#
# Successive kernels are generated by looking every production that can
# be reached from a previous kernel if a particular given token is seen.
# For example, if the rest of the grammar is:
#
# E ::= E / E
# E ::= n
#
# Then after seeing an n, the Lr0Kernel would be:
#
# E ::= n .
#
class Lr0Kernel(object):
__slots__ = ("lr0_items",)
def __new__(cls, lr1_items, cache):
lr0_items = frozenset(item.lr0_item for item in lr1_items)
result = cache.get(lr0_items, None)
if result is None:
result = super(Lr0Kernel, cls).__new__(cls)
result.lr0_items = lr0_items
cache[result.lr0_items] = result
return result
#
# An Lr1State is the compiled version of an ItemSet. Ie, all the
# information the Parser doesn't need has been discarded.
#
class Lr1State(int):
if sys.version_info < (3,):
__slots__ = ('actions', 'gotos', 'id', 'rules')
def __new__(cls, id, actions, gotos, rules):
result = super(Lr1State, cls).__new__(cls, id)
result.actions = actions
result.gotos = gotos
result.id = id
result.rules = rules
return result
def __repr__(self):
def p(act):
if len(act) == 1:
return "shift %d" % act
if len(act) == 2 or act[2] is None:
return "reduce %d %d" % act[:2]
return "reduce %d %d %s" % act[:3]
result = [str(self)]
if self.actions:
result.append(" -- actions")
for token in sorted(self.actions, key=lambda sym: str(sym)):
result.append(" %s: %s" % (token, p(self.actions[token],)))
if self.gotos:
result.append(" -- gotos")
for nonterm_number in sorted(self.gotos, key=lambda sym: str(sym)):
result.append(
" %s: %s" %
(nonterm_number, self.gotos[nonterm_number],))
return '\n'.join(result)
def __str__(self):
return "Lr1State:%d" % (int(self),)
def to_flat(self, grammar):
actions = {}
for token, action in self.actions.items():
if len(action) == 1:
new_action = action[0]
elif action[2] is None:
new_action = tuple(action[:2])
else:
new_action = (action[0], action[1], action[2].name)
actions[token.name] = new_action
rules = sorted(rule.name for rule in self.rules)
return (actions, self.gotos, rules)
PYTHON_VAR_RE = re.compile(
"(?i)^[a-z_][a-z_0-9]*(?:[.][a-z_][a-z_0-9]*)*$")
def from_flat(cls, index, flat, rules, token_registry):
actions = {}
for token_name, action in flat[0].items():
if not isinstance(action, tuple):
new_action = (action,)
elif len(action) == 2:
new_action = (action[0], action[1], None, None)
else:
new_action = (action[0], action[1], rules[action[2]], None)
token = token_registry[token_name]
actions[token] = new_action
target_rules = (rules[rule_name] for rule_name in flat[2])
return cls(index, actions, flat[1], set(target_rules))
from_flat = classmethod(from_flat)
def sorted(cls, table):
return sorted(table)
sorted = classmethod(sorted)
#
# An ItemSet is a state in the LR(1) grammar. An ItemSet is identical in
# concept to an LR(0) kernel, but it consists of Lr1Item's rather than
# Lr0Item's. In other words, the items contain the tokens they expect to
# follow the production. These tokens are shown in []'s.
#
# Following on from the example above:
#
# G ::= . E , [__empty__]
#
# And if an n is accepted:
#
# E ::= n ., [, /]
#
class ItemSet(object):
ID = 0
__slots__ = (
'actions', '_closure', '_goto_cache', 'gotos', 'id',
'_kernel', '_lhs_prio', 'lr0_kernel', 'prio',)
def __init__(self, items, cache):
items = tuple(items)
self._kernel = dict((item[0].lr0_item, item[0]) for item in items)
self.prio = dict((item[0].lr0_item, item[1]) for item in items)
self.lr0_kernel = Lr0Kernel(iter(self), cache)
self.actions = None
self.gotos = {}
self._closure = None
self._goto_cache = None
self._lhs_prio = None
self.id = self.ID
self.__class__.ID += 1
def __str__(self):
return "ItemSet:%d" % (self.id,)
def __repr__(self):
result = [str(self)]
for kernel_item in Lr1Item.sorted(iter(self)):
line = " %r %s" % (kernel_item, self.repr_prio(kernel_item))
result.append(line)
if self._closure:
result.append(" -- closure")
for closure_item in Lr1Item.sorted(self._closure.values()):
line = " %r %s" % (
closure_item, self.repr_prio(closure_item))
result.append(line)
if self.actions:
result.append(" -- actions")
for token in sorted(self.actions, key=lambda action: str(action)):
actions = self.actions[token]
if isinstance(actions, Action):
lst = repr(actions)
else:
lst = ', '.join(sorted(repr(act) for act in actions))
result.append(" %s: %s" % (token, lst,))
if self.gotos:
result.append(" -- gotos")
for symbol in sorted(self.gotos, key=lambda symbol: str(symbol)):
result.append(" %s: %s" % (symbol, self.gotos[symbol],))
return '\n'.join(result)
def repr_prio(self, item):
result = []
for prio, lookahead in sorted(self.prio[item.lr0_item].items()):
result.append("%r:%s" % (prio, str_symbol_set(lookahead),))
return '{%s}' % ', '.join(result)
#
# Return all lhs tokens we generate.
#
def rules(self):
return set(item.production.lhs.get_rule() for item in self)
rules = property(rules)
#
# Compute the closure for the passed items. Ie, given the item
# {A ::= a . B c}, add {B ::= . C d} and repeat for B. In other words,
# the kernel plus its closure contains every production we can be
# expanding, and the position in them.
#
# X ::= a ^ b c, [l]
#
# b ::= ^ f g, [follow(c)]
#
def _close_kernel_items(self, items, cache):
modified = False
queue = collections.deque(items)
empty_token = cache['__empty__']
while queue:
#
# Find the next symbol that will be consumed by item.
#
item = queue.popleft()
rhs, dot_pos = item.production.rhs, item.dot_pos
if dot_pos >= len(rhs):
continue
symbol = rhs[dot_pos]
if isinstance(symbol, TokenSymbol):
continue
#
# Find all tokens we could possibly see after consuming that
# symbol.
#
first_set = self.symbol_seq_first_set(rhs[dot_pos + 1:], cache)
if empty_token not in first_set:
lookahead = set(first_set)
no_empty = first_set
else:
no_empty = first_set - empty_token.first_set
lookahead = no_empty | item.lookahead
#
# If the next symbol is a nonterm, then add all of its productions
# to the closure.
#
for production in symbol.productions:
lr0_item = Lr0Item(production, 0, cache)
existing = self._closure.get(lr0_item, None)
if existing is not None:
extra = lookahead - existing.lookahead
existing.lookahead |= extra
closure = existing
else:
extra = lookahead
closure = Lr1Item(lr0_item, lookahead)
self._closure[lr0_item] = closure
self.prio[closure.lr0_item] = {}
if not extra:
continue
queue.append(closure)
modified = True
#
# Push the priority through the closure. In other words, if
# the rhs of this production assigned this lhs a priority,
# then add this priority to the list we inherited from the
# ItemSet's that goto us. Eg, given:
#
# START = Prio(b,c) __end_of_input__, b.prio==0 and c.prio==1
# b = Prio(d+'X', e+'Y'), so d.prio==0 and e.prio==1
# c = Prio(e+'X', d+'Y'), so d.prio==1 and e.prio==0
# d = 'T'
# e = 'T'
#
# The prio of START is always (), so b's prio will be (0,)
# ie "() + (0,)", and c's prio will be (1,) and after that
# we expect both to be followed by __end_of_input__. From b's
# production d's prio will be b's plus 0, ie
# "(0,) + (0,) = (0,0,)", but only when followed by a 'X' (ie
# the lookahead is 'X'). From c's production d's prio will be
# (1,1), but only when followed by a 'Y'.
#
# Thus for 'd' we end up with these priorities:
#
# { (0,0,): set('X'), (0,1,): set('Y') }
#
# and for 'e' we end up with:
#
# { (1,0): set('X'), (1,1): set('Y') }
#
# If the resulting parser is given the input string:
#
# 'T' 'Y' __end_of_input__.
#
# First it has to decide whether 'T' is a 'd' or an 'e' as
# both will resolve to 'T'. A normal LR(1) would be stuck
# with a reduce/reduce conflict, but since the d's priority
# for 'Y' (0,1) < e's priority (1,1), we chose 'd'. We don't
# do that choosing here, but we create the priorities so it
# can happen should a conflict arise.
#
rank = closure.production.lhs.rank
priority = closure.production.lhs.priority
if priority is not None:
rank -= 1
append = (0,) * max(0, rank - item.production.lhs.rank)
if priority is not None:
append += (priority,)
prio_items = self.prio[item.lr0_item].items()
for itemset_prio, item_prio in prio_items:
existing_sets = self.prio[closure.lr0_item]
if empty_token not in first_set:
add = no_empty
else:
add = no_empty | extra & item_prio
existing_prio = itemset_prio[:rank] + append
if existing_prio not in existing_sets:
existing_sets[existing_prio] = add
else:
existing_sets[existing_prio] |= add
return modified
#
# Compute the first_set for a sequence of Symbols. If the sequence is:
#
# sym0 sym1 sym2
#
# The first_set is the same as the first_set of sym0, and if the sym0
# can be empty then the first_set of sym1 and so on.
#
def symbol_seq_first_set(cls, symbol_sequence, cache):
syms = tuple(symbol_sequence)
result = cache.get(syms, None)
if result is None:
empty_token = cache['__empty__']
first_set = set(empty_token.first_set)
for symbol in syms:
first_set |= symbol.first_set
if empty_token not in symbol.first_set:
first_set -= empty_token.first_set
break
result = frozenset(first_set)
cache[syms] = result
return result
symbol_seq_first_set = classmethod(symbol_seq_first_set)
#
# Iterating over us returns the Lr1Item's in our kernel.
#
def __iter__(self):
return iter(self._kernel.values())
#
# Return a generator for the kernel + closure.
#
def all_items(self):
return itertools.chain(self, self._closure.values())
#
# Compute the closure for ourselves.
#
def compute_closure(self, cache):
if self._closure is None:
empty_token = cache['__empty__']
for lr0_item, old_prio_dict in self.prio.items():
prio_dict = {}
last_lookahead = empty_token.first_set
for prio, lookahead in sorted(old_prio_dict.items()):
if lookahead != last_lookahead:
prio_dict[prio] = lookahead
last_lookahead = lookahead
self.prio[lr0_item] = prio_dict
self._closure = {}
self._close_kernel_items(iter(self), cache)
#
# Calculate the kernel of the goto set, given a particular symbol.
#
def goto_sets(self, cache):
if self._goto_cache is None:
dot_symbols = collections.defaultdict(set)
for item in self.all_items():
dot_pos, rhs = item.dot_pos, item.production.rhs
if dot_pos < len(rhs):
dot_symbols[rhs[dot_pos]].add(item)
self._goto_cache = {}
for symbol in sorted(dot_symbols, key=lambda symbol: symbol.id):
gen = (
(item.lr1_shift(cache), self.prio[item.lr0_item])
for item in dot_symbols[symbol])
item_set = ItemSet(gen, cache)
self._goto_cache[symbol] = item_set
return self._goto_cache
#
# Check for reduce/reduce compatibility between ItemSet's. The two
# ItemSet's passed must have the same same lr0 kernel. ItemSet's with
# different lr0 kernels are never compatible.
#
# When to reduce and what to is determined by the lookahead:
#
# S ::= W; S ::= X; S ::= Y; S ::= Z
# W ::= a P i
# X ::= a Q j
# Y ::= b P j
# Z ::= b Q i
# P ::= c
# Q ::= c
#
# Produces these ItemSets, among others:
#
# WX: P -> c ., [i] (i-->reduce(P)); Q -> c ., [j] (j-->reduce(Q))
# YZ: P -> c ., [j] (j-->reduce(Q)); Q -> c ., [i] (i-->reduce(P))
#
# In this case merging WX and YZ would produce reduce conflicts because if
# we see the token i, we can't both reduce(P) and reduce(Q).
#
def compatible(self, other):
#
# Compatible() isn't always called, so lazily evaluate
# ItemSet._lhs_prio.
#
def lhs_prio(item_set):
if item_set._lhs_prio is None:
all_lookaheads = None
lhs_prio = {}
for lr0_item in item_set._kernel:
for prio, lookahead in item_set.prio[lr0_item].items():
key = (lr0_item.production.lhs, prio)
if key not in lhs_prio:
lhs_prio[key] = lookahead
else:
lhs_prio[key] |= lookahead
if all_lookaheads is None:
all_lookaheads = lookahead
else:
all_lookaheads |= lookahead
item_set._lhs_prio = (lhs_prio, all_lookaheads)
return item_set._lhs_prio
if self is other:
return True
#
# For an LR(1) scheme we are saying for a given lhs in self its
# lookahead set can only share lookahead tokens with the same lhs in
# other.
#
# For us it gets a little more complex, as we are carrying
# ItemSet.prio's and they have to be compatible as well. However,
# turns out this reduces to insisting that (lhs, prio) combinations
# can't share lookaheads suffices.
#
self_lhs_prio, self_all_lookaheads = lhs_prio(self)
other_lhs_prio, other_all_lookaheads = lhs_prio(other)
common = self_all_lookaheads & other_all_lookaheads
for key, self_lookaheads in self_lhs_prio.items():
other_lookaheads = other_lhs_prio.get(key, None)
if other_lookaheads is not None:
if (self_lookaheads & common) != (other_lookaheads & common):
return False
return True
#
# Merge two ItemSet's. They must be ItemSet.compatible(). Return True
# if the merge altered 'self', so it needs a new closure computed.
#
def merge(self, other, cache):
closure_items = []
for item in self:
other_item = other._kernel[item.lr0_item]
modified = False
item_prio = self.prio[item.lr0_item]
expanded = False
for prio, other_lookahead in other.prio[item.lr0_item].items():
self_lookahead = self.prio[item.lr0_item].get(prio, None)
if self_lookahead is None:
expanded = expanded or item_prio.copy()
expanded[prio] = other_lookahead
elif not self_lookahead >= other_lookahead:
expanded = expanded or item_prio.copy()
expanded[prio] |= other_lookahead
if expanded:
self.prio[item.lr0_item] = expanded
modified = True
if modified:
item.lookahead |= other_item.lookahead
closure_items.append(item)
if not closure_items:
return False
self._goto_cache = None
self._lhs_prio = None
self._close_kernel_items(closure_items, cache)
return True
def sorted(cls, table):
return sorted(table, key=lambda item_set: item_set.id)
sorted = classmethod(sorted)
#
# LR parser actions. These have to be tuples because that is what the
# lr1_parser expects in the optimised case. These are for the non-optimised
# case, so we carry more information for debugging and priority resolution.
# Nonetheless these must be backwards compatible with what
# Grammar.optimise_parsing_table() produces.
#
class Action(tuple):
__slots__ = ()
#
# Actions for an owning item are placed into a set. Identical actions
# can't be in that set, so __hash__ and __eq__ must be implemented
# accordingly.
#
def __hash__(self):
raise NotImplementedError()
def __eq__(self, other):
raise NotImplementedError()
#
# Sort order. Only important for tests where we need a repeatable test
# outcomes.
#
def __lt__(self, other):
return self.key() < other.key()
#
# This function returns three things:
#
# (lhs, low, high)
#
# where:
#
# lhs Is the symbol we will reduce to, or None if there could be
# several. This is used to compare associativity.
#
# low The lowest priority for the passed token.
#
# high The highest priority for the passed token.
def precedence(self, token, item_set):
raise NotImplementedError()
#
# A Shift action - consume a token, ie move it onto the stack.
#
class ShiftAction(Action):
__slots__ = ()
def __new__(cls, *next_state):
result = Action.__new__(cls, next_state)
return result
def __repr__(self):
return "shift %s" % self[0]
def __hash__(self):
return hash(self[0])
def __eq__(self, other):
return isinstance(other, ShiftAction) and self[0] == other[0]
#
# Key used for sorting. Used only to get repeatable tests.
#
def key(self):
return "shift", self[0].id
#
# In a shift:
#
# lhs Is the lhs of all items if they are the same, otherwise None.
#
# low The lowest prio for the token in the kernel.
#
# high The highest prio for the token in the kernel.
#
def precedence(self, token, item_set):
my_item_set = self[0]
lhs = next(iter(my_item_set)).production.lhs
low = (1e100,)
high = ()
for lr1_item in my_item_set:
if lhs != lr1_item.production.lhs:
lhs = None
items = my_item_set.prio[lr1_item.lr0_item].items()
for prio, lookahead in items:
if token in lookahead:
if low > prio:
low = prio
if high < prio:
high = prio
return lhs, low, high
#
# A reduce action - ie the top of the stack is a production we recognise.
# Replace it with it's lhs.
#
class ReduceAction(Action):
__slots__ = ()
def __new__(cls, lr1_item):
lhs = lr1_item.production.lhs
output = lhs if isinstance(lhs, Rule) and lhs.name[0] != '_' else None
me = (lhs, len(lr1_item.production.rhs), output, lr1_item)
result = Action.__new__(cls, me)
return result
def __repr__(self):
return "reduce %s = %s" % (
self[0], ' '.join(str(sym) for sym in self[3].production.rhs))
def __hash__(self):
return hash(self[:3])
def __eq__(self, other):
return isinstance(other, ReduceAction) and self[:3] == other[:3]
#
# Key used for sorting. Used only to get repeatable tests.
#
def key(self):
return "reduce", self[0].id, self[3].dot_pos
#
# In a reduce:
#
# lhs Is the lhs of the target production.
#
# low The lowest prio for the token for this Lr1Item.
#
# high The highest prio for the token for this Lr1Item.
#
def precedence(self, token, item_set):
lr1_item = self[3]
lhs = lr1_item.production.lhs
low = (1e100,)
high = ()
for prio, lookahead in item_set.prio[lr1_item.lr0_item].items():
if token in lookahead:
if low > prio:
low = prio
if high < prio:
high = prio
return lhs, low, high
#
# The thing that implements the grammar - the Parser.
#
class Parser(object):
VERSION = "0.1"
comments = None # object, TokenSymbol(), list of tokens.
empty_token = None # object, MetaToken("__empty__")
eoi_token = None # object, MetaToken("__end_of_input__")
epoch_symbol = None # object, Rule(), = START __end_of_input__
parser_name = None # string, Name of the parser
parsing_table = None # tuple, (Lr1State(), ...)
rules = None # dict, {"name": Rule(), ...}
token_registry = None # object, TokenRegistry()
unused_symbols = None # set, SymbolSet(Rule(), ...)
whitespace = None # string, Characters defined to be whitespace
def __init__(self, parser_name, dct):
def new_meta(name):
meta = MetaToken(name)
return meta.resolve_symbol(name, self.rules, self.token_registry)
#
# Step 1 is to replace all productions with a Rule() equivalent and
# find the TokenRegistry.
#
self.parser_name = parser_name
rule_symbols, self.rules, token_registry = (
self.catalogue_symbols(dct))
if token_registry is None:
self.token_registry = TokenRegistry()
else:
token_registry.restore_dicts()
self.token_registry = token_registry()
for name in sorted(self.rules):
self.resolve_rule(self.rules[name], rule_symbols)
self.empty_token = new_meta("__empty__")
self.eoi_token = new_meta("__end_of_input__")
for name in sorted(self.rules):
self.resolve_symbol(self.rules[name])
if name in self.token_registry:
msg = "A token and symbol share the same name %r"
raise GrammarError(msg % name)
#
# Create the starting production for the grammar.
#
start_symbol = dct.get("START", None)
if start_symbol is None:
raise GrammarError("No START symbol defined")
if not isinstance(start_symbol, Rule):
raise GrammarError("START is not a Nonterm")
if len(start_symbol.dict) != 0:
raise GrammarError("START symbol may not have dictionary elements")
epoch_symbol = Sequence()
epoch_symbol.nested = [start_symbol, self.eoi_token]
self.epoch_symbol = Rule('<%s>' % self.parser_name, epoch_symbol)
epoch_symbol.parent = self.epoch_symbol
if self.epoch_symbol.name in self.rules:
msg = "Symbol name %r is reserved" % self.epoch_symbol.name
raise GrammarError(msg)
self.rules[self.epoch_symbol.name] = self.epoch_symbol
for rule in self.rules.values():
rule.resolved = True
#
# Get the special cased tokens.
#
self.whitespace = dct.get("WHITESPACE", None)
self.token_registry.compile_tokens(self.whitespace)
comments = dct.get("COMMENTS", None)
if comments is None:
self.comment_tokens = None
else:
error = not isinstance(comments, Rule)
if not error:
comment_symbol = comments.nested[0]
if isinstance(comment_symbol, TokenSymbol):
self.comment_tokens = comments.nested
elif isinstance(comment_symbol, Choice):
error = all(
not isinstance(sym, TokenSymbol)
for sym in comments.nested[0].nested)
if not error:
self.comment_tokens = comments.nested[0].nested
if error:
raise GrammarError("COMMENTS must be Token | Token ...")
del self.rules['COMMENTS']
#
# Compile the grammar into an LR(1) parse_table, or raise a GrammarError
# if there is a problem with the grammar.
#
def compile_grammar(self):
if self.parsing_table:
return
#
# Initialise the grammar by creating the start production. Then
# compile it.
#
used_symbols = self.epoch_symbol.compile_grammar(self.empty_token)
self.unused_symbols = frozenset(
symbol for symbol in self.rules.values()
if symbol not in used_symbols)
#
# Resolve first sets.
#
self.calc_first_sets(self.epoch_symbol)
#
# Create the parser.
#
start_state, lr0_item_sets = self.compute_lr1_items(self.epoch_symbol)
table = self.compute_parsing_table(lr0_item_sets)
self.parsing_table = (start_state, table)
self.normalise_item_set_id(self.parsing_table)
self.disambiguate(table)
#
# Return the optimised_grammar suitable for passing to compile_grammar().
#
def pre_compile_grammar(self, grammar_class, pre_compiled=None):
def from_flat(i, flat):
return Lr1State.from_flat(i, flat, self.rules, self.token_registry)
#
# If it has already been pre compiled just return.
#
if self.parsing_table is not None:
if isinstance(self.parsing_table[0], int):
return None
#
# If we don't have a table see if we can use the pre_compiled version.
#
if pre_compiled:
if isinstance(pre_compiled, string_types):
pre_compiled = ast.literal_eval(pre_compiled)
if pre_compiled[0] == self.grammar_hash():
optimised_start_state = pre_compiled[1]
optimised_parsing_table = tuple(
from_flat(i - 2, pre_compiled[i])
for i in range(2, len(pre_compiled)))
self.parsing_table = (
optimised_start_state, optimised_parsing_table)
return None
#
# Optimise it.
#
self.compile_grammar()
self.parsing_table = self.optimise_parsing_table(self.parsing_table)
flattened = tuple(
state.to_flat(grammar_class)
for state in self.parsing_table[1])
return repr((self.grammar_hash(), self.parsing_table[0]) + flattened)
#
# Make the start state item_set.id==0 and the remainder following
# sequentially.
#
def normalise_item_set_id(cls, parsing_table):
start_state, table = parsing_table
mapping = iter(zip(ItemSet.sorted(table), itertools.count(0)))
first = next(mapping)
for item_set, id in itertools.chain((first,), mapping):
item_set.id = id
start_state.id, first[0].id = first[0].id, start_state.id
normalise_item_set_id = classmethod(normalise_item_set_id)
#
# Debug dump of nonterms.
#
def repr_productions(self):
def r(nonterm):
all_nonterms.add(nonterm)
for symbol in nonterm.nested:
if isinstance(symbol, Nonterm) and symbol not in all_nonterms:
r(symbol)
all_nonterms = set()
for rule in self.rules.values():
r(rule)
all_nonterms = [n for n in sorted(all_nonterms, key=lambda t: str(t))]
result = []
i = 0
for nonterm in all_nonterms:
if not nonterm.productions:
continue
rank = ".%d" % nonterm.rank if nonterm.rank else ""
result.append(
"%-6s: %r" % ("%d%s" % (i, rank), nonterm.productions[0],))
for p in nonterm.productions[1:]:
result.append(" %r" % (p,))
i += 1
return '\n'.join(result)
#
# Debug dump of the parsing table.
#
def repr_parse_table(self, state=None):
if self.parsing_table is None:
return ''
a_state = next(iter(self.parsing_table[1]))
if state is None:
func = lambda item_set: True
elif state >= 0:
func = lambda item_set: item_set.id == state
else:
is_state = lambda s: (s if isinstance(s, int) else s.id) == -state
action_state = lambda act: is_state(tuple(act)[0])
func = lambda item_set: (
is_state(item_set) or
any(is_state(g) for g in item_set.gotos.values()) or
any(action_state(a) for a in item_set.actions.values()))
item_sets = (i for i in self.parsing_table[1] if func(i))
result = []
for item_set in a_state.sorted(item_sets):
result.append(repr(item_set))
result.append("")
if result:
del result[-1]
return '\n'.join(result)
#
# Dump the grammar.
#
def repr_grammar(self):
result = [repr(rule) for name, rule in sorted(self.rules.items())]
return '\n'.join(result)
#
# Parse a feed.
#
def parse(self, input, tree_factory=None, on_error=None, log=None):
if self.parsing_table is None:
self.compile_grammar()
token_feed = self.token_registry.tokeniser(input, self.whitespace)
return self.lr1_parser(token_feed, tree_factory, on_error, log)
#
# Build up a catalogue of all symbols.
#
def catalogue_symbols(cls, dct):
#
# Move one symbols dict to another.
#
def move_dict(to, frm):
for key in list(frm.dict):
to.dict[key] = frm.dict[key]
del frm.dict[key]
#
# Create a new Rule() for this rule.
#
def catalogue(name, field):
rule = Rule(name, field)
rule_symbols[field] = rule
symbols[name] = rule
dct[name] = rule
return rule
rule_symbols = {}
symbols = {}
token_registry = None
for name, field in sorted(dct.items()):
if isinstance(field, Symbol):
if isinstance(field, Ref):
raise GrammarError("Ref(%r) hasn't been defined" % name)
if field not in rule_symbols:
move_dict(catalogue(name, field), field)
elif name == "START":
catalogue(name, rule_symbols[field])
elif rule_symbols[field].name == "START":
rule = rule_symbols[field]
field_rule = catalogue(name, field)
move_dict(field_rule, rule)
catalogue(rule.name, field_rule)
else:
msg = (
"You have \"%s = %s\" or " +
"\"%s = TokenRegistry.tok; " +
"%s = TokenRegistry.tok\".\n" +
"Only START maybe assigned directly to " +
"another Symbol. A workaround is %s = %s * 1"
)
raise GrammarError(msg % (
name, rule_symbols[field],
name, rule_symbols[field],
name, rule_symbols[field],))
elif isinstance(field, type) and TokenRegistry in field.__bases__:
if token_registry is None:
token_registry = field
else:
msg = "Can't have more than one %s"
raise GrammarError(msg, TokenRegistry.__name__)
return rule_symbols, symbols, token_registry
catalogue_symbols = classmethod(catalogue_symbols)
#
# Replace all occurrences of a declared symbol with it's Rule().
#
def resolve_rule(self, rule, rule_symbols):
def r(nested):
for i, symbol in zip(itertools.count(), nested):
if not isinstance(symbol, Rule):
resolved = rule_symbols.get(symbol, None)
if resolved is not None:
nested[i] = resolved
else:
r(symbol.nested)
for i, symbol in zip(itertools.count(), rule.nested):
# resolved = rule_symbols[symbol]
# if resolved is not rule:
# rule.nested[i] = resolved
assert rule_symbols[symbol] is rule
if not isinstance(symbol, Rule):
r(symbol.nested)
#
# Scan the entire grammar, allowing nodes in the parse tree to replace
# themselves with other nodes. Eg, Ref's with the Symbol they are
# referencing.
#
def resolve_symbol(self, rule):
def r(parent, symbol):
resolved = symbol.resolve_symbol(name, self.rules, token_registry)
if isinstance(resolved, Rule):
return resolved
resolved.parent = parent
for i, sym in zip(itertools.count(), resolved.nested):
if not isinstance(sym, Rule):
resolved.nested[i] = r(resolved, sym)
return resolved
name = rule.name
token_registry = self.token_registry
for i, sym in zip(itertools.count(), rule.nested):
rule.nested[i] = r(rule, sym)
#
# Compute the first sets for all symbols.
#
def calc_first_sets(self, epoch_symbol):
#
# Collect all nonterminals used by the grammar.
#
def r(nonterm):
for prod in nonterm.productions:
for sym in prod.rhs:
if isinstance(sym, Nonterm) and sym not in nonterms:
nonterms.add(sym)
nonterm_list.append(sym)
r(sym)
#
# We use nonterm_list to make it deterministic.
#
nonterms = set()
nonterm_list = []
r(epoch_symbol)
changed = True
while changed:
changed = any(sym.merge_first_set(self) for sym in nonterm_list)
#
# Compute the collection of sets of LR(1) items.
#
# Wikipedia is a good reference:
# http://en.wikipedia.org/wiki/Canonical_LR_parser
#
def compute_lr1_items(self, epoch_symbol):
#
# Initialise the parse table by creating the start ItemSet. It
# contains one item:
#
# FINISH ::= ^ START $.,
#
cache = {'__empty__': self.empty_token}
start_production = epoch_symbol.productions[0]
start_item_lr0 = Lr0Item(start_production, 0, cache)
start_item_lookahead = set(self.empty_token.first_set)
start_item_lr1 = Lr1Item(start_item_lr0, start_item_lookahead)
start_item_set = ItemSet(((start_item_lr1, {(): set()}),), cache)
start_item_set.compute_closure(cache)
lr0_item_sets = collections.defaultdict(list)
lr0_item_sets[start_item_set.lr0_kernel].append(start_item_set)
#
# Now compute new ItemSet's from the ones we have created, until we
# all we create is duplicates.
#
worklist = collections.deque([start_item_set])
while worklist:
item_set = worklist.popleft()
goto_sets = item_set.goto_sets(cache).items()
for symbol, goto_set in sorted(goto_sets, key=lambda i: i[0].id):
lr1_merge = None
for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]:
if lr1_item_set.compatible(goto_set):
lr1_merge = lr1_item_set
break
if lr1_merge is None:
goto_set.compute_closure(cache)
worklist.append(goto_set)
lr0_item_sets[goto_set.lr0_kernel].append(goto_set)
elif lr1_merge.merge(goto_set, cache):
if lr1_merge in worklist:
worklist.remove(lr1_merge)
worklist.appendleft(lr1_merge)
return start_item_set, lr0_item_sets
#
# Compute LR(1) actions.
#
def compute_parsing_table(cls, lr0_item_sets):
#
# First assign a unique ID to each item_set. This ID will be it's
# index into actions[] and gotos[].
#
table = dict(
(item_set, item_set)
for item_set_list in lr0_item_sets.values()
for item_set in item_set_list)
for item_set in table:
#
# Compute actions.
#
actions = collections.defaultdict(set)
goto_sets = item_set.goto_sets(None)
for item in item_set.all_items():
dot_pos, rhs = item.dot_pos, item.production.rhs
if dot_pos == len(rhs):
for token in item.lookahead:
actions[token].add(ReduceAction(item))
elif isinstance(rhs[item.dot_pos], TokenSymbol):
token = rhs[item.dot_pos]
goto_set = goto_sets[token]
found = False
for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]:
if lr1_item_set.compatible(goto_set):
actions[token].add(ShiftAction(lr1_item_set))
found = True
break
assert found, repr(token)
#
# Turn the action lists into tuples.
#
item_set.actions = dict(actions)
for token in actions:
item_set.actions[token] = tuple(item_set.actions[token])
#
# Compute goto's.
#
gotos = item_set.gotos
for symbol in goto_sets:
if not isinstance(symbol, Nonterm):
continue
goto_set = goto_sets[symbol]
for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]:
if lr1_item_set.compatible(goto_set):
assert symbol not in gotos
gotos[symbol] = lr1_item_set
break
return table
compute_parsing_table = classmethod(compute_parsing_table)
#
# Look for action ambiguities and resolve them if possible.
#
def disambiguate(cls, item_sets):
action_list = [
(item_set, act)
for item_set in item_sets
for act in item_set.actions.items()]
for item_set, (token, actions) in action_list:
if len(actions) == 1:
item_set.actions[token] = actions[0]
continue
#
# Multiple actions are ambiguities. Compare every action with
# all others in the hope that Prio() and Assoc() can eliminate
# all bar one.
#
new_actions = sorted(actions) # Repeatability for testing
i = 0
while i < len(new_actions) - 1:
act_0 = new_actions[i]
j = i + 1
while j < len(new_actions):
act_1 = new_actions[j]
keep = cls.resolve_ambiguity(item_set, token, act_0, act_1)
if "1" not in keep:
del new_actions[j]
j -= 1
j += 1
if "0" not in keep:
del new_actions[i]
i -= 1
break
i += 1
#
# Since we don't support GLR(1) grammars yet (ie, we don't do
# Split()), resolve_ambiguity() must have not more than one result
# left. It's possible Nonassoc eliminates all of them.
#
if not new_actions:
del item_set.actions[token]
else:
assert len(new_actions) == 1
item_set.actions[token] = new_actions[0]
disambiguate = classmethod(disambiguate)
#
# Compute how to resolve an action conflict. Returns the actions to
# keep, or "err".
#
def resolve_ambiguity(cls, item_set, token, action_0, action_1):
#
# Print a nice and hopefully useful error message when we can't
# resolve a conflict.
#
def err(reason):
#
# Print an action in a nice looking way.
#
def explain_action(action):
if isinstance(action, ReduceAction):
rhs = ' '.join(str(s) for s in action[3].production.rhs)
lhs = action[3].production.lhs
return [" replace the sequence [%s] with %s" % (rhs, lhs)]
next_state_lr1_items = list(action[0])
one_of = "" if len(next_state_lr1_items) == 1 else " one of"
msg = [
" accept the %s in the hope it will match%s:"
% (token, one_of)
]
msg.extend([
" " + repr(lr1_item.lr0_item)
for lr1_item in Lr1Item.sorted(next_state_lr1_items)])
return msg
#
# Is the passed item relevant to the action?
#
def relevant(item, action):
if isinstance(action, ReduceAction):
return action[3] is item
return (item.dot_pos < len(item.production.rhs) and
item.production.rhs[item.dot_pos] == token)
#
# Produce a nicely formatted error message.
#
msg = ["Conflict: %s" % (reason,)]
msg.append("While trying to recognise state %d:" % (item_set.id,))
for item in item_set.all_items():
if relevant(item, action_0) or relevant(item, action_1):
msg.append(" %r" % (item.lr0_item,))
msg.append("on seeing a %s I could not decide between:" % (token,))
msg.extend(explain_action(action_0))
msg.append("and")
msg.extend(explain_action(action_1))
msg.append("Please remove this ambiguity from your grammar")
raise GrammarError('\n'.join(msg))
#
# Decide the associativity of a symbol.
#
def assoc(nonterm):
if isinstance(nonterm, Assoc):
return nonterm
return None
#
# Ambiguities can arise from productions like this:
#
# e = e op e
# op = op1 | op2
#
# When confronted with the token string:
#
# e op1 e ^ op2 e
#
# We get a shift / reduce conflict at the indicated position.
# The choice is really between two different parse trees:
#
# Shift: (e op1 (e op2 e))
# Reduce: ((e op1 e) op2 e)
#
# This can be resolved in two ways. If priorities are allocated to
# the clashing productions, then we can choose the action the
# grammar writer preferred based on which of 'op1' or 'op2' was
# used:
#
# e = Prio(e op1 e, e op2 e)
# which is equivalent to:
# e = e op1 e, Priority=0
# e = e op2 e, Priority=1
#
# Or, the grammar writer can specify associativity, ie saying he always
# wants ((e 2) e) regardless of 'op' (left associative) or he
# wants (e op (e op e)) regardless of 'op' (right associative). For
# left associativity it is like this:
#
# e = e << op << e
#
# A reduce/reduce conflict can happen if the clash happens higher up
# in the parse tree:
#
# e0 = Prio(e1, e2)
# e1 = 'n'
# e2 = 'n'
#
# When parsing:
#
# 'n' ^
#
# We have two reduction choices, e1='n' and e2='n'. These productions
# don't have priorities allocated directly, but the parse table builder
# will have pushed the priorities e0 allocated down to them. Nested
# priorities are ranked (see allocate_rank()), and this ranking makes
# priorities globally comparable.
#
lhs_0, low_0, high_0 = action_0.precedence(token, item_set)
lhs_1, low_1, high_1 = action_1.precedence(token, item_set)
if low_0 < low_1:
return "0"
if low_0 > low_1:
return "1"
#
# The priorities are the same. Try using the associativity.
#
assert (
not isinstance(action_0, ShiftAction) or
not isinstance(action_1, ShiftAction))
if (isinstance(action_0, ReduceAction) and
isinstance(action_1, ReduceAction)):
return err("Reduce/Reduce")
lhs_0_assoc, lhs_1_assoc = assoc(lhs_0), assoc(lhs_1)
lhs = lhs_0_assoc if lhs_0_assoc is not None else lhs_1_assoc
if not isinstance(lhs, Assoc):
return err("Shift/Reduce and no associativity")
if isinstance(lhs_1_assoc, Assoc) and lhs.assoc != lhs_1_assoc.assoc:
return err("Shift/Reduce and conflicting associativity")
#
# The actions have met the associativity pre-conditions so we can
# resolve the conflict.
#
if lhs.assoc == 'l': # Left()
return "1" if isinstance(action_0, ShiftAction) else "0"
if lhs.assoc == 'r': # Right()
return "0" if isinstance(action_0, ShiftAction) else "1"
assert lhs.assoc == 'n', lhs_0.assoc
return "" # Nonassoc() - Association is a parse error
resolve_ambiguity = classmethod(resolve_ambiguity)
#
# This optional step creates a parsing_table with all the information
# in the ItemSet() pruned.
#
def optimise_parsing_table(self, parsing_table):
start_state, table = parsing_table
#
# Create a mapping of item_set: int, with start_state mapping to 0.
#
state_number = dict(zip(ItemSet.sorted(table), itertools.count()))
item_set_0 = next(i for i in state_number if state_number[i] == 0)
state_number[item_set_0] = state_number[start_state]
state_number[start_state] = 0
#
# Map each item set to it's number.
#
optimised = []
for item_set in sorted(table, key=lambda i: state_number[i]):
actions = {}
for token in item_set.actions:
action = item_set.actions[token]
if isinstance(action, ShiftAction):
new_action = (state_number[action[0]],)
else:
nonterm_nr = action[0].id
len_rhs = action[1]
output = action[2]
new_action = (nonterm_nr, len_rhs, output, None)
actions[token] = new_action
gotos = dict(
(nonterm.id, state_number[itm_set])
for nonterm, itm_set in item_set.gotos.items())
state_id = len(optimised)
lr1_state = Lr1State(state_id, actions, gotos, item_set.rules)
optimised.append(lr1_state)
#
# And we are done.
#
return 0, optimised
#
# The grammar hash.
#
def grammar_hash(self):
grammar_tree = '; '.join(sorted(
repr(self.rules[name]) for name in self.rules))
hsh = hashlib.sha512()
hsh.update(grammar_tree.encode())
hsh.update(self.VERSION.encode())
return hsh.hexdigest()
#
# The LR(1) table driven parser.
#
def lr1_parser(self, token_feed, tree_factory, on_error, log):
#
# Print the current stack, for log.
#
def print_stack(stk):
return " ".join(
"%s=%s" % (s[0], s[1][0][0] if s[1] else '()') for s in stk)
#
# The lengths I am prepared to go in the name of fast path
# efficiency frightens me a times.
#
def insert_error_recovery_tuple():
while recovery_stack:
input_tuple = next(recovery_stack[-1], EOF)
if input_tuple is not EOF:
yield input_tuple
else:
recovery_stack.pop()
iterator[0] = original_input
yield next(iterator[0], EOF)
EOF = object()
recovery_stack = []
original_input = itertools.chain(token_feed, ((self.eoi_token,),))
iterator = [original_input]
start_state, table = self.parsing_table
state = table[start_state]
stack = [(state, ((self.empty_token,),))]
if not self.comment_tokens:
comments = ()
else:
comments = frozenset(self.comment_tokens)
input_tuple = next(iterator[0], EOF)
while input_tuple is not EOF:
token = input_tuple[0]
if token in comments:
input_tuple = next(iterator[0], EOF)
continue
while True:
try:
action = state.actions[token]
except KeyError:
#
# A Parse error. Does he want to do error recovery?
#
if on_error is None:
raise ParseError(input_tuple, stack)
insert = on_error(iterator[0], input_tuple, stack)
if insert is None:
raise ParseError(input_tuple, stack)
recovery_stack.append(iter(insert))
iterator = [insert_error_recovery_tuple()]
break
#
# A shift?
#
if len(action) == 1:
if token is self.eoi_token:
break
if tree_factory:
input_tuple = tree_factory(input_tuple)
state = table[action[0]]
stack.append((state, (input_tuple,)))
if log:
log("shift %s; %s" % (token, print_stack(stack[1:]),))
break
#
# A reduce.
#
goto, pop_count, output, _ = action
if pop_count == 0:
tail = ()
nodes = ()
else:
tail = stack[-pop_count:]
del stack[-pop_count:]
nodes = sum((s[1] for s in tail), ())
if output is not None:
nodes = (output,) + nodes
if tree_factory:
nodes = tree_factory(nodes)
nodes = (nodes,)
state = table[stack[-1][0].gotos[goto]]
stack.append((state, nodes))
if log:
log(
"reduce %s; %s -- %s" %
(token, print_stack(tail), print_stack(stack[1:])))
input_tuple = next(iterator[0], EOF)
return stack[-1][1][0]
#
# The base class for Symbol's in the grammar: a token or a non-terminal.
#
class Symbol(object):
DICT_METHODS = (
"__contains__", "__delitem__", "__getitem__", "__iter__",
"__len__", "__setitem__")
__slots__ = ('dict', 'id', 'first_set', 'nested', 'parent') + DICT_METHODS
SYMBOL_PRECEDENCE = 0
def __init__(self):
self.dict = {}
self.first_set = frozenset()
self.nested = ()
self.parent = None
#
# The original idea was to inherit from dict. Doing so meant Symbol
# wasn't hashable and since it is used extensively in sets and as
# keys to dict's overriding __hash__ so it was hash'able caused a
# 10% slowdown in generating the parse table. So now we just emulate
# a dict.
#
self.__contains__ = self.dict.__contains__
self.__delitem__ = self.dict.__delitem__
self.__getitem__ = self.dict.__getitem__
self.__iter__ = self.dict.__iter__
self.__len__ = self.dict.__len__
self.__setitem__ = self.dict.__setitem__
def __add__(self, other):
return OpPlus(self, other)
def __radd__(self, other):
return OpPlus(other, self)
def __mul__(self, other):
if other is Opt or other is Some or other is Many or other is Repeat:
return other(self)
if isinstance(other, int):
return Repeat(self, other, other)
if (not isinstance(other, tuple) or len(other) > 2 or
any(not isinstance(a, int) for a in other)):
msg = (
"right operand of * must be one of: " +
"Opt, Some, Many, Repeat, (), (min,), (min,max)")
raise GrammarError(msg)
return Repeat(self, *other)
def __rmul__(self, other):
if other is Opt or other is Some or other is Many or other is Repeat:
return other(self)
if isinstance(other, int):
return Repeat(self, other, other)
if (not isinstance(other, tuple) or len(other) > 2 or
any(not isinstance(a, int) for a in other)):
msg = (
"left operand of * must be one of: " +
"Opt, Some, Many, Repeat, (), (min,), (min,max)")
raise GrammarError(msg)
return Repeat(self, *other)
def __lshift__(self, other):
return OpLshift(self, other)
def __rlshift__(self, other):
return OpLshift(other, self)
def __rshift__(self, other):
return OpRshift(self, other)
def __rrshift__(self, other):
return OpRshift(other, self)
def __or__(self, other):
return OpOr(self, other)
def __ror__(self, other):
return OpOr(other, self)
def __nonzero__(self):
return True
__bool__ = __nonzero__ # For Python3
def cast(cls, value):
value = cls.CAST.get(type(value), lambda x: x)(value)
if not isinstance(value, Symbol):
raise GrammarError("%r can't be a Symbol" % value)
return value
def compile_symbol(self, comp):
raise NotImplementedError()
def resolve_symbol(self, name, rules, token_registry):
return self
#
# Return a unique name for the symbol.
#
def __str__(self):
if self.parent is None:
return self.__class__.__name__
same_as_me = tuple(
sym
for sym in self.parent.nested
if sym.__class__.__name__ == self.__class__.__name__)
if len(same_as_me) < 2:
my_name = self.__class__.__name__
else:
index = same_as_me.index(self)
my_name = "%s%d" % (self.__class__.__name__, index)
return "%s.%s" % (self.parent, my_name,)
def __repr__(self):
return "%s(%s)" % (self.__class__.__name__, self.repr_nested())
def repr_nested(self):
return ', '.join(sym.nested_repr() for sym in self.nested)
def nested_repr(self):
return repr(self)
def get_rule(self):
rule = self
while rule.parent is not None:
rule = rule.parent
assert isinstance(rule, Rule)
return rule
#
# A forward reference to a symbol that will be defined later.
#
class Ref(Symbol):
__slots__ = ('referenced_name',)
def __init__(self, referenced_name):
super(Ref, self).__init__()
self.referenced_name = referenced_name
def resolve_symbol(self, name, rules, token_registry):
if self.referenced_name not in rules:
raise GrammarError("%s references undefined %r" % (name, self))
if len(self.dict) != 0:
msg = "Ref(%s) may not have dictionary elements"
raise GrammarError(msg % (self.referenced_name,))
return rules[self.referenced_name]
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.referenced_name)
#
# A reference to the Rule() of the current production.
#
class This(Symbol):
__slots__ = ()
def resolve_symbol(self, name, rules, token_registry):
return rules[name]
THIS = This()
#
# Combine several lists productions as a sequence. In other words, say we
# had several nonterms which were combined as a sequence:
#
# a = a0 + a1 + a2
#
# a0, a1 and a2 have their own lists of productions. We must produce the
# productions for a from those lists. So:
#
# Given:
# a0 = [(a,),(b,)]
# a1 = [(c,d),(e,f)]
# a2 = [(g,)]
#
# Return:
# [(a,c,d,g), (a,e,f,g), (b,c,d,g), (b,e,f,g)]
#
def seq(*args):
if not isinstance(args[-1], int):
repeats = 1
else:
repeats = args[-1]
args = args[:-1]
args = [[(p,)] if isinstance(p, Symbol) else p for p in args]
result = list(
tuple(itertools.chain.from_iterable(prod))
for prod in itertools.product(*args, repeat=repeats))
return result
#
# A non-terminal: A symbol that appears on the left hand side of a rule.
#
class Nonterm(Symbol):
__slots__ = ('rank', 'productions', 'priority')
def __init__(self, *args):
super(Nonterm, self).__init__()
for symbol in args:
if isinstance(symbol, Rule) and symbol.resolved:
msg = "Can not import %s from another Grammar" % (
symbol.name,)
raise GrammarError(msg)
if isinstance(symbol, MetaToken):
msg = "%s can not be used in a production" % (symbol.name,)
raise GrammarError(msg)
self.nested = [self.cast(symbol) for symbol in args]
self.priority = None
self.productions = ()
self.rank = None
#
# Compile the grammar. In other words turn it into a series of LR
# productions. Start from the start symbol and work our way down.
# Record what symbols were referenced, as unreferenced ones are
# probably an error.
#
def compile_grammar(self, empty_token):
#
# A recursive function that visits all symbols in the grammar,
# compiling them.
#
def comp(symbol):
if not isinstance(symbol, Rule):
return symbol.compile_symbol(comp)
if symbol not in seen:
seen.add(symbol)
symbol.emit(symbol.compile_symbol(comp))
return [(symbol,)]
seen = set()
comp(self)
self.allocate_symbol_id(empty_token)
self.allocate_rank()
return seen
#
# Allocate Nonterm.rank, so priorities can be calculated during the
# parser generator process. Lets say we have:
#
# START = Prio(a, b)
# a = Prio(b, c)
# b = 'n'
# c = Prio(START, a)
#
# So START has allocated 'b' a priority of 1, but 'a' has allocated 'b' a
# priority of 0. The question is how do we compare these? The answer
# used here is to rank the priorities based on how close they are to
# the START symbol. START is 0 away from itself, so it has the most
# preferred rank. 'a' and 'b' have ranks of 1, because they are referred
# to directly by START, and 'c' has a rank of 2.
#
# The parser uses this rank to create a priority tuple: (p0, p1, p2, ...),
# where p0 is the assigned by rank 0, p1 is the priority assigned by rank 1
# and so on.
#
# So in answering the question earlier, START's priority for 'b' is:
# (START = b,) which is (1,),
# and 'a's priority for 'b' is:
# (START = a, a = b,) which is (0,0,).
#
# When parsing this sequence of symbols:
#
# 'n' __end_of_input__
#
# The parser will go through these states.
#
# STACK INPUT POSSIBLE ACTIONS
# [] 'n' shift 'n'
# ['n'] __end_of_input__ reduce b = 'n'
# [b] __end_of_input__ reduce a = b, reduce START = b
#
# The priority of a = b is (0,0), and the priority of START = b is (1,).
# Since (0,0) < (1,), the parser will chose a = b.
#
def allocate_rank(self):
#
# First determine the height of each Nonterm in the production tree
# by doing a a breadth first pass over it. Collect the tree structure
# as we go.
#
height = 0
refers = collections.defaultdict(set)
refers[self].add(self)
symbol_heights = {self: height}
height += 1
queue = [self]
while queue:
queue, process = [], queue
for nonterm in process:
refers[nonterm] |= frozenset()
for production in nonterm.productions:
for symbol in production.rhs:
if isinstance(symbol, Nonterm):
if nonterm is not symbol:
refers[symbol].add(nonterm)
if symbol not in symbol_heights:
symbol_heights[symbol] = height
queue.append(symbol)
height += 1
#
# Turn it into a hierarchy. The idea is we do a topological sort on
# the graph using the tree structure we accumulated, breaking any
# cycles using height.
#
def remove(no_refers):
no_prio = set(
s for s in no_refers
if not isinstance(s, Prio.Prioritised))
if not no_prio:
rank[0] += 1
else:
no_refers = no_prio
for s in no_refers:
s.rank = rank[0]
del refers[s]
for s in refers:
refers[s] -= no_refers
rank = [0]
while refers:
no_refers = set(s for s, r in refers.items() if not r)
while refers and no_refers:
remove(no_refers)
no_refers = set(s for s, r in refers.items() if not r)
#
# If that didn't consume every token we have a cycle. Break the
# cycle by considering the node closest to the START symbol (ie
# lowest height) as havingt the heighest priority.
#
if refers:
low = min(symbol_heights[s] for s in refers)
lowest = set(s for s in refers if symbol_heights[s] == low)
remove(lowest)
# print sorted("%d %s" % (sym.rank, sym) for sym in symbol_heights)
#
# Allocate each symbol a unique id. This is used to make things
# deterministic when we iterate through hash tables and sets.
#
def allocate_symbol_id(self, empty_token):
queue = collections.deque([self, empty_token])
all_symbols = set()
while queue:
symbol = queue.popleft()
if symbol in all_symbols:
continue
symbol.id = len(all_symbols)
all_symbols.add(symbol)
if isinstance(symbol, Nonterm):
for production in symbol.productions:
queue.extend(production.rhs)
#
# Generate the Production() objects for the Nonterm().
#
def emit(self, productions):
self.productions = [Production(self, p) for p in productions]
#
# Merge the first_sets of all productions into ours. Return True if
# the first_set was changed.
#
def merge_first_set(self, parser):
want_empty = False
have_empty = parser.empty_token in self.first_set
result = self.first_set
for production in self.productions:
#
# For all A = a + b, merge first(a) into first(A).
#
for symbol in production.rhs:
result |= symbol.first_set
if parser.empty_token not in symbol.first_set:
break
have_empty = True
else:
want_empty = True
if want_empty:
if not have_empty:
result |= parser.empty_token.first_set
else:
if have_empty:
result -= parser.empty_token.first_set
if len(result) != len(self.first_set):
self.first_set = result
return True
return False
#
# For nonterms that accept lists, print in a nice way.
#
def nonterm_repr(self, delimiter):
def repr_sym(symbol):
if (len(symbol.nested) < 2 or
self.SYMBOL_PRECEDENCE <= symbol.SYMBOL_PRECEDENCE):
return symbol.nested_repr()
return '(' + symbol.nested_repr() + ')'
if len(self.nested) < 2:
return "%s(%s)" % (
self.__class__.__name__,
', '.join(sym.nested_repr() for sym in self.nested),)
nested = (repr_sym(symbol) for symbol in self.nested)
return (' ' + delimiter + ' ').join(nested)
#
# A symbol on the left hand side of a grammar rule.
#
class Rule(Nonterm):
__slots__ = ('name', 'resolved')
def __init__(self, name, symbol):
super(Rule, self).__init__(symbol)
self.name = name
self.resolved = False
def compile_symbol(self, comp):
return comp(*self.nested)
def __str__(self):
return self.name
def __repr__(self):
if isinstance(self.nested[0], Rule):
return "%s = %s" % (self.name, self.nested[0].name)
return "%s = %r" % (self.name, self.nested[0])
def nested_repr(self):
return "%s" % (self.name,)
#
# The arguments are a single production. Thus:
#
# L = Sequence(sym0, sym1, sym2)
#
# Yields the production:
#
# L ::= sym0 sym1 sym2
#
class Sequence(Nonterm):
__slots__ = ()
SYMBOL_PRECEDENCE = 3
def __init__(self, *args):
super(Sequence, self).__init__(*args)
def compile_symbol(self, comp):
return seq(*[comp(symbol) for symbol in self.nested])
def __repr__(self):
return self.nonterm_repr('+')
#
# The arguments are alternate productions. Thus
#
# L = Choice(sym0, sym1, sym2)
#
# Yields the productions:
#
# L ::= sym0
# L ::= sym1
# L ::= sym2
#
class Choice(Nonterm):
__slots__ = ()
SYMBOL_PRECEDENCE = 1
def __init__(self, *args):
super(Choice, self).__init__(*args)
def compile_symbol(self, comp):
return sum([comp(sym) for sym in self.nested], [])
def __repr__(self):
return self.nonterm_repr('|')
#
# Binary operations. Repeated applications of the same binary operation
# yield a single list. The op is then applied to that list. Thus:
#
# L = sym0 + sym1 + sym2 | sym3 | sym4
#
# Gets turned into:
#
# L = Alternate(Sequence(sym0, sym1, sym2), sym3, sym4).
#
class BinOp(Nonterm):
__slots__ = ()
def __init__(self, arg1, arg2):
super(BinOp, self).__init__(arg1, arg2)
def combine(self):
def r(arg1, arg2):
a1 = r(*arg1.nested) if type(self) == type(arg1) else [arg1]
a2 = r(*arg2.nested) if type(self) == type(arg2) else [arg2]
return a1 + a2
return r(*self.nested)
#
# Constructed for: sym | sym.
#
class OpOr(BinOp):
__slots__ = ()
def resolve_symbol(self, name, rules, token_registry):
return Choice(*self.combine())
#
# Constructed for: sym + sym.
#
class OpPlus(BinOp):
__slots__ = ()
def resolve_symbol(self, name, rules, token_registry):
return Sequence(*self.combine())
#
# Constructed for: sym << sym.
#
class OpLshift(BinOp):
__slots__ = ()
def resolve_symbol(self, name, rules, token_registry):
return Left(*self.combine())
#
# Constructed for: sym >> sym.
#
class OpRshift(BinOp):
__slots__ = ()
def resolve_symbol(self, name, rules, token_registry):
return Right(*self.combine())
#
# A priority list. A priority list is passed a list of symbols:
# S = Prio(sym0, sym1, sym2)
# is identical to:
# S = sym0 | sym1 | sym2
# with the side effect that in the event of a conflict we will choose
# sym0 over sym1 over sym2.
#
class Prio(Nonterm):
#
# This node holds priorities.
#
class Prioritised(Nonterm):
__slots__ = ()
def __init__(self, priority, symbol):
super(Prio.Prioritised, self).__init__(symbol)
self.priority = priority
def compile_symbol(self, comp):
self.emit(comp(*self.nested))
return [(self,)]
__slots__ = ()
def __init__(self, *args):
super(Prio, self).__init__(*args)
#
# Priority nodes allocate a separate node for each sub-node. We
# subsume nested Prio's.
#
def resolve_symbol(self, name, rules, token_registry):
def r(prio):
for symbol in prio.nested:
if isinstance(symbol, Prio):
r(symbol)
else:
children.append(self.Prioritised(next(index), symbol))
index = itertools.count()
children = []
r(self)
self.nested = children
return self
def compile_symbol(self, comp):
self.emit(sum([comp(sym) for sym in self.nested], []))
return [(self,)]
def __repr__(self):
if len(self.nested) < 2:
return "%s(%s)" % (
self.__class__.__name__,
', '.join(sym.nested_repr() for sym in self.nested))
return "(%s)" % (', '.join(sym.nested_repr() for sym in self.nested),)
#
# This node holds associativity: ie left, right or not allowed.
#
class Assoc(Nonterm):
__slots__ = ('assoc',)
def __init__(self, assoc, *args):
if assoc not in 'lnr':
raise GrammarError("Unknown associativity %r" % (assoc,))
super(Assoc, self).__init__(*args)
self.assoc = assoc
def compile_symbol(self, comp):
self.emit(seq(*[comp(symbol) for symbol in self.nested]))
return [(self,)]
def __repr__(self):
if type(self) is not Assoc:
return super(Assoc, self).__repr__()
return "%s(%s, %s)" % (
self.__class__.__name__, self.assoc, self.repr_nested(),)
#
# Force left associativity.
#
class Left(Assoc):
__slots__ = ()
SYMBOL_PRECEDENCE = 2
def __init__(self, *args):
super(Left, self).__init__('l', *args)
def __repr__(self):
return self.nonterm_repr('<<')
#
# Force right associativity.
#
class Right(Assoc):
__slots__ = ()
SYMBOL_PRECEDENCE = 2
def __init__(self, *args):
super(Right, self).__init__('r', *args)
def __repr__(self):
return self.nonterm_repr('>>')
#
# Force non associative.
#
class Nonassoc(Assoc):
__slots__ = ()
def __init__(self, *args):
super(Nonassoc, self).__init__('n', *args)
#
# Construct a list of productions separated by a delimiter.
#
class List(Nonterm):
__slots__ = ('max', 'min', 'opt',)
def __init__(self, symbol, delimiter, min=None, max=None, opt=None):
super(List, self).__init__(symbol, delimiter)
self.min = 0 if min is None else min
self.max = max
self.opt = opt
if max is not None and max < self.min:
raise GrammarError("min may not be greater than max")
def compile_symbol(self, comp):
symbol = comp(self.nested[0])
delimiter = comp(self.nested[1])
productions = []
if self.min == 0:
productions.append(())
#
# A fixed max means we can just list all the possibilities like this:
#
# [(), (sym,), (sym,delim,sym), (sym,delim,sym,delim,sym)]
#
if self.max is not None:
for repeat in range(max(0, self.min - 1), self.max):
prod = seq(symbol, seq(delimiter, symbol, repeat))
productions.extend(prod)
if self.opt:
productions.extend(seq(prod, delimiter))
return productions
#
# There is no upper maximum, so we need recursion:
#
# For right assoc:
# [(sym,delim,sym,delim,S)]
# S ::= [(sym,), (sym,delim,S)]
#
# For left assoc:
# [(S,delim,sym,delim,sym)]
# S ::= [(sym,), (S,delim,sym)]
#
my_productions = list(symbol)
if self.parent is None or not isinstance(self.parent, Assoc):
assoc = None
else:
assoc = self.parent.assoc
repeat = max(0, self.min - 1)
if assoc == 'r':
prod = seq(seq(symbol, delimiter, repeat), self)
productions.extend(prod)
my_productions.extend(seq(symbol, delimiter, self))
if self.opt:
my_productions.extend(seq(symbol, delimiter))
elif assoc is None or assoc == 'l':
prod = seq(self, seq(delimiter, symbol, repeat))
productions.extend(prod)
if self.opt:
productions.extend(seq(prod, delimiter))
my_productions.extend(seq(self, delimiter, symbol))
else:
msg = "Can't implement %s associativity on %r"
raise GrammarError(msg % (assoc, self))
self.emit(my_productions)
return productions
def __repr__(self):
if self.opt is not None:
return "%s(%s, %r, %r, %r)" % (
self.__class__.__name__, self.repr_nested(),
self.min, self.max, self.opt)
if self.max is not None:
return "%s(%s, %r, %r)" % (
self.__class__.__name__, self.repr_nested(),
self.min, self.max)
if self.min != 0:
return "%s(%s, %r)" % (
self.__class__.__name__, self.repr_nested(),
self.min)
return "%s(%s)" % (self.__class__.__name__, self.repr_nested())
#
# Repeats. This class handles all forms of repeats.
#
class Repeat(Nonterm):
__slots__ = ('min', 'max',)
def __init__(self, symbol, min=None, max=None):
super(Repeat, self).__init__(symbol)
self.min = 0 if min is None else min
self.max = max
if max is not None and max < self.min:
raise GrammarError("min may not be greater than max")
def compile_symbol(self, comp):
symbol = comp(*self.nested)
#
# If we have both min and max repeats we can just enumerate the
# results like this:
#
# [ (), (symbol,), (symbol,symbol,), ... ]
#
if self.max is not None:
prods = (seq(symbol, rpt) for rpt in range(self.min, self.max + 1))
return sum(prods, [])
#
# There is no maximum, so we must use recursion.
#
# For right associative:
# [ (symbol,symbol,S) ]
# S ::= [symbol, (symbol, S) ]
#
# For left associative:
# [ (S,symbol,symbol) ]
# S ::= [symbol, (S,symbol) ]
#
if self.parent is None or not isinstance(self.parent, Assoc):
assoc = None
else:
assoc = self.parent.assoc
if self.min == 0:
productions = [(), (self,)]
else:
productions = seq(seq(symbol, max(0, self.min - 1)), self)
if assoc == 'r':
my_productions = symbol + seq(symbol, self)
elif assoc is None or assoc == 'l':
my_productions = symbol + seq(self, symbol)
else:
msg = "Can't implement %s associativity on %r"
raise GrammarError(msg % (assoc, self))
self.emit(my_productions)
return productions
def __repr__(self):
nested = self.nested[0].nested_repr()
if len(self.nested[0].nested) >= 2:
nested = '(%s)' % (nested,)
if type(self) is not Repeat:
return "%s * %s" % (nested, self.__class__.__name__)
if self.min == self.max:
return "%s * %r" % (nested, self.min)
if self.max is not None:
return "%s * (%r, %r)" % (nested, self.min, self.max)
if self.min != 0:
return "%s * (%r,)" % (nested, self.min)
return "%s * ()" % (nested,)
#
# Optional - ie 0 or 1.
#
class Opt(Repeat):
__slots__ = ()
def __init__(self, symbol):
super(Opt, self).__init__(symbol, 0, 1)
#
# 1 or more.
#
class Some(Repeat):
__slots__ = ()
def __init__(self, symbol):
super(Some, self).__init__(symbol, 1, None)
#
# 0 or more.
#
class Many(Repeat):
__slots__ = ()
def __init__(self, symbol):
super(Many, self).__init__(symbol, 0, None)
#
# Generate tokens by splitting a string.
#
class Tokens(Nonterm):
__slots__ = ()
def __init__(self, literals, keywords=None, case=None):
tokens = []
if literals and literals.strip():
tokens.extend([
Token(literal, case=case)
for literal in literals.strip().split()])
if keywords and keywords.strip():
tokens.extend([
Keyword(keyword, case)
for keyword in keywords.strip().split()])
super(Tokens, self).__init__(*tokens)
def resolve_symbol(self, name, rules, token_registry):
return Choice(*self.nested)
#
# The Tokeniser() breaks up input into tokens.
#
class Tokeniser(object):
literals = None # dict, {"literal": token, ...}
regex = None # object, re.compile() - compiled token recognisers
re_groups = None # tuple, (int, ...)
re_list = None # tuple, (Token(), ...)
registry = None # object, TokenRegistry() that owns us
unrecognised = None # object, The UnrecognisedToken()
re_flags = re.DOTALL | re.MULTILINE
ANCHOR_RE = re.compile(r'(?:[^[\\]|\\.|\[\^?\]?(?:\\.|[^]\\])*\])*\\[AZ]')
BACKREF_RE = re.compile(
r'(?:[^[\\]|\\[^0-9]|\[\^?\]?(?:\\.|[^]\\])*\])*(\\[0-9]+)')
CAPTURE_RE = re.compile(r'[^[\\(]|\\.|\[\^?\]?(?:\\.|[^]\\])*\]|\(\?')
def compile_tokens(self, token_registry, whitespace):
self.registry = token_registry
#
# Whitespace must be a string.
#
if whitespace is not None:
if not isinstance(whitespace, string_types):
raise GrammarError("WHITESPACE must be a string")
all_tokens = (
token
for token in self.registry.values()
if isinstance(token, Token))
key = lambda t: (to_str(t.literal), to_str(t.re))
all_tokens = sorted(all_tokens, key=key, reverse=True)
patterns = [token for token in all_tokens if token.re is not None]
self.literals = {}
self.unrecognised = next(
(t for t in all_tokens if t.re is None and t.literal is None),
None)
#
# It's amazing what unit testing turns up. Would anybody really use
# the inbuilt tokeniser without recognising a single token?
#
self.re_list = []
self.re_groups = []
if not patterns:
pattern = "x(?<=y)" # An re that never matches
else:
#
# Currently Python's re module returns the first re that matches
# when given the sequence a|b|c. We always want it to match the
# longest possible literal. In Python 2.7 putting the longest
# literals first makes that happen.
#
# Backreferences are allowed, but since grammar writer has no idea
# what order we will put them in we have to renumber them.
#
longest = lambda token: (token.literal is not None, -len(token.re))
ordered_patterns = []
backref_base = 0
for token in sorted(patterns, key=longest):
base = backref_base
backref_base += 1
token_re = '(?:()(?:%s))' % token.re
self.re_groups.append(backref_base)
self.re_list.append(token)
backref_matches = tuple(self.BACKREF_RE.finditer(token_re))
for match in reversed(backref_matches):
backref_no = int(match.group(1)[1:], 10) + backref_base
token_re = "%s\\%d%s" % (
token_re[:match.start(1)],
backref_no,
token_re[match.end(1):])
backref_base = base + len(self.CAPTURE_RE.sub('', token_re))
ordered_patterns.append(token_re)
pattern = '|'.join(ordered_patterns)
self.regex = re.compile(pattern, self.re_flags)
self.re_list = tuple(self.re_list)
self.re_groups = tuple(self.re_groups)
#
# Gather all literals and checking there are no duplicates.
#
all_re = {}
for token in all_tokens:
#
# Ensure the re isn't duplicated.
#
if token.re is not None:
if token.re in all_re:
msg = "Token's %r and %r define the same re"
raise GrammarError(msg % (token, all_re[token.re]))
all_re[token.re] = token
if token.literal is not None:
self.literals[token.literal] = token
#
# Ensure the literal is matched by exactly one Token.
# This also ensures there are no duplicate literals.
#
matches = []
for re_tok in patterns:
if self.ANCHOR_RE.match(re_tok.re):
continue
match = re.match(re_tok.re, token.literal)
if match is not None:
if (re_tok.literal is None or
match.group() == token.literal):
matches.append((re_tok, match))
if not matches:
msg = "Keyword %r does not match any re"
raise GrammarError(msg % (token,))
if len(matches) == 1:
token.owner = matches[0][0]
else:
re_matches = [
m[0] for m in matches if m[0].literal is None]
if re_matches:
msg = (
"Literal token %s should be a Keyword " +
"as it matches re token %s")
res = ', '.join(str(match) for match in re_matches)
raise GrammarError(msg % (token, res))
matches = [m for m in matches if m is not token]
msg = "duplicate literal %r and %r"
raise GrammarError(msg % (matches[0][0], token,))
if not any(m.group() == token.literal for t, m in matches):
msg = "Token.re %r partially matches %r of Keyword %r"
raise GrammarError(msg % (matches[0], match, token))
#
# This generator is a filter. It takes a string generator as an argument
# and generates tokens ready to be fed into the parser. The strings
# returned by the generator are assumed to be entire tokens.
#
def tokeniser(self, input, whitespace=None):
#
# pos = [position_in_stream, line_number, column_number]
#
def update_position(data):
ldata = len(data)
pos[0] += ldata
matches = list(re.finditer("(?:\n\r?|\r\n?)", data))
if not matches:
pos[2] += ldata
else:
pos[1] += len(matches)
pos[2] = ldata - matches[-1].end() + 1
pos = [0, 1, 1]
#
# Normalise the parameters.
#
iterator = iter((input,) if isinstance(input, string_types) else input)
if whitespace == "":
is_whitespace = lambda s: False
last_whitespace = lambda s: len(s)
else:
spaces = " \f\n\r\t\v" if whitespace is None else whitespace
is_whitespace = lambda s: not s.lstrip(spaces)
trans = string_maketrans(spaces, spaces[0] * len(spaces))
last_whitespace = lambda s: s.translate(trans).rfind(spaces[0])
#
# Loop until end of the stream.
#
cur_tok = next(iterator, None)
cur_isstr = isinstance(cur_tok, string_types)
nxt_tok = next(iterator, None)
nxt_isstr = isinstance(nxt_tok, string_types)
while cur_tok is not None:
buf = ""
#
# Loop while the current token is a string.
#
while cur_isstr:
#
# If the next token isn't a string we must parse all of the
# input, otherwise we only parse up to the last space. This
# somewhat reduces the chance of tokens being truncated across
# iterator boundaries.
#
last = last_whitespace(cur_tok) if nxt_isstr else len(cur_tok)
if last == -1:
buf += cur_tok
else:
last += len(buf)
buf += cur_tok
offset = 0
while offset < last:
match = self.regex.search(buf, offset, last)
if match is not None:
start, end = match.span()
else:
if nxt_isstr:
break
start, end = last, last
#
# The only thing that can separate one token and the
# next is whitespace.
#
if offset < start:
in_between = buf[offset:start]
if not is_whitespace(in_between):
if self.unrecognised is None:
msg = (
"Unrecognised token %r " +
"at line %d column %d"
)
raise TokenError(
msg % (in_between, pos[1], pos[2]),
in_between, pos[0], pos[1], pos[2])
yield (
self.unrecognised, in_between,
pos[0], pos[1], pos[2])
update_position(in_between)
if start == last:
break
#
# Found some data that matches a token. Identify what
# token it matches.
#
data = match.group()
try:
token = self.literals[data]
except KeyError:
try:
token = self.literals[data.lower()]
if token.case:
token = None
except KeyError:
token = None
if token is None:
idx = match.group(*self.re_groups).index('')
token = self.re_list[idx]
if token.refine is not None:
token = token.refine(self.registry, data)
yield token, data, pos[0], pos[1], pos[2]
update_position(data)
offset = end
buf = buf[offset:]
cur_tok = nxt_tok
cur_isstr = nxt_isstr
nxt_tok = next(iterator, None)
nxt_isstr = isinstance(nxt_tok, string_types)
#
# If we are given non-strings pass it straight on.
#
while cur_tok is not None and not cur_isstr:
yield cur_tok
cur_tok = nxt_tok
cur_isstr = nxt_isstr
nxt_tok = next(iterator, None)
nxt_isstr = isinstance(nxt_tok, string_types)
#
# Meta class that does the work for a TokenRegistry.
#
class TokenRegistryMeta(type):
def __new__(cls, name, bases, dct):
registry = super(TokenRegistryMeta, cls).__new__(cls, name, bases, dct)
if dct.get("__metaclass__", None) is not cls:
registry.save_dicts()
return registry
#
# Put your token definitions in a class that inherits from this one.
#
class TokenRegistry(dict):
__dicts = None
__metaclass__ = TokenRegistryMeta
__tokeniser = None
def __init__(self):
for name, token_symbol in self.__class__.__dict__.items():
if isinstance(token_symbol, TokenSymbol):
qualified_name = "%s.%s" % (self.__class__.__name__, name)
token_symbol.set_name(qualified_name)
self._resolve_token_(token_symbol)
#
# Save the registered token's dicts, as they may be overwritten
# by the Grammar. This is called by the meta class, so it happens
# before the Grammar has a chance to get it's fingers into the pie.
#
def save_dicts(cls):
cls.__dicts = {}
for token_symbol in cls.__dict__.values():
if isinstance(token_symbol, TokenSymbol):
cls.__dicts[token_symbol] = token_symbol.dict
token_symbol.dict = {}
save_dicts = classmethod(save_dicts)
#
# Return the dict's of all registered tokens.
#
def restore_dicts(cls):
for token_symbol, token_dict in cls.__dicts.items():
token_symbol.dict = token_dict
del cls.__dicts
restore_dicts = classmethod(restore_dicts)
#
# Resolve duplicate tokens.
#
def _resolve_token_(self, token_symbol):
alias = self.get(token_symbol.name, None)
if alias is None:
alias = token_symbol
super(TokenRegistry, self).__setitem__(alias.name, alias)
if alias is not token_symbol:
alias.merge(token_symbol)
return alias
#
# Compile the re that recognises the tokens.
#
def compile_tokens(self, whitespace=None):
self.__tokeniser = Tokeniser()
self.__tokeniser.compile_tokens(self, whitespace)
#
# Return a generator for tokens in the grammar.
#
def tokeniser(self, input, whitespace=None):
return self.__tokeniser.tokeniser(input, whitespace)
def __setitem__(self, key, value):
raise NotImplementedError()
def __delitem__(self, key):
raise NotImplementedError()
TokenRegistry = python3_metaclass(TokenRegistry)
#
# A Token in the grammar. An instance of a TokenSymbol() defines one kind
# of token.
#
class TokenSymbol(Symbol):
__slots__ = ('name', 'named',)
def __init__(self, name):
super(TokenSymbol, self).__init__()
self.name = name
self.named = False
self.first_set = frozenset((self,))
def resolve_symbol(self, name, rules, token_definitions):
return token_definitions._resolve_token_(self)
#
# Absorb another token definition into this one.
#
def merge(self, other):
msg = "Token %r doesn't support merging with token %r"
raise GrammarError(msg % (self, other))
def compile_symbol(self, comp):
return [(self,)]
#
# Set the name of this token.
#
def set_name(self, name):
if self.name is None:
self.name = name
elif self.name != name:
msg = "Can not rename token %r to %r" % (self.name, name)
raise GrammarError(msg)
self.named = True
def __repr__(self):
return str(self)
def __str__(self):
return str(self.name)
#
# Given a tuple returned by the tokeniser, return an English
# description of where we are in the input stream.
#
def position(self, token_tuple):
return None
#
# MetaToken's are used internally by the Parser. They are re-usable
# by multiple grammar's.
#
class MetaToken(TokenSymbol):
__slots__ = ()
def __init__(self, name):
super(MetaToken, self).__init__(name)
def __repr__(self):
return str(self.name)
def __str__(self):
return self.name
def position(self, token_tuple):
return self.name
#
# A user generated token.
#
class UserToken(TokenSymbol):
__slots__ = ()
def __init__(self, name=None):
super(UserToken, self).__init__(name)
def resolve_symbol(self, name, rules, token_definitions):
if self.name is None:
msg = "A %s must be assigned a name using a %s" % (
self.__class__.__name__, TokenRegistry.__name__)
raise GrammarError(msg)
return super(UserToken, self).resolve_symbol(
name, rules, token_definitions)
def merge(self, other):
if not isinstance(other, UserToken):
super(UserToken, self).merge(other)
#
# A Token() built by the inbuilt tokeniser. A token comes in two
# varieties:
#
# - A token defined by a regular expression.
# - A keyword, which is a special purposed token.
#
class Token(TokenSymbol):
_RE = re
KEYWORD = object()
UNRECOGNISED = object()
__slots__ = ('case', 'literal', 'owner', 're', 'refine')
def __init__(
self, literal=None, re=None, case=None, kind=None, refine=None):
if kind is self.KEYWORD:
if re is not None:
raise GrammarError("A keyword must not have an re")
if refine is not None:
raise GrammarError("A keyword can not be refined")
if literal is None:
raise GrammarError("A keyword must have a literal")
elif kind is self.UNRECOGNISED:
if literal is not None or re is not None:
msg = "The UnrecognisedToken can't have a literal or re"
raise GrammarError(msg)
elif kind is not None:
raise GrammarError("Unrecognised Token kind %r" % (kind,))
else:
if literal is None and re is None:
raise GrammarError("A Token must have a literal or an re")
if literal is not None:
if re is not None:
msg = "A Token can't have both a literal and a re"
raise GrammarError(msg)
if refine is not None:
raise GrammarError("A literal can't be refined")
self.literal = literal
self.re = re
self.named = False
super(Token, self).__init__(str(self))
self.case = case if case is not None else True
self.refine = refine
self.owner = self
if kind is not None:
pass
elif re is not None:
self._RE.compile(re)
else:
self.re = self._RE.escape(self.literal)
if not self.case and self.literal.lower() != self.literal.upper():
def either_case(match):
char = match.group()
return "[%c%c]" % (char.upper(), char.lower())
self.re = self._RE.sub("[a-zA-Z]", either_case, self.re)
def __repr__(self):
if self.literal is not None:
result = repr(self.literal)
elif self.re is not None:
result = self.repr_re()
else:
result = "%s()" % (UnrecognisedToken.__name__,)
if self.named:
return '%s=%s' % (self, result,)
return result
def __str__(self):
if self.named:
return super(Token, self).__str__()
if self.literal:
return repr(self.literal)
if self.re is not None:
return self.repr_re()
return ''
def repr_re(self):
return "/%s/" % (repr(self.re)[1:-1].replace("/", "\\/"),)
def merge(self, other):
#
# We are only merge with like.
#
if not isinstance(other, Token):
TokenSymbol.merge(self, other)
if self.refine is None:
self.refine = other.refine
elif other.refine is not None and other.refine != self.refine:
msg = "Token %r defined with conflicting refine's %r and %r"
raise GrammarError(msg % (self.name, self.refine, other.refine))
#
# Set the name of this token.
#
def set_name(self, name):
if self.named and self.name != name:
msg = "Can not rename token %r to %r" % (self.name, name)
raise GrammarError(msg)
self.name = name
self.named = True
#
# Our tokeniser puts the line and column in the tuple.
#
def position(self, token_tuple):
if len(token_tuple) < 5 or None in token_tuple[3:5]:
return super(Token, self).position(token_tuple)
return "line %d column %d" % token_tuple[3:5]
#
# The Unrecognised Token.
#
def UnrecognisedToken():
return Token(kind=Token.UNRECOGNISED)
#
# A Keyword is literal without a regexp, ie it must match an existing regexp.
#
def Keyword(literal, case=None):
return Token(literal, case=case, kind=Token.KEYWORD)
#
# How we handle non-Symbol types in Symbol expression.
#
Symbol.CAST = {
str: Token,
tuple: lambda t: Prio(*t),
}
if sys.version_info < (3,):
Symbol.CAST[unicode] = Token
class Production(object):
__slots__ = ("lhs", "rhs")
def __init__(self, lhs, rhs):
self.lhs = lhs
self.rhs = tuple(rhs)
def __repr__(self):
return (
"%s = %s" %
(self.lhs, " ".join(str(elm) for elm in self.rhs),))
#
# Parser Constructor.
#
class GrammarMeta(type):
def __new__(cls, name, bases, dct):
if "_parser_" in dct:
raise GrammarError("_parser_ is reserved in Gramma's.")
if dct.get("__metaclass__", None) is not cls:
dct["_parser_"] = Parser(name, dct)
return super(GrammarMeta, cls).__new__(cls, name, bases, dct)
#
# The base class for Parsers.
#
class Grammar(object):
__metaclass__ = GrammarMeta
def compile_grammar(cls):
compile_grammar(cls)
compile_grammar = classmethod(compile_grammar)
def epoch_symbol(cls):
return epoch_symbol(cls)
epoch_symbol = classmethod(epoch_symbol)
def parse(cls, input, tree_factory=None, on_error=None, log=None):
return parse(cls, input, tree_factory, on_error, log)
parse = classmethod(parse)
def pre_compile_grammar(cls, pre_compiled=None):
return pre_compile_grammar(cls, pre_compiled)
pre_compile_grammar = classmethod(pre_compile_grammar)
def repr_grammar(cls):
return repr_grammar(cls)
repr_grammar = classmethod(repr_grammar)
def repr_parse_table(cls, state=None):
return repr_parse_table(cls, state)
repr_parse_table = classmethod(repr_parse_table)
def repr_parse_tree(cls, tree, indent=None):
return repr_parse_tree(tree, indent)
repr_parse_tree = classmethod(repr_parse_tree)
def repr_productions(cls):
return repr_productions(cls)
repr_productions = classmethod(repr_productions)
def unused_rules(cls):
return unused_rules(cls)
unused_rules = classmethod(unused_rules)
Grammar = python3_metaclass(Grammar)
def compile_grammar(grammar):
grammar._parser_.compile_grammar()
def epoch_symbol(grammar):
return grammar._parser_.epoch_symbol
def parse(grammar, input, tree_factory=None, on_error=None, log=None):
return grammar._parser_.parse(input, tree_factory, on_error, log)
def pre_compile_grammar(grammar, pre_compiled=None):
return grammar._parser_.pre_compile_grammar(grammar, pre_compiled)
def repr_grammar(grammar):
return grammar._parser_.repr_grammar()
def repr_parse_table(grammar, state=None):
return grammar._parser_.repr_parse_table(state)
def repr_productions(grammar):
return grammar._parser_.repr_productions()
def unused_rules(grammar):
return grammar._parser_.unused_symbols
def repr_parse_tree(tree, indent=None):
def indent_tree(tree, padding):
#
# Append tokens and empty productions to the current line.
#
def extend_line(line, prod):
while True:
while prod and isinstance(prod[0][0], TokenSymbol):
line.append(repr_token(prod.popleft()))
if not prod or len(prod[0]) != 1:
break
line.append("(%s)" % (prod.popleft()[0],))
repr_token = (
lambda t: repr(t[1]) if isinstance(t[0], Token) else str(t[0]))
#
# Were we passed a token?
#
if isinstance(tree[0], TokenSymbol):
return [repr_token(tree)]
#
# List singleton productions (ie productions of the form rule1 = rule2)
# on the same line.
#
line, result = [], []
nesting = 1
while len(tree) == 2 and not isinstance(tree[1][0], TokenSymbol):
line.append("(%s" % (tree[0],))
nesting += 1
tree = tree[1]
line.append("(%s" % (tree[0],))
#
# If the remainder of the symbols are tokens just list them as well.
#
prod = collections.deque(tree[1:])
extend_line(line, prod)
if not prod:
result.append('%s%s' % (padding, ' '.join(line)))
else:
#
# If we have rule1 = ... ^ rule2, then list them as:
#
# (rule1 ... rule2
# rule2-child0
# ...)
#
# If we have rule1 = ... ^ rule2 ... then list them as:
#
# (rule1 ...
# (rule2
# ...)
# ...)
#
if len(prod) > 1:
result.append('%s%s' % (padding, ' '.join(line)))
elif (len(prod) == 1 and
all(isinstance(t[0], TokenSymbol) for t in prod[0][1:])):
last = prod.popleft()
line.append("(%s" % last[0])
line.extend(repr_token(t) for t in last[1:])
line[-1] += ")"
result.append('%s%s' % (padding, ' '.join(line)))
else:
line.append('(%s' % (prod[0][0],))
result.append('%s%s' % (padding, ' '.join(line)))
prod = collections.deque(prod[0][1:])
if len(prod[0]) == 1 or isinstance(prod[0][0], TokenSymbol):
line = []
extend_line(line, prod)
result.append('%s%s' % (padding + indent, ' '.join(line)))
while prod:
result.extend(indent_tree(prod.popleft(), padding + indent))
line = [result[-1]]
extend_line(line, prod)
result[-1] = ' '.join(line)
result[-1] += ')' * nesting
return result
eol = ' ' if indent is False else '\n'
indent = " " if indent is None else indent or ""
return eol.join(indent_tree(tree, ""))
# vim: set shiftwidth=4 expandtab softtabstop=8 :