# Lrparsing.py is a LR(1) parser hiding behind a pythonic interface. It takes # as input a grammar and a string to be parsed, and outputs the parse tree. # # Copyright (c) 2013,2014,2015,2016,2017,2018 Russell Stuart. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published # by the Free Software Foundation, either version 3 of the License, or (at # your option) any later version. # # The copyright holders grant you an additional permission under Section 7 # of the GNU Affero General Public License, version 3, exempting you from # the requirement in Section 6 of the GNU General Public License, version 3, # to accompany Corresponding Source with Installation Information for the # Program or any work based on the Program. You are still required to # comply with all other Section 6 requirements to provide Corresponding # Source. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # import ast import collections import hashlib import itertools import re import string import sys # # Python2/3 compatibility hacks. # # Written this odd way to preserve 100% test coverage. # StandardError = Exception if sys.version_info >= (3,) else StandardError python3_metaclass = (lambda cls: cls) if sys.version_info < (3,) else ( lambda cls: cls.__metaclass__( cls.__name__, cls.__bases__, dict(cls.__dict__))) string_types = basestring if sys.version_info < (3,) else str string_maketrans = ( string.maketrans if sys.version_info < (3,) else str.maketrans) to_str = lambda s: s if isinstance(s, string_types) else str(s) # # Common base class for all exceptions here. # class LrParsingError(StandardError): pass # # Raised if the Grammar isn't valid. # class GrammarError(LrParsingError): pass # # Base class for errors raised at parsing time. # class ParsingError(LrParsingError): pass # # Raised if an invalid syntax is given. # class TokenError(ParsingError): def __init__(self, message, data, offset, line, column): self.data = data self.offset = offset self.line = line self.column = column super(TokenError, self).__init__(message, data, offset, line, column) # # Raised if an invalid syntax is given. # class ParseError(ParsingError): input_token = None stack = None def __init__(self, input_token, stack): def comma_or(lst): strs = sorted("%s" % l for l in lst) if len(strs) == 1: return strs[0] return ', '.join(strs[:-1]) + ' or ' + strs[-1] self.input_token = input_token self.stack = stack lr1_state = stack[-1][0] if len(lr1_state.actions) >= 10: msg = "Got unexpected %s" % (input_token[0],) else: msg = "Got %s when expecting %s" % ( input_token[0], comma_or(lr1_state.actions)) if len(lr1_state.rules) < 10: msg += " while trying to match %s" % (comma_or(lr1_state.rules)) msg += " in state %d" % (lr1_state.id,) position = input_token[0].position(input_token) if position: msg = position + ": " + msg super(ParseError, self).__init__(msg) # # Print a set of symbols. # def str_symbol_set(symbol_set): return '[%s]' % (','.join(sorted(str(symbol) for symbol in symbol_set)),) # # An LR(0) Item. An item is just a production and the position the parser # is up to in parsing it - the dot_pos. Eg: # # sym0 sym1 . sym2 # # means the parser is processing a production wants to see the symbols # "sym0 sym1 sym2" in order, and it has seen sym1 and sym2. # class Lr0Item(object): __slots__ = ('dot_pos', '_key', 'lr0_item', 'production') def __new__(cls, production, dot_pos, cache): lr0_item = (production, dot_pos) result = cache.get(lr0_item, None) if result is None: result = super(Lr0Item, cls).__new__(cls) result.lr0_item = lr0_item result.dot_pos = dot_pos result.production = production result._key = ( (str(result.production.lhs),) + tuple(str(sym) for sym in result.production.rhs) + (result.dot_pos,) ) cache[result.lr0_item] = result return result def __repr__(self): rhs = self.production.rhs ll = lambda s, e: [str(symbol) for symbol in rhs[s:e]] prod = ll(0, self.dot_pos) + ['^'] + ll(self.dot_pos, len(rhs)) return "%s = %s" % (self.production.lhs, ' '.join(prod)) def key(cls, lr0_item): return lr0_item._key key = classmethod(key) # # An item in an LR(1) grammar. It is just a LR(0) item, together with the # set a of tokens that could follow the production called the lookahead. # Eg, given the Item(): # # L ::= sym0 sym1 sym2 . [tokA, tokB] # # The parser has seen all the symbols in this production, so if the next token # to be processed is tokA or tokB, the production can be reduced (ie replaced) # with it's left hand side, ie L # class Lr1Item(object): __slots__ = ('dot_pos', 'lookahead', 'lr0_item', 'production') def __init__(self, lr0_item, lookahead): self.lr0_item = lr0_item self.production = self.lr0_item.production self.dot_pos = self.lr0_item.dot_pos self.lookahead = lookahead def lr1_shift(self, cache): lr0_item = Lr0Item(self.production, self.dot_pos + 1, cache) return Lr1Item(lr0_item, set(self.lookahead)) def __repr__(self): return "%r %s" % (self.lr0_item, str_symbol_set(self.lookahead),) def sorted(cls, iterable): return sorted(iterable, key=lambda item: Lr0Item.key(item.lr0_item)) sorted = classmethod(sorted) # # An Lr0Kernel. A "kernel" is the state of the parser. It is just a set of # items (Lr0Items in the case of an Lr0Kernels). The initial kernel is just # the grammar's start production, eg: # # G ::= . E # # Successive kernels are generated by looking every production that can # be reached from a previous kernel if a particular given token is seen. # For example, if the rest of the grammar is: # # E ::= E / E # E ::= n # # Then after seeing an n, the Lr0Kernel would be: # # E ::= n . # class Lr0Kernel(object): __slots__ = ("lr0_items",) def __new__(cls, lr1_items, cache): lr0_items = frozenset(item.lr0_item for item in lr1_items) result = cache.get(lr0_items, None) if result is None: result = super(Lr0Kernel, cls).__new__(cls) result.lr0_items = lr0_items cache[result.lr0_items] = result return result # # An Lr1State is the compiled version of an ItemSet. Ie, all the # information the Parser doesn't need has been discarded. # class Lr1State(int): if sys.version_info < (3,): __slots__ = ('actions', 'gotos', 'id', 'rules') def __new__(cls, id, actions, gotos, rules): result = super(Lr1State, cls).__new__(cls, id) result.actions = actions result.gotos = gotos result.id = id result.rules = rules return result def __repr__(self): def p(act): if len(act) == 1: return "shift %d" % act if len(act) == 2 or act[2] is None: return "reduce %d %d" % act[:2] return "reduce %d %d %s" % act[:3] result = [str(self)] if self.actions: result.append(" -- actions") for token in sorted(self.actions, key=lambda sym: str(sym)): result.append(" %s: %s" % (token, p(self.actions[token],))) if self.gotos: result.append(" -- gotos") for nonterm_number in sorted(self.gotos, key=lambda sym: str(sym)): result.append( " %s: %s" % (nonterm_number, self.gotos[nonterm_number],)) return '\n'.join(result) def __str__(self): return "Lr1State:%d" % (int(self),) def to_flat(self, grammar): actions = {} for token, action in self.actions.items(): if len(action) == 1: new_action = action[0] elif action[2] is None: new_action = tuple(action[:2]) else: new_action = (action[0], action[1], action[2].name) actions[token.name] = new_action rules = sorted(rule.name for rule in self.rules) return (actions, self.gotos, rules) PYTHON_VAR_RE = re.compile( "(?i)^[a-z_][a-z_0-9]*(?:[.][a-z_][a-z_0-9]*)*$") def from_flat(cls, index, flat, rules, token_registry): actions = {} for token_name, action in flat[0].items(): if not isinstance(action, tuple): new_action = (action,) elif len(action) == 2: new_action = (action[0], action[1], None, None) else: new_action = (action[0], action[1], rules[action[2]], None) token = token_registry[token_name] actions[token] = new_action target_rules = (rules[rule_name] for rule_name in flat[2]) return cls(index, actions, flat[1], set(target_rules)) from_flat = classmethod(from_flat) def sorted(cls, table): return sorted(table) sorted = classmethod(sorted) # # An ItemSet is a state in the LR(1) grammar. An ItemSet is identical in # concept to an LR(0) kernel, but it consists of Lr1Item's rather than # Lr0Item's. In other words, the items contain the tokens they expect to # follow the production. These tokens are shown in []'s. # # Following on from the example above: # # G ::= . E , [__empty__] # # And if an n is accepted: # # E ::= n ., [, /] # class ItemSet(object): ID = 0 __slots__ = ( 'actions', '_closure', '_goto_cache', 'gotos', 'id', '_kernel', '_lhs_prio', 'lr0_kernel', 'prio',) def __init__(self, items, cache): items = tuple(items) self._kernel = dict((item[0].lr0_item, item[0]) for item in items) self.prio = dict((item[0].lr0_item, item[1]) for item in items) self.lr0_kernel = Lr0Kernel(iter(self), cache) self.actions = None self.gotos = {} self._closure = None self._goto_cache = None self._lhs_prio = None self.id = self.ID self.__class__.ID += 1 def __str__(self): return "ItemSet:%d" % (self.id,) def __repr__(self): result = [str(self)] for kernel_item in Lr1Item.sorted(iter(self)): line = " %r %s" % (kernel_item, self.repr_prio(kernel_item)) result.append(line) if self._closure: result.append(" -- closure") for closure_item in Lr1Item.sorted(self._closure.values()): line = " %r %s" % ( closure_item, self.repr_prio(closure_item)) result.append(line) if self.actions: result.append(" -- actions") for token in sorted(self.actions, key=lambda action: str(action)): actions = self.actions[token] if isinstance(actions, Action): lst = repr(actions) else: lst = ', '.join(sorted(repr(act) for act in actions)) result.append(" %s: %s" % (token, lst,)) if self.gotos: result.append(" -- gotos") for symbol in sorted(self.gotos, key=lambda symbol: str(symbol)): result.append(" %s: %s" % (symbol, self.gotos[symbol],)) return '\n'.join(result) def repr_prio(self, item): result = [] for prio, lookahead in sorted(self.prio[item.lr0_item].items()): result.append("%r:%s" % (prio, str_symbol_set(lookahead),)) return '{%s}' % ', '.join(result) # # Return all lhs tokens we generate. # def rules(self): return set(item.production.lhs.get_rule() for item in self) rules = property(rules) # # Compute the closure for the passed items. Ie, given the item # {A ::= a . B c}, add {B ::= . C d} and repeat for B. In other words, # the kernel plus its closure contains every production we can be # expanding, and the position in them. # # X ::= a ^ b c, [l] # # b ::= ^ f g, [follow(c)] # def _close_kernel_items(self, items, cache): modified = False queue = collections.deque(items) empty_token = cache['__empty__'] while queue: # # Find the next symbol that will be consumed by item. # item = queue.popleft() rhs, dot_pos = item.production.rhs, item.dot_pos if dot_pos >= len(rhs): continue symbol = rhs[dot_pos] if isinstance(symbol, TokenSymbol): continue # # Find all tokens we could possibly see after consuming that # symbol. # first_set = self.symbol_seq_first_set(rhs[dot_pos + 1:], cache) if empty_token not in first_set: lookahead = set(first_set) no_empty = first_set else: no_empty = first_set - empty_token.first_set lookahead = no_empty | item.lookahead # # If the next symbol is a nonterm, then add all of its productions # to the closure. # for production in symbol.productions: lr0_item = Lr0Item(production, 0, cache) existing = self._closure.get(lr0_item, None) if existing is not None: extra = lookahead - existing.lookahead existing.lookahead |= extra closure = existing else: extra = lookahead closure = Lr1Item(lr0_item, lookahead) self._closure[lr0_item] = closure self.prio[closure.lr0_item] = {} if not extra: continue queue.append(closure) modified = True # # Push the priority through the closure. In other words, if # the rhs of this production assigned this lhs a priority, # then add this priority to the list we inherited from the # ItemSet's that goto us. Eg, given: # # START = Prio(b,c) __end_of_input__, b.prio==0 and c.prio==1 # b = Prio(d+'X', e+'Y'), so d.prio==0 and e.prio==1 # c = Prio(e+'X', d+'Y'), so d.prio==1 and e.prio==0 # d = 'T' # e = 'T' # # The prio of START is always (), so b's prio will be (0,) # ie "() + (0,)", and c's prio will be (1,) and after that # we expect both to be followed by __end_of_input__. From b's # production d's prio will be b's plus 0, ie # "(0,) + (0,) = (0,0,)", but only when followed by a 'X' (ie # the lookahead is 'X'). From c's production d's prio will be # (1,1), but only when followed by a 'Y'. # # Thus for 'd' we end up with these priorities: # # { (0,0,): set('X'), (0,1,): set('Y') } # # and for 'e' we end up with: # # { (1,0): set('X'), (1,1): set('Y') } # # If the resulting parser is given the input string: # # 'T' 'Y' __end_of_input__. # # First it has to decide whether 'T' is a 'd' or an 'e' as # both will resolve to 'T'. A normal LR(1) would be stuck # with a reduce/reduce conflict, but since the d's priority # for 'Y' (0,1) < e's priority (1,1), we chose 'd'. We don't # do that choosing here, but we create the priorities so it # can happen should a conflict arise. # rank = closure.production.lhs.rank priority = closure.production.lhs.priority if priority is not None: rank -= 1 append = (0,) * max(0, rank - item.production.lhs.rank) if priority is not None: append += (priority,) prio_items = self.prio[item.lr0_item].items() for itemset_prio, item_prio in prio_items: existing_sets = self.prio[closure.lr0_item] if empty_token not in first_set: add = no_empty else: add = no_empty | extra & item_prio existing_prio = itemset_prio[:rank] + append if existing_prio not in existing_sets: existing_sets[existing_prio] = add else: existing_sets[existing_prio] |= add return modified # # Compute the first_set for a sequence of Symbols. If the sequence is: # # sym0 sym1 sym2 # # The first_set is the same as the first_set of sym0, and if the sym0 # can be empty then the first_set of sym1 and so on. # def symbol_seq_first_set(cls, symbol_sequence, cache): syms = tuple(symbol_sequence) result = cache.get(syms, None) if result is None: empty_token = cache['__empty__'] first_set = set(empty_token.first_set) for symbol in syms: first_set |= symbol.first_set if empty_token not in symbol.first_set: first_set -= empty_token.first_set break result = frozenset(first_set) cache[syms] = result return result symbol_seq_first_set = classmethod(symbol_seq_first_set) # # Iterating over us returns the Lr1Item's in our kernel. # def __iter__(self): return iter(self._kernel.values()) # # Return a generator for the kernel + closure. # def all_items(self): return itertools.chain(self, self._closure.values()) # # Compute the closure for ourselves. # def compute_closure(self, cache): if self._closure is None: empty_token = cache['__empty__'] for lr0_item, old_prio_dict in self.prio.items(): prio_dict = {} last_lookahead = empty_token.first_set for prio, lookahead in sorted(old_prio_dict.items()): if lookahead != last_lookahead: prio_dict[prio] = lookahead last_lookahead = lookahead self.prio[lr0_item] = prio_dict self._closure = {} self._close_kernel_items(iter(self), cache) # # Calculate the kernel of the goto set, given a particular symbol. # def goto_sets(self, cache): if self._goto_cache is None: dot_symbols = collections.defaultdict(set) for item in self.all_items(): dot_pos, rhs = item.dot_pos, item.production.rhs if dot_pos < len(rhs): dot_symbols[rhs[dot_pos]].add(item) self._goto_cache = {} for symbol in sorted(dot_symbols, key=lambda symbol: symbol.id): gen = ( (item.lr1_shift(cache), self.prio[item.lr0_item]) for item in dot_symbols[symbol]) item_set = ItemSet(gen, cache) self._goto_cache[symbol] = item_set return self._goto_cache # # Check for reduce/reduce compatibility between ItemSet's. The two # ItemSet's passed must have the same same lr0 kernel. ItemSet's with # different lr0 kernels are never compatible. # # When to reduce and what to is determined by the lookahead: # # S ::= W; S ::= X; S ::= Y; S ::= Z # W ::= a P i # X ::= a Q j # Y ::= b P j # Z ::= b Q i # P ::= c # Q ::= c # # Produces these ItemSets, among others: # # WX: P -> c ., [i] (i-->reduce(P)); Q -> c ., [j] (j-->reduce(Q)) # YZ: P -> c ., [j] (j-->reduce(Q)); Q -> c ., [i] (i-->reduce(P)) # # In this case merging WX and YZ would produce reduce conflicts because if # we see the token i, we can't both reduce(P) and reduce(Q). # def compatible(self, other): # # Compatible() isn't always called, so lazily evaluate # ItemSet._lhs_prio. # def lhs_prio(item_set): if item_set._lhs_prio is None: all_lookaheads = None lhs_prio = {} for lr0_item in item_set._kernel: for prio, lookahead in item_set.prio[lr0_item].items(): key = (lr0_item.production.lhs, prio) if key not in lhs_prio: lhs_prio[key] = lookahead else: lhs_prio[key] |= lookahead if all_lookaheads is None: all_lookaheads = lookahead else: all_lookaheads |= lookahead item_set._lhs_prio = (lhs_prio, all_lookaheads) return item_set._lhs_prio if self is other: return True # # For an LR(1) scheme we are saying for a given lhs in self its # lookahead set can only share lookahead tokens with the same lhs in # other. # # For us it gets a little more complex, as we are carrying # ItemSet.prio's and they have to be compatible as well. However, # turns out this reduces to insisting that (lhs, prio) combinations # can't share lookaheads suffices. # self_lhs_prio, self_all_lookaheads = lhs_prio(self) other_lhs_prio, other_all_lookaheads = lhs_prio(other) common = self_all_lookaheads & other_all_lookaheads for key, self_lookaheads in self_lhs_prio.items(): other_lookaheads = other_lhs_prio.get(key, None) if other_lookaheads is not None: if (self_lookaheads & common) != (other_lookaheads & common): return False return True # # Merge two ItemSet's. They must be ItemSet.compatible(). Return True # if the merge altered 'self', so it needs a new closure computed. # def merge(self, other, cache): closure_items = [] for item in self: other_item = other._kernel[item.lr0_item] modified = False item_prio = self.prio[item.lr0_item] expanded = False for prio, other_lookahead in other.prio[item.lr0_item].items(): self_lookahead = self.prio[item.lr0_item].get(prio, None) if self_lookahead is None: expanded = expanded or item_prio.copy() expanded[prio] = other_lookahead elif not self_lookahead >= other_lookahead: expanded = expanded or item_prio.copy() expanded[prio] |= other_lookahead if expanded: self.prio[item.lr0_item] = expanded modified = True if modified: item.lookahead |= other_item.lookahead closure_items.append(item) if not closure_items: return False self._goto_cache = None self._lhs_prio = None self._close_kernel_items(closure_items, cache) return True def sorted(cls, table): return sorted(table, key=lambda item_set: item_set.id) sorted = classmethod(sorted) # # LR parser actions. These have to be tuples because that is what the # lr1_parser expects in the optimised case. These are for the non-optimised # case, so we carry more information for debugging and priority resolution. # Nonetheless these must be backwards compatible with what # Grammar.optimise_parsing_table() produces. # class Action(tuple): __slots__ = () # # Actions for an owning item are placed into a set. Identical actions # can't be in that set, so __hash__ and __eq__ must be implemented # accordingly. # def __hash__(self): raise NotImplementedError() def __eq__(self, other): raise NotImplementedError() # # Sort order. Only important for tests where we need a repeatable test # outcomes. # def __lt__(self, other): return self.key() < other.key() # # This function returns three things: # # (lhs, low, high) # # where: # # lhs Is the symbol we will reduce to, or None if there could be # several. This is used to compare associativity. # # low The lowest priority for the passed token. # # high The highest priority for the passed token. def precedence(self, token, item_set): raise NotImplementedError() # # A Shift action - consume a token, ie move it onto the stack. # class ShiftAction(Action): __slots__ = () def __new__(cls, *next_state): result = Action.__new__(cls, next_state) return result def __repr__(self): return "shift %s" % self[0] def __hash__(self): return hash(self[0]) def __eq__(self, other): return isinstance(other, ShiftAction) and self[0] == other[0] # # Key used for sorting. Used only to get repeatable tests. # def key(self): return "shift", self[0].id # # In a shift: # # lhs Is the lhs of all items if they are the same, otherwise None. # # low The lowest prio for the token in the kernel. # # high The highest prio for the token in the kernel. # def precedence(self, token, item_set): my_item_set = self[0] lhs = next(iter(my_item_set)).production.lhs low = (1e100,) high = () for lr1_item in my_item_set: if lhs != lr1_item.production.lhs: lhs = None items = my_item_set.prio[lr1_item.lr0_item].items() for prio, lookahead in items: if token in lookahead: if low > prio: low = prio if high < prio: high = prio return lhs, low, high # # A reduce action - ie the top of the stack is a production we recognise. # Replace it with it's lhs. # class ReduceAction(Action): __slots__ = () def __new__(cls, lr1_item): lhs = lr1_item.production.lhs output = lhs if isinstance(lhs, Rule) and lhs.name[0] != '_' else None me = (lhs, len(lr1_item.production.rhs), output, lr1_item) result = Action.__new__(cls, me) return result def __repr__(self): return "reduce %s = %s" % ( self[0], ' '.join(str(sym) for sym in self[3].production.rhs)) def __hash__(self): return hash(self[:3]) def __eq__(self, other): return isinstance(other, ReduceAction) and self[:3] == other[:3] # # Key used for sorting. Used only to get repeatable tests. # def key(self): return "reduce", self[0].id, self[3].dot_pos # # In a reduce: # # lhs Is the lhs of the target production. # # low The lowest prio for the token for this Lr1Item. # # high The highest prio for the token for this Lr1Item. # def precedence(self, token, item_set): lr1_item = self[3] lhs = lr1_item.production.lhs low = (1e100,) high = () for prio, lookahead in item_set.prio[lr1_item.lr0_item].items(): if token in lookahead: if low > prio: low = prio if high < prio: high = prio return lhs, low, high # # The thing that implements the grammar - the Parser. # class Parser(object): VERSION = "0.1" comments = None # object, TokenSymbol(), list of tokens. empty_token = None # object, MetaToken("__empty__") eoi_token = None # object, MetaToken("__end_of_input__") epoch_symbol = None # object, Rule(), = START __end_of_input__ parser_name = None # string, Name of the parser parsing_table = None # tuple, (Lr1State(), ...) rules = None # dict, {"name": Rule(), ...} token_registry = None # object, TokenRegistry() unused_symbols = None # set, SymbolSet(Rule(), ...) whitespace = None # string, Characters defined to be whitespace def __init__(self, parser_name, dct): def new_meta(name): meta = MetaToken(name) return meta.resolve_symbol(name, self.rules, self.token_registry) # # Step 1 is to replace all productions with a Rule() equivalent and # find the TokenRegistry. # self.parser_name = parser_name rule_symbols, self.rules, token_registry = ( self.catalogue_symbols(dct)) if token_registry is None: self.token_registry = TokenRegistry() else: token_registry.restore_dicts() self.token_registry = token_registry() for name in sorted(self.rules): self.resolve_rule(self.rules[name], rule_symbols) self.empty_token = new_meta("__empty__") self.eoi_token = new_meta("__end_of_input__") for name in sorted(self.rules): self.resolve_symbol(self.rules[name]) if name in self.token_registry: msg = "A token and symbol share the same name %r" raise GrammarError(msg % name) # # Create the starting production for the grammar. # start_symbol = dct.get("START", None) if start_symbol is None: raise GrammarError("No START symbol defined") if not isinstance(start_symbol, Rule): raise GrammarError("START is not a Nonterm") if len(start_symbol.dict) != 0: raise GrammarError("START symbol may not have dictionary elements") epoch_symbol = Sequence() epoch_symbol.nested = [start_symbol, self.eoi_token] self.epoch_symbol = Rule('<%s>' % self.parser_name, epoch_symbol) epoch_symbol.parent = self.epoch_symbol if self.epoch_symbol.name in self.rules: msg = "Symbol name %r is reserved" % self.epoch_symbol.name raise GrammarError(msg) self.rules[self.epoch_symbol.name] = self.epoch_symbol for rule in self.rules.values(): rule.resolved = True # # Get the special cased tokens. # self.whitespace = dct.get("WHITESPACE", None) self.token_registry.compile_tokens(self.whitespace) comments = dct.get("COMMENTS", None) if comments is None: self.comment_tokens = None else: error = not isinstance(comments, Rule) if not error: comment_symbol = comments.nested[0] if isinstance(comment_symbol, TokenSymbol): self.comment_tokens = comments.nested elif isinstance(comment_symbol, Choice): error = all( not isinstance(sym, TokenSymbol) for sym in comments.nested[0].nested) if not error: self.comment_tokens = comments.nested[0].nested if error: raise GrammarError("COMMENTS must be Token | Token ...") del self.rules['COMMENTS'] # # Compile the grammar into an LR(1) parse_table, or raise a GrammarError # if there is a problem with the grammar. # def compile_grammar(self): if self.parsing_table: return # # Initialise the grammar by creating the start production. Then # compile it. # used_symbols = self.epoch_symbol.compile_grammar(self.empty_token) self.unused_symbols = frozenset( symbol for symbol in self.rules.values() if symbol not in used_symbols) # # Resolve first sets. # self.calc_first_sets(self.epoch_symbol) # # Create the parser. # start_state, lr0_item_sets = self.compute_lr1_items(self.epoch_symbol) table = self.compute_parsing_table(lr0_item_sets) self.parsing_table = (start_state, table) self.normalise_item_set_id(self.parsing_table) self.disambiguate(table) # # Return the optimised_grammar suitable for passing to compile_grammar(). # def pre_compile_grammar(self, grammar_class, pre_compiled=None): def from_flat(i, flat): return Lr1State.from_flat(i, flat, self.rules, self.token_registry) # # If it has already been pre compiled just return. # if self.parsing_table is not None: if isinstance(self.parsing_table[0], int): return None # # If we don't have a table see if we can use the pre_compiled version. # if pre_compiled: if isinstance(pre_compiled, string_types): pre_compiled = ast.literal_eval(pre_compiled) if pre_compiled[0] == self.grammar_hash(): optimised_start_state = pre_compiled[1] optimised_parsing_table = tuple( from_flat(i - 2, pre_compiled[i]) for i in range(2, len(pre_compiled))) self.parsing_table = ( optimised_start_state, optimised_parsing_table) return None # # Optimise it. # self.compile_grammar() self.parsing_table = self.optimise_parsing_table(self.parsing_table) flattened = tuple( state.to_flat(grammar_class) for state in self.parsing_table[1]) return repr((self.grammar_hash(), self.parsing_table[0]) + flattened) # # Make the start state item_set.id==0 and the remainder following # sequentially. # def normalise_item_set_id(cls, parsing_table): start_state, table = parsing_table mapping = iter(zip(ItemSet.sorted(table), itertools.count(0))) first = next(mapping) for item_set, id in itertools.chain((first,), mapping): item_set.id = id start_state.id, first[0].id = first[0].id, start_state.id normalise_item_set_id = classmethod(normalise_item_set_id) # # Debug dump of nonterms. # def repr_productions(self): def r(nonterm): all_nonterms.add(nonterm) for symbol in nonterm.nested: if isinstance(symbol, Nonterm) and symbol not in all_nonterms: r(symbol) all_nonterms = set() for rule in self.rules.values(): r(rule) all_nonterms = [n for n in sorted(all_nonterms, key=lambda t: str(t))] result = [] i = 0 for nonterm in all_nonterms: if not nonterm.productions: continue rank = ".%d" % nonterm.rank if nonterm.rank else "" result.append( "%-6s: %r" % ("%d%s" % (i, rank), nonterm.productions[0],)) for p in nonterm.productions[1:]: result.append(" %r" % (p,)) i += 1 return '\n'.join(result) # # Debug dump of the parsing table. # def repr_parse_table(self, state=None): if self.parsing_table is None: return '' a_state = next(iter(self.parsing_table[1])) if state is None: func = lambda item_set: True elif state >= 0: func = lambda item_set: item_set.id == state else: is_state = lambda s: (s if isinstance(s, int) else s.id) == -state action_state = lambda act: is_state(tuple(act)[0]) func = lambda item_set: ( is_state(item_set) or any(is_state(g) for g in item_set.gotos.values()) or any(action_state(a) for a in item_set.actions.values())) item_sets = (i for i in self.parsing_table[1] if func(i)) result = [] for item_set in a_state.sorted(item_sets): result.append(repr(item_set)) result.append("") if result: del result[-1] return '\n'.join(result) # # Dump the grammar. # def repr_grammar(self): result = [repr(rule) for name, rule in sorted(self.rules.items())] return '\n'.join(result) # # Parse a feed. # def parse(self, input, tree_factory=None, on_error=None, log=None): if self.parsing_table is None: self.compile_grammar() token_feed = self.token_registry.tokeniser(input, self.whitespace) return self.lr1_parser(token_feed, tree_factory, on_error, log) # # Build up a catalogue of all symbols. # def catalogue_symbols(cls, dct): # # Move one symbols dict to another. # def move_dict(to, frm): for key in list(frm.dict): to.dict[key] = frm.dict[key] del frm.dict[key] # # Create a new Rule() for this rule. # def catalogue(name, field): rule = Rule(name, field) rule_symbols[field] = rule symbols[name] = rule dct[name] = rule return rule rule_symbols = {} symbols = {} token_registry = None for name, field in sorted(dct.items()): if isinstance(field, Symbol): if isinstance(field, Ref): raise GrammarError("Ref(%r) hasn't been defined" % name) if field not in rule_symbols: move_dict(catalogue(name, field), field) elif name == "START": catalogue(name, rule_symbols[field]) elif rule_symbols[field].name == "START": rule = rule_symbols[field] field_rule = catalogue(name, field) move_dict(field_rule, rule) catalogue(rule.name, field_rule) else: msg = ( "You have \"%s = %s\" or " + "\"%s = TokenRegistry.tok; " + "%s = TokenRegistry.tok\".\n" + "Only START maybe assigned directly to " + "another Symbol. A workaround is %s = %s * 1" ) raise GrammarError(msg % ( name, rule_symbols[field], name, rule_symbols[field], name, rule_symbols[field],)) elif isinstance(field, type) and TokenRegistry in field.__bases__: if token_registry is None: token_registry = field else: msg = "Can't have more than one %s" raise GrammarError(msg, TokenRegistry.__name__) return rule_symbols, symbols, token_registry catalogue_symbols = classmethod(catalogue_symbols) # # Replace all occurrences of a declared symbol with it's Rule(). # def resolve_rule(self, rule, rule_symbols): def r(nested): for i, symbol in zip(itertools.count(), nested): if not isinstance(symbol, Rule): resolved = rule_symbols.get(symbol, None) if resolved is not None: nested[i] = resolved else: r(symbol.nested) for i, symbol in zip(itertools.count(), rule.nested): # resolved = rule_symbols[symbol] # if resolved is not rule: # rule.nested[i] = resolved assert rule_symbols[symbol] is rule if not isinstance(symbol, Rule): r(symbol.nested) # # Scan the entire grammar, allowing nodes in the parse tree to replace # themselves with other nodes. Eg, Ref's with the Symbol they are # referencing. # def resolve_symbol(self, rule): def r(parent, symbol): resolved = symbol.resolve_symbol(name, self.rules, token_registry) if isinstance(resolved, Rule): return resolved resolved.parent = parent for i, sym in zip(itertools.count(), resolved.nested): if not isinstance(sym, Rule): resolved.nested[i] = r(resolved, sym) return resolved name = rule.name token_registry = self.token_registry for i, sym in zip(itertools.count(), rule.nested): rule.nested[i] = r(rule, sym) # # Compute the first sets for all symbols. # def calc_first_sets(self, epoch_symbol): # # Collect all nonterminals used by the grammar. # def r(nonterm): for prod in nonterm.productions: for sym in prod.rhs: if isinstance(sym, Nonterm) and sym not in nonterms: nonterms.add(sym) nonterm_list.append(sym) r(sym) # # We use nonterm_list to make it deterministic. # nonterms = set() nonterm_list = [] r(epoch_symbol) changed = True while changed: changed = any(sym.merge_first_set(self) for sym in nonterm_list) # # Compute the collection of sets of LR(1) items. # # Wikipedia is a good reference: # http://en.wikipedia.org/wiki/Canonical_LR_parser # def compute_lr1_items(self, epoch_symbol): # # Initialise the parse table by creating the start ItemSet. It # contains one item: # # FINISH ::= ^ START $., # cache = {'__empty__': self.empty_token} start_production = epoch_symbol.productions[0] start_item_lr0 = Lr0Item(start_production, 0, cache) start_item_lookahead = set(self.empty_token.first_set) start_item_lr1 = Lr1Item(start_item_lr0, start_item_lookahead) start_item_set = ItemSet(((start_item_lr1, {(): set()}),), cache) start_item_set.compute_closure(cache) lr0_item_sets = collections.defaultdict(list) lr0_item_sets[start_item_set.lr0_kernel].append(start_item_set) # # Now compute new ItemSet's from the ones we have created, until we # all we create is duplicates. # worklist = collections.deque([start_item_set]) while worklist: item_set = worklist.popleft() goto_sets = item_set.goto_sets(cache).items() for symbol, goto_set in sorted(goto_sets, key=lambda i: i[0].id): lr1_merge = None for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]: if lr1_item_set.compatible(goto_set): lr1_merge = lr1_item_set break if lr1_merge is None: goto_set.compute_closure(cache) worklist.append(goto_set) lr0_item_sets[goto_set.lr0_kernel].append(goto_set) elif lr1_merge.merge(goto_set, cache): if lr1_merge in worklist: worklist.remove(lr1_merge) worklist.appendleft(lr1_merge) return start_item_set, lr0_item_sets # # Compute LR(1) actions. # def compute_parsing_table(cls, lr0_item_sets): # # First assign a unique ID to each item_set. This ID will be it's # index into actions[] and gotos[]. # table = dict( (item_set, item_set) for item_set_list in lr0_item_sets.values() for item_set in item_set_list) for item_set in table: # # Compute actions. # actions = collections.defaultdict(set) goto_sets = item_set.goto_sets(None) for item in item_set.all_items(): dot_pos, rhs = item.dot_pos, item.production.rhs if dot_pos == len(rhs): for token in item.lookahead: actions[token].add(ReduceAction(item)) elif isinstance(rhs[item.dot_pos], TokenSymbol): token = rhs[item.dot_pos] goto_set = goto_sets[token] found = False for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]: if lr1_item_set.compatible(goto_set): actions[token].add(ShiftAction(lr1_item_set)) found = True break assert found, repr(token) # # Turn the action lists into tuples. # item_set.actions = dict(actions) for token in actions: item_set.actions[token] = tuple(item_set.actions[token]) # # Compute goto's. # gotos = item_set.gotos for symbol in goto_sets: if not isinstance(symbol, Nonterm): continue goto_set = goto_sets[symbol] for lr1_item_set in lr0_item_sets[goto_set.lr0_kernel]: if lr1_item_set.compatible(goto_set): assert symbol not in gotos gotos[symbol] = lr1_item_set break return table compute_parsing_table = classmethod(compute_parsing_table) # # Look for action ambiguities and resolve them if possible. # def disambiguate(cls, item_sets): action_list = [ (item_set, act) for item_set in item_sets for act in item_set.actions.items()] for item_set, (token, actions) in action_list: if len(actions) == 1: item_set.actions[token] = actions[0] continue # # Multiple actions are ambiguities. Compare every action with # all others in the hope that Prio() and Assoc() can eliminate # all bar one. # new_actions = sorted(actions) # Repeatability for testing i = 0 while i < len(new_actions) - 1: act_0 = new_actions[i] j = i + 1 while j < len(new_actions): act_1 = new_actions[j] keep = cls.resolve_ambiguity(item_set, token, act_0, act_1) if "1" not in keep: del new_actions[j] j -= 1 j += 1 if "0" not in keep: del new_actions[i] i -= 1 break i += 1 # # Since we don't support GLR(1) grammars yet (ie, we don't do # Split()), resolve_ambiguity() must have not more than one result # left. It's possible Nonassoc eliminates all of them. # if not new_actions: del item_set.actions[token] else: assert len(new_actions) == 1 item_set.actions[token] = new_actions[0] disambiguate = classmethod(disambiguate) # # Compute how to resolve an action conflict. Returns the actions to # keep, or "err". # def resolve_ambiguity(cls, item_set, token, action_0, action_1): # # Print a nice and hopefully useful error message when we can't # resolve a conflict. # def err(reason): # # Print an action in a nice looking way. # def explain_action(action): if isinstance(action, ReduceAction): rhs = ' '.join(str(s) for s in action[3].production.rhs) lhs = action[3].production.lhs return [" replace the sequence [%s] with %s" % (rhs, lhs)] next_state_lr1_items = list(action[0]) one_of = "" if len(next_state_lr1_items) == 1 else " one of" msg = [ " accept the %s in the hope it will match%s:" % (token, one_of) ] msg.extend([ " " + repr(lr1_item.lr0_item) for lr1_item in Lr1Item.sorted(next_state_lr1_items)]) return msg # # Is the passed item relevant to the action? # def relevant(item, action): if isinstance(action, ReduceAction): return action[3] is item return (item.dot_pos < len(item.production.rhs) and item.production.rhs[item.dot_pos] == token) # # Produce a nicely formatted error message. # msg = ["Conflict: %s" % (reason,)] msg.append("While trying to recognise state %d:" % (item_set.id,)) for item in item_set.all_items(): if relevant(item, action_0) or relevant(item, action_1): msg.append(" %r" % (item.lr0_item,)) msg.append("on seeing a %s I could not decide between:" % (token,)) msg.extend(explain_action(action_0)) msg.append("and") msg.extend(explain_action(action_1)) msg.append("Please remove this ambiguity from your grammar") raise GrammarError('\n'.join(msg)) # # Decide the associativity of a symbol. # def assoc(nonterm): if isinstance(nonterm, Assoc): return nonterm return None # # Ambiguities can arise from productions like this: # # e = e op e # op = op1 | op2 # # When confronted with the token string: # # e op1 e ^ op2 e # # We get a shift / reduce conflict at the indicated position. # The choice is really between two different parse trees: # # Shift: (e op1 (e op2 e)) # Reduce: ((e op1 e) op2 e) # # This can be resolved in two ways. If priorities are allocated to # the clashing productions, then we can choose the action the # grammar writer preferred based on which of 'op1' or 'op2' was # used: # # e = Prio(e op1 e, e op2 e) # which is equivalent to: # e = e op1 e, Priority=0 # e = e op2 e, Priority=1 # # Or, the grammar writer can specify associativity, ie saying he always # wants ((e 2) e) regardless of 'op' (left associative) or he # wants (e op (e op e)) regardless of 'op' (right associative). For # left associativity it is like this: # # e = e << op << e # # A reduce/reduce conflict can happen if the clash happens higher up # in the parse tree: # # e0 = Prio(e1, e2) # e1 = 'n' # e2 = 'n' # # When parsing: # # 'n' ^ # # We have two reduction choices, e1='n' and e2='n'. These productions # don't have priorities allocated directly, but the parse table builder # will have pushed the priorities e0 allocated down to them. Nested # priorities are ranked (see allocate_rank()), and this ranking makes # priorities globally comparable. # lhs_0, low_0, high_0 = action_0.precedence(token, item_set) lhs_1, low_1, high_1 = action_1.precedence(token, item_set) if low_0 < low_1: return "0" if low_0 > low_1: return "1" # # The priorities are the same. Try using the associativity. # assert ( not isinstance(action_0, ShiftAction) or not isinstance(action_1, ShiftAction)) if (isinstance(action_0, ReduceAction) and isinstance(action_1, ReduceAction)): return err("Reduce/Reduce") lhs_0_assoc, lhs_1_assoc = assoc(lhs_0), assoc(lhs_1) lhs = lhs_0_assoc if lhs_0_assoc is not None else lhs_1_assoc if not isinstance(lhs, Assoc): return err("Shift/Reduce and no associativity") if isinstance(lhs_1_assoc, Assoc) and lhs.assoc != lhs_1_assoc.assoc: return err("Shift/Reduce and conflicting associativity") # # The actions have met the associativity pre-conditions so we can # resolve the conflict. # if lhs.assoc == 'l': # Left() return "1" if isinstance(action_0, ShiftAction) else "0" if lhs.assoc == 'r': # Right() return "0" if isinstance(action_0, ShiftAction) else "1" assert lhs.assoc == 'n', lhs_0.assoc return "" # Nonassoc() - Association is a parse error resolve_ambiguity = classmethod(resolve_ambiguity) # # This optional step creates a parsing_table with all the information # in the ItemSet() pruned. # def optimise_parsing_table(self, parsing_table): start_state, table = parsing_table # # Create a mapping of item_set: int, with start_state mapping to 0. # state_number = dict(zip(ItemSet.sorted(table), itertools.count())) item_set_0 = next(i for i in state_number if state_number[i] == 0) state_number[item_set_0] = state_number[start_state] state_number[start_state] = 0 # # Map each item set to it's number. # optimised = [] for item_set in sorted(table, key=lambda i: state_number[i]): actions = {} for token in item_set.actions: action = item_set.actions[token] if isinstance(action, ShiftAction): new_action = (state_number[action[0]],) else: nonterm_nr = action[0].id len_rhs = action[1] output = action[2] new_action = (nonterm_nr, len_rhs, output, None) actions[token] = new_action gotos = dict( (nonterm.id, state_number[itm_set]) for nonterm, itm_set in item_set.gotos.items()) state_id = len(optimised) lr1_state = Lr1State(state_id, actions, gotos, item_set.rules) optimised.append(lr1_state) # # And we are done. # return 0, optimised # # The grammar hash. # def grammar_hash(self): grammar_tree = '; '.join(sorted( repr(self.rules[name]) for name in self.rules)) hsh = hashlib.sha512() hsh.update(grammar_tree.encode()) hsh.update(self.VERSION.encode()) return hsh.hexdigest() # # The LR(1) table driven parser. # def lr1_parser(self, token_feed, tree_factory, on_error, log): # # Print the current stack, for log. # def print_stack(stk): return " ".join( "%s=%s" % (s[0], s[1][0][0] if s[1] else '()') for s in stk) # # The lengths I am prepared to go in the name of fast path # efficiency frightens me a times. # def insert_error_recovery_tuple(): while recovery_stack: input_tuple = next(recovery_stack[-1], EOF) if input_tuple is not EOF: yield input_tuple else: recovery_stack.pop() iterator[0] = original_input yield next(iterator[0], EOF) EOF = object() recovery_stack = [] original_input = itertools.chain(token_feed, ((self.eoi_token,),)) iterator = [original_input] start_state, table = self.parsing_table state = table[start_state] stack = [(state, ((self.empty_token,),))] if not self.comment_tokens: comments = () else: comments = frozenset(self.comment_tokens) input_tuple = next(iterator[0], EOF) while input_tuple is not EOF: token = input_tuple[0] if token in comments: input_tuple = next(iterator[0], EOF) continue while True: try: action = state.actions[token] except KeyError: # # A Parse error. Does he want to do error recovery? # if on_error is None: raise ParseError(input_tuple, stack) insert = on_error(iterator[0], input_tuple, stack) if insert is None: raise ParseError(input_tuple, stack) recovery_stack.append(iter(insert)) iterator = [insert_error_recovery_tuple()] break # # A shift? # if len(action) == 1: if token is self.eoi_token: break if tree_factory: input_tuple = tree_factory(input_tuple) state = table[action[0]] stack.append((state, (input_tuple,))) if log: log("shift %s; %s" % (token, print_stack(stack[1:]),)) break # # A reduce. # goto, pop_count, output, _ = action if pop_count == 0: tail = () nodes = () else: tail = stack[-pop_count:] del stack[-pop_count:] nodes = sum((s[1] for s in tail), ()) if output is not None: nodes = (output,) + nodes if tree_factory: nodes = tree_factory(nodes) nodes = (nodes,) state = table[stack[-1][0].gotos[goto]] stack.append((state, nodes)) if log: log( "reduce %s; %s -- %s" % (token, print_stack(tail), print_stack(stack[1:]))) input_tuple = next(iterator[0], EOF) return stack[-1][1][0] # # The base class for Symbol's in the grammar: a token or a non-terminal. # class Symbol(object): DICT_METHODS = ( "__contains__", "__delitem__", "__getitem__", "__iter__", "__len__", "__setitem__") __slots__ = ('dict', 'id', 'first_set', 'nested', 'parent') + DICT_METHODS SYMBOL_PRECEDENCE = 0 def __init__(self): self.dict = {} self.first_set = frozenset() self.nested = () self.parent = None # # The original idea was to inherit from dict. Doing so meant Symbol # wasn't hashable and since it is used extensively in sets and as # keys to dict's overriding __hash__ so it was hash'able caused a # 10% slowdown in generating the parse table. So now we just emulate # a dict. # self.__contains__ = self.dict.__contains__ self.__delitem__ = self.dict.__delitem__ self.__getitem__ = self.dict.__getitem__ self.__iter__ = self.dict.__iter__ self.__len__ = self.dict.__len__ self.__setitem__ = self.dict.__setitem__ def __add__(self, other): return OpPlus(self, other) def __radd__(self, other): return OpPlus(other, self) def __mul__(self, other): if other is Opt or other is Some or other is Many or other is Repeat: return other(self) if isinstance(other, int): return Repeat(self, other, other) if (not isinstance(other, tuple) or len(other) > 2 or any(not isinstance(a, int) for a in other)): msg = ( "right operand of * must be one of: " + "Opt, Some, Many, Repeat, (), (min,), (min,max)") raise GrammarError(msg) return Repeat(self, *other) def __rmul__(self, other): if other is Opt or other is Some or other is Many or other is Repeat: return other(self) if isinstance(other, int): return Repeat(self, other, other) if (not isinstance(other, tuple) or len(other) > 2 or any(not isinstance(a, int) for a in other)): msg = ( "left operand of * must be one of: " + "Opt, Some, Many, Repeat, (), (min,), (min,max)") raise GrammarError(msg) return Repeat(self, *other) def __lshift__(self, other): return OpLshift(self, other) def __rlshift__(self, other): return OpLshift(other, self) def __rshift__(self, other): return OpRshift(self, other) def __rrshift__(self, other): return OpRshift(other, self) def __or__(self, other): return OpOr(self, other) def __ror__(self, other): return OpOr(other, self) def __nonzero__(self): return True __bool__ = __nonzero__ # For Python3 def cast(cls, value): value = cls.CAST.get(type(value), lambda x: x)(value) if not isinstance(value, Symbol): raise GrammarError("%r can't be a Symbol" % value) return value def compile_symbol(self, comp): raise NotImplementedError() def resolve_symbol(self, name, rules, token_registry): return self # # Return a unique name for the symbol. # def __str__(self): if self.parent is None: return self.__class__.__name__ same_as_me = tuple( sym for sym in self.parent.nested if sym.__class__.__name__ == self.__class__.__name__) if len(same_as_me) < 2: my_name = self.__class__.__name__ else: index = same_as_me.index(self) my_name = "%s%d" % (self.__class__.__name__, index) return "%s.%s" % (self.parent, my_name,) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.repr_nested()) def repr_nested(self): return ', '.join(sym.nested_repr() for sym in self.nested) def nested_repr(self): return repr(self) def get_rule(self): rule = self while rule.parent is not None: rule = rule.parent assert isinstance(rule, Rule) return rule # # A forward reference to a symbol that will be defined later. # class Ref(Symbol): __slots__ = ('referenced_name',) def __init__(self, referenced_name): super(Ref, self).__init__() self.referenced_name = referenced_name def resolve_symbol(self, name, rules, token_registry): if self.referenced_name not in rules: raise GrammarError("%s references undefined %r" % (name, self)) if len(self.dict) != 0: msg = "Ref(%s) may not have dictionary elements" raise GrammarError(msg % (self.referenced_name,)) return rules[self.referenced_name] def __repr__(self): return "%s(%r)" % (self.__class__.__name__, self.referenced_name) # # A reference to the Rule() of the current production. # class This(Symbol): __slots__ = () def resolve_symbol(self, name, rules, token_registry): return rules[name] THIS = This() # # Combine several lists productions as a sequence. In other words, say we # had several nonterms which were combined as a sequence: # # a = a0 + a1 + a2 # # a0, a1 and a2 have their own lists of productions. We must produce the # productions for a from those lists. So: # # Given: # a0 = [(a,),(b,)] # a1 = [(c,d),(e,f)] # a2 = [(g,)] # # Return: # [(a,c,d,g), (a,e,f,g), (b,c,d,g), (b,e,f,g)] # def seq(*args): if not isinstance(args[-1], int): repeats = 1 else: repeats = args[-1] args = args[:-1] args = [[(p,)] if isinstance(p, Symbol) else p for p in args] result = list( tuple(itertools.chain.from_iterable(prod)) for prod in itertools.product(*args, repeat=repeats)) return result # # A non-terminal: A symbol that appears on the left hand side of a rule. # class Nonterm(Symbol): __slots__ = ('rank', 'productions', 'priority') def __init__(self, *args): super(Nonterm, self).__init__() for symbol in args: if isinstance(symbol, Rule) and symbol.resolved: msg = "Can not import %s from another Grammar" % ( symbol.name,) raise GrammarError(msg) if isinstance(symbol, MetaToken): msg = "%s can not be used in a production" % (symbol.name,) raise GrammarError(msg) self.nested = [self.cast(symbol) for symbol in args] self.priority = None self.productions = () self.rank = None # # Compile the grammar. In other words turn it into a series of LR # productions. Start from the start symbol and work our way down. # Record what symbols were referenced, as unreferenced ones are # probably an error. # def compile_grammar(self, empty_token): # # A recursive function that visits all symbols in the grammar, # compiling them. # def comp(symbol): if not isinstance(symbol, Rule): return symbol.compile_symbol(comp) if symbol not in seen: seen.add(symbol) symbol.emit(symbol.compile_symbol(comp)) return [(symbol,)] seen = set() comp(self) self.allocate_symbol_id(empty_token) self.allocate_rank() return seen # # Allocate Nonterm.rank, so priorities can be calculated during the # parser generator process. Lets say we have: # # START = Prio(a, b) # a = Prio(b, c) # b = 'n' # c = Prio(START, a) # # So START has allocated 'b' a priority of 1, but 'a' has allocated 'b' a # priority of 0. The question is how do we compare these? The answer # used here is to rank the priorities based on how close they are to # the START symbol. START is 0 away from itself, so it has the most # preferred rank. 'a' and 'b' have ranks of 1, because they are referred # to directly by START, and 'c' has a rank of 2. # # The parser uses this rank to create a priority tuple: (p0, p1, p2, ...), # where p0 is the assigned by rank 0, p1 is the priority assigned by rank 1 # and so on. # # So in answering the question earlier, START's priority for 'b' is: # (START = b,) which is (1,), # and 'a's priority for 'b' is: # (START = a, a = b,) which is (0,0,). # # When parsing this sequence of symbols: # # 'n' __end_of_input__ # # The parser will go through these states. # # STACK INPUT POSSIBLE ACTIONS # [] 'n' shift 'n' # ['n'] __end_of_input__ reduce b = 'n' # [b] __end_of_input__ reduce a = b, reduce START = b # # The priority of a = b is (0,0), and the priority of START = b is (1,). # Since (0,0) < (1,), the parser will chose a = b. # def allocate_rank(self): # # First determine the height of each Nonterm in the production tree # by doing a a breadth first pass over it. Collect the tree structure # as we go. # height = 0 refers = collections.defaultdict(set) refers[self].add(self) symbol_heights = {self: height} height += 1 queue = [self] while queue: queue, process = [], queue for nonterm in process: refers[nonterm] |= frozenset() for production in nonterm.productions: for symbol in production.rhs: if isinstance(symbol, Nonterm): if nonterm is not symbol: refers[symbol].add(nonterm) if symbol not in symbol_heights: symbol_heights[symbol] = height queue.append(symbol) height += 1 # # Turn it into a hierarchy. The idea is we do a topological sort on # the graph using the tree structure we accumulated, breaking any # cycles using height. # def remove(no_refers): no_prio = set( s for s in no_refers if not isinstance(s, Prio.Prioritised)) if not no_prio: rank[0] += 1 else: no_refers = no_prio for s in no_refers: s.rank = rank[0] del refers[s] for s in refers: refers[s] -= no_refers rank = [0] while refers: no_refers = set(s for s, r in refers.items() if not r) while refers and no_refers: remove(no_refers) no_refers = set(s for s, r in refers.items() if not r) # # If that didn't consume every token we have a cycle. Break the # cycle by considering the node closest to the START symbol (ie # lowest height) as havingt the heighest priority. # if refers: low = min(symbol_heights[s] for s in refers) lowest = set(s for s in refers if symbol_heights[s] == low) remove(lowest) # print sorted("%d %s" % (sym.rank, sym) for sym in symbol_heights) # # Allocate each symbol a unique id. This is used to make things # deterministic when we iterate through hash tables and sets. # def allocate_symbol_id(self, empty_token): queue = collections.deque([self, empty_token]) all_symbols = set() while queue: symbol = queue.popleft() if symbol in all_symbols: continue symbol.id = len(all_symbols) all_symbols.add(symbol) if isinstance(symbol, Nonterm): for production in symbol.productions: queue.extend(production.rhs) # # Generate the Production() objects for the Nonterm(). # def emit(self, productions): self.productions = [Production(self, p) for p in productions] # # Merge the first_sets of all productions into ours. Return True if # the first_set was changed. # def merge_first_set(self, parser): want_empty = False have_empty = parser.empty_token in self.first_set result = self.first_set for production in self.productions: # # For all A = a + b, merge first(a) into first(A). # for symbol in production.rhs: result |= symbol.first_set if parser.empty_token not in symbol.first_set: break have_empty = True else: want_empty = True if want_empty: if not have_empty: result |= parser.empty_token.first_set else: if have_empty: result -= parser.empty_token.first_set if len(result) != len(self.first_set): self.first_set = result return True return False # # For nonterms that accept lists, print in a nice way. # def nonterm_repr(self, delimiter): def repr_sym(symbol): if (len(symbol.nested) < 2 or self.SYMBOL_PRECEDENCE <= symbol.SYMBOL_PRECEDENCE): return symbol.nested_repr() return '(' + symbol.nested_repr() + ')' if len(self.nested) < 2: return "%s(%s)" % ( self.__class__.__name__, ', '.join(sym.nested_repr() for sym in self.nested),) nested = (repr_sym(symbol) for symbol in self.nested) return (' ' + delimiter + ' ').join(nested) # # A symbol on the left hand side of a grammar rule. # class Rule(Nonterm): __slots__ = ('name', 'resolved') def __init__(self, name, symbol): super(Rule, self).__init__(symbol) self.name = name self.resolved = False def compile_symbol(self, comp): return comp(*self.nested) def __str__(self): return self.name def __repr__(self): if isinstance(self.nested[0], Rule): return "%s = %s" % (self.name, self.nested[0].name) return "%s = %r" % (self.name, self.nested[0]) def nested_repr(self): return "%s" % (self.name,) # # The arguments are a single production. Thus: # # L = Sequence(sym0, sym1, sym2) # # Yields the production: # # L ::= sym0 sym1 sym2 # class Sequence(Nonterm): __slots__ = () SYMBOL_PRECEDENCE = 3 def __init__(self, *args): super(Sequence, self).__init__(*args) def compile_symbol(self, comp): return seq(*[comp(symbol) for symbol in self.nested]) def __repr__(self): return self.nonterm_repr('+') # # The arguments are alternate productions. Thus # # L = Choice(sym0, sym1, sym2) # # Yields the productions: # # L ::= sym0 # L ::= sym1 # L ::= sym2 # class Choice(Nonterm): __slots__ = () SYMBOL_PRECEDENCE = 1 def __init__(self, *args): super(Choice, self).__init__(*args) def compile_symbol(self, comp): return sum([comp(sym) for sym in self.nested], []) def __repr__(self): return self.nonterm_repr('|') # # Binary operations. Repeated applications of the same binary operation # yield a single list. The op is then applied to that list. Thus: # # L = sym0 + sym1 + sym2 | sym3 | sym4 # # Gets turned into: # # L = Alternate(Sequence(sym0, sym1, sym2), sym3, sym4). # class BinOp(Nonterm): __slots__ = () def __init__(self, arg1, arg2): super(BinOp, self).__init__(arg1, arg2) def combine(self): def r(arg1, arg2): a1 = r(*arg1.nested) if type(self) == type(arg1) else [arg1] a2 = r(*arg2.nested) if type(self) == type(arg2) else [arg2] return a1 + a2 return r(*self.nested) # # Constructed for: sym | sym. # class OpOr(BinOp): __slots__ = () def resolve_symbol(self, name, rules, token_registry): return Choice(*self.combine()) # # Constructed for: sym + sym. # class OpPlus(BinOp): __slots__ = () def resolve_symbol(self, name, rules, token_registry): return Sequence(*self.combine()) # # Constructed for: sym << sym. # class OpLshift(BinOp): __slots__ = () def resolve_symbol(self, name, rules, token_registry): return Left(*self.combine()) # # Constructed for: sym >> sym. # class OpRshift(BinOp): __slots__ = () def resolve_symbol(self, name, rules, token_registry): return Right(*self.combine()) # # A priority list. A priority list is passed a list of symbols: # S = Prio(sym0, sym1, sym2) # is identical to: # S = sym0 | sym1 | sym2 # with the side effect that in the event of a conflict we will choose # sym0 over sym1 over sym2. # class Prio(Nonterm): # # This node holds priorities. # class Prioritised(Nonterm): __slots__ = () def __init__(self, priority, symbol): super(Prio.Prioritised, self).__init__(symbol) self.priority = priority def compile_symbol(self, comp): self.emit(comp(*self.nested)) return [(self,)] __slots__ = () def __init__(self, *args): super(Prio, self).__init__(*args) # # Priority nodes allocate a separate node for each sub-node. We # subsume nested Prio's. # def resolve_symbol(self, name, rules, token_registry): def r(prio): for symbol in prio.nested: if isinstance(symbol, Prio): r(symbol) else: children.append(self.Prioritised(next(index), symbol)) index = itertools.count() children = [] r(self) self.nested = children return self def compile_symbol(self, comp): self.emit(sum([comp(sym) for sym in self.nested], [])) return [(self,)] def __repr__(self): if len(self.nested) < 2: return "%s(%s)" % ( self.__class__.__name__, ', '.join(sym.nested_repr() for sym in self.nested)) return "(%s)" % (', '.join(sym.nested_repr() for sym in self.nested),) # # This node holds associativity: ie left, right or not allowed. # class Assoc(Nonterm): __slots__ = ('assoc',) def __init__(self, assoc, *args): if assoc not in 'lnr': raise GrammarError("Unknown associativity %r" % (assoc,)) super(Assoc, self).__init__(*args) self.assoc = assoc def compile_symbol(self, comp): self.emit(seq(*[comp(symbol) for symbol in self.nested])) return [(self,)] def __repr__(self): if type(self) is not Assoc: return super(Assoc, self).__repr__() return "%s(%s, %s)" % ( self.__class__.__name__, self.assoc, self.repr_nested(),) # # Force left associativity. # class Left(Assoc): __slots__ = () SYMBOL_PRECEDENCE = 2 def __init__(self, *args): super(Left, self).__init__('l', *args) def __repr__(self): return self.nonterm_repr('<<') # # Force right associativity. # class Right(Assoc): __slots__ = () SYMBOL_PRECEDENCE = 2 def __init__(self, *args): super(Right, self).__init__('r', *args) def __repr__(self): return self.nonterm_repr('>>') # # Force non associative. # class Nonassoc(Assoc): __slots__ = () def __init__(self, *args): super(Nonassoc, self).__init__('n', *args) # # Construct a list of productions separated by a delimiter. # class List(Nonterm): __slots__ = ('max', 'min', 'opt',) def __init__(self, symbol, delimiter, min=None, max=None, opt=None): super(List, self).__init__(symbol, delimiter) self.min = 0 if min is None else min self.max = max self.opt = opt if max is not None and max < self.min: raise GrammarError("min may not be greater than max") def compile_symbol(self, comp): symbol = comp(self.nested[0]) delimiter = comp(self.nested[1]) productions = [] if self.min == 0: productions.append(()) # # A fixed max means we can just list all the possibilities like this: # # [(), (sym,), (sym,delim,sym), (sym,delim,sym,delim,sym)] # if self.max is not None: for repeat in range(max(0, self.min - 1), self.max): prod = seq(symbol, seq(delimiter, symbol, repeat)) productions.extend(prod) if self.opt: productions.extend(seq(prod, delimiter)) return productions # # There is no upper maximum, so we need recursion: # # For right assoc: # [(sym,delim,sym,delim,S)] # S ::= [(sym,), (sym,delim,S)] # # For left assoc: # [(S,delim,sym,delim,sym)] # S ::= [(sym,), (S,delim,sym)] # my_productions = list(symbol) if self.parent is None or not isinstance(self.parent, Assoc): assoc = None else: assoc = self.parent.assoc repeat = max(0, self.min - 1) if assoc == 'r': prod = seq(seq(symbol, delimiter, repeat), self) productions.extend(prod) my_productions.extend(seq(symbol, delimiter, self)) if self.opt: my_productions.extend(seq(symbol, delimiter)) elif assoc is None or assoc == 'l': prod = seq(self, seq(delimiter, symbol, repeat)) productions.extend(prod) if self.opt: productions.extend(seq(prod, delimiter)) my_productions.extend(seq(self, delimiter, symbol)) else: msg = "Can't implement %s associativity on %r" raise GrammarError(msg % (assoc, self)) self.emit(my_productions) return productions def __repr__(self): if self.opt is not None: return "%s(%s, %r, %r, %r)" % ( self.__class__.__name__, self.repr_nested(), self.min, self.max, self.opt) if self.max is not None: return "%s(%s, %r, %r)" % ( self.__class__.__name__, self.repr_nested(), self.min, self.max) if self.min != 0: return "%s(%s, %r)" % ( self.__class__.__name__, self.repr_nested(), self.min) return "%s(%s)" % (self.__class__.__name__, self.repr_nested()) # # Repeats. This class handles all forms of repeats. # class Repeat(Nonterm): __slots__ = ('min', 'max',) def __init__(self, symbol, min=None, max=None): super(Repeat, self).__init__(symbol) self.min = 0 if min is None else min self.max = max if max is not None and max < self.min: raise GrammarError("min may not be greater than max") def compile_symbol(self, comp): symbol = comp(*self.nested) # # If we have both min and max repeats we can just enumerate the # results like this: # # [ (), (symbol,), (symbol,symbol,), ... ] # if self.max is not None: prods = (seq(symbol, rpt) for rpt in range(self.min, self.max + 1)) return sum(prods, []) # # There is no maximum, so we must use recursion. # # For right associative: # [ (symbol,symbol,S) ] # S ::= [symbol, (symbol, S) ] # # For left associative: # [ (S,symbol,symbol) ] # S ::= [symbol, (S,symbol) ] # if self.parent is None or not isinstance(self.parent, Assoc): assoc = None else: assoc = self.parent.assoc if self.min == 0: productions = [(), (self,)] else: productions = seq(seq(symbol, max(0, self.min - 1)), self) if assoc == 'r': my_productions = symbol + seq(symbol, self) elif assoc is None or assoc == 'l': my_productions = symbol + seq(self, symbol) else: msg = "Can't implement %s associativity on %r" raise GrammarError(msg % (assoc, self)) self.emit(my_productions) return productions def __repr__(self): nested = self.nested[0].nested_repr() if len(self.nested[0].nested) >= 2: nested = '(%s)' % (nested,) if type(self) is not Repeat: return "%s * %s" % (nested, self.__class__.__name__) if self.min == self.max: return "%s * %r" % (nested, self.min) if self.max is not None: return "%s * (%r, %r)" % (nested, self.min, self.max) if self.min != 0: return "%s * (%r,)" % (nested, self.min) return "%s * ()" % (nested,) # # Optional - ie 0 or 1. # class Opt(Repeat): __slots__ = () def __init__(self, symbol): super(Opt, self).__init__(symbol, 0, 1) # # 1 or more. # class Some(Repeat): __slots__ = () def __init__(self, symbol): super(Some, self).__init__(symbol, 1, None) # # 0 or more. # class Many(Repeat): __slots__ = () def __init__(self, symbol): super(Many, self).__init__(symbol, 0, None) # # Generate tokens by splitting a string. # class Tokens(Nonterm): __slots__ = () def __init__(self, literals, keywords=None, case=None): tokens = [] if literals and literals.strip(): tokens.extend([ Token(literal, case=case) for literal in literals.strip().split()]) if keywords and keywords.strip(): tokens.extend([ Keyword(keyword, case) for keyword in keywords.strip().split()]) super(Tokens, self).__init__(*tokens) def resolve_symbol(self, name, rules, token_registry): return Choice(*self.nested) # # The Tokeniser() breaks up input into tokens. # class Tokeniser(object): literals = None # dict, {"literal": token, ...} regex = None # object, re.compile() - compiled token recognisers re_groups = None # tuple, (int, ...) re_list = None # tuple, (Token(), ...) registry = None # object, TokenRegistry() that owns us unrecognised = None # object, The UnrecognisedToken() re_flags = re.DOTALL | re.MULTILINE ANCHOR_RE = re.compile(r'(?:[^[\\]|\\.|\[\^?\]?(?:\\.|[^]\\])*\])*\\[AZ]') BACKREF_RE = re.compile( r'(?:[^[\\]|\\[^0-9]|\[\^?\]?(?:\\.|[^]\\])*\])*(\\[0-9]+)') CAPTURE_RE = re.compile(r'[^[\\(]|\\.|\[\^?\]?(?:\\.|[^]\\])*\]|\(\?') def compile_tokens(self, token_registry, whitespace): self.registry = token_registry # # Whitespace must be a string. # if whitespace is not None: if not isinstance(whitespace, string_types): raise GrammarError("WHITESPACE must be a string") all_tokens = ( token for token in self.registry.values() if isinstance(token, Token)) key = lambda t: (to_str(t.literal), to_str(t.re)) all_tokens = sorted(all_tokens, key=key, reverse=True) patterns = [token for token in all_tokens if token.re is not None] self.literals = {} self.unrecognised = next( (t for t in all_tokens if t.re is None and t.literal is None), None) # # It's amazing what unit testing turns up. Would anybody really use # the inbuilt tokeniser without recognising a single token? # self.re_list = [] self.re_groups = [] if not patterns: pattern = "x(?<=y)" # An re that never matches else: # # Currently Python's re module returns the first re that matches # when given the sequence a|b|c. We always want it to match the # longest possible literal. In Python 2.7 putting the longest # literals first makes that happen. # # Backreferences are allowed, but since grammar writer has no idea # what order we will put them in we have to renumber them. # longest = lambda token: (token.literal is not None, -len(token.re)) ordered_patterns = [] backref_base = 0 for token in sorted(patterns, key=longest): base = backref_base backref_base += 1 token_re = '(?:()(?:%s))' % token.re self.re_groups.append(backref_base) self.re_list.append(token) backref_matches = tuple(self.BACKREF_RE.finditer(token_re)) for match in reversed(backref_matches): backref_no = int(match.group(1)[1:], 10) + backref_base token_re = "%s\\%d%s" % ( token_re[:match.start(1)], backref_no, token_re[match.end(1):]) backref_base = base + len(self.CAPTURE_RE.sub('', token_re)) ordered_patterns.append(token_re) pattern = '|'.join(ordered_patterns) self.regex = re.compile(pattern, self.re_flags) self.re_list = tuple(self.re_list) self.re_groups = tuple(self.re_groups) # # Gather all literals and checking there are no duplicates. # all_re = {} for token in all_tokens: # # Ensure the re isn't duplicated. # if token.re is not None: if token.re in all_re: msg = "Token's %r and %r define the same re" raise GrammarError(msg % (token, all_re[token.re])) all_re[token.re] = token if token.literal is not None: self.literals[token.literal] = token # # Ensure the literal is matched by exactly one Token. # This also ensures there are no duplicate literals. # matches = [] for re_tok in patterns: if self.ANCHOR_RE.match(re_tok.re): continue match = re.match(re_tok.re, token.literal) if match is not None: if (re_tok.literal is None or match.group() == token.literal): matches.append((re_tok, match)) if not matches: msg = "Keyword %r does not match any re" raise GrammarError(msg % (token,)) if len(matches) == 1: token.owner = matches[0][0] else: re_matches = [ m[0] for m in matches if m[0].literal is None] if re_matches: msg = ( "Literal token %s should be a Keyword " + "as it matches re token %s") res = ', '.join(str(match) for match in re_matches) raise GrammarError(msg % (token, res)) matches = [m for m in matches if m is not token] msg = "duplicate literal %r and %r" raise GrammarError(msg % (matches[0][0], token,)) if not any(m.group() == token.literal for t, m in matches): msg = "Token.re %r partially matches %r of Keyword %r" raise GrammarError(msg % (matches[0], match, token)) # # This generator is a filter. It takes a string generator as an argument # and generates tokens ready to be fed into the parser. The strings # returned by the generator are assumed to be entire tokens. # def tokeniser(self, input, whitespace=None): # # pos = [position_in_stream, line_number, column_number] # def update_position(data): ldata = len(data) pos[0] += ldata matches = list(re.finditer("(?:\n\r?|\r\n?)", data)) if not matches: pos[2] += ldata else: pos[1] += len(matches) pos[2] = ldata - matches[-1].end() + 1 pos = [0, 1, 1] # # Normalise the parameters. # iterator = iter((input,) if isinstance(input, string_types) else input) if whitespace == "": is_whitespace = lambda s: False last_whitespace = lambda s: len(s) else: spaces = " \f\n\r\t\v" if whitespace is None else whitespace is_whitespace = lambda s: not s.lstrip(spaces) trans = string_maketrans(spaces, spaces[0] * len(spaces)) last_whitespace = lambda s: s.translate(trans).rfind(spaces[0]) # # Loop until end of the stream. # cur_tok = next(iterator, None) cur_isstr = isinstance(cur_tok, string_types) nxt_tok = next(iterator, None) nxt_isstr = isinstance(nxt_tok, string_types) while cur_tok is not None: buf = "" # # Loop while the current token is a string. # while cur_isstr: # # If the next token isn't a string we must parse all of the # input, otherwise we only parse up to the last space. This # somewhat reduces the chance of tokens being truncated across # iterator boundaries. # last = last_whitespace(cur_tok) if nxt_isstr else len(cur_tok) if last == -1: buf += cur_tok else: last += len(buf) buf += cur_tok offset = 0 while offset < last: match = self.regex.search(buf, offset, last) if match is not None: start, end = match.span() else: if nxt_isstr: break start, end = last, last # # The only thing that can separate one token and the # next is whitespace. # if offset < start: in_between = buf[offset:start] if not is_whitespace(in_between): if self.unrecognised is None: msg = ( "Unrecognised token %r " + "at line %d column %d" ) raise TokenError( msg % (in_between, pos[1], pos[2]), in_between, pos[0], pos[1], pos[2]) yield ( self.unrecognised, in_between, pos[0], pos[1], pos[2]) update_position(in_between) if start == last: break # # Found some data that matches a token. Identify what # token it matches. # data = match.group() try: token = self.literals[data] except KeyError: try: token = self.literals[data.lower()] if token.case: token = None except KeyError: token = None if token is None: idx = match.group(*self.re_groups).index('') token = self.re_list[idx] if token.refine is not None: token = token.refine(self.registry, data) yield token, data, pos[0], pos[1], pos[2] update_position(data) offset = end buf = buf[offset:] cur_tok = nxt_tok cur_isstr = nxt_isstr nxt_tok = next(iterator, None) nxt_isstr = isinstance(nxt_tok, string_types) # # If we are given non-strings pass it straight on. # while cur_tok is not None and not cur_isstr: yield cur_tok cur_tok = nxt_tok cur_isstr = nxt_isstr nxt_tok = next(iterator, None) nxt_isstr = isinstance(nxt_tok, string_types) # # Meta class that does the work for a TokenRegistry. # class TokenRegistryMeta(type): def __new__(cls, name, bases, dct): registry = super(TokenRegistryMeta, cls).__new__(cls, name, bases, dct) if dct.get("__metaclass__", None) is not cls: registry.save_dicts() return registry # # Put your token definitions in a class that inherits from this one. # class TokenRegistry(dict): __dicts = None __metaclass__ = TokenRegistryMeta __tokeniser = None def __init__(self): for name, token_symbol in self.__class__.__dict__.items(): if isinstance(token_symbol, TokenSymbol): qualified_name = "%s.%s" % (self.__class__.__name__, name) token_symbol.set_name(qualified_name) self._resolve_token_(token_symbol) # # Save the registered token's dicts, as they may be overwritten # by the Grammar. This is called by the meta class, so it happens # before the Grammar has a chance to get it's fingers into the pie. # def save_dicts(cls): cls.__dicts = {} for token_symbol in cls.__dict__.values(): if isinstance(token_symbol, TokenSymbol): cls.__dicts[token_symbol] = token_symbol.dict token_symbol.dict = {} save_dicts = classmethod(save_dicts) # # Return the dict's of all registered tokens. # def restore_dicts(cls): for token_symbol, token_dict in cls.__dicts.items(): token_symbol.dict = token_dict del cls.__dicts restore_dicts = classmethod(restore_dicts) # # Resolve duplicate tokens. # def _resolve_token_(self, token_symbol): alias = self.get(token_symbol.name, None) if alias is None: alias = token_symbol super(TokenRegistry, self).__setitem__(alias.name, alias) if alias is not token_symbol: alias.merge(token_symbol) return alias # # Compile the re that recognises the tokens. # def compile_tokens(self, whitespace=None): self.__tokeniser = Tokeniser() self.__tokeniser.compile_tokens(self, whitespace) # # Return a generator for tokens in the grammar. # def tokeniser(self, input, whitespace=None): return self.__tokeniser.tokeniser(input, whitespace) def __setitem__(self, key, value): raise NotImplementedError() def __delitem__(self, key): raise NotImplementedError() TokenRegistry = python3_metaclass(TokenRegistry) # # A Token in the grammar. An instance of a TokenSymbol() defines one kind # of token. # class TokenSymbol(Symbol): __slots__ = ('name', 'named',) def __init__(self, name): super(TokenSymbol, self).__init__() self.name = name self.named = False self.first_set = frozenset((self,)) def resolve_symbol(self, name, rules, token_definitions): return token_definitions._resolve_token_(self) # # Absorb another token definition into this one. # def merge(self, other): msg = "Token %r doesn't support merging with token %r" raise GrammarError(msg % (self, other)) def compile_symbol(self, comp): return [(self,)] # # Set the name of this token. # def set_name(self, name): if self.name is None: self.name = name elif self.name != name: msg = "Can not rename token %r to %r" % (self.name, name) raise GrammarError(msg) self.named = True def __repr__(self): return str(self) def __str__(self): return str(self.name) # # Given a tuple returned by the tokeniser, return an English # description of where we are in the input stream. # def position(self, token_tuple): return None # # MetaToken's are used internally by the Parser. They are re-usable # by multiple grammar's. # class MetaToken(TokenSymbol): __slots__ = () def __init__(self, name): super(MetaToken, self).__init__(name) def __repr__(self): return str(self.name) def __str__(self): return self.name def position(self, token_tuple): return self.name # # A user generated token. # class UserToken(TokenSymbol): __slots__ = () def __init__(self, name=None): super(UserToken, self).__init__(name) def resolve_symbol(self, name, rules, token_definitions): if self.name is None: msg = "A %s must be assigned a name using a %s" % ( self.__class__.__name__, TokenRegistry.__name__) raise GrammarError(msg) return super(UserToken, self).resolve_symbol( name, rules, token_definitions) def merge(self, other): if not isinstance(other, UserToken): super(UserToken, self).merge(other) # # A Token() built by the inbuilt tokeniser. A token comes in two # varieties: # # - A token defined by a regular expression. # - A keyword, which is a special purposed token. # class Token(TokenSymbol): _RE = re KEYWORD = object() UNRECOGNISED = object() __slots__ = ('case', 'literal', 'owner', 're', 'refine') def __init__( self, literal=None, re=None, case=None, kind=None, refine=None): if kind is self.KEYWORD: if re is not None: raise GrammarError("A keyword must not have an re") if refine is not None: raise GrammarError("A keyword can not be refined") if literal is None: raise GrammarError("A keyword must have a literal") elif kind is self.UNRECOGNISED: if literal is not None or re is not None: msg = "The UnrecognisedToken can't have a literal or re" raise GrammarError(msg) elif kind is not None: raise GrammarError("Unrecognised Token kind %r" % (kind,)) else: if literal is None and re is None: raise GrammarError("A Token must have a literal or an re") if literal is not None: if re is not None: msg = "A Token can't have both a literal and a re" raise GrammarError(msg) if refine is not None: raise GrammarError("A literal can't be refined") self.literal = literal self.re = re self.named = False super(Token, self).__init__(str(self)) self.case = case if case is not None else True self.refine = refine self.owner = self if kind is not None: pass elif re is not None: self._RE.compile(re) else: self.re = self._RE.escape(self.literal) if not self.case and self.literal.lower() != self.literal.upper(): def either_case(match): char = match.group() return "[%c%c]" % (char.upper(), char.lower()) self.re = self._RE.sub("[a-zA-Z]", either_case, self.re) def __repr__(self): if self.literal is not None: result = repr(self.literal) elif self.re is not None: result = self.repr_re() else: result = "%s()" % (UnrecognisedToken.__name__,) if self.named: return '%s=%s' % (self, result,) return result def __str__(self): if self.named: return super(Token, self).__str__() if self.literal: return repr(self.literal) if self.re is not None: return self.repr_re() return '' def repr_re(self): return "/%s/" % (repr(self.re)[1:-1].replace("/", "\\/"),) def merge(self, other): # # We are only merge with like. # if not isinstance(other, Token): TokenSymbol.merge(self, other) if self.refine is None: self.refine = other.refine elif other.refine is not None and other.refine != self.refine: msg = "Token %r defined with conflicting refine's %r and %r" raise GrammarError(msg % (self.name, self.refine, other.refine)) # # Set the name of this token. # def set_name(self, name): if self.named and self.name != name: msg = "Can not rename token %r to %r" % (self.name, name) raise GrammarError(msg) self.name = name self.named = True # # Our tokeniser puts the line and column in the tuple. # def position(self, token_tuple): if len(token_tuple) < 5 or None in token_tuple[3:5]: return super(Token, self).position(token_tuple) return "line %d column %d" % token_tuple[3:5] # # The Unrecognised Token. # def UnrecognisedToken(): return Token(kind=Token.UNRECOGNISED) # # A Keyword is literal without a regexp, ie it must match an existing regexp. # def Keyword(literal, case=None): return Token(literal, case=case, kind=Token.KEYWORD) # # How we handle non-Symbol types in Symbol expression. # Symbol.CAST = { str: Token, tuple: lambda t: Prio(*t), } if sys.version_info < (3,): Symbol.CAST[unicode] = Token class Production(object): __slots__ = ("lhs", "rhs") def __init__(self, lhs, rhs): self.lhs = lhs self.rhs = tuple(rhs) def __repr__(self): return ( "%s = %s" % (self.lhs, " ".join(str(elm) for elm in self.rhs),)) # # Parser Constructor. # class GrammarMeta(type): def __new__(cls, name, bases, dct): if "_parser_" in dct: raise GrammarError("_parser_ is reserved in Gramma's.") if dct.get("__metaclass__", None) is not cls: dct["_parser_"] = Parser(name, dct) return super(GrammarMeta, cls).__new__(cls, name, bases, dct) # # The base class for Parsers. # class Grammar(object): __metaclass__ = GrammarMeta def compile_grammar(cls): compile_grammar(cls) compile_grammar = classmethod(compile_grammar) def epoch_symbol(cls): return epoch_symbol(cls) epoch_symbol = classmethod(epoch_symbol) def parse(cls, input, tree_factory=None, on_error=None, log=None): return parse(cls, input, tree_factory, on_error, log) parse = classmethod(parse) def pre_compile_grammar(cls, pre_compiled=None): return pre_compile_grammar(cls, pre_compiled) pre_compile_grammar = classmethod(pre_compile_grammar) def repr_grammar(cls): return repr_grammar(cls) repr_grammar = classmethod(repr_grammar) def repr_parse_table(cls, state=None): return repr_parse_table(cls, state) repr_parse_table = classmethod(repr_parse_table) def repr_parse_tree(cls, tree, indent=None): return repr_parse_tree(tree, indent) repr_parse_tree = classmethod(repr_parse_tree) def repr_productions(cls): return repr_productions(cls) repr_productions = classmethod(repr_productions) def unused_rules(cls): return unused_rules(cls) unused_rules = classmethod(unused_rules) Grammar = python3_metaclass(Grammar) def compile_grammar(grammar): grammar._parser_.compile_grammar() def epoch_symbol(grammar): return grammar._parser_.epoch_symbol def parse(grammar, input, tree_factory=None, on_error=None, log=None): return grammar._parser_.parse(input, tree_factory, on_error, log) def pre_compile_grammar(grammar, pre_compiled=None): return grammar._parser_.pre_compile_grammar(grammar, pre_compiled) def repr_grammar(grammar): return grammar._parser_.repr_grammar() def repr_parse_table(grammar, state=None): return grammar._parser_.repr_parse_table(state) def repr_productions(grammar): return grammar._parser_.repr_productions() def unused_rules(grammar): return grammar._parser_.unused_symbols def repr_parse_tree(tree, indent=None): def indent_tree(tree, padding): # # Append tokens and empty productions to the current line. # def extend_line(line, prod): while True: while prod and isinstance(prod[0][0], TokenSymbol): line.append(repr_token(prod.popleft())) if not prod or len(prod[0]) != 1: break line.append("(%s)" % (prod.popleft()[0],)) repr_token = ( lambda t: repr(t[1]) if isinstance(t[0], Token) else str(t[0])) # # Were we passed a token? # if isinstance(tree[0], TokenSymbol): return [repr_token(tree)] # # List singleton productions (ie productions of the form rule1 = rule2) # on the same line. # line, result = [], [] nesting = 1 while len(tree) == 2 and not isinstance(tree[1][0], TokenSymbol): line.append("(%s" % (tree[0],)) nesting += 1 tree = tree[1] line.append("(%s" % (tree[0],)) # # If the remainder of the symbols are tokens just list them as well. # prod = collections.deque(tree[1:]) extend_line(line, prod) if not prod: result.append('%s%s' % (padding, ' '.join(line))) else: # # If we have rule1 = ... ^ rule2, then list them as: # # (rule1 ... rule2 # rule2-child0 # ...) # # If we have rule1 = ... ^ rule2 ... then list them as: # # (rule1 ... # (rule2 # ...) # ...) # if len(prod) > 1: result.append('%s%s' % (padding, ' '.join(line))) elif (len(prod) == 1 and all(isinstance(t[0], TokenSymbol) for t in prod[0][1:])): last = prod.popleft() line.append("(%s" % last[0]) line.extend(repr_token(t) for t in last[1:]) line[-1] += ")" result.append('%s%s' % (padding, ' '.join(line))) else: line.append('(%s' % (prod[0][0],)) result.append('%s%s' % (padding, ' '.join(line))) prod = collections.deque(prod[0][1:]) if len(prod[0]) == 1 or isinstance(prod[0][0], TokenSymbol): line = [] extend_line(line, prod) result.append('%s%s' % (padding + indent, ' '.join(line))) while prod: result.extend(indent_tree(prod.popleft(), padding + indent)) line = [result[-1]] extend_line(line, prod) result[-1] = ' '.join(line) result[-1] += ')' * nesting return result eol = ' ' if indent is False else '\n' indent = " " if indent is None else indent or "" return eol.join(indent_tree(tree, "")) # vim: set shiftwidth=4 expandtab softtabstop=8 :