Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: Changeset: r94210:045483a22d66 Date: 2018-04-01 12:36 +0200 http://bitbucket.org/pypy/pypy/changeset/045483a22d66/
Log: merge pyparser-improvements: - speed up python parser - create slightly more helpful SyntaxError message (only in very clear and obvious cases) diff --git a/lib-python/2.7/test/test_genexps.py b/lib-python/2.7/test/test_genexps.py --- a/lib-python/2.7/test/test_genexps.py +++ b/lib-python/2.7/test/test_genexps.py @@ -87,7 +87,7 @@ >>> dict(a = i for i in xrange(10)) Traceback (most recent call last): ... - SyntaxError: invalid syntax + SyntaxError: invalid syntax (expected ')') Verify that parenthesis are required when used as a keyword argument value diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -86,3 +86,8 @@ that jit hooks are currently not enabled at all. in that case, the list of ops does not have to be created in the case of the on_abort hook (which is expensive). + + +.. branch: pyparser-improvements + +Improve speed of Python parser, improve ParseError messages slightly. diff --git a/pypy/interpreter/pyparser/metaparser.py b/pypy/interpreter/pyparser/metaparser.py --- a/pypy/interpreter/pyparser/metaparser.py +++ b/pypy/interpreter/pyparser/metaparser.py @@ -147,8 +147,10 @@ for label, next in state.arcs.iteritems(): arcs.append((self.make_label(gram, label), dfa.index(next))) states.append((arcs, state.is_final)) - gram.dfas.append((states, self.make_first(gram, name))) - assert len(gram.dfas) - 1 == gram.symbol_ids[name] - 256 + symbol_id = gram.symbol_ids[name] + dfa = parser.DFA(symbol_id, states, self.make_first(gram, name)) + gram.dfas.append(dfa) + assert len(gram.dfas) - 1 == symbol_id - 256 gram.start = gram.symbol_ids[self.start_symbol] return gram @@ -162,6 +164,13 @@ else: gram.labels.append(gram.symbol_ids[label]) gram.symbol_to_label[label] = label_index + first = self.first[label] + if len(first) == 1: + first, = first + if not first[0].isupper(): + first = first.strip("\"'") + assert label_index not in gram.token_to_error_string + gram.token_to_error_string[label_index] = first return label_index elif label.isupper(): token_index = gram.TOKENS[label] @@ -183,7 +192,7 @@ else: gram.labels.append(gram.KEYWORD_TOKEN) gram.keyword_ids[value] = label_index - return label_index + result = label_index else: try: token_index = gram.OPERATOR_MAP[value] @@ -194,7 +203,10 @@ else: gram.labels.append(token_index) gram.token_ids[token_index] = label_index - return label_index + result = label_index + assert result not in gram.token_to_error_string + gram.token_to_error_string[result] = value + return result def make_first(self, gram, name): original_firsts = self.first[name] diff --git a/pypy/interpreter/pyparser/parser.py b/pypy/interpreter/pyparser/parser.py --- a/pypy/interpreter/pyparser/parser.py +++ b/pypy/interpreter/pyparser/parser.py @@ -1,6 +1,7 @@ """ A CPython inspired RPython parser. """ +from rpython.rlib.objectmodel import not_rpython class Grammar(object): @@ -16,6 +17,7 @@ self.symbol_names = {} self.symbol_to_label = {} self.keyword_ids = {} + self.token_to_error_string = {} self.dfas = [] self.labels = [0] self.token_ids = {} @@ -41,6 +43,27 @@ pass return True +class DFA(object): + def __init__(self, symbol_id, states, first): + self.symbol_id = symbol_id + self.states = states + self.first = self._first_to_string(first) + + def could_match_token(self, label_index): + pos = label_index >> 3 + bit = 1 << (label_index & 0b111) + return bool(ord(self.first[label_index >> 3]) & bit) + + @staticmethod + @not_rpython + def _first_to_string(first): + l = sorted(first.keys()) + b = bytearray(32) + for label_index in l: + pos = label_index >> 3 + bit = 1 << (label_index & 0b111) + b[pos] |= bit + return str(b) class Node(object): @@ -127,14 +150,17 @@ class Nonterminal(AbstractNonterminal): __slots__ = ("_children", ) - def __init__(self, type, children): + def __init__(self, type, children=None): Node.__init__(self, type) + if children is None: + children = [] self._children = children def __repr__(self): return "Nonterminal(type=%s, children=%r)" % (self.type, self._children) def get_child(self, i): + assert self._children is not None return self._children[i] def num_children(self): @@ -168,7 +194,7 @@ class ParseError(Exception): def __init__(self, msg, token_type, value, lineno, column, line, - expected=-1): + expected=-1, expected_str=None): self.msg = msg self.token_type = token_type self.value = value @@ -176,17 +202,41 @@ self.column = column self.line = line self.expected = expected + self.expected_str = expected_str def __str__(self): return "ParserError(%s, %r)" % (self.token_type, self.value) +class StackEntry(object): + def __init__(self, next, dfa, state): + self.next = next + self.dfa = dfa + self.state = state + self.node = None + + def push(self, dfa, state): + return StackEntry(self, dfa, state) + + def pop(self): + return self.next + + def node_append_child(self, child): + node = self.node + if node is None: + self.node = Nonterminal1(self.dfa.symbol_id, child) + elif isinstance(node, Nonterminal1): + newnode = self.node = Nonterminal( + self.dfa.symbol_id, [node._child, child]) + else: + self.node.append_child(child) + + class Parser(object): def __init__(self, grammar): self.grammar = grammar self.root = None - self.stack = None def prepare(self, start=-1): """Setup the parser for parsing. @@ -196,16 +246,15 @@ if start == -1: start = self.grammar.start self.root = None - current_node = Nonterminal(start, []) - self.stack = [] - self.stack.append((self.grammar.dfas[start - 256], 0, current_node)) + self.stack = StackEntry(None, self.grammar.dfas[start - 256], 0) def add_token(self, token_type, value, lineno, column, line): label_index = self.classify(token_type, value, lineno, column, line) sym_id = 0 # for the annotator while True: - dfa, state_index, node = self.stack[-1] - states, first = dfa + dfa = self.stack.dfa + state_index = self.stack.state + states = dfa.states arcs, is_accepting = states[state_index] for i, next_state in arcs: sym_id = self.grammar.labels[i] @@ -217,16 +266,17 @@ # the stack. while state[1] and not state[0]: self.pop() - if not self.stack: + if self.stack is None: # Parsing is done. return True - dfa, state_index, node = self.stack[-1] - state = dfa[0][state_index] + dfa = self.stack.dfa + state_index = self.stack.state + state = dfa.states[state_index] return False elif sym_id >= 256: sub_node_dfa = self.grammar.dfas[sym_id - 256] # Check if this token can start a child node. - if label_index in sub_node_dfa[1]: + if sub_node_dfa.could_match_token(label_index): self.push(sub_node_dfa, next_state, sym_id, lineno, column) break @@ -235,7 +285,7 @@ # state is accepting, it's invalid input. if is_accepting: self.pop() - if not self.stack: + if self.stack is None: raise ParseError("too much input", token_type, value, lineno, column, line) else: @@ -243,10 +293,13 @@ # error. if len(arcs) == 1: expected = sym_id + expected_str = self.grammar.token_to_error_string.get( + arcs[0][0], None) else: expected = -1 + expected_str = None raise ParseError("bad input", token_type, value, lineno, - column, line, expected) + column, line, expected, expected_str) def classify(self, token_type, value, lineno, column, line): """Find the label for a token.""" @@ -262,26 +315,22 @@ def shift(self, next_state, token_type, value, lineno, column): """Shift a non-terminal and prepare for the next state.""" - dfa, state, node = self.stack[-1] new_node = Terminal(token_type, value, lineno, column) - node.append_child(new_node) - self.stack[-1] = (dfa, next_state, node) + self.stack.node_append_child(new_node) + self.stack.state = next_state def push(self, next_dfa, next_state, node_type, lineno, column): """Push a terminal and adjust the current state.""" - dfa, state, node = self.stack[-1] - new_node = Nonterminal(node_type, []) - self.stack[-1] = (dfa, next_state, node) - self.stack.append((next_dfa, 0, new_node)) + self.stack.state = next_state + self.stack = self.stack.push(next_dfa, 0) def pop(self): """Pop an entry off the stack and make its node a child of the last.""" - dfa, state, node = self.stack.pop() + top = self.stack + self.stack = top.pop() + node = top.node + assert node is not None if self.stack: - # we are now done with node, so we can store it more efficiently if - # it has just one child - if node.num_children() == 1: - node = Nonterminal1(node.type, node.get_child(0)) - self.stack[-1][2].append_child(node) + self.stack.node_append_child(node) else: self.root = node diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py --- a/pypy/interpreter/pyparser/pyparse.py +++ b/pypy/interpreter/pyparser/pyparse.py @@ -132,7 +132,11 @@ w_message = space.str(e.get_w_value(space)) raise error.SyntaxError(space.text_w(w_message)) raise + if enc is not None: + compile_info.encoding = enc + return self._parse(textsrc, compile_info) + def _parse(self, textsrc, compile_info): flags = compile_info.flags # The tokenizer is very picky about how it wants its input. @@ -181,6 +185,9 @@ else: new_err = error.SyntaxError msg = "invalid syntax" + if e.expected_str is not None: + msg += " (expected '%s')" % e.expected_str + raise new_err(msg, e.lineno, e.column, e.line, compile_info.filename) else: @@ -188,6 +195,4 @@ finally: # Avoid hanging onto the tree. self.root = None - if enc is not None: - compile_info.encoding = enc return tree diff --git a/pypy/interpreter/pyparser/test/targetparse.py b/pypy/interpreter/pyparser/test/targetparse.py new file mode 100644 --- /dev/null +++ b/pypy/interpreter/pyparser/test/targetparse.py @@ -0,0 +1,39 @@ +import sys +import os +ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +print ROOT +sys.path.insert(0, str(ROOT)) +import time +from pypy.interpreter.pyparser import pyparse + + + +with file("../../../rpython/rlib/unicodedata/unicodedb_5_2_0.py") as f: + s = f.read() + +class FakeSpace(object): + pass + +fakespace = FakeSpace() + +def bench(title): + a = time.clock() + info = pyparse.CompileInfo("<string>", "exec") + parser = pyparse.PythonParser(fakespace) + tree = parser._parse(s, info) + b = time.clock() + print title, (b-a) + + +def entry_point(argv): + bench("foo") + + return 0 + +# _____ Define and setup target ___ + +def target(*args): + return entry_point, None + +if __name__ == '__main__': + entry_point(sys.argv) diff --git a/pypy/interpreter/pyparser/test/test_metaparser.py b/pypy/interpreter/pyparser/test/test_metaparser.py --- a/pypy/interpreter/pyparser/test/test_metaparser.py +++ b/pypy/interpreter/pyparser/test/test_metaparser.py @@ -34,8 +34,8 @@ assert len(g.dfas) == 1 eval_sym = g.symbol_ids["eval"] assert g.start == eval_sym - states, first = g.dfas[eval_sym - 256] - assert states == [([(1, 1)], False), ([], True)] + dfa = g.dfas[eval_sym - 256] + assert dfa.states == [([(1, 1)], False), ([], True)] assert g.labels[0] == 0 def test_load_python_grammars(self): @@ -51,7 +51,7 @@ def test_items(self): g = self.gram_for("foo: NAME STRING OP '+'") assert len(g.dfas) == 1 - states = g.dfas[g.symbol_ids["foo"] - 256][0] + states = g.dfas[g.symbol_ids["foo"] - 256].states last = states[0][0][0][1] for state in states[1:-1]: assert last < state[0][0][1] diff --git a/pypy/interpreter/pyparser/test/test_parser.py b/pypy/interpreter/pyparser/test/test_parser.py --- a/pypy/interpreter/pyparser/test/test_parser.py +++ b/pypy/interpreter/pyparser/test/test_parser.py @@ -7,6 +7,12 @@ from pypy.interpreter.pyparser.test.test_metaparser import MyGrammar +def test_char_set(): + first = {5: None, 9: None, 100: None, 255:None} + p = parser.DFA(None, None, first) + for i in range(256): + assert p.could_match_token(i) == (i in first) + class SimpleParser(parser.Parser): def parse(self, input): @@ -55,8 +61,7 @@ n = parser.Terminal(tp, value, 0, 0) else: tp = gram.symbol_ids[data[0]] - children = [] - n = parser.Nonterminal(tp, children) + n = parser.Nonterminal(tp) new_indent = count_indent(line) if new_indent >= last_indent: if new_indent == last_indent and node_stack: @@ -291,3 +296,37 @@ NEWLINE ENDMARKER""" assert tree_from_string(expected, gram) == p.parse("hi 42 end") + + + def test_optimized_terminal(self): + gram = """foo: bar baz 'end' NEWLINE ENDMARKER +bar: NAME +baz: NUMBER +""" + p, gram = self.parser_for(gram, False) + expected = """ + foo + bar + NAME "a_name" + baz + NUMBER "42" + NAME "end" + NEWLINE + ENDMARKER""" + input = "a_name 42 end" + tree = p.parse(input) + assert tree_from_string(expected, gram) == tree + assert isinstance(tree, parser.Nonterminal) + assert isinstance(tree.get_child(0), parser.Nonterminal1) + assert isinstance(tree.get_child(1), parser.Nonterminal1) + + + def test_error_string(self): + p, gram = self.parser_for( + "foo: 'if' NUMBER '+' NUMBER" + ) + info = py.test.raises(parser.ParseError, p.parse, "if 42") + info.value.expected_str is None + info = py.test.raises(parser.ParseError, p.parse, "if 42 42") + info.value.expected_str == '+' + diff --git a/pypy/interpreter/pyparser/test/test_pyparse.py b/pypy/interpreter/pyparser/test/test_pyparse.py --- a/pypy/interpreter/pyparser/test/test_pyparse.py +++ b/pypy/interpreter/pyparser/test/test_pyparse.py @@ -165,3 +165,11 @@ for linefeed in ["\r\n","\r"]: tree = self.parse(fmt % linefeed) assert expected_tree == tree + + def test_error_forgotten_chars(self): + info = py.test.raises(SyntaxError, self.parse, "if 1\n print 4") + assert "(expected ':')" in info.value.msg + info = py.test.raises(SyntaxError, self.parse, "for i in range(10)\n print i") + assert "(expected ':')" in info.value.msg + info = py.test.raises(SyntaxError, self.parse, "def f:\n print 1") + assert "(expected '(')" in info.value.msg diff --git a/pypy/module/parser/pyparser.py b/pypy/module/parser/pyparser.py --- a/pypy/module/parser/pyparser.py +++ b/pypy/module/parser/pyparser.py @@ -152,7 +152,7 @@ # Raise an exception now and be done with it. raise parser_error(space, w_tuple, "Illegal syntax-tree; cannot start with terminal symbol.") - node = pyparse.parser.Nonterminal(type, []) + node = pyparse.parser.Nonterminal(type) build_node_children(space, w_tuple, node, node_state) return node @@ -171,7 +171,7 @@ strn = space.text_w(w_obj) child = pyparse.parser.Terminal(type, strn, node_state.lineno, 0) else: - child = pyparse.parser.Nonterminal(type, []) + child = pyparse.parser.Nonterminal(type) node.append_child(child) if type >= 256: # Nonterminal node build_node_children(space, w_elem, child, node_state) @@ -187,8 +187,7 @@ raise parse_error(space, "Unrecognized node type %d." % type) dfa = parser.grammar.dfas[type] # Run the DFA for this nonterminal - states, first = dfa - arcs, is_accepting = states[0] + arcs, is_accepting = dfa.states[0] for pos in range(tree.num_children()): ch = tree.get_child(pos) for i, next_state in arcs: @@ -198,7 +197,7 @@ if ch.type >= 256: validate_node(space, ch) # Update the state, and move on to the next child. - arcs, is_accepting = states[next_state] + arcs, is_accepting = dfa.states[next_state] break else: raise parse_error(space, "Illegal node") _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit