Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de>
Branch: 
Changeset: r94210:045483a22d66
Date: 2018-04-01 12:36 +0200
http://bitbucket.org/pypy/pypy/changeset/045483a22d66/

Log:    merge pyparser-improvements:

        - speed up python parser
        - create slightly more helpful SyntaxError message (only in very clear
        and obvious cases)

diff --git a/lib-python/2.7/test/test_genexps.py 
b/lib-python/2.7/test/test_genexps.py
--- a/lib-python/2.7/test/test_genexps.py
+++ b/lib-python/2.7/test/test_genexps.py
@@ -87,7 +87,7 @@
     >>> dict(a = i for i in xrange(10))
     Traceback (most recent call last):
        ...
-    SyntaxError: invalid syntax
+    SyntaxError: invalid syntax (expected ')')
 
 Verify that parenthesis are required when used as a keyword argument value
 
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -86,3 +86,8 @@
 that jit hooks are currently not enabled at all. in that case, the list of ops
 does not have to be created in the case of the on_abort hook (which is
 expensive).
+
+
+.. branch: pyparser-improvements
+
+Improve speed of Python parser, improve ParseError messages slightly.
diff --git a/pypy/interpreter/pyparser/metaparser.py 
b/pypy/interpreter/pyparser/metaparser.py
--- a/pypy/interpreter/pyparser/metaparser.py
+++ b/pypy/interpreter/pyparser/metaparser.py
@@ -147,8 +147,10 @@
                 for label, next in state.arcs.iteritems():
                     arcs.append((self.make_label(gram, label), 
dfa.index(next)))
                 states.append((arcs, state.is_final))
-            gram.dfas.append((states, self.make_first(gram, name)))
-            assert len(gram.dfas) - 1 == gram.symbol_ids[name] - 256
+            symbol_id = gram.symbol_ids[name]
+            dfa = parser.DFA(symbol_id, states, self.make_first(gram, name))
+            gram.dfas.append(dfa)
+            assert len(gram.dfas) - 1 == symbol_id - 256
         gram.start = gram.symbol_ids[self.start_symbol]
         return gram
 
@@ -162,6 +164,13 @@
                 else:
                     gram.labels.append(gram.symbol_ids[label])
                     gram.symbol_to_label[label] = label_index
+                    first = self.first[label]
+                    if len(first) == 1:
+                        first, = first
+                        if not first[0].isupper():
+                            first = first.strip("\"'")
+                            assert label_index not in 
gram.token_to_error_string
+                            gram.token_to_error_string[label_index] = first
                     return label_index
             elif label.isupper():
                 token_index = gram.TOKENS[label]
@@ -183,7 +192,7 @@
                 else:
                     gram.labels.append(gram.KEYWORD_TOKEN)
                     gram.keyword_ids[value] = label_index
-                    return label_index
+                    result = label_index
             else:
                 try:
                     token_index = gram.OPERATOR_MAP[value]
@@ -194,7 +203,10 @@
                 else:
                     gram.labels.append(token_index)
                     gram.token_ids[token_index] = label_index
-                    return label_index
+                    result = label_index
+            assert result not in gram.token_to_error_string
+            gram.token_to_error_string[result] = value
+            return result
 
     def make_first(self, gram, name):
         original_firsts = self.first[name]
diff --git a/pypy/interpreter/pyparser/parser.py 
b/pypy/interpreter/pyparser/parser.py
--- a/pypy/interpreter/pyparser/parser.py
+++ b/pypy/interpreter/pyparser/parser.py
@@ -1,6 +1,7 @@
 """
 A CPython inspired RPython parser.
 """
+from rpython.rlib.objectmodel import not_rpython
 
 
 class Grammar(object):
@@ -16,6 +17,7 @@
         self.symbol_names = {}
         self.symbol_to_label = {}
         self.keyword_ids = {}
+        self.token_to_error_string = {}
         self.dfas = []
         self.labels = [0]
         self.token_ids = {}
@@ -41,6 +43,27 @@
             pass
         return True
 
+class DFA(object):
+    def __init__(self, symbol_id, states, first):
+        self.symbol_id = symbol_id
+        self.states = states
+        self.first = self._first_to_string(first)
+
+    def could_match_token(self, label_index):
+        pos = label_index >> 3
+        bit = 1 << (label_index & 0b111)
+        return bool(ord(self.first[label_index >> 3]) & bit)
+
+    @staticmethod
+    @not_rpython
+    def _first_to_string(first):
+        l = sorted(first.keys())
+        b = bytearray(32)
+        for label_index in l:
+            pos = label_index >> 3
+            bit = 1 << (label_index & 0b111)
+            b[pos] |= bit
+        return str(b)
 
 class Node(object):
 
@@ -127,14 +150,17 @@
 
 class Nonterminal(AbstractNonterminal):
     __slots__ = ("_children", )
-    def __init__(self, type, children):
+    def __init__(self, type, children=None):
         Node.__init__(self, type)
+        if children is None:
+            children = []
         self._children = children
 
     def __repr__(self):
         return "Nonterminal(type=%s, children=%r)" % (self.type, 
self._children)
 
     def get_child(self, i):
+        assert self._children is not None
         return self._children[i]
 
     def num_children(self):
@@ -168,7 +194,7 @@
 class ParseError(Exception):
 
     def __init__(self, msg, token_type, value, lineno, column, line,
-                 expected=-1):
+                 expected=-1, expected_str=None):
         self.msg = msg
         self.token_type = token_type
         self.value = value
@@ -176,17 +202,41 @@
         self.column = column
         self.line = line
         self.expected = expected
+        self.expected_str = expected_str
 
     def __str__(self):
         return "ParserError(%s, %r)" % (self.token_type, self.value)
 
 
+class StackEntry(object):
+    def __init__(self, next, dfa, state):
+        self.next = next
+        self.dfa = dfa
+        self.state = state
+        self.node = None
+
+    def push(self, dfa, state):
+        return StackEntry(self, dfa, state)
+
+    def pop(self):
+        return self.next
+
+    def node_append_child(self, child):
+        node = self.node
+        if node is None:
+            self.node = Nonterminal1(self.dfa.symbol_id, child)
+        elif isinstance(node, Nonterminal1):
+            newnode = self.node = Nonterminal(
+                    self.dfa.symbol_id, [node._child, child])
+        else:
+            self.node.append_child(child)
+
+
 class Parser(object):
 
     def __init__(self, grammar):
         self.grammar = grammar
         self.root = None
-        self.stack = None
 
     def prepare(self, start=-1):
         """Setup the parser for parsing.
@@ -196,16 +246,15 @@
         if start == -1:
             start = self.grammar.start
         self.root = None
-        current_node = Nonterminal(start, [])
-        self.stack = []
-        self.stack.append((self.grammar.dfas[start - 256], 0, current_node))
+        self.stack = StackEntry(None, self.grammar.dfas[start - 256], 0)
 
     def add_token(self, token_type, value, lineno, column, line):
         label_index = self.classify(token_type, value, lineno, column, line)
         sym_id = 0 # for the annotator
         while True:
-            dfa, state_index, node = self.stack[-1]
-            states, first = dfa
+            dfa = self.stack.dfa
+            state_index = self.stack.state
+            states = dfa.states
             arcs, is_accepting = states[state_index]
             for i, next_state in arcs:
                 sym_id = self.grammar.labels[i]
@@ -217,16 +266,17 @@
                     # the stack.
                     while state[1] and not state[0]:
                         self.pop()
-                        if not self.stack:
+                        if self.stack is None:
                             # Parsing is done.
                             return True
-                        dfa, state_index, node = self.stack[-1]
-                        state = dfa[0][state_index]
+                        dfa = self.stack.dfa
+                        state_index = self.stack.state
+                        state = dfa.states[state_index]
                     return False
                 elif sym_id >= 256:
                     sub_node_dfa = self.grammar.dfas[sym_id - 256]
                     # Check if this token can start a child node.
-                    if label_index in sub_node_dfa[1]:
+                    if sub_node_dfa.could_match_token(label_index):
                         self.push(sub_node_dfa, next_state, sym_id, lineno,
                                   column)
                         break
@@ -235,7 +285,7 @@
                 # state is accepting, it's invalid input.
                 if is_accepting:
                     self.pop()
-                    if not self.stack:
+                    if self.stack is None:
                         raise ParseError("too much input", token_type, value,
                                          lineno, column, line)
                 else:
@@ -243,10 +293,13 @@
                     # error.
                     if len(arcs) == 1:
                         expected = sym_id
+                        expected_str = self.grammar.token_to_error_string.get(
+                                arcs[0][0], None)
                     else:
                         expected = -1
+                        expected_str = None
                     raise ParseError("bad input", token_type, value, lineno,
-                                     column, line, expected)
+                                     column, line, expected, expected_str)
 
     def classify(self, token_type, value, lineno, column, line):
         """Find the label for a token."""
@@ -262,26 +315,22 @@
 
     def shift(self, next_state, token_type, value, lineno, column):
         """Shift a non-terminal and prepare for the next state."""
-        dfa, state, node = self.stack[-1]
         new_node = Terminal(token_type, value, lineno, column)
-        node.append_child(new_node)
-        self.stack[-1] = (dfa, next_state, node)
+        self.stack.node_append_child(new_node)
+        self.stack.state = next_state
 
     def push(self, next_dfa, next_state, node_type, lineno, column):
         """Push a terminal and adjust the current state."""
-        dfa, state, node = self.stack[-1]
-        new_node = Nonterminal(node_type, [])
-        self.stack[-1] = (dfa, next_state, node)
-        self.stack.append((next_dfa, 0, new_node))
+        self.stack.state = next_state
+        self.stack = self.stack.push(next_dfa, 0)
 
     def pop(self):
         """Pop an entry off the stack and make its node a child of the last."""
-        dfa, state, node = self.stack.pop()
+        top = self.stack
+        self.stack = top.pop()
+        node = top.node
+        assert node is not None
         if self.stack:
-            # we are now done with node, so we can store it more efficiently if
-            # it has just one child
-            if node.num_children() == 1:
-                node = Nonterminal1(node.type, node.get_child(0))
-            self.stack[-1][2].append_child(node)
+            self.stack.node_append_child(node)
         else:
             self.root = node
diff --git a/pypy/interpreter/pyparser/pyparse.py 
b/pypy/interpreter/pyparser/pyparse.py
--- a/pypy/interpreter/pyparser/pyparse.py
+++ b/pypy/interpreter/pyparser/pyparse.py
@@ -132,7 +132,11 @@
                         w_message = space.str(e.get_w_value(space))
                         raise error.SyntaxError(space.text_w(w_message))
                     raise
+        if enc is not None:
+            compile_info.encoding = enc
+        return self._parse(textsrc, compile_info)
 
+    def _parse(self, textsrc, compile_info):
         flags = compile_info.flags
 
         # The tokenizer is very picky about how it wants its input.
@@ -181,6 +185,9 @@
                 else:
                     new_err = error.SyntaxError
                     msg = "invalid syntax"
+                    if e.expected_str is not None:
+                        msg += " (expected '%s')" % e.expected_str
+
                 raise new_err(msg, e.lineno, e.column, e.line,
                               compile_info.filename)
             else:
@@ -188,6 +195,4 @@
         finally:
             # Avoid hanging onto the tree.
             self.root = None
-        if enc is not None:
-            compile_info.encoding = enc
         return tree
diff --git a/pypy/interpreter/pyparser/test/targetparse.py 
b/pypy/interpreter/pyparser/test/targetparse.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/pyparser/test/targetparse.py
@@ -0,0 +1,39 @@
+import sys
+import os
+ROOT =  
os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+print ROOT
+sys.path.insert(0, str(ROOT))
+import time
+from pypy.interpreter.pyparser import pyparse
+
+
+
+with file("../../../rpython/rlib/unicodedata/unicodedb_5_2_0.py") as f:
+    s = f.read()
+
+class FakeSpace(object):
+    pass
+
+fakespace = FakeSpace()
+
+def bench(title):
+    a = time.clock()
+    info = pyparse.CompileInfo("<string>", "exec")
+    parser = pyparse.PythonParser(fakespace)
+    tree = parser._parse(s, info)
+    b = time.clock()
+    print title, (b-a)
+
+
+def entry_point(argv):
+    bench("foo")
+
+    return 0
+
+# _____ Define and setup target ___
+
+def target(*args):
+    return entry_point, None
+
+if __name__ == '__main__':
+    entry_point(sys.argv)
diff --git a/pypy/interpreter/pyparser/test/test_metaparser.py 
b/pypy/interpreter/pyparser/test/test_metaparser.py
--- a/pypy/interpreter/pyparser/test/test_metaparser.py
+++ b/pypy/interpreter/pyparser/test/test_metaparser.py
@@ -34,8 +34,8 @@
         assert len(g.dfas) == 1
         eval_sym = g.symbol_ids["eval"]
         assert g.start == eval_sym
-        states, first = g.dfas[eval_sym - 256]
-        assert states == [([(1, 1)], False), ([], True)]
+        dfa = g.dfas[eval_sym - 256]
+        assert dfa.states == [([(1, 1)], False), ([], True)]
         assert g.labels[0] == 0
 
     def test_load_python_grammars(self):
@@ -51,7 +51,7 @@
     def test_items(self):
         g = self.gram_for("foo: NAME STRING OP '+'")
         assert len(g.dfas) == 1
-        states = g.dfas[g.symbol_ids["foo"] - 256][0]
+        states = g.dfas[g.symbol_ids["foo"] - 256].states
         last = states[0][0][0][1]
         for state in states[1:-1]:
             assert last < state[0][0][1]
diff --git a/pypy/interpreter/pyparser/test/test_parser.py 
b/pypy/interpreter/pyparser/test/test_parser.py
--- a/pypy/interpreter/pyparser/test/test_parser.py
+++ b/pypy/interpreter/pyparser/test/test_parser.py
@@ -7,6 +7,12 @@
 from pypy.interpreter.pyparser.test.test_metaparser import MyGrammar
 
 
+def test_char_set():
+    first = {5: None, 9: None, 100: None, 255:None}
+    p = parser.DFA(None, None, first)
+    for i in range(256):
+        assert p.could_match_token(i) == (i in first)
+
 class SimpleParser(parser.Parser):
 
     def parse(self, input):
@@ -55,8 +61,7 @@
             n = parser.Terminal(tp, value, 0, 0)
         else:
             tp = gram.symbol_ids[data[0]]
-            children = []
-            n = parser.Nonterminal(tp, children)
+            n = parser.Nonterminal(tp)
         new_indent = count_indent(line)
         if new_indent >= last_indent:
             if new_indent == last_indent and node_stack:
@@ -291,3 +296,37 @@
             NEWLINE
             ENDMARKER"""
         assert tree_from_string(expected, gram) == p.parse("hi 42 end")
+
+
+    def test_optimized_terminal(self):
+        gram = """foo: bar baz 'end' NEWLINE ENDMARKER
+bar: NAME
+baz: NUMBER
+"""
+        p, gram = self.parser_for(gram, False)
+        expected = """
+        foo
+            bar
+                NAME "a_name"
+            baz
+                NUMBER "42"
+            NAME "end"
+            NEWLINE
+            ENDMARKER"""
+        input = "a_name 42 end"
+        tree = p.parse(input)
+        assert tree_from_string(expected, gram) == tree
+        assert isinstance(tree, parser.Nonterminal)
+        assert isinstance(tree.get_child(0), parser.Nonterminal1)
+        assert isinstance(tree.get_child(1), parser.Nonterminal1)
+
+
+    def test_error_string(self):
+        p, gram = self.parser_for(
+            "foo: 'if' NUMBER '+' NUMBER"
+        )
+        info = py.test.raises(parser.ParseError, p.parse, "if 42")
+        info.value.expected_str is None
+        info = py.test.raises(parser.ParseError, p.parse, "if 42 42")
+        info.value.expected_str == '+'
+
diff --git a/pypy/interpreter/pyparser/test/test_pyparse.py 
b/pypy/interpreter/pyparser/test/test_pyparse.py
--- a/pypy/interpreter/pyparser/test/test_pyparse.py
+++ b/pypy/interpreter/pyparser/test/test_pyparse.py
@@ -165,3 +165,11 @@
         for linefeed in ["\r\n","\r"]:
             tree = self.parse(fmt % linefeed)
             assert expected_tree == tree
+
+    def test_error_forgotten_chars(self):
+        info = py.test.raises(SyntaxError, self.parse, "if 1\n    print 4")
+        assert "(expected ':')" in info.value.msg
+        info = py.test.raises(SyntaxError, self.parse, "for i in range(10)\n   
 print i")
+        assert "(expected ':')" in info.value.msg
+        info = py.test.raises(SyntaxError, self.parse, "def f:\n print 1")
+        assert "(expected '(')" in info.value.msg
diff --git a/pypy/module/parser/pyparser.py b/pypy/module/parser/pyparser.py
--- a/pypy/module/parser/pyparser.py
+++ b/pypy/module/parser/pyparser.py
@@ -152,7 +152,7 @@
         # Raise an exception now and be done with it.
         raise parser_error(space, w_tuple,
                            "Illegal syntax-tree; cannot start with terminal 
symbol.")
-    node = pyparse.parser.Nonterminal(type, [])
+    node = pyparse.parser.Nonterminal(type)
     build_node_children(space, w_tuple, node, node_state)
     return node
 
@@ -171,7 +171,7 @@
             strn = space.text_w(w_obj)
             child = pyparse.parser.Terminal(type, strn, node_state.lineno, 0)
         else:
-            child = pyparse.parser.Nonterminal(type, [])
+            child = pyparse.parser.Nonterminal(type)
         node.append_child(child)
         if type >= 256:  # Nonterminal node
             build_node_children(space, w_elem, child, node_state)
@@ -187,8 +187,7 @@
         raise parse_error(space, "Unrecognized node type %d." % type)
     dfa = parser.grammar.dfas[type]
     # Run the DFA for this nonterminal
-    states, first = dfa
-    arcs, is_accepting = states[0]
+    arcs, is_accepting = dfa.states[0]
     for pos in range(tree.num_children()):
         ch = tree.get_child(pos)
         for i, next_state in arcs:
@@ -198,7 +197,7 @@
                 if ch.type >= 256:
                     validate_node(space, ch)
                 # Update the state, and move on to the next child.
-                arcs, is_accepting = states[next_state]
+                arcs, is_accepting = dfa.states[next_state]
                 break
         else:
             raise parse_error(space, "Illegal node")
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to