[pypy-commit] pypy py3k: pep3131: support non-ascii identifiers. thanks amaury for most of the tokenizer

pjenvey Mon, 11 Mar 2013 18:43:21 -0700

Author: Philip Jenvey <[email protected]>
Branch: py3k
Changeset: r62303:78c50cd0ed82
Date: 2013-03-11 18:37 -0700
http://bitbucket.org/pypy/pypy/changeset/78c50cd0ed82/


Log:    pep3131: support non-ascii identifiers. thanks amaury for most of
        the tokenizer work

diff --git a/pypy/interpreter/astcompiler/ast.py 
b/pypy/interpreter/astcompiler/ast.py
--- a/pypy/interpreter/astcompiler/ast.py
+++ b/pypy/interpreter/astcompiler/ast.py
@@ -6190,7 +6190,8 @@
     if not w_self.initialization_state & 4:
         typename = space.type(w_self).getname(space)
         raise operationerrfmt(space.w_AttributeError, "'%s' object has no 
attribute '%s'", typename, 'id')
-    return space.wrap(w_self.id)
+    id_ = w_self.id.decode('utf-8')
+    return space.wrap(id_)
 
 def Name_set_id(space, w_self, w_new_value):
     try:
diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py 
b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -1135,6 +1135,11 @@
         assert isinstance(s, ast.Str)
         assert space.eq_w(s.s, space.wrap(japan))
 
+    def test_pep3131(self):
+        assign = self.get_first_stmt("&#26085;&#26412; = 32").targets[0]
+        assert isinstance(assign, ast.Name)
+        assert assign.id == u"&#26085;&#26412;".encode('utf-8')
+
     def test_issue3574(self):
         space = self.space
         source = u'# coding: Latin-1\nu = "&#199;"\n'
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -90,7 +90,9 @@
         self.co_flags = flags
         self.co_code = code
         self.co_consts_w = consts
-        self.co_names_w = [space.new_interned_str(aname) for aname in names]
+        self.co_names_w = [
+            space.new_interned_w_str(space.wrap(aname.decode('utf-8')))
+            for aname in names]
         self.co_varnames = varnames
         self.co_freevars = freevars
         self.co_cellvars = cellvars
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -575,9 +575,9 @@
         self.pushvalue(w_build_class)
 
     def STORE_NAME(self, varindex, next_instr):
-        varname = self.getname_u(varindex)
+        w_varname = self.getname_w(varindex)
         w_newvalue = self.popvalue()
-        self.space.setitem_str(self.w_locals, varname, w_newvalue)
+        self.space.setitem(self.w_locals, w_varname, w_newvalue)
 
     def DELETE_NAME(self, varindex, next_instr):
         w_varname = self.getname_w(varindex)
@@ -656,31 +656,33 @@
                 self.pushvalue(w_value)
                 return
         # fall-back
-        varname = self.space.str_w(w_varname)
-        w_value = self._load_global(varname)
+        w_value = self._load_global(w_varname)
         if w_value is None:
-            message = "name '%s' is not defined"
-            raise operationerrfmt(self.space.w_NameError, message, varname)
+            message = "name '%8' is not defined"
+            raise operationerrfmt(self.space.w_NameError, message,
+                                  self.space.identifier_w(w_varname))
         self.pushvalue(w_value)
 
-    def _load_global(self, varname):
-        w_value = self.space.finditem_str(self.w_globals, varname)
+    def _load_global(self, w_varname):
+        w_value = self.space.finditem(self.w_globals, w_varname)
         if w_value is None:
             # not in the globals, now look in the built-ins
-            w_value = self.get_builtin().getdictvalue(self.space, varname)
+            w_value = self.get_builtin().getdictvalue(
+                self.space, self.space.identifier_w(w_varname))
         return w_value
     _load_global._always_inline_ = True
 
-    def _load_global_failed(self, varname):
-        message = "global name '%s' is not defined"
-        raise operationerrfmt(self.space.w_NameError, message, varname)
+    def _load_global_failed(self, w_varname):
+        message = "global name '%8' is not defined"
+        raise operationerrfmt(self.space.w_NameError, message,
+                              self.space.identifier_w(w_varname))
     _load_global_failed._dont_inline_ = True
 
     def LOAD_GLOBAL(self, nameindex, next_instr):
-        varname = self.getname_u(nameindex)
-        w_value = self._load_global(varname)
+        w_varname = self.getname_w(nameindex)
+        w_value = self._load_global(w_varname)
         if w_value is None:
-            self._load_global_failed(varname)
+            self._load_global_failed(w_varname)
         self.pushvalue(w_value)
     LOAD_GLOBAL._always_inline_ = True
 
diff --git a/pypy/interpreter/pyparser/automata.py 
b/pypy/interpreter/pyparser/automata.py
--- a/pypy/interpreter/pyparser/automata.py
+++ b/pypy/interpreter/pyparser/automata.py
@@ -36,6 +36,8 @@
         i = pos
         for i in range(pos, len(inVec)):
             item = inVec[i]
+            if ord(item) > 0x80:
+                item = "\x80"  # NON_ASCII
             # arcMap, accept = self.states[crntState]
             arcMap = self.states[crntState]
             accept = self.accepts[crntState]
diff --git a/pypy/interpreter/pyparser/dfa_generated.py 
b/pypy/interpreter/pyparser/dfa_generated.py
--- a/pypy/interpreter/pyparser/dfa_generated.py
+++ b/pypy/interpreter/pyparser/dfa_generated.py
@@ -37,7 +37,7 @@
      'q': 1, 'r': 3, 's': 1, 't': 1,
      'u': 1, 'v': 1, 'w': 1, 'x': 1,
      'y': 1, 'z': 1, '{': 14, '|': 13,
-     '}': 14, '~': 14},
+     '}': 14, '~': 14, '\x80': 1},
     # 1
     {'0': 1, '1': 1, '2': 1, '3': 1,
      '4': 1, '5': 1, '6': 1, '7': 1,
@@ -54,7 +54,7 @@
      'l': 1, 'm': 1, 'n': 1, 'o': 1,
      'p': 1, 'q': 1, 'r': 1, 's': 1,
      't': 1, 'u': 1, 'v': 1, 'w': 1,
-     'x': 1, 'y': 1, 'z': 1},
+     'x': 1, 'y': 1, 'z': 1, '\x80': 1},
     # 2
     {'"': 17, "'": 16, '0': 1, '1': 1,
      '2': 1, '3': 1, '4': 1, '5': 1,
@@ -72,7 +72,7 @@
      'n': 1, 'o': 1, 'p': 1, 'q': 1,
      'r': 3, 's': 1, 't': 1, 'u': 1,
      'v': 1, 'w': 1, 'x': 1, 'y': 1,
-     'z': 1},
+     'z': 1, '\x80': 1},
     # 3
     {'"': 17, "'": 16, '0': 1, '1': 1,
      '2': 1, '3': 1, '4': 1, '5': 1,
@@ -90,7 +90,7 @@
      'n': 1, 'o': 1, 'p': 1, 'q': 1,
      'r': 1, 's': 1, 't': 1, 'u': 1,
      'v': 1, 'w': 1, 'x': 1, 'y': 1,
-     'z': 1},
+     'z': 1, '\x80': 1},
     # 4
     {'.': 25, '0': 23, '1': 24, '2': 24,
      '3': 24, '4': 24, '5': 24, '6': 24,
diff --git a/pypy/interpreter/pyparser/gendfa.py 
b/pypy/interpreter/pyparser/gendfa.py
--- a/pypy/interpreter/pyparser/gendfa.py
+++ b/pypy/interpreter/pyparser/gendfa.py
@@ -17,6 +17,8 @@
 from pypy.interpreter.pyparser.pylexer import *
 from pypy.interpreter.pyparser.automata import NonGreedyDFA, DFA, DEFAULT
 
+NON_ASCII = "\x80"
+
 def makePyPseudoDFA ():
     import string
     states = []
@@ -50,9 +52,10 @@
     # ____________________________________________________________
     # Names
     name = chain(states,
-                 groupStr(states, string.letters + "_"),
+                 groupStr(states, string.letters + "_" + NON_ASCII),
                  any(states, groupStr(states,
-                                      string.letters + string.digits + "_")))
+                                      string.letters + string.digits + "_" +
+                                      NON_ASCII)))
     # ____________________________________________________________
     # Digits
     def makeDigits ():
diff --git a/pypy/interpreter/pyparser/pytokenizer.py 
b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -44,6 +44,20 @@
     return None
 
 
+def verify_identifier(token):
+    for c in token:
+        if ord(c) > 0x80:
+            break
+    else:
+        return True
+    try:
+        u = token.decode('utf-8')
+    except UnicodeDecodeError:
+        return False
+    from pypy.objspace.std.unicodeobject import _isidentifier
+    return _isidentifier(u)
+
+
 def generate_tokens(lines, flags):
     """
     This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
@@ -231,7 +245,11 @@
                         tok = (tokens.STRING, token, lnum, start, line)
                         token_list.append(tok)
                         last_comment = ''
-                elif initial in namechars:                 # ordinary name
+                elif (initial in namechars or              # ordinary name
+                      ord(initial) >= 0x80):               # unicode identifier
+                    if not verify_identifier(token):
+                        raise TokenError("invalid character in identifier",
+                                         line, lnum, start + 1, token_list)
                     token_list.append((tokens.NAME, token, lnum, start, line))
                     last_comment = ''
                 elif initial == '\\':                      # continued stmt
diff --git a/pypy/interpreter/pyparser/test/test_pyparse.py 
b/pypy/interpreter/pyparser/test/test_pyparse.py
--- a/pypy/interpreter/pyparser/test/test_pyparse.py
+++ b/pypy/interpreter/pyparser/test/test_pyparse.py
@@ -36,6 +36,10 @@
         tree = self.parse("""foo = '&#26085;&#26412;'""", info=info)
         assert info.encoding == 'utf-8'
 
+    def test_unicode_identifier(self):
+        tree = self.parse("a&#26085;&#26412; = 32")
+        tree = self.parse("&#26085;&#26412; = 32")
+
     def test_syntax_error(self):
         parse = self.parse
         exc = py.test.raises(SyntaxError, parse, "name another for").value
diff --git a/pypy/interpreter/test/test_compiler.py 
b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 import __future__
 import py, sys
 from pypy.interpreter.pycompiler import PythonAstCompiler
@@ -798,6 +799,29 @@
         s = '\udcff'
         raises(UnicodeEncodeError, compile, s, 'foo', 'exec')
 
+    def test_unicode_identifier(self):
+        c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec")
+        d = {}
+        exec(c, d)
+        assert d['\xc6'] == '\xc6'
+        c = compile("&#26085;&#26412; = 8; &#26085;&#26412;2 = 
&#26085;&#26412; + 1; del &#26085;&#26412;;", "dummy", "exec")
+        exec(c, d)
+        assert '&#26085;&#26412;2' in d
+        assert d['&#26085;&#26412;2'] == 9
+        assert '&#26085;&#26412;' not in d
+
+        raises(SyntaxError, eval, b'\xff\x20')
+        raises(SyntaxError, eval, b'\xef\xbb\x20')
+
+    def test_cpython_issue2301(self):
+        skip('XXX')
+        try:
+            compile(b"# coding: utf7\nprint '+XnQ-'", "dummy", "exec")
+        except SyntaxError as v:
+            assert v.text ==  "print '\u5e74'\n"
+        else:
+            assert False, "Expected SyntaxError"
+
     def test_ast_equality(self):
         import _ast
         sample_code = [
diff --git a/pypy/module/_ast/test/test_ast.py 
b/pypy/module/_ast/test/test_ast.py
--- a/pypy/module/_ast/test/test_ast.py
+++ b/pypy/module/_ast/test/test_ast.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 import py
 
 
@@ -85,6 +86,10 @@
         name.id = "hi"
         assert name.id == "hi"
 
+    def test_unicode_identifier(self):
+        name = self.get_ast("&#26085;&#26412;", "eval").body
+        assert name.id == "&#26085;&#26412;"
+
     @py.test.mark.skipif("py.test.config.option.runappdirect")
     def test_object(self):
         ast = self.ast
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -302,9 +302,11 @@
     return space.newbool(cased)
 
 def unicode_isidentifier__Unicode(space, w_unicode):
-    v = w_unicode._value
-    if len(v) == 0:
-        return space.w_False
+    return space.newbool(_isidentifier(w_unicode._value))
+
+def _isidentifier(u):
+    if not u:
+        return False
 
     # PEP 3131 says that the first character must be in XID_Start and
     # subsequent characters in XID_Continue, and for the ASCII range,
@@ -313,14 +315,14 @@
     # current definition of XID_Start and XID_Continue, it is
     # sufficient to check just for these, except that _ must be
     # allowed as starting an identifier.
-    first = v[0]
+    first = u[0]
     if not (unicodedb.isxidstart(ord(first)) or first == u'_'):
-        return space.w_False
+        return False
 
-    for i in range(1, len(v)):
-        if not unicodedb.isxidcontinue(ord(v[i])):
-            return space.w_False
-    return space.w_True
+    for i in range(1, len(u)):
+        if not unicodedb.isxidcontinue(ord(u[i])):
+            return False
+    return True
 
 def unicode_isprintable__Unicode(space, w_unicode):
     for uchar in w_unicode._value:
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3k: pep3131: support non-ascii identifiers. thanks amaury for most of the tokenizer

Reply via email to