Author: Philip Jenvey <[email protected]>
Branch: py3k
Changeset: r62303:78c50cd0ed82
Date: 2013-03-11 18:37 -0700
http://bitbucket.org/pypy/pypy/changeset/78c50cd0ed82/
Log: pep3131: support non-ascii identifiers. thanks amaury for most of
the tokenizer work
diff --git a/pypy/interpreter/astcompiler/ast.py
b/pypy/interpreter/astcompiler/ast.py
--- a/pypy/interpreter/astcompiler/ast.py
+++ b/pypy/interpreter/astcompiler/ast.py
@@ -6190,7 +6190,8 @@
if not w_self.initialization_state & 4:
typename = space.type(w_self).getname(space)
raise operationerrfmt(space.w_AttributeError, "'%s' object has no
attribute '%s'", typename, 'id')
- return space.wrap(w_self.id)
+ id_ = w_self.id.decode('utf-8')
+ return space.wrap(id_)
def Name_set_id(space, w_self, w_new_value):
try:
diff --git a/pypy/interpreter/astcompiler/test/test_astbuilder.py
b/pypy/interpreter/astcompiler/test/test_astbuilder.py
--- a/pypy/interpreter/astcompiler/test/test_astbuilder.py
+++ b/pypy/interpreter/astcompiler/test/test_astbuilder.py
@@ -1135,6 +1135,11 @@
assert isinstance(s, ast.Str)
assert space.eq_w(s.s, space.wrap(japan))
+ def test_pep3131(self):
+ assign = self.get_first_stmt("日本 = 32").targets[0]
+ assert isinstance(assign, ast.Name)
+ assert assign.id == u"日本".encode('utf-8')
+
def test_issue3574(self):
space = self.space
source = u'# coding: Latin-1\nu = "Ç"\n'
diff --git a/pypy/interpreter/pycode.py b/pypy/interpreter/pycode.py
--- a/pypy/interpreter/pycode.py
+++ b/pypy/interpreter/pycode.py
@@ -90,7 +90,9 @@
self.co_flags = flags
self.co_code = code
self.co_consts_w = consts
- self.co_names_w = [space.new_interned_str(aname) for aname in names]
+ self.co_names_w = [
+ space.new_interned_w_str(space.wrap(aname.decode('utf-8')))
+ for aname in names]
self.co_varnames = varnames
self.co_freevars = freevars
self.co_cellvars = cellvars
diff --git a/pypy/interpreter/pyopcode.py b/pypy/interpreter/pyopcode.py
--- a/pypy/interpreter/pyopcode.py
+++ b/pypy/interpreter/pyopcode.py
@@ -575,9 +575,9 @@
self.pushvalue(w_build_class)
def STORE_NAME(self, varindex, next_instr):
- varname = self.getname_u(varindex)
+ w_varname = self.getname_w(varindex)
w_newvalue = self.popvalue()
- self.space.setitem_str(self.w_locals, varname, w_newvalue)
+ self.space.setitem(self.w_locals, w_varname, w_newvalue)
def DELETE_NAME(self, varindex, next_instr):
w_varname = self.getname_w(varindex)
@@ -656,31 +656,33 @@
self.pushvalue(w_value)
return
# fall-back
- varname = self.space.str_w(w_varname)
- w_value = self._load_global(varname)
+ w_value = self._load_global(w_varname)
if w_value is None:
- message = "name '%s' is not defined"
- raise operationerrfmt(self.space.w_NameError, message, varname)
+ message = "name '%8' is not defined"
+ raise operationerrfmt(self.space.w_NameError, message,
+ self.space.identifier_w(w_varname))
self.pushvalue(w_value)
- def _load_global(self, varname):
- w_value = self.space.finditem_str(self.w_globals, varname)
+ def _load_global(self, w_varname):
+ w_value = self.space.finditem(self.w_globals, w_varname)
if w_value is None:
# not in the globals, now look in the built-ins
- w_value = self.get_builtin().getdictvalue(self.space, varname)
+ w_value = self.get_builtin().getdictvalue(
+ self.space, self.space.identifier_w(w_varname))
return w_value
_load_global._always_inline_ = True
- def _load_global_failed(self, varname):
- message = "global name '%s' is not defined"
- raise operationerrfmt(self.space.w_NameError, message, varname)
+ def _load_global_failed(self, w_varname):
+ message = "global name '%8' is not defined"
+ raise operationerrfmt(self.space.w_NameError, message,
+ self.space.identifier_w(w_varname))
_load_global_failed._dont_inline_ = True
def LOAD_GLOBAL(self, nameindex, next_instr):
- varname = self.getname_u(nameindex)
- w_value = self._load_global(varname)
+ w_varname = self.getname_w(nameindex)
+ w_value = self._load_global(w_varname)
if w_value is None:
- self._load_global_failed(varname)
+ self._load_global_failed(w_varname)
self.pushvalue(w_value)
LOAD_GLOBAL._always_inline_ = True
diff --git a/pypy/interpreter/pyparser/automata.py
b/pypy/interpreter/pyparser/automata.py
--- a/pypy/interpreter/pyparser/automata.py
+++ b/pypy/interpreter/pyparser/automata.py
@@ -36,6 +36,8 @@
i = pos
for i in range(pos, len(inVec)):
item = inVec[i]
+ if ord(item) > 0x80:
+ item = "\x80" # NON_ASCII
# arcMap, accept = self.states[crntState]
arcMap = self.states[crntState]
accept = self.accepts[crntState]
diff --git a/pypy/interpreter/pyparser/dfa_generated.py
b/pypy/interpreter/pyparser/dfa_generated.py
--- a/pypy/interpreter/pyparser/dfa_generated.py
+++ b/pypy/interpreter/pyparser/dfa_generated.py
@@ -37,7 +37,7 @@
'q': 1, 'r': 3, 's': 1, 't': 1,
'u': 1, 'v': 1, 'w': 1, 'x': 1,
'y': 1, 'z': 1, '{': 14, '|': 13,
- '}': 14, '~': 14},
+ '}': 14, '~': 14, '\x80': 1},
# 1
{'0': 1, '1': 1, '2': 1, '3': 1,
'4': 1, '5': 1, '6': 1, '7': 1,
@@ -54,7 +54,7 @@
'l': 1, 'm': 1, 'n': 1, 'o': 1,
'p': 1, 'q': 1, 'r': 1, 's': 1,
't': 1, 'u': 1, 'v': 1, 'w': 1,
- 'x': 1, 'y': 1, 'z': 1},
+ 'x': 1, 'y': 1, 'z': 1, '\x80': 1},
# 2
{'"': 17, "'": 16, '0': 1, '1': 1,
'2': 1, '3': 1, '4': 1, '5': 1,
@@ -72,7 +72,7 @@
'n': 1, 'o': 1, 'p': 1, 'q': 1,
'r': 3, 's': 1, 't': 1, 'u': 1,
'v': 1, 'w': 1, 'x': 1, 'y': 1,
- 'z': 1},
+ 'z': 1, '\x80': 1},
# 3
{'"': 17, "'": 16, '0': 1, '1': 1,
'2': 1, '3': 1, '4': 1, '5': 1,
@@ -90,7 +90,7 @@
'n': 1, 'o': 1, 'p': 1, 'q': 1,
'r': 1, 's': 1, 't': 1, 'u': 1,
'v': 1, 'w': 1, 'x': 1, 'y': 1,
- 'z': 1},
+ 'z': 1, '\x80': 1},
# 4
{'.': 25, '0': 23, '1': 24, '2': 24,
'3': 24, '4': 24, '5': 24, '6': 24,
diff --git a/pypy/interpreter/pyparser/gendfa.py
b/pypy/interpreter/pyparser/gendfa.py
--- a/pypy/interpreter/pyparser/gendfa.py
+++ b/pypy/interpreter/pyparser/gendfa.py
@@ -17,6 +17,8 @@
from pypy.interpreter.pyparser.pylexer import *
from pypy.interpreter.pyparser.automata import NonGreedyDFA, DFA, DEFAULT
+NON_ASCII = "\x80"
+
def makePyPseudoDFA ():
import string
states = []
@@ -50,9 +52,10 @@
# ____________________________________________________________
# Names
name = chain(states,
- groupStr(states, string.letters + "_"),
+ groupStr(states, string.letters + "_" + NON_ASCII),
any(states, groupStr(states,
- string.letters + string.digits + "_")))
+ string.letters + string.digits + "_" +
+ NON_ASCII)))
# ____________________________________________________________
# Digits
def makeDigits ():
diff --git a/pypy/interpreter/pyparser/pytokenizer.py
b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -44,6 +44,20 @@
return None
+def verify_identifier(token):
+ for c in token:
+ if ord(c) > 0x80:
+ break
+ else:
+ return True
+ try:
+ u = token.decode('utf-8')
+ except UnicodeDecodeError:
+ return False
+ from pypy.objspace.std.unicodeobject import _isidentifier
+ return _isidentifier(u)
+
+
def generate_tokens(lines, flags):
"""
This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
@@ -231,7 +245,11 @@
tok = (tokens.STRING, token, lnum, start, line)
token_list.append(tok)
last_comment = ''
- elif initial in namechars: # ordinary name
+ elif (initial in namechars or # ordinary name
+ ord(initial) >= 0x80): # unicode identifier
+ if not verify_identifier(token):
+ raise TokenError("invalid character in identifier",
+ line, lnum, start + 1, token_list)
token_list.append((tokens.NAME, token, lnum, start, line))
last_comment = ''
elif initial == '\\': # continued stmt
diff --git a/pypy/interpreter/pyparser/test/test_pyparse.py
b/pypy/interpreter/pyparser/test/test_pyparse.py
--- a/pypy/interpreter/pyparser/test/test_pyparse.py
+++ b/pypy/interpreter/pyparser/test/test_pyparse.py
@@ -36,6 +36,10 @@
tree = self.parse("""foo = '日本'""", info=info)
assert info.encoding == 'utf-8'
+ def test_unicode_identifier(self):
+ tree = self.parse("a日本 = 32")
+ tree = self.parse("日本 = 32")
+
def test_syntax_error(self):
parse = self.parse
exc = py.test.raises(SyntaxError, parse, "name another for").value
diff --git a/pypy/interpreter/test/test_compiler.py
b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
import __future__
import py, sys
from pypy.interpreter.pycompiler import PythonAstCompiler
@@ -798,6 +799,29 @@
s = '\udcff'
raises(UnicodeEncodeError, compile, s, 'foo', 'exec')
+ def test_unicode_identifier(self):
+ c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec")
+ d = {}
+ exec(c, d)
+ assert d['\xc6'] == '\xc6'
+ c = compile("日本 = 8; 日本2 =
日本 + 1; del 日本;", "dummy", "exec")
+ exec(c, d)
+ assert '日本2' in d
+ assert d['日本2'] == 9
+ assert '日本' not in d
+
+ raises(SyntaxError, eval, b'\xff\x20')
+ raises(SyntaxError, eval, b'\xef\xbb\x20')
+
+ def test_cpython_issue2301(self):
+ skip('XXX')
+ try:
+ compile(b"# coding: utf7\nprint '+XnQ-'", "dummy", "exec")
+ except SyntaxError as v:
+ assert v.text == "print '\u5e74'\n"
+ else:
+ assert False, "Expected SyntaxError"
+
def test_ast_equality(self):
import _ast
sample_code = [
diff --git a/pypy/module/_ast/test/test_ast.py
b/pypy/module/_ast/test/test_ast.py
--- a/pypy/module/_ast/test/test_ast.py
+++ b/pypy/module/_ast/test/test_ast.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
import py
@@ -85,6 +86,10 @@
name.id = "hi"
assert name.id == "hi"
+ def test_unicode_identifier(self):
+ name = self.get_ast("日本", "eval").body
+ assert name.id == "日本"
+
@py.test.mark.skipif("py.test.config.option.runappdirect")
def test_object(self):
ast = self.ast
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -302,9 +302,11 @@
return space.newbool(cased)
def unicode_isidentifier__Unicode(space, w_unicode):
- v = w_unicode._value
- if len(v) == 0:
- return space.w_False
+ return space.newbool(_isidentifier(w_unicode._value))
+
+def _isidentifier(u):
+ if not u:
+ return False
# PEP 3131 says that the first character must be in XID_Start and
# subsequent characters in XID_Continue, and for the ASCII range,
@@ -313,14 +315,14 @@
# current definition of XID_Start and XID_Continue, it is
# sufficient to check just for these, except that _ must be
# allowed as starting an identifier.
- first = v[0]
+ first = u[0]
if not (unicodedb.isxidstart(ord(first)) or first == u'_'):
- return space.w_False
+ return False
- for i in range(1, len(v)):
- if not unicodedb.isxidcontinue(ord(v[i])):
- return space.w_False
- return space.w_True
+ for i in range(1, len(u)):
+ if not unicodedb.isxidcontinue(ord(u[i])):
+ return False
+ return True
def unicode_isprintable__Unicode(space, w_unicode):
for uchar in w_unicode._value:
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit