Author: Armin Rigo <[email protected]>
Branch:
Changeset: r68838:574aa48e4875
Date: 2014-01-22 10:33 +0100
http://bitbucket.org/pypy/pypy/changeset/574aa48e4875/
Log: Test and fix: the parser would use UTF-16 and produce surrogates
from the source code, even if we're a UTF-32 version of pypy.
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py
b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -1,4 +1,4 @@
-import py
+import py, sys
from pypy.interpreter.astcompiler import codegen, astbuilder, symtable,
optimize
from pypy.interpreter.pyparser import pyparse
from pypy.interpreter.pyparser.test import expressions
@@ -867,6 +867,9 @@
class AppTestCompiler:
+ def setup_class(cls):
+ cls.w_maxunicode = cls.space.wrap(sys.maxunicode)
+
def test_docstring_not_loaded(self):
import StringIO, dis, sys
ns = {}
@@ -911,7 +914,17 @@
l = [a for a in Foo()]
assert hint_called[0]
assert l == list(range(5))
-
+
+ def test_unicode_in_source(self):
+ import sys
+ d = {}
+ exec '# -*- coding: utf-8 -*-\n\nu = u"\xf0\x9f\x92\x8b"' in d
+ if sys.maxunicode > 65535 and self.maxunicode > 65535:
+ expected_length = 1
+ else:
+ expected_length = 2
+ assert len(d['u']) == expected_length
+
class TestOptimizations:
def count_instructions(self, source):
diff --git a/pypy/interpreter/pyparser/parsestring.py
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -15,7 +15,6 @@
Yes, it's very inefficient.
Yes, CPython has very similar code.
"""
-
# we use ps as "pointer to s"
# q is the virtual last char index of the string
ps = 0
@@ -54,42 +53,10 @@
if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
if encoding is None or encoding == "iso-8859-1":
# 'unicode_escape' expects latin-1 bytes, string is ready.
- buf = s
- bufp = ps
- bufq = q
- u = None
+ assert 0 <= ps <= q
+ substr = s[ps:q]
else:
- # String is utf8-encoded, but 'unicode_escape' expects
- # latin-1; So multibyte sequences must be escaped.
- lis = [] # using a list to assemble the value
- end = q
- # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
- while ps < end:
- if s[ps] == '\\':
- lis.append(s[ps])
- ps += 1
- if ord(s[ps]) & 0x80:
- # A multibyte sequence will follow, it will be
- # escaped like \u1234. To avoid confusion with
- # the backslash we just wrote, we emit "\u005c"
- # instead.
- lis.append("u005c")
- if ord(s[ps]) & 0x80: # XXX inefficient
- w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
- rn = len(w)
- assert rn % 2 == 0
- for i in range(0, rn, 2):
- lis.append('\\u')
- lis.append(hexbyte(ord(w[i])))
- lis.append(hexbyte(ord(w[i+1])))
- else:
- lis.append(s[ps])
- ps += 1
- buf = ''.join(lis)
- bufp = 0
- bufq = len(buf)
- assert 0 <= bufp <= bufq
- substr = buf[bufp:bufq]
+ substr = decode_unicode_utf8(space, s, ps, q)
if rawmode:
v = unicodehelper.decode_raw_unicode_escape(space, substr)
else:
@@ -121,6 +88,39 @@
result = "0" + result
return result
+def decode_unicode_utf8(space, s, ps, q):
+ # ****The Python 2.7 version, producing UTF-32 escapes****
+ # String is utf8-encoded, but 'unicode_escape' expects
+ # latin-1; So multibyte sequences must be escaped.
+ lis = [] # using a list to assemble the value
+ end = q
+ # Worst case:
+ # "<92><195><164>" may become "\u005c\U000000E4" (16 bytes)
+ while ps < end:
+ if s[ps] == '\\':
+ lis.append(s[ps])
+ ps += 1
+ if ord(s[ps]) & 0x80:
+ # A multibyte sequence will follow, it will be
+ # escaped like \u1234. To avoid confusion with
+ # the backslash we just wrote, we emit "\u005c"
+ # instead.
+ lis.append("u005c")
+ if ord(s[ps]) & 0x80: # XXX inefficient
+ w, ps = decode_utf8(space, s, ps, end, "utf-32-be")
+ rn = len(w)
+ assert rn % 4 == 0
+ for i in range(0, rn, 4):
+ lis.append('\\U')
+ lis.append(hexbyte(ord(w[i])))
+ lis.append(hexbyte(ord(w[i+1])))
+ lis.append(hexbyte(ord(w[i+2])))
+ lis.append(hexbyte(ord(w[i+3])))
+ else:
+ lis.append(s[ps])
+ ps += 1
+ return ''.join(lis)
+
def PyString_DecodeEscape(space, s, recode_encoding):
"""
Unescape a backslash-escaped string. If recode_encoding is non-zero,
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py
b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -1,5 +1,5 @@
from pypy.interpreter.pyparser import parsestring
-import py
+import py, sys
class TestParsetring:
def parse_and_compare(self, literal, value):
@@ -91,3 +91,18 @@
input = ["'", 'x', ' ', chr(0xc3), chr(0xa9), ' ', chr(92), 'n', "'"]
w_ret = parsestring.parsestr(space, 'utf8', ''.join(input))
assert space.str_w(w_ret) == ''.join(expected)
+
+ def test_wide_unicode_in_source(self):
+ if sys.maxunicode == 65535:
+ py.test.skip("requires a wide-unicode host")
+ self.parse_and_compare('u"\xf0\x9f\x92\x8b"',
+ unichr(0x1f48b),
+ encoding='utf-8')
+
+ def test_decode_unicode_utf8(self):
+ buf = parsestring.decode_unicode_utf8(self.space,
+ 'u"\xf0\x9f\x92\x8b"', 2, 6)
+ if sys.maxunicode == 65535:
+ assert buf == r"\U0000d83d\U0000dc8b"
+ else:
+ assert buf == r"\U0001f48b"
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit