[pypy-commit] pypy default: Test and fix: the parser would use UTF-16 and produce surrogates

arigo Wed, 22 Jan 2014 01:37:54 -0800

Author: Armin Rigo <[email protected]>
Branch: 
Changeset: r68838:574aa48e4875
Date: 2014-01-22 10:33 +0100
http://bitbucket.org/pypy/pypy/changeset/574aa48e4875/


Log:    Test and fix: the parser would use UTF-16 and produce surrogates
        from the source code, even if we're a UTF-32 version of pypy.

diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py 
b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -1,4 +1,4 @@
-import py
+import py, sys
 from pypy.interpreter.astcompiler import codegen, astbuilder, symtable, 
optimize
 from pypy.interpreter.pyparser import pyparse
 from pypy.interpreter.pyparser.test import expressions
@@ -867,6 +867,9 @@
 
 class AppTestCompiler:
 
+    def setup_class(cls):
+        cls.w_maxunicode = cls.space.wrap(sys.maxunicode)
+
     def test_docstring_not_loaded(self):
         import StringIO, dis, sys
         ns = {}
@@ -911,7 +914,17 @@
         l = [a for a in Foo()]
         assert hint_called[0]
         assert l == list(range(5))
-        
+
+    def test_unicode_in_source(self):
+        import sys
+        d = {}
+        exec '# -*- coding: utf-8 -*-\n\nu = u"\xf0\x9f\x92\x8b"' in d
+        if sys.maxunicode > 65535 and self.maxunicode > 65535:
+            expected_length = 1
+        else:
+            expected_length = 2
+        assert len(d['u']) == expected_length
+
 
 class TestOptimizations:
     def count_instructions(self, source):
diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -15,7 +15,6 @@
     Yes, it's very inefficient.
     Yes, CPython has very similar code.
     """
-
     # we use ps as "pointer to s"
     # q is the virtual last char index of the string
     ps = 0
@@ -54,42 +53,10 @@
     if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
         if encoding is None or encoding == "iso-8859-1":
             # 'unicode_escape' expects latin-1 bytes, string is ready.
-            buf = s
-            bufp = ps
-            bufq = q
-            u = None
+            assert 0 <= ps <= q
+            substr = s[ps:q]
         else:
-            # String is utf8-encoded, but 'unicode_escape' expects
-            # latin-1; So multibyte sequences must be escaped.
-            lis = [] # using a list to assemble the value
-            end = q
-            # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
-            while ps < end:
-                if s[ps] == '\\':
-                    lis.append(s[ps])
-                    ps += 1
-                    if ord(s[ps]) & 0x80:
-                        # A multibyte sequence will follow, it will be
-                        # escaped like \u1234. To avoid confusion with
-                        # the backslash we just wrote, we emit "\u005c"
-                        # instead.
-                        lis.append("u005c")
-                if ord(s[ps]) & 0x80: # XXX inefficient
-                    w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
-                    rn = len(w)
-                    assert rn % 2 == 0
-                    for i in range(0, rn, 2):
-                        lis.append('\\u')
-                        lis.append(hexbyte(ord(w[i])))
-                        lis.append(hexbyte(ord(w[i+1])))
-                else:
-                    lis.append(s[ps])
-                    ps += 1
-            buf = ''.join(lis)
-            bufp = 0
-            bufq = len(buf)
-        assert 0 <= bufp <= bufq
-        substr = buf[bufp:bufq]
+            substr = decode_unicode_utf8(space, s, ps, q)
         if rawmode:
             v = unicodehelper.decode_raw_unicode_escape(space, substr)
         else:
@@ -121,6 +88,39 @@
         result = "0" + result
     return result
 
+def decode_unicode_utf8(space, s, ps, q):
+    # ****The Python 2.7 version, producing UTF-32 escapes****
+    # String is utf8-encoded, but 'unicode_escape' expects
+    # latin-1; So multibyte sequences must be escaped.
+    lis = [] # using a list to assemble the value
+    end = q
+    # Worst case:
+    # "<92><195><164>" may become "\u005c\U000000E4" (16 bytes)
+    while ps < end:
+        if s[ps] == '\\':
+            lis.append(s[ps])
+            ps += 1
+            if ord(s[ps]) & 0x80:
+                # A multibyte sequence will follow, it will be
+                # escaped like \u1234. To avoid confusion with
+                # the backslash we just wrote, we emit "\u005c"
+                # instead.
+                lis.append("u005c")
+        if ord(s[ps]) & 0x80: # XXX inefficient
+            w, ps = decode_utf8(space, s, ps, end, "utf-32-be")
+            rn = len(w)
+            assert rn % 4 == 0
+            for i in range(0, rn, 4):
+                lis.append('\\U')
+                lis.append(hexbyte(ord(w[i])))
+                lis.append(hexbyte(ord(w[i+1])))
+                lis.append(hexbyte(ord(w[i+2])))
+                lis.append(hexbyte(ord(w[i+3])))
+        else:
+            lis.append(s[ps])
+            ps += 1
+    return ''.join(lis)
+
 def PyString_DecodeEscape(space, s, recode_encoding):
     """
     Unescape a backslash-escaped string. If recode_encoding is non-zero,
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py 
b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -1,5 +1,5 @@
 from pypy.interpreter.pyparser import parsestring
-import py
+import py, sys
 
 class TestParsetring:
     def parse_and_compare(self, literal, value):
@@ -91,3 +91,18 @@
         input = ["'", 'x', ' ', chr(0xc3), chr(0xa9), ' ', chr(92), 'n', "'"]
         w_ret = parsestring.parsestr(space, 'utf8', ''.join(input))
         assert space.str_w(w_ret) == ''.join(expected)
+
+    def test_wide_unicode_in_source(self):
+        if sys.maxunicode == 65535:
+            py.test.skip("requires a wide-unicode host")
+        self.parse_and_compare('u"\xf0\x9f\x92\x8b"',
+                               unichr(0x1f48b),
+                               encoding='utf-8')
+
+    def test_decode_unicode_utf8(self):
+        buf = parsestring.decode_unicode_utf8(self.space,
+                                              'u"\xf0\x9f\x92\x8b"', 2, 6)
+        if sys.maxunicode == 65535:
+            assert buf == r"\U0000d83d\U0000dc8b"
+        else:
+            assert buf == r"\U0001f48b"
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy default: Test and fix: the parser would use UTF-16 and produce surrogates

Reply via email to