fix_test_codecs (pull request #612)

cfbolz Mon, 28 May 2018 23:09:01 -0700

Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de>
Branch: py3.6
Changeset: r94705:24d343241901
Date: 2018-05-29 06:07 +0000
http://bitbucket.org/pypy/pypy/changeset/24d343241901/


Log:    Merged in toumorokoshi/pypy/fix_test_codecs (pull request #612)

        Fix test codecs

diff --git a/lib-python/3/test/test_codecs.py b/lib-python/3/test/test_codecs.py
--- a/lib-python/3/test/test_codecs.py
+++ b/lib-python/3/test/test_codecs.py
@@ -2468,7 +2468,8 @@
                 with self.assertWarns(DeprecationWarning):
                     check(b"\\" + b, "\\" + chr(i))
             if b.upper() not in b'UN':
-                with self.assertWarns(DeprecationWarning):
+                with self.assertWarns(DeprecationWarning,
+                                      msg="character {} did not raise an 
exception".format(i)):
                     check(b"\\" + b.upper(), "\\" + chr(i-32))
         with self.assertWarns(DeprecationWarning):
             check(br"\8", "\\8")
diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -117,12 +117,6 @@
     v, first_escape_error_char = PyString_DecodeEscape(
         space, substr, 'strict', encoding)
 
-    if first_escape_error_char != '':
-        space.warn(
-            space.newtext("invalid escape sequence '\\%s'"
-                          % first_escape_error_char),
-            space.w_DeprecationWarning)
-
     return space.newbytes(v)
 
 def decode_unicode_utf8(space, s, ps, q):
@@ -252,6 +246,13 @@
             # an arbitry number of unescaped UTF-8 bytes may follow.
 
     buf = builder.build()
+
+    if first_escape_error_char != '':
+        space.warn(
+            space.newtext("invalid escape sequence '\\%s'"
+                          % first_escape_error_char),
+            space.w_DeprecationWarning)
+
     return buf, first_escape_error_char
 
 
diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -3,7 +3,10 @@
 import struct
 import sys
 from pypy.interpreter.unicodehelper import (
-    encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be)
+    encode_utf8, decode_utf8,
+    unicode_encode_utf_8,
+    unicode_encode_utf_32_be, str_decode_utf_32_be
+)
 from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
 
 
@@ -28,6 +31,35 @@
     c = u"\udc00"
     py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
 
+
+def test_encode_utf_8_combine_surrogates():
+    """
+    In the case of a surrogate pair, the error handler should
+    return back a start and stop position of the full surrogate
+    pair (new behavior inherited from python3.6)
+    """
+    u = u"\udc80\ud800\udfff"
+
+    handler_num = 0
+
+    def errorhandler(errors, encoding, msg, s, start, end):
+        """
+        This handler will be called twice, so asserting both times:
+
+        1. the first time, 0xDC80 will be handled as a single surrogate,
+           since it is a standalone character and an invalid surrogate.
+        2. the second time, the characters will be 0xD800 and 0xDFFF, since
+           that is a valid surrogate pair.
+        """
+        assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
+        return [], None, end
+
+    unicode_encode_utf_8(
+        u, len(u), True,
+        errorhandler=errorhandler,
+        allow_surrogates=False
+    )
+
 def test_encode_utf8_allow_surrogates():
     sp = FakeSpace()
     assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,12 +1,13 @@
 import sys
 from pypy.interpreter.error import OperationError, oefmt
-from rpython.rlib.objectmodel import specialize
-from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.objectmodel import specialize, we_are_translated
+from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib import runicode
+from rpython.rlib import runicode, jit, nonconst
 from rpython.rlib.runicode import (
     default_unicode_error_encode, default_unicode_error_decode,
     MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
+from rpython.tool.sourcetools import func_with_new_name
 
 _WIN32 = sys.platform == 'win32'
 _MACOSX = sys.platform == 'darwin'
@@ -85,7 +86,7 @@
                                     force_replace=False)
     elif _MACOSX:
         uni = space.unicode_w(w_uni)
-        bytes = runicode.unicode_encode_utf_8_impl(
+        bytes = unicode_encode_utf_8_impl(
             uni, len(uni), 'surrogateescape',
             errorhandler=state.encode_error_handler,
             allow_surrogates=False)
@@ -117,12 +118,176 @@
     from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
-    result, consumed = runicode.str_decode_unicode_escape(
+    result, consumed, first_escape_error_char = str_decode_unicode_escape(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
         unicodedata_handler=unicodedata_handler)
     return result
 
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+
+def hexescape(builder, s, pos, digits,
+              encoding, errorhandler, message, errors):
+    chr = 0
+    if pos + digits > len(s):
+        endinpos = pos
+        while endinpos < len(s) and s[endinpos] in hexdigits:
+            endinpos += 1
+        res, pos = errorhandler(errors, encoding,
+                                message, s, pos-2, endinpos)
+        builder.append(res)
+    else:
+        try:
+            chr = r_uint(int(s[pos:pos+digits], 16))
+        except ValueError:
+            endinpos = pos
+            while s[endinpos] in hexdigits:
+                endinpos += 1
+            res, pos = errorhandler(errors, encoding,
+                                    message, s, pos-2, endinpos)
+            builder.append(res)
+        else:
+            # when we get here, chr is a 32-bit unicode character
+            if chr <= MAXUNICODE:
+                builder.append(UNICHR(chr))
+                pos += digits
+
+            elif chr <= 0x10ffff:
+                chr -= 0x10000L
+                builder.append(unichr(0xD800 + (chr >> 10)))
+                builder.append(unichr(0xDC00 + (chr & 0x03FF)))
+                pos += digits
+            else:
+                message = "illegal Unicode character"
+                res, pos = errorhandler(errors, encoding,
+                                        message, s, pos-2, pos+digits)
+                builder.append(res)
+    return pos
+
+
+def str_decode_unicode_escape(s, size, errors, final=False,
+                              errorhandler=None,
+                              unicodedata_handler=None):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+
+    if size == 0:
+        return u'', 0, None
+
+    builder = UnicodeBuilder(size)
+    pos = 0
+    first_escape_error_char = None
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            builder.append(unichr(ord(ch)))
+            pos += 1
+            continue
+
+        # - Escapes
+        pos += 1
+        if pos >= size:
+            message = "\\ at end of string"
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos-1, size)
+            builder.append(res)
+            continue
+
+        ch = s[pos]
+        pos += 1
+        # \x escapes
+        if ch == '\n': pass
+        elif ch == '\\': builder.append(u'\\')
+        elif ch == '\'': builder.append(u'\'')
+        elif ch == '\"': builder.append(u'\"')
+        elif ch == 'b' : builder.append(u'\b')
+        elif ch == 'f' : builder.append(u'\f')
+        elif ch == 't' : builder.append(u'\t')
+        elif ch == 'n' : builder.append(u'\n')
+        elif ch == 'r' : builder.append(u'\r')
+        elif ch == 'v' : builder.append(u'\v')
+        elif ch == 'a' : builder.append(u'\a')
+        elif '0' <= ch <= '7':
+            x = ord(ch) - ord('0')
+            if pos < size:
+                ch = s[pos]
+                if '0' <= ch <= '7':
+                    pos += 1
+                    x = (x<<3) + ord(ch) - ord('0')
+                    if pos < size:
+                        ch = s[pos]
+                        if '0' <= ch <= '7':
+                            pos += 1
+                            x = (x<<3) + ord(ch) - ord('0')
+            builder.append(unichr(x))
+        # hex escapes
+        # \xXX
+        elif ch == 'x':
+            digits = 2
+            message = "truncated \\xXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \uXXXX
+        elif ch == 'u':
+            digits = 4
+            message = "truncated \\uXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        #  \UXXXXXXXX
+        elif ch == 'U':
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+
+        # \N{name}
+        elif ch == 'N' and unicodedata_handler is not None:
+            message = "malformed \\N character escape"
+            look = pos
+
+            if look < size and s[look] == '{':
+                # look for the closing brace
+                while look < size and s[look] != '}':
+                    look += 1
+                if look < size and s[look] == '}':
+                    # found a name.  look it up in the unicode database
+                    message = "unknown Unicode character name"
+                    name = s[pos+1:look]
+                    code = unicodedata_handler.call(name)
+                    if code < 0:
+                        res, pos = errorhandler(errors, "unicodeescape",
+                                                message, s, pos-1, look+1)
+                        builder.append(res)
+                        continue
+                    pos = look + 1
+                    if code <= MAXUNICODE:
+                        builder.append(UNICHR(code))
+                    else:
+                        code -= 0x10000L
+                        builder.append(unichr(0xD800 + (code >> 10)))
+                        builder.append(unichr(0xDC00 + (code & 0x03FF)))
+                else:
+                    res, pos = errorhandler(errors, "unicodeescape",
+                                            message, s, pos-1, look+1)
+                    builder.append(res)
+            else:
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, look+1)
+                builder.append(res)
+        else:
+            first_escape_error_char = unichr(ord(ch))
+            builder.append(u'\\')
+            builder.append(unichr(ord(ch)))
+
+    return builder.build(), pos, first_escape_error_char
+
+
 def decode_raw_unicode_escape(space, string):
     result, consumed = runicode.str_decode_raw_unicode_escape(
         string, len(string), "strict",
@@ -149,11 +314,109 @@
     # allowed, either paired or lone.  A paired surrogate is considered
     # like the non-BMP character it stands for.  See also *_utf8sp().
     assert isinstance(uni, unicode)
-    return runicode.unicode_encode_utf_8(
+    return unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=encode_error_handler(space),
         allow_surrogates=allow_surrogates)
 
+@jit.elidable
+def unicode_encode_utf_8(s, size, errors, errorhandler=None,
+                         allow_surrogates=False):
+    # In this function, allow_surrogates can be:
+    #
+    #  * True:  surrogates are always allowed.  A valid surrogate pair
+    #           is replaced with the non-BMP unicode char it stands for,
+    #           which is then encoded as 4 bytes.
+    #
+    #  * False: surrogates are always forbidden.
+    #
+    # See also unicode_encode_utf8sp().
+    #
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+    return unicode_encode_utf_8_elidable(s, size, errors, errorhandler,
+                                         allow_surrogates=allow_surrogates)
+
+def unicode_encode_utf_8_impl(s, size, errors, errorhandler,
+                              allow_surrogates=False):
+    assert(size >= 0)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        else:
+            # Encode UCS2 Unicode ordinals
+            if ch < 0x10000:
+                # Special case: check for surrogates
+                if 0xD800 <= ch <= 0xDFFF:
+                    error_start_pos = pos - 1
+                    if pos != size:
+                        ch2 = ord(s[pos])
+                        # check if the first character is a high surrogate,
+                        # and the second character is a low surrogate. If so,
+                        # they should be handled collectively.
+                        if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF:
+                            # pos should be incremented regardless.
+                            # by doing so, it ensures the lower surrogate
+                            # is also included in the characters considered
+                            # in the errorhandler.
+                            pos += 1
+                            # if we allow surrogates, we should combine
+                            # the two and form a UCS4 value
+                            if allow_surrogates or MAXUNICODE < 65535 or 
is_narrow_host():
+                                ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 
0x10000
+                                assert ch3 >= 0
+                                _encodeUCS4(result, ch3)
+                                continue
+                    # note: if the program only ever calls this with
+                    # allow_surrogates=True, then we'll never annotate
+                    # the following block of code, and errorhandler()
+                    # will never be called.  This causes RPython
+                    # problems.  Avoid it with the nonconst hack.
+                    if not allow_surrogates or nonconst.NonConstant(False):
+                        ru, rs, pos = errorhandler(errors, 'utf8',
+                                                   'surrogates not allowed',
+                                                   s, error_start_pos, pos)
+                        if rs is not None:
+                            # py3k only
+                            result.append(rs)
+                            continue
+                        for ch in ru:
+                            if ord(ch) < 0x80:
+                                result.append(chr(ord(ch)))
+                            else:
+                                errorhandler('strict', 'utf8',
+                                             'surrogates not allowed',
+                                             s, pos - 1 , pos)
+                        continue
+                    # else: Fall through and handles isolated high surrogates
+                result.append((chr((0xe0 | (ch >> 12)))))
+                result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+                result.append((chr((0x80 | (ch & 0x3f)))))
+            else:
+                _encodeUCS4(result, ch)
+    return result.build()
+unicode_encode_utf_8_elidable = jit.elidable(
+    func_with_new_name(unicode_encode_utf_8_impl,
+                       "unicode_encode_utf_8_elidable"))
+
+
+def _encodeUCS4(result, ch):
+    # Encode UCS4 Unicode ordinals
+    result.append((chr((0xf0 | (ch >> 18)))))
+    result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
+    result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+    result.append((chr((0x80 | (ch & 0x3f)))))
+
+
 def encode_utf8sp(space, uni):
     # Surrogate-preserving utf-8 encoding.  Any surrogate character
     # turns into its 3-bytes encoding, whether it is paired or not.
@@ -586,3 +849,7 @@
     return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
                                         allow_surrogates, "little",
                                         'utf-32-le')
+
+
+def is_narrow_host():
+    return not we_are_translated() and sys.maxunicode == 0xFFFF
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -10,6 +10,9 @@
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter import unicodehelper
+from pypy.interpreter.unicodehelper import (
+    unicode_encode_utf_8_impl,
+    str_decode_unicode_escape)
 from pypy.module.unicodedata import unicodedb
 
 
@@ -735,7 +738,7 @@
     # NB. can't call unicode_encode_utf_8() directly because that's
     # an @elidable function nowadays.  Instead, we need the _impl().
     # (The problem is the errorhandler, which calls arbitrary Python.)
-    result = runicode.unicode_encode_utf_8_impl(
+    result = unicode_encode_utf_8_impl(
         uni, len(uni), errors, state.encode_error_handler,
         allow_surrogates=False)
     return space.newtuple([space.newbytes(result), space.newint(len(uni))])
@@ -947,11 +950,18 @@
 
     unicode_name_handler = state.get_unicodedata_handler(space)
 
-    result, consumed = runicode.str_decode_unicode_escape(
+    result, consumed, first_escape_error_char = str_decode_unicode_escape(
         string, len(string), errors,
         final, state.decode_error_handler,
         unicode_name_handler)
 
+    if first_escape_error_char is not None:
+        space.warn(
+            space.newtext("invalid escape sequence '\\%s'"
+                          % str(first_escape_error_char)),
+            space.w_DeprecationWarning
+        )
+
     return space.newtuple([space.newunicode(result), space.newint(consumed)])
 
 # ____________________________________________________________
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -796,6 +796,15 @@
             test_sequence = before_sequence + ill_surrogate + after_sequence
             raises(UnicodeDecodeError, test_sequence.decode, encoding)
 
+    def test_lone_surrogates_utf_8(self):
+        """
+        utf-8 should not longer allow surrogates,
+        and should return back full surrogate pairs.
+        """
+        e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8",
+                   "surrogateescape").value
+        assert e.object[e.start:e.end] == u'\ud800\udfff'
+
     def test_charmap_encode(self):
         assert 'xxx'.encode('charmap') == b'xxx'
 
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,7 +6,7 @@
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib.runicode import (
     make_unicode_escape_function, str_decode_ascii, str_decode_utf_8,
-    unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii,
+    unicode_encode_ascii, fast_str_decode_ascii,
     unicode_encode_utf8_forbid_surrogates, SurrogateError)
 from rpython.rlib import jit
 
@@ -564,7 +564,7 @@
         if encoding is None or encoding == 'utf-8':
             u = space.unicode_w(w_object)
             eh = unicodehelper.encode_error_handler(space)
-            return space.newbytes(unicode_encode_utf_8(
+            return space.newbytes(unicodehelper.unicode_encode_utf_8(
                     u, len(u), errors, errorhandler=eh))
         elif encoding == 'ascii':
             u = space.unicode_w(w_object)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3.6: Merged in toumorokoshi/pypy/fix_test_codecs (pull request #612)

Reply via email to