Author: Carl Friedrich Bolz-Tereick <cfb...@gmx.de> Branch: py3.6 Changeset: r94705:24d343241901 Date: 2018-05-29 06:07 +0000 http://bitbucket.org/pypy/pypy/changeset/24d343241901/
Log: Merged in toumorokoshi/pypy/fix_test_codecs (pull request #612) Fix test codecs diff --git a/lib-python/3/test/test_codecs.py b/lib-python/3/test/test_codecs.py --- a/lib-python/3/test/test_codecs.py +++ b/lib-python/3/test/test_codecs.py @@ -2468,7 +2468,8 @@ with self.assertWarns(DeprecationWarning): check(b"\\" + b, "\\" + chr(i)) if b.upper() not in b'UN': - with self.assertWarns(DeprecationWarning): + with self.assertWarns(DeprecationWarning, + msg="character {} did not raise an exception".format(i)): check(b"\\" + b.upper(), "\\" + chr(i-32)) with self.assertWarns(DeprecationWarning): check(br"\8", "\\8") diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -117,12 +117,6 @@ v, first_escape_error_char = PyString_DecodeEscape( space, substr, 'strict', encoding) - if first_escape_error_char != '': - space.warn( - space.newtext("invalid escape sequence '\\%s'" - % first_escape_error_char), - space.w_DeprecationWarning) - return space.newbytes(v) def decode_unicode_utf8(space, s, ps, q): @@ -252,6 +246,13 @@ # an arbitry number of unescaped UTF-8 bytes may follow. buf = builder.build() + + if first_escape_error_char != '': + space.warn( + space.newtext("invalid escape sequence '\\%s'" + % first_escape_error_char), + space.w_DeprecationWarning) + return buf, first_escape_error_char diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -3,7 +3,10 @@ import struct import sys from pypy.interpreter.unicodehelper import ( - encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be) + encode_utf8, decode_utf8, + unicode_encode_utf_8, + unicode_encode_utf_32_be, str_decode_utf_32_be +) from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp @@ -28,6 +31,35 @@ c = u"\udc00" py.test.raises(Hit, encode_utf8, space, u"\ud800" + c) + +def test_encode_utf_8_combine_surrogates(): + """ + In the case of a surrogate pair, the error handler should + return back a start and stop position of the full surrogate + pair (new behavior inherited from python3.6) + """ + u = u"\udc80\ud800\udfff" + + handler_num = 0 + + def errorhandler(errors, encoding, msg, s, start, end): + """ + This handler will be called twice, so asserting both times: + + 1. the first time, 0xDC80 will be handled as a single surrogate, + since it is a standalone character and an invalid surrogate. + 2. the second time, the characters will be 0xD800 and 0xDFFF, since + that is a valid surrogate pair. + """ + assert s[start:end] in [u'\udc80', u'\uD800\uDFFF'] + return [], None, end + + unicode_encode_utf_8( + u, len(u), True, + errorhandler=errorhandler, + allow_surrogates=False + ) + def test_encode_utf8_allow_surrogates(): sp = FakeSpace() assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80" diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1,12 +1,13 @@ import sys from pypy.interpreter.error import OperationError, oefmt -from rpython.rlib.objectmodel import specialize -from rpython.rlib.rarithmetic import intmask +from rpython.rlib.objectmodel import specialize, we_are_translated +from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rstring import StringBuilder, UnicodeBuilder -from rpython.rlib import runicode +from rpython.rlib import runicode, jit, nonconst from rpython.rlib.runicode import ( default_unicode_error_encode, default_unicode_error_decode, MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR) +from rpython.tool.sourcetools import func_with_new_name _WIN32 = sys.platform == 'win32' _MACOSX = sys.platform == 'darwin' @@ -85,7 +86,7 @@ force_replace=False) elif _MACOSX: uni = space.unicode_w(w_uni) - bytes = runicode.unicode_encode_utf_8_impl( + bytes = unicode_encode_utf_8_impl( uni, len(uni), 'surrogateescape', errorhandler=state.encode_error_handler, allow_surrogates=False) @@ -117,12 +118,176 @@ from pypy.module._codecs import interp_codecs state = space.fromcache(interp_codecs.CodecState) unicodedata_handler = state.get_unicodedata_handler(space) - result, consumed = runicode.str_decode_unicode_escape( + result, consumed, first_escape_error_char = str_decode_unicode_escape( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), unicodedata_handler=unicodedata_handler) return result + +hexdigits = "0123456789ABCDEFabcdef" + + +def hexescape(builder, s, pos, digits, + encoding, errorhandler, message, errors): + chr = 0 + if pos + digits > len(s): + endinpos = pos + while endinpos < len(s) and s[endinpos] in hexdigits: + endinpos += 1 + res, pos = errorhandler(errors, encoding, + message, s, pos-2, endinpos) + builder.append(res) + else: + try: + chr = r_uint(int(s[pos:pos+digits], 16)) + except ValueError: + endinpos = pos + while s[endinpos] in hexdigits: + endinpos += 1 + res, pos = errorhandler(errors, encoding, + message, s, pos-2, endinpos) + builder.append(res) + else: + # when we get here, chr is a 32-bit unicode character + if chr <= MAXUNICODE: + builder.append(UNICHR(chr)) + pos += digits + + elif chr <= 0x10ffff: + chr -= 0x10000L + builder.append(unichr(0xD800 + (chr >> 10))) + builder.append(unichr(0xDC00 + (chr & 0x03FF))) + pos += digits + else: + message = "illegal Unicode character" + res, pos = errorhandler(errors, encoding, + message, s, pos-2, pos+digits) + builder.append(res) + return pos + + +def str_decode_unicode_escape(s, size, errors, final=False, + errorhandler=None, + unicodedata_handler=None): + if errorhandler is None: + errorhandler = default_unicode_error_decode + + if size == 0: + return u'', 0, None + + builder = UnicodeBuilder(size) + pos = 0 + first_escape_error_char = None + while pos < size: + ch = s[pos] + + # Non-escape characters are interpreted as Unicode ordinals + if ch != '\\': + builder.append(unichr(ord(ch))) + pos += 1 + continue + + # - Escapes + pos += 1 + if pos >= size: + message = "\\ at end of string" + res, pos = errorhandler(errors, "unicodeescape", + message, s, pos-1, size) + builder.append(res) + continue + + ch = s[pos] + pos += 1 + # \x escapes + if ch == '\n': pass + elif ch == '\\': builder.append(u'\\') + elif ch == '\'': builder.append(u'\'') + elif ch == '\"': builder.append(u'\"') + elif ch == 'b' : builder.append(u'\b') + elif ch == 'f' : builder.append(u'\f') + elif ch == 't' : builder.append(u'\t') + elif ch == 'n' : builder.append(u'\n') + elif ch == 'r' : builder.append(u'\r') + elif ch == 'v' : builder.append(u'\v') + elif ch == 'a' : builder.append(u'\a') + elif '0' <= ch <= '7': + x = ord(ch) - ord('0') + if pos < size: + ch = s[pos] + if '0' <= ch <= '7': + pos += 1 + x = (x<<3) + ord(ch) - ord('0') + if pos < size: + ch = s[pos] + if '0' <= ch <= '7': + pos += 1 + x = (x<<3) + ord(ch) - ord('0') + builder.append(unichr(x)) + # hex escapes + # \xXX + elif ch == 'x': + digits = 2 + message = "truncated \\xXX escape" + pos = hexescape(builder, s, pos, digits, + "unicodeescape", errorhandler, message, errors) + + # \uXXXX + elif ch == 'u': + digits = 4 + message = "truncated \\uXXXX escape" + pos = hexescape(builder, s, pos, digits, + "unicodeescape", errorhandler, message, errors) + + # \UXXXXXXXX + elif ch == 'U': + digits = 8 + message = "truncated \\UXXXXXXXX escape" + pos = hexescape(builder, s, pos, digits, + "unicodeescape", errorhandler, message, errors) + + # \N{name} + elif ch == 'N' and unicodedata_handler is not None: + message = "malformed \\N character escape" + look = pos + + if look < size and s[look] == '{': + # look for the closing brace + while look < size and s[look] != '}': + look += 1 + if look < size and s[look] == '}': + # found a name. look it up in the unicode database + message = "unknown Unicode character name" + name = s[pos+1:look] + code = unicodedata_handler.call(name) + if code < 0: + res, pos = errorhandler(errors, "unicodeescape", + message, s, pos-1, look+1) + builder.append(res) + continue + pos = look + 1 + if code <= MAXUNICODE: + builder.append(UNICHR(code)) + else: + code -= 0x10000L + builder.append(unichr(0xD800 + (code >> 10))) + builder.append(unichr(0xDC00 + (code & 0x03FF))) + else: + res, pos = errorhandler(errors, "unicodeescape", + message, s, pos-1, look+1) + builder.append(res) + else: + res, pos = errorhandler(errors, "unicodeescape", + message, s, pos-1, look+1) + builder.append(res) + else: + first_escape_error_char = unichr(ord(ch)) + builder.append(u'\\') + builder.append(unichr(ord(ch))) + + return builder.build(), pos, first_escape_error_char + + def decode_raw_unicode_escape(space, string): result, consumed = runicode.str_decode_raw_unicode_escape( string, len(string), "strict", @@ -149,11 +314,109 @@ # allowed, either paired or lone. A paired surrogate is considered # like the non-BMP character it stands for. See also *_utf8sp(). assert isinstance(uni, unicode) - return runicode.unicode_encode_utf_8( + return unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=encode_error_handler(space), allow_surrogates=allow_surrogates) +@jit.elidable +def unicode_encode_utf_8(s, size, errors, errorhandler=None, + allow_surrogates=False): + # In this function, allow_surrogates can be: + # + # * True: surrogates are always allowed. A valid surrogate pair + # is replaced with the non-BMP unicode char it stands for, + # which is then encoded as 4 bytes. + # + # * False: surrogates are always forbidden. + # + # See also unicode_encode_utf8sp(). + # + if errorhandler is None: + errorhandler = default_unicode_error_encode + return unicode_encode_utf_8_elidable(s, size, errors, errorhandler, + allow_surrogates=allow_surrogates) + +def unicode_encode_utf_8_impl(s, size, errors, errorhandler, + allow_surrogates=False): + assert(size >= 0) + result = StringBuilder(size) + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 + if ch < 0x80: + # Encode ASCII + result.append(chr(ch)) + elif ch < 0x0800: + # Encode Latin-1 + result.append(chr((0xc0 | (ch >> 6)))) + result.append(chr((0x80 | (ch & 0x3f)))) + else: + # Encode UCS2 Unicode ordinals + if ch < 0x10000: + # Special case: check for surrogates + if 0xD800 <= ch <= 0xDFFF: + error_start_pos = pos - 1 + if pos != size: + ch2 = ord(s[pos]) + # check if the first character is a high surrogate, + # and the second character is a low surrogate. If so, + # they should be handled collectively. + if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF: + # pos should be incremented regardless. + # by doing so, it ensures the lower surrogate + # is also included in the characters considered + # in the errorhandler. + pos += 1 + # if we allow surrogates, we should combine + # the two and form a UCS4 value + if allow_surrogates or MAXUNICODE < 65535 or is_narrow_host(): + ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000 + assert ch3 >= 0 + _encodeUCS4(result, ch3) + continue + # note: if the program only ever calls this with + # allow_surrogates=True, then we'll never annotate + # the following block of code, and errorhandler() + # will never be called. This causes RPython + # problems. Avoid it with the nonconst hack. + if not allow_surrogates or nonconst.NonConstant(False): + ru, rs, pos = errorhandler(errors, 'utf8', + 'surrogates not allowed', + s, error_start_pos, pos) + if rs is not None: + # py3k only + result.append(rs) + continue + for ch in ru: + if ord(ch) < 0x80: + result.append(chr(ord(ch))) + else: + errorhandler('strict', 'utf8', + 'surrogates not allowed', + s, pos - 1 , pos) + continue + # else: Fall through and handles isolated high surrogates + result.append((chr((0xe0 | (ch >> 12))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) + else: + _encodeUCS4(result, ch) + return result.build() +unicode_encode_utf_8_elidable = jit.elidable( + func_with_new_name(unicode_encode_utf_8_impl, + "unicode_encode_utf_8_elidable")) + + +def _encodeUCS4(result, ch): + # Encode UCS4 Unicode ordinals + result.append((chr((0xf0 | (ch >> 18))))) + result.append((chr((0x80 | ((ch >> 12) & 0x3f))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) + + def encode_utf8sp(space, uni): # Surrogate-preserving utf-8 encoding. Any surrogate character # turns into its 3-bytes encoding, whether it is paired or not. @@ -586,3 +849,7 @@ return unicode_encode_utf_32_helper(s, size, errors, errorhandler, allow_surrogates, "little", 'utf-32-le') + + +def is_narrow_host(): + return not we_are_translated() and sys.maxunicode == 0xFFFF diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -10,6 +10,9 @@ from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault from pypy.interpreter import unicodehelper +from pypy.interpreter.unicodehelper import ( + unicode_encode_utf_8_impl, + str_decode_unicode_escape) from pypy.module.unicodedata import unicodedb @@ -735,7 +738,7 @@ # NB. can't call unicode_encode_utf_8() directly because that's # an @elidable function nowadays. Instead, we need the _impl(). # (The problem is the errorhandler, which calls arbitrary Python.) - result = runicode.unicode_encode_utf_8_impl( + result = unicode_encode_utf_8_impl( uni, len(uni), errors, state.encode_error_handler, allow_surrogates=False) return space.newtuple([space.newbytes(result), space.newint(len(uni))]) @@ -947,11 +950,18 @@ unicode_name_handler = state.get_unicodedata_handler(space) - result, consumed = runicode.str_decode_unicode_escape( + result, consumed, first_escape_error_char = str_decode_unicode_escape( string, len(string), errors, final, state.decode_error_handler, unicode_name_handler) + if first_escape_error_char is not None: + space.warn( + space.newtext("invalid escape sequence '\\%s'" + % str(first_escape_error_char)), + space.w_DeprecationWarning + ) + return space.newtuple([space.newunicode(result), space.newint(consumed)]) # ____________________________________________________________ diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py --- a/pypy/module/_codecs/test/test_codecs.py +++ b/pypy/module/_codecs/test/test_codecs.py @@ -796,6 +796,15 @@ test_sequence = before_sequence + ill_surrogate + after_sequence raises(UnicodeDecodeError, test_sequence.decode, encoding) + def test_lone_surrogates_utf_8(self): + """ + utf-8 should not longer allow surrogates, + and should return back full surrogate pairs. + """ + e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8", + "surrogateescape").value + assert e.object[e.start:e.end] == u'\ud800\udfff' + def test_charmap_encode(self): assert 'xxx'.encode('charmap') == b'xxx' diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -6,7 +6,7 @@ from rpython.rlib.rstring import StringBuilder, UnicodeBuilder from rpython.rlib.runicode import ( make_unicode_escape_function, str_decode_ascii, str_decode_utf_8, - unicode_encode_ascii, unicode_encode_utf_8, fast_str_decode_ascii, + unicode_encode_ascii, fast_str_decode_ascii, unicode_encode_utf8_forbid_surrogates, SurrogateError) from rpython.rlib import jit @@ -564,7 +564,7 @@ if encoding is None or encoding == 'utf-8': u = space.unicode_w(w_object) eh = unicodehelper.encode_error_handler(space) - return space.newbytes(unicode_encode_utf_8( + return space.newbytes(unicodehelper.unicode_encode_utf_8( u, len(u), errors, errorhandler=eh)) elif encoding == 'ascii': u = space.unicode_w(w_object) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit