Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8 Changeset: r93136:dc6582a05b85 Date: 2017-11-23 10:48 +0100 http://bitbucket.org/pypy/pypy/changeset/dc6582a05b85/
Log: Review for surrogates diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -370,14 +370,15 @@ builder.append(res) else: # when we get here, chr is a 32-bit unicode character - if chr > 0x10ffff: + try: + rutf8.unichr_as_utf8_append(builder, intmask(chr), True) + except ValueError: message = "illegal Unicode character" res, pos = errorhandler(errors, encoding, message, s, pos-2, pos+digits) size, flag = rutf8.check_utf8(res, True) builder.append(res) else: - rutf8.unichr_as_utf8_append(builder, intmask(chr), True) flag = rutf8.get_flag_from_code(intmask(chr)) pos += digits size = 1 @@ -466,7 +467,7 @@ pos += 1 x = (x<<3) + ord(ch) - ord('0') outsize += 1 - if x >= 0x7F: + if x > 0x7F: rutf8.unichr_as_utf8_append(builder, x) flag = combine_flags(rutf8.FLAG_REGULAR, flag) else: @@ -524,7 +525,9 @@ pos = look + 1 outsize += 1 flag = combine_flags(flag, rutf8.get_flag_from_code(code)) - rutf8.unichr_as_utf8_append(builder, code) + rutf8.unichr_as_utf8_append(builder, code, + allow_surrogates=True) + # xxx 'code' is probably always within range here... else: res, pos = errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) @@ -772,7 +775,8 @@ surrogate = 0 continue else: - rutf8.unichr_as_utf8_append(result, surrogate) + rutf8.unichr_as_utf8_append(result, surrogate, + allow_surrogates=True) flag = rutf8.FLAG_HAS_SURROGATES outsize += 1 surrogate = 0 @@ -1236,7 +1240,7 @@ result.append(r) continue - rutf8.unichr_as_utf8_append(result, ch) + rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True) pos += 4 r = result.build() lgt, flag = rutf8.check_utf8(r, True) @@ -1360,7 +1364,7 @@ s, pos, pos + unicode_bytes) result.append(res) continue - rutf8.unichr_as_utf8_append(result, intmask(t)) + rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True) pos += unicode_bytes r = result.build() lgt, flag = rutf8.check_utf8(r, True) diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -127,7 +127,7 @@ errorcb, namecb, stringdata) src = pypy_cjk_dec_outbuf(decodebuf) length = pypy_cjk_dec_outlen(decodebuf) - return rffi.wcharpsize2utf8(src, length) + return rffi.wcharpsize2utf8(src, length) # assumes no out-of-range chars def multibytecodec_decerror(decodebuf, e, errors, errorcb, namecb, stringdata): diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py --- a/rpython/rtyper/lltypesystem/rffi.py +++ b/rpython/rtyper/lltypesystem/rffi.py @@ -1012,6 +1012,7 @@ def wcharpsize2utf8(w, size): """ Helper to convert WCHARP pointer to utf8 in one go. Equivalent to wcharpsize2unicode().encode("utf8") + Raises ValueError if characters are outside range(0x110000)! """ from rpython.rlib import rutf8 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit