Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95339:9dc3de0b34d5
Date: 2018-11-18 18:36 -0800
http://bitbucket.org/pypy/pypy/changeset/9dc3de0b34d5/
Log: distinguish between errorhandler returning unicode and bytes
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -27,7 +27,7 @@
def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
assert startingpos >= 0
ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
- return ''.join(ux), endingpos
+ return ''.join(ux), endingpos, 'b'
@specialize.memo()
def encode_error_handler(space):
@@ -199,7 +199,7 @@
while i < len(s):
ch = s[i]
if ord(ch) > 0x7F:
- r, i = errorhandler(errors, 'ascii', 'ordinal not in range(128)',
+ r, i, rettype = errorhandler(errors, 'ascii', 'ordinal not in
range(128)',
s, i, i + 1)
res.append(r)
else:
@@ -242,7 +242,7 @@
assert pos >= 0
start = s[:pos]
upos = rutf8.codepoints_in_utf8(s, end=pos)
- ru, lgt = errorhandler(errors, 'utf8',
+ ru, lgt, rettype = errorhandler(errors, 'utf8',
'surrogates not allowed', s, upos, upos + 1)
end = utf8_encode_utf_8(s[pos+3:], errors, errorhandler,
allow_surrogates=allow_surrogates)
@@ -275,13 +275,20 @@
pos = rutf8.next_codepoint_pos(s, pos)
index += 1
msg = "ordinal not in range(256)"
- res_8, newindex = errorhandler(
+ res, newindex, rettype = errorhandler(
errors, 'latin1', msg, s, startindex, index)
- for ch in res_8:
- if ord(ch) > 0xFF:
- errorhandler("strict", 'latin1', msg, s, startindex, index)
- raise RuntimeError('error handler should not have
returned')
- result.append(ch)
+ if rettype == 'u':
+ for cp in rutf8.Utf8StringIterator(res):
+ if cp > 0xFF:
+ errorhandler("strict", 'latin1', msg, s, startindex,
index)
+ raise RuntimeError('error handler should not have
returned')
+ result.append(chr(cp))
+ else:
+ for ch in res:
+ if ord(ch) > 0xFF:
+ errorhandler("strict", 'latin1', msg, s, startindex,
index)
+ raise RuntimeError('error handler should not have
returned')
+ result.append(ch)
if index != newindex: # Should be uncommon
index = newindex
pos = rutf8._pos_at_index(s, newindex)
@@ -309,9 +316,20 @@
pos = rutf8.next_codepoint_pos(s, pos)
index += 1
msg = "ordinal not in range(128)"
- res_8, newindex = errorhandler(
+ res, newindex, rettype = errorhandler(
errors, 'ascii', msg, s, startindex, index)
- result.append(res_8)
+ if rettype == 'u':
+ for cp in rutf8.Utf8StringIterator(res):
+ if cp > 0x80:
+ errorhandler("strict", 'ascii', msg, s, startindex,
index)
+ raise RuntimeError('error handler should not have
returned')
+ result.append(chr(cp))
+ else:
+ for ch in res:
+ if ord(ch) > 0x80:
+ errorhandler("strict", 'ascii', msg, s, startindex,
index)
+ raise RuntimeError('error handler should not have
returned')
+ result.append(ch)
pos = rutf8._pos_at_index(s, newindex)
return result.build()
@@ -346,7 +364,7 @@
continue
if ordch1 <= 0xC1:
- r, pos = errorhandler(errors, "utf8", "invalid start byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid start
byte",
s, pos, pos + 1)
res.append(r)
continue
@@ -358,14 +376,14 @@
if not final:
pos -= 1
break
- r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+ r, pos, rettype = errorhandler(errors, "utf8", "unexpected end
of data",
s, pos - 1, pos)
res.append(r)
continue
ordch2 = ord(s[pos])
if rutf8._invalid_byte_2_of_2(ordch2):
- r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
res.append(r)
continue
@@ -380,7 +398,7 @@
if not final:
pos -= 1
break
- r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+ r, pos, rettype = errorhandler(errors, "utf8", "unexpected end
of data",
s, pos - 1, pos + 1)
res.append(r)
continue
@@ -388,12 +406,12 @@
ordch3 = ord(s[pos + 1])
if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
- r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
res.append(r)
continue
elif rutf8._invalid_byte_3_of_3(ordch3):
- r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos + 1)
res.append(r)
continue
@@ -410,26 +428,25 @@
if not final:
pos -= 1
break
- r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+ r, pos, rettype = errorhandler(errors, "utf8", "unexpected end
of data",
s, pos - 1, pos)
- res.append(r)
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
ordch4 = ord(s[pos + 2])
if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
- r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos)
res.append(r)
continue
elif rutf8._invalid_byte_3_of_4(ordch3):
- r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos + 1)
res.append(r)
continue
elif rutf8._invalid_byte_4_of_4(ordch4):
- r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid
continuation byte",
s, pos - 1, pos + 2)
res.append(r)
continue
@@ -442,7 +459,7 @@
res.append(chr(ordch4))
continue
- r, pos = errorhandler(errors, "utf8", "invalid start byte",
+ r, pos, rettype = errorhandler(errors, "utf8", "invalid start byte",
s, pos - 1, pos)
res.append(r)
@@ -458,9 +475,9 @@
endinpos = pos
while endinpos < len(s) and s[endinpos] in hexdigits:
endinpos += 1
- res, pos = errorhandler(
+ r, pos, rettype = errorhandler(
errors, encoding, message, s, pos - 2, endinpos)
- builder.append(res)
+ builder.append(r)
else:
try:
chr = int(s[pos:pos + digits], 16)
@@ -468,9 +485,9 @@
endinpos = pos
while s[endinpos] in hexdigits:
endinpos += 1
- res, pos = errorhandler(
+ r, pos, rettype = errorhandler(
errors, encoding, message, s, pos - 2, endinpos)
- builder.append(res)
+ builder.append(r)
else:
# when we get here, chr is a 32-bit unicode character
try:
@@ -478,9 +495,9 @@
pos += digits
except ValueError:
message = "illegal Unicode character"
- res, pos = errorhandler(
+ r, pos, rettype = errorhandler(
errors, encoding, message, s, pos - 2, pos + digits)
- builder.append(res)
+ builder.append(r)
return pos
def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
@@ -506,9 +523,9 @@
pos += 1
if pos >= size:
message = "\\ at end of string"
- res, pos = errorhandler(errors, "unicodeescape",
+ r, pos, rettype = errorhandler(errors, "unicodeescape",
message, s, pos - 1, size)
- builder.append(res)
+ builder.append(r)
continue
ch = s[pos]
@@ -586,21 +603,21 @@
name = s[pos + 1:look]
code = ud_handler.call(name)
if code < 0:
- res, pos = errorhandler(
+ r, pos, rettype = errorhandler(
errors, "unicodeescape", message,
s, pos - 1, look + 1)
- builder.append(res)
+ builder.append(r)
continue
pos = look + 1
builder.append_code(code)
else:
- res, pos = errorhandler(errors, "unicodeescape",
+ r, pos, rettype = errorhandler(errors, "unicodeescape",
message, s, pos - 1, look + 1)
- builder.append(res)
+ builder.append(r)
else:
- res, pos = errorhandler(errors, "unicodeescape",
+ r, pos, rettype = errorhandler(errors, "unicodeescape",
message, s, pos - 1, look + 1)
- builder.append(res)
+ builder.append(r)
else:
builder.append_char('\\')
builder.append_code(ord(ch))
@@ -867,22 +884,22 @@
# We've seen at least one base-64 character
pos += 1
msg = "partial character in shift sequence"
- res, pos = errorhandler(errors, 'utf7',
+ r, pos, rettype = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
- reslen = rutf8.check_utf8(res, True)
+ reslen = rutf8.check_utf8(r, True)
outsize += reslen
- result.append(res)
+ result.append(r)
continue
else:
# Some bits remain; they should be zero
if base64buffer != 0:
pos += 1
msg = "non-zero padding bits in shift sequence"
- res, pos = errorhandler(errors, 'utf7',
+ r, pos, rettype = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
- reslen = rutf8.check_utf8(res, True)
+ reslen = rutf8.check_utf8(r, True)
outsize += reslen
- result.append(res)
+ result.append(r)
continue
if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
@@ -917,10 +934,10 @@
startinpos = pos
pos += 1
msg = "unexpected special character"
- res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
- reslen = rutf8.check_utf8(res, True)
+ r, pos, rettype = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
+ reslen = rutf8.check_utf8(r, True)
outsize += reslen
- result.append(res)
+ result.append(r)
# end of string
final_length = result.getlength()
@@ -931,10 +948,10 @@
base64bits >= 6 or
(base64bits > 0 and base64buffer != 0)):
msg = "unterminated shift sequence"
- res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos,
pos)
- reslen = rutf8.check_utf8(res, True)
+ r, pos, rettype = errorhandler(errors, 'utf7', msg, s,
shiftOutStartPos, pos)
+ reslen = rutf8.check_utf8(r, True)
outsize += reslen
- result.append(res)
+ result.append(r)
final_length = result.getlength()
elif inShift:
pos = startinpos
@@ -1101,7 +1118,7 @@
if len(s) - pos < 2:
if not final:
break
- r, pos = errorhandler(errors, public_encoding_name,
+ r, pos, rettype = errorhandler(errors, public_encoding_name,
"truncated data",
s, pos, len(s))
result.append(r)
@@ -1118,7 +1135,7 @@
if not final:
break
errmsg = "unexpected end of data"
- r, pos = errorhandler(errors, public_encoding_name,
+ r, pos, rettype = errorhandler(errors, public_encoding_name,
errmsg, s, pos, len(s))
result.append(r)
if len(s) - pos < 2:
@@ -1131,12 +1148,12 @@
rutf8.unichr_as_utf8_append(result, ch)
continue
else:
- r, pos = errorhandler(errors, public_encoding_name,
+ r, pos, rettype = errorhandler(errors, public_encoding_name,
"illegal UTF-16 surrogate",
s, pos - 4, pos - 2)
result.append(r)
else:
- r, pos = errorhandler(errors, public_encoding_name,
+ r, pos, rettype = errorhandler(errors, public_encoding_name,
"illegal encoding",
s, pos - 2, pos)
result.append(r)
@@ -1176,44 +1193,62 @@
index = 0
while pos < size:
try:
- ch = rutf8.codepoint_at_pos(s, pos)
+ cp = rutf8.codepoint_at_pos(s, pos)
except IndexError:
# malformed codepoint, blindly use ch
- ch = ord(s[pos])
pos += 1
if errorhandler:
- res_8, newindex = errorhandler(
+ r, newindex, rettype = errorhandler(
errors, public_encoding_name, 'malformed unicode',
s, pos - 1, pos)
- for cp in rutf8.Utf8StringIterator(res_8):
- if cp < 0xD800:
+ if rettype == 'u':
+ for cp in rutf8.Utf8StringIterator(r):
+ if cp < 0xD800:
+ _STORECHAR(result, cp, byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'malformed unicode',
+ s, pos-1, pos)
+ else:
+ for ch in r:
+ cp = ord(ch)
+ if cp < 0xD800:
+ _STORECHAR(result, cp, byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'malformed unicode',
+ s, pos-1, pos)
+ else:
+ cp = ord(s[pos])
+ _STORECHAR(result, cp, byteorder)
+ continue
+ if cp < 0xD800:
+ _STORECHAR(result, cp, byteorder)
+ elif cp >= 0x10000:
+ _STORECHAR(result, 0xD800 | ((cp-0x10000) >> 10), byteorder)
+ _STORECHAR(result, 0xDC00 | ((cp-0x10000) & 0x3FF), byteorder)
+ elif cp >= 0xE000 or allow_surrogates:
+ _STORECHAR(result, cp, byteorder)
+ else:
+ r, newindex, rettype = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos, pos+1)
+ if rettype == 'u':
+ for cp in rutf8.Utf8StringIterator(r):
+ if cp < 0xD800 or allow_surrogates:
_STORECHAR(result, cp, byteorder)
else:
errorhandler('strict', public_encoding_name,
- 'malformed unicode',
- s, pos-1, pos)
+ 'surrogates not allowed',
+ s, pos, pos+1)
else:
- _STORECHAR(result, ch, byteorder)
- continue
- if ch < 0xD800:
- _STORECHAR(result, ch, byteorder)
- elif ch >= 0x10000:
- _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder)
- _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder)
- elif ch >= 0xE000 or allow_surrogates:
- _STORECHAR(result, ch, byteorder)
- else:
- res_8, newindex = errorhandler(
- errors, public_encoding_name, 'surrogates not allowed',
- s, pos, pos+1)
- #for cp in rutf8.Utf8StringIterator(res_8):
- for ch in res_8:
- cp = ord(ch)
- if cp < 0xD800 or allow_surrogates:
- _STORECHAR(result, cp, byteorder)
- else:
- errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
+ for ch in r:
+ cp = ord(ch)
+ if cp < 0xD800 or allow_surrogates:
+ _STORECHAR(result, cp, byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'surrogates not allowed',
s, pos, pos+1)
if index != newindex: # Should be uncommon
index = newindex
@@ -1329,7 +1364,7 @@
if len(s) - pos < 4:
if not final:
break
- r, pos = errorhandler(errors, public_encoding_name,
+ r, pos, rettype = errorhandler(errors, public_encoding_name,
"truncated data",
s, pos, len(s))
result.append(r)
@@ -1339,14 +1374,14 @@
ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) <<
16) |
(ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
if not allow_surrogates and 0xD800 <= ch <= 0xDFFF:
- r, pos = errorhandler(errors, public_encoding_name,
+ r, pos, rettype = errorhandler(errors, public_encoding_name,
"code point in surrogate code point "
"range(0xd800, 0xe000)",
s, pos, pos + 4)
result.append(r)
continue
elif ch >= 0x110000:
- r, pos = errorhandler(errors, public_encoding_name,
+ r, pos, rettype = errorhandler(errors, public_encoding_name,
"codepoint not in range(0x110000)",
s, pos, len(s))
result.append(r)
@@ -1404,11 +1439,20 @@
ch = ord(s[pos])
pos += 1
if errorhandler:
- res_8, newindex = errorhandler(
+ r, newindex, rettype = errorhandler(
errors, public_encoding_name, 'malformed unicode',
s, index, index+1)
- if res_8:
- for cp in rutf8.Utf8StringIterator(res_8):
+ if rettype == 'u' and r:
+ for cp in rutf8.Utf8StringIterator(r):
+ if cp < 0xD800:
+ _STORECHAR32(result, cp, byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'malformed unicode',
+ s, index, index+1)
+ elif r:
+ for ch in r:
+ cp = ord(ch)
if cp < 0xD800:
_STORECHAR32(result, cp, byteorder)
else:
@@ -1422,16 +1466,26 @@
index += 1
continue
if not allow_surrogates and 0xD800 <= ch < 0xE000:
- res_8, newindex = errorhandler(
+ r, newindex, rettype = errorhandler(
errors, public_encoding_name, 'surrogates not allowed',
s, index, index+1)
- for ch in rutf8.Utf8StringIterator(res_8):
- if ch < 0xD800:
- _STORECHAR32(result, ch, byteorder)
- else:
- errorhandler(
- 'strict', public_encoding_name, 'surrogates not
allowed',
- s, index, index+1)
+ if rettype == 'u':
+ for ch in rutf8.Utf8StringIterator(r):
+ if ch < 0xD800:
+ _STORECHAR32(result, ch, byteorder)
+ else:
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not
allowed',
+ s, index, index+1)
+ else:
+ for ch in r:
+ cp = ord(ch)
+ if cp < 0xD800:
+ _STORECHAR32(result, cp, byteorder)
+ else:
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not
allowed',
+ s, index, index+1)
if index != newindex: # Should be uncommon
index = newindex
pos = rutf8._pos_at_index(s, newindex)
@@ -1471,11 +1525,20 @@
ch = ord(s[pos])
pos += 1
if errorhandler:
- res_8, newindex = errorhandler(
+ r, newindex, rettype = errorhandler(
errors, public_encoding_name, 'malformed unicode',
s, pos - 1, pos)
- if res_8:
- for cp in rutf8.Utf8StringIterator(res_8):
+ if rettype == 'u' and r:
+ for cp in rutf8.Utf8StringIterator(r):
+ if cp < 0xD800:
+ _STORECHAR32(result, cp, byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'malformed unicode',
+ s, pos-1, pos)
+ elif r:
+ for ch in r:
+ cp = ord(ch)
if cp < 0xD800:
_STORECHAR32(result, cp, byteorder)
else:
@@ -1489,16 +1552,26 @@
index += 1
continue
if not allow_surrogates and 0xD800 <= ch < 0xE000:
- res_8, newindex = errorhandler(
+ r, newindex, rettype = errorhandler(
errors, public_encoding_name, 'surrogates not allowed',
s, pos - 1, pos)
- for ch in rutf8.Utf8StringIterator(res_8):
- if ch < 0xD800:
- _STORECHAR32(result, ch, byteorder)
- else:
- errorhandler(
- 'strict', public_encoding_name, 'surrogates not
allowed',
- s, pos - 1, pos)
+ if rettype == 'u':
+ for ch in rutf8.Utf8StringIterator(res_8):
+ if ch < 0xD800:
+ _STORECHAR32(result, ch, byteorder)
+ else:
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not
allowed',
+ s, pos - 1, pos)
+ else:
+ for ch in res_8:
+ cp = ord(ch)
+ if cp < 0xD800:
+ _STORECHAR32(result, cp, byteorder)
+ else:
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not
allowed',
+ s, pos - 1, pos)
if index != newindex: # Should be uncommon
index = newindex
pos = rutf8._pos_at_index(s, newindex)
@@ -1551,10 +1624,10 @@
pos = 0
while pos < size:
if pos > size - unicode_bytes:
- res, pos = errorhandler(errors, "unicode_internal",
+ r, pos, rettype = errorhandler(errors, "unicode_internal",
"truncated input",
s, pos, size)
- result.append(res)
+ result.append(r)
continue
t = r_uint(0)
h = 0
@@ -1562,10 +1635,10 @@
t += r_uint(ord(s[pos + j])) << (h*8)
h += 1
if t > runicode.MAXUNICODE:
- res, pos = errorhandler(errors, "unicode_internal",
+ r, pos, rettype = errorhandler(errors, "unicode_internal",
"unichr(%d) not in range" % (t,),
s, pos, pos + unicode_bytes)
- result.append(res)
+ result.append(r)
continue
rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True)
pos += unicode_bytes
@@ -1627,7 +1700,7 @@
c = mapping.get(ord(ch), ERROR_CHAR)
if c == ERROR_CHAR:
- r, pos = errorhandler(errors, "charmap",
+ r, pos, rettype = errorhandler(errors, "charmap",
"character maps to <undefined>",
s, pos, pos + 1)
result.append(r)
@@ -1659,10 +1732,10 @@
mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
pos = rutf8.next_codepoint_pos(s, pos)
index += 1
- res_8, newindex = errorhandler(errors, "charmap",
+ r, newindex, rettype = errorhandler(errors, "charmap",
"character maps to <undefined>",
s, startindex, index)
- for cp2 in rutf8.Utf8StringIterator(res_8):
+ for cp2 in rutf8.Utf8StringIterator(r):
ch2 = mapping.get(cp2, '')
if not ch2:
errorhandler(
@@ -1727,7 +1800,7 @@
i += 1
end_index = i
msg = "invalid decimal Unicode string"
- r, pos = errorhandler(
+ r, pos, retype = errorhandler(
errors, 'decimal', msg, s, start_index, end_index)
for ch in rutf8.Utf8StringIterator(r):
if unicodedb.isspace(ch):
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -72,8 +72,11 @@
raise OperationError(space.w_TypeError, space.newtext(msg))
w_replace, w_newpos = space.fixedview(w_res, 2)
- if not (space.isinstance_w(w_replace, space.w_unicode) or
- (not decode and space.isinstance_w(w_replace, space.w_bytes))):
+ if space.isinstance_w(w_replace, space.w_unicode):
+ rettype = 'u'
+ elif encode and space.isinstance_w(w_replace, space.w_bytes):
+ rettype = 'b'
+ else:
if decode:
msg = ("decoding error handler must return "
"(str, int) tuple")
@@ -94,7 +97,7 @@
raise oefmt(space.w_IndexError,
"position %d from error handler out of bounds",
newpos)
- return space.utf8_w(w_replace), newpos
+ return space.utf8_w(w_replace), newpos, rettype
return call_errorhandler
def make_decode_errorhandler(self, space):
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -826,11 +826,14 @@
repl = "\u00E9"
s = "\u5678".encode("latin-1", "test.bad_handler")
assert s == b'\xe9'
+ raises(UnicodeEncodeError, "\u5678".encode, "ascii",
+ "test.bad_handler")
def test_lone_surrogates(self):
encodings = ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
'utf-32', 'utf-32-le', 'utf-32-be')
for encoding in encodings:
+ print('encoding', encoding)
raises(UnicodeEncodeError, u'\ud800'.encode, encoding)
assert (u'[\udc80]'.encode(encoding, "backslashreplace") ==
'[\\udc80]'.encode(encoding))
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1200,14 +1200,16 @@
if errors is None or errors == 'strict':
utf8 = space.utf8_w(w_object)
if encoding is None or encoding == 'utf-8':
- #if rutf8.has_surrogates(utf8):
- # utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+ if rutf8.has_surrogates(utf8):
+ # slow path
+ return encode_text(space, w_object, encoding, errors)
return space.newbytes(utf8)
elif encoding == 'ascii':
try:
rutf8.check_ascii(utf8)
except rutf8.CheckError as a:
- eh = unicodehelper.encode_error_handler(space)
+ state = space.fromcache(CodecState)
+ eh = state.encode_error_handler
eh(None, "ascii", "ordinal not in range(128)", utf8,
a.pos, a.pos + 1)
assert False, "always raises"
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit