Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95335:de06359bbf5c
Date: 2018-11-16 12:28 -0800
http://bitbucket.org/pypy/pypy/changeset/de06359bbf5c/
Log: refactor builting erro handlers to use utf8 indices, add failing
test
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -379,25 +379,23 @@
def surrogatepass_errors(space, w_exc):
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
- utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object')))
+ w_obj = space.getattr(w_exc, space.newtext('object'))
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
encoding = space.text_w(space.getattr(w_exc,
space.newtext('encoding')))
- msg = space.text_w(space.getattr(w_exc, space.newtext('reason')))
bytelength, code = get_standard_encoding(encoding)
if code == ENC_UNKNOWN:
# Not supported, fail with original exception
raise OperationError(space.type(w_exc), w_exc)
end = space.int_w(w_end)
builder = StringBuilder()
+ start = w_obj._index_to_byte(start)
+ end = w_obj._index_to_byte(end)
+ obj = w_obj._utf8
pos = start
- # start, end are in codepoint indices
- itr = rutf8.Utf8StringIterator(utf8)
- for i in range(pos):
- itr.next()
while pos < end:
- ch = itr.next()
- pos += 1
+ ch = rutf8.codepoint_at_pos(obj, pos)
+ pos = rutf8.next_codepoint_pos(obj, pos)
if ch < 0xd800 or ch > 0xdfff:
# Not a surrogate, fail with original exception
raise OperationError(space.type(w_exc), w_exc)
@@ -465,22 +463,22 @@
def surrogateescape_errors(space, w_exc):
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
- utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object')))
+ w_obj = space.getattr(w_exc, space.newtext('object'))
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
end = space.int_w(w_end)
res = ''
+ start = w_obj._index_to_byte(start)
+ end = w_obj._index_to_byte(end)
+ obj = w_obj._utf8
pos = start
- itr = rutf8.Utf8StringIterator(utf8)
- for i in range(pos):
- itr.next()
while pos < end:
- ch = itr.next()
- pos += 1
- if ch < 0xdc80 or ch > 0xdcff:
+ code = rutf8.codepoint_at_pos(obj, pos)
+ if code < 0xdc80 or code > 0xdcff:
# Not a UTF-8b surrogate, fail with original exception
raise OperationError(space.type(w_exc), w_exc)
- res += chr(ch - 0xdc00)
+ res += chr(code - 0xdc00)
+ pos = rutf8.next_codepoint_pos(obj, pos)
return space.newtuple([space.newbytes(res), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
consumed = 0
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -625,6 +625,8 @@
assert '[\uDC80]'.encode('utf-8', 'namereplace') == b'[\\udc80]'
def test_surrogateescape(self):
+ assert "\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1",
+ "surrogateescape") == b"\xe4\xeb\xef\xf6\xfc"
assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
assert 'a\udc80b'.encode('utf-8', 'surrogateescape') == b'a\x80b'
for enc in ('utf-8', 'ascii', 'latin-1', 'charmap'):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit