[pypy-commit] pypy unicode-utf8-py3: refactor builting erro handlers to use utf8 indices, add failing test

mattip Sun, 18 Nov 2018 19:36:50 -0800

Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95335:de06359bbf5c
Date: 2018-11-16 12:28 -0800
http://bitbucket.org/pypy/pypy/changeset/de06359bbf5c/


Log:    refactor builting erro handlers to use utf8 indices, add failing
        test

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -379,25 +379,23 @@
 def surrogatepass_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object')))
+        w_obj = space.getattr(w_exc, space.newtext('object'))
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         encoding = space.text_w(space.getattr(w_exc, 
space.newtext('encoding')))
-        msg = space.text_w(space.getattr(w_exc, space.newtext('reason')))
         bytelength, code = get_standard_encoding(encoding)
         if code == ENC_UNKNOWN:
             # Not supported, fail with original exception
             raise OperationError(space.type(w_exc), w_exc)
         end = space.int_w(w_end)
         builder = StringBuilder()
+        start = w_obj._index_to_byte(start)
+        end = w_obj._index_to_byte(end)
+        obj = w_obj._utf8
         pos = start
-        # start, end are in codepoint indices
-        itr = rutf8.Utf8StringIterator(utf8)
-        for i in range(pos):
-            itr.next()
         while pos < end:
-            ch = itr.next()
-            pos += 1
+            ch = rutf8.codepoint_at_pos(obj, pos)
+            pos = rutf8.next_codepoint_pos(obj, pos)
             if ch < 0xd800 or ch > 0xdfff:
                 # Not a surrogate, fail with original exception
                 raise OperationError(space.type(w_exc), w_exc)
@@ -465,22 +463,22 @@
 def surrogateescape_errors(space, w_exc):
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
-        utf8 = space.utf8_w(space.getattr(w_exc, space.newtext('object')))
+        w_obj = space.getattr(w_exc, space.newtext('object'))
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
         res = ''
+        start = w_obj._index_to_byte(start)
+        end = w_obj._index_to_byte(end)
+        obj = w_obj._utf8
         pos = start
-        itr = rutf8.Utf8StringIterator(utf8)
-        for i in range(pos):
-            itr.next()
         while pos < end:
-            ch = itr.next()
-            pos += 1
-            if ch < 0xdc80 or ch > 0xdcff:
+            code = rutf8.codepoint_at_pos(obj, pos)
+            if code < 0xdc80 or code > 0xdcff:
                 # Not a UTF-8b surrogate, fail with original exception
                 raise OperationError(space.type(w_exc), w_exc)
-            res += chr(ch - 0xdc00)
+            res += chr(code - 0xdc00)
+            pos = rutf8.next_codepoint_pos(obj, pos)
         return space.newtuple([space.newbytes(res), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
         consumed = 0
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -625,6 +625,8 @@
         assert '[\uDC80]'.encode('utf-8', 'namereplace') == b'[\\udc80]'
 
     def test_surrogateescape(self):
+        assert "\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1",
+                             "surrogateescape") == b"\xe4\xeb\xef\xf6\xfc"
         assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
         assert 'a\udc80b'.encode('utf-8', 'surrogateescape') == b'a\x80b'
         for enc in ('utf-8', 'ascii', 'latin-1', 'charmap'):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: refactor builting erro handlers to use utf8 indices, add failing test

Reply via email to