[pypy-commit] pypy unicode-utf8: * Improve ascii/utf8 codecs and unicode escape

fijal Mon, 20 Nov 2017 04:58:01 -0800

Author: fijal
Branch: unicode-utf8
Changeset: r93091:4668380f4c79
Date: 2017-11-20 13:56 +0100
http://bitbucket.org/pypy/pypy/changeset/4668380f4c79/


Log:    * Improve ascii/utf8 codecs and unicode escape
        * Raise instead of looping infinitely when errorhandler returns
        nonsense

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -33,25 +33,33 @@
     assert lst == [("??", "ascii", input, 0, 2),
                    ("??", "ascii", input, 5, 7)]
 
+@given(strategies.text())
+def test_utf8_encode_ascii_2(u):
+    def eh(errors, encoding, reason, p, start, end):
+        return "?" * (end - start), end
+
+    assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == 
u.encode("ascii", "replace")
+
 def test_str_decode_ascii():
-    assert str_decode_ascii("abc", 3, "??", True, "??") == ("abc", 3, 3)
+    assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3, 
rutf8.FLAG_ASCII)
     def eh(errors, encoding, reason, p, start, end):
         lst.append((errors, encoding, p, start, end))
-        return u"\u1234\u5678", end
+        return u"\u1234\u5678".encode("utf8"), end
     lst = []
     input = "\xe8"
     exp = u"\u1234\u5678".encode("utf8")
-    assert str_decode_ascii(input, 1, "??", True, eh) == (exp, 1, 2)
+    assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2, 
rutf8.FLAG_REGULAR)
     assert lst == [("??", "ascii", input, 0, 1)]
     lst = []
     input = "\xe8\xe9abc\xea\xeb"
-    assert str_decode_ascii(input, 7, "??", True, eh) == (
-        exp + exp + "abc" + exp + exp, 7, 11)
+    assert str_decode_ascii(input, "??", True, eh) == (
+        exp + exp + "abc" + exp + exp, 7, 11, rutf8.FLAG_REGULAR)
     assert lst == [("??", "ascii", input, 0, 1),
                    ("??", "ascii", input, 1, 2),
                    ("??", "ascii", input, 5, 6),
                    ("??", "ascii", input, 6, 7)]
 
-@given(strategies.binary())
-def test_unicode_raw_escape(s):
-    uh.utf8_encode_raw_unicode_escape(s, 'strict')
+@given(strategies.text())
+def test_unicode_raw_escape(u):
+    r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict')
+    assert r == u.encode("raw-unicode-escape")
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -158,6 +158,7 @@
                 res.append(chr(oc))
                 i += 1
             else:
+                XXX
                 r, pos = errorhandler(errors, 'latin1',
                                       'ordinal not in range(256)', s, cur,
                                       cur + 1)
@@ -179,10 +180,15 @@
     pos = 0
     while i < len(utf8):
         ch = rutf8.codepoint_at_pos(utf8, i)
-        if ch >= 0x7F:
+        if ch > 0x7F:
+            endpos = pos + 1
+            end_i = rutf8.next_codepoint_pos(utf8, i)
+            while end_i < len(utf8) and rutf8.codepoint_at_pos(utf8, end_i) > 
0x7F:
+                endpos += 1
+                end_i = rutf8.next_codepoint_pos(utf8, end_i)
             msg = "ordinal not in range(128)"
             r, newpos = errorhandler(errors, 'ascii', msg, utf8,
-                pos, pos + 1)
+                pos, endpos)
             for _ in range(newpos - pos):
                 i = rutf8.next_codepoint_pos(utf8, i)
             pos = newpos
@@ -603,13 +609,13 @@
     result = StringBuilder(size)
     pos = 0
     while pos < size:
-        oc = ord(s[pos])
+        oc = rutf8.codepoint_at_pos(s, pos)
 
         if oc < 0x100:
             result.append(chr(oc))
         else:
             raw_unicode_escape_helper(result, oc)
-        pos += 1
+        pos = rutf8.next_codepoint_pos(s, pos)
 
     return result.build()
 
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -71,6 +71,9 @@
                 raise oefmt(space.w_IndexError,
                             "position %d from error handler out of bounds",
                             newpos)
+            if newpos < startpos:
+                raise oefmt(space.w_IndexError,
+                    "position %d from error handler did not progress", newpos)
             w_replace = space.convert_to_w_unicode(w_replace)
             return w_replace._utf8, newpos
         return call_errorhandler
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: * Improve ascii/utf8 codecs and unicode escape

Reply via email to