[pypy-commit] pypy unicode-utf8-py3: fix for MAXUNICODE < 65536

mattip Tue, 23 Oct 2018 21:24:00 -0700

Author: Matti Picus <matti.pi...@gmail.com>
Branch: unicode-utf8-py3
Changeset: r95233:26082fc25722
Date: 2018-10-24 07:22 +0300
http://bitbucket.org/pypy/pypy/changeset/26082fc25722/


Log:    fix for MAXUNICODE < 65536

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1530,7 +1530,10 @@
     if size == 0:
         return '', 0
 
-    unicode_bytes = 4
+    if runicode.MAXUNICODE < 65536:
+        unicode_bytes = 2
+    else:
+        unicode_bytes = 4
     if BYTEORDER == "little":
         start = 0
         stop = unicode_bytes
@@ -1554,7 +1557,7 @@
         for j in range(start, stop, step):
             t += r_uint(ord(s[pos + j])) << (h*8)
             h += 1
-        if t > 0x10ffff:
+        if t > runicode.MAXUNICODE:
             res, pos = errorhandler(errors, "unicode_internal",
                                     "unichr(%d) not in range" % (t,),
                                     s, pos, pos + unicode_bytes)
@@ -1571,18 +1574,24 @@
     if size == 0:
         return ''
 
-    result = StringBuilder(size * 4)
+    if runicode.MAXUNICODE < 65536:
+        unicode_bytes = 2
+    else:
+        unicode_bytes = 4
+    result = StringBuilder(size * unicode_bytes)
     pos = 0
     while pos < size:
         oc = rutf8.codepoint_at_pos(s, pos)
         if BYTEORDER == "little":
             result.append(chr(oc       & 0xFF))
             result.append(chr(oc >>  8 & 0xFF))
-            result.append(chr(oc >> 16 & 0xFF))
-            result.append(chr(oc >> 24 & 0xFF))
+            if unicode_bytes > 2:
+                result.append(chr(oc >> 16 & 0xFF))
+                result.append(chr(oc >> 24 & 0xFF))
         else:
-            result.append(chr(oc >> 24 & 0xFF))
-            result.append(chr(oc >> 16 & 0xFF))
+            if unicode_bytes > 2:
+                result.append(chr(oc >> 24 & 0xFF))
+                result.append(chr(oc >> 16 & 0xFF))
             result.append(chr(oc >>  8 & 0xFF))
             result.append(chr(oc       & 0xFF))
         pos = rutf8.next_codepoint_pos(s, pos)
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -248,7 +248,7 @@
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
         w_obj = space.getattr(w_exc, space.newtext('object'))
-        space.realutf8_w(w_obj) # weeoes
+        space.realutf8_w(w_obj) # for errors
         w_obj = space.convert_arg_to_w_unicode(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
@@ -275,17 +275,22 @@
 
     check_exception(space, w_exc)
     if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or
-        space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
-        obj = space.realunicode_w(space.getattr(w_exc, 
space.newtext('object')))
+            space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+        w_obj = space.getattr(w_exc, space.newtext('object'))
+        space.realutf8_w(w_obj) # for errors
+        w_obj = space.convert_arg_to_w_unicode(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
+        start = w_obj._index_to_byte(start)
+        end = w_obj._index_to_byte(end)
         builder = StringBuilder()
         pos = start
+        obj = w_obj._utf8
         while pos < end:
-            oc = ord(obj[pos])
-            raw_unicode_escape_helper(builder, oc)
-            pos += 1
+            code = rutf8.codepoint_at_pos(obj, pos)
+            raw_unicode_escape_helper(builder, code)
+            pos = rutf8.next_codepoint_pos(obj, pos)
         return space.newtuple([space.newtext(builder.build()), w_end])
     elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
         obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: fix for MAXUNICODE < 65536

Reply via email to