Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95233:26082fc25722
Date: 2018-10-24 07:22 +0300
http://bitbucket.org/pypy/pypy/changeset/26082fc25722/
Log: fix for MAXUNICODE < 65536
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1530,7 +1530,10 @@
if size == 0:
return '', 0
- unicode_bytes = 4
+ if runicode.MAXUNICODE < 65536:
+ unicode_bytes = 2
+ else:
+ unicode_bytes = 4
if BYTEORDER == "little":
start = 0
stop = unicode_bytes
@@ -1554,7 +1557,7 @@
for j in range(start, stop, step):
t += r_uint(ord(s[pos + j])) << (h*8)
h += 1
- if t > 0x10ffff:
+ if t > runicode.MAXUNICODE:
res, pos = errorhandler(errors, "unicode_internal",
"unichr(%d) not in range" % (t,),
s, pos, pos + unicode_bytes)
@@ -1571,18 +1574,24 @@
if size == 0:
return ''
- result = StringBuilder(size * 4)
+ if runicode.MAXUNICODE < 65536:
+ unicode_bytes = 2
+ else:
+ unicode_bytes = 4
+ result = StringBuilder(size * unicode_bytes)
pos = 0
while pos < size:
oc = rutf8.codepoint_at_pos(s, pos)
if BYTEORDER == "little":
result.append(chr(oc & 0xFF))
result.append(chr(oc >> 8 & 0xFF))
- result.append(chr(oc >> 16 & 0xFF))
- result.append(chr(oc >> 24 & 0xFF))
+ if unicode_bytes > 2:
+ result.append(chr(oc >> 16 & 0xFF))
+ result.append(chr(oc >> 24 & 0xFF))
else:
- result.append(chr(oc >> 24 & 0xFF))
- result.append(chr(oc >> 16 & 0xFF))
+ if unicode_bytes > 2:
+ result.append(chr(oc >> 24 & 0xFF))
+ result.append(chr(oc >> 16 & 0xFF))
result.append(chr(oc >> 8 & 0xFF))
result.append(chr(oc & 0xFF))
pos = rutf8.next_codepoint_pos(s, pos)
diff --git a/pypy/module/_codecs/interp_codecs.py
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -248,7 +248,7 @@
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
w_obj = space.getattr(w_exc, space.newtext('object'))
- space.realutf8_w(w_obj) # weeoes
+ space.realutf8_w(w_obj) # for errors
w_obj = space.convert_arg_to_w_unicode(w_obj)
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
@@ -275,17 +275,22 @@
check_exception(space, w_exc)
if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or
- space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
- obj = space.realunicode_w(space.getattr(w_exc,
space.newtext('object')))
+ space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
+ w_obj = space.getattr(w_exc, space.newtext('object'))
+ space.realutf8_w(w_obj) # for errors
+ w_obj = space.convert_arg_to_w_unicode(w_obj)
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
end = space.int_w(w_end)
+ start = w_obj._index_to_byte(start)
+ end = w_obj._index_to_byte(end)
builder = StringBuilder()
pos = start
+ obj = w_obj._utf8
while pos < end:
- oc = ord(obj[pos])
- raw_unicode_escape_helper(builder, oc)
- pos += 1
+ code = rutf8.codepoint_at_pos(obj, pos)
+ raw_unicode_escape_helper(builder, code)
+ pos = rutf8.next_codepoint_pos(obj, pos)
return space.newtuple([space.newtext(builder.build()), w_end])
elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError):
obj = space.bytes_w(space.getattr(w_exc, space.newtext('object')))
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit