Author: Matti Picus <matti.pi...@gmail.com> Branch: unicode-utf8-py3 Changeset: r95233:26082fc25722 Date: 2018-10-24 07:22 +0300 http://bitbucket.org/pypy/pypy/changeset/26082fc25722/
Log: fix for MAXUNICODE < 65536 diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1530,7 +1530,10 @@ if size == 0: return '', 0 - unicode_bytes = 4 + if runicode.MAXUNICODE < 65536: + unicode_bytes = 2 + else: + unicode_bytes = 4 if BYTEORDER == "little": start = 0 stop = unicode_bytes @@ -1554,7 +1557,7 @@ for j in range(start, stop, step): t += r_uint(ord(s[pos + j])) << (h*8) h += 1 - if t > 0x10ffff: + if t > runicode.MAXUNICODE: res, pos = errorhandler(errors, "unicode_internal", "unichr(%d) not in range" % (t,), s, pos, pos + unicode_bytes) @@ -1571,18 +1574,24 @@ if size == 0: return '' - result = StringBuilder(size * 4) + if runicode.MAXUNICODE < 65536: + unicode_bytes = 2 + else: + unicode_bytes = 4 + result = StringBuilder(size * unicode_bytes) pos = 0 while pos < size: oc = rutf8.codepoint_at_pos(s, pos) if BYTEORDER == "little": result.append(chr(oc & 0xFF)) result.append(chr(oc >> 8 & 0xFF)) - result.append(chr(oc >> 16 & 0xFF)) - result.append(chr(oc >> 24 & 0xFF)) + if unicode_bytes > 2: + result.append(chr(oc >> 16 & 0xFF)) + result.append(chr(oc >> 24 & 0xFF)) else: - result.append(chr(oc >> 24 & 0xFF)) - result.append(chr(oc >> 16 & 0xFF)) + if unicode_bytes > 2: + result.append(chr(oc >> 24 & 0xFF)) + result.append(chr(oc >> 16 & 0xFF)) result.append(chr(oc >> 8 & 0xFF)) result.append(chr(oc & 0xFF)) pos = rutf8.next_codepoint_pos(s, pos) diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -248,7 +248,7 @@ check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) - space.realutf8_w(w_obj) # weeoes + space.realutf8_w(w_obj) # for errors w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) @@ -275,17 +275,22 @@ check_exception(space, w_exc) if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or - space.isinstance_w(w_exc, space.w_UnicodeTranslateError)): - obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object'))) + space.isinstance_w(w_exc, space.w_UnicodeTranslateError)): + w_obj = space.getattr(w_exc, space.newtext('object')) + space.realutf8_w(w_obj) # for errors + w_obj = space.convert_arg_to_w_unicode(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) + start = w_obj._index_to_byte(start) + end = w_obj._index_to_byte(end) builder = StringBuilder() pos = start + obj = w_obj._utf8 while pos < end: - oc = ord(obj[pos]) - raw_unicode_escape_helper(builder, oc) - pos += 1 + code = rutf8.codepoint_at_pos(obj, pos) + raw_unicode_escape_helper(builder, code) + pos = rutf8.next_codepoint_pos(obj, pos) return space.newtuple([space.newtext(builder.build()), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit