Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8 Changeset: r93319:ac75e33e51bb Date: 2017-12-09 01:36 +0000 http://bitbucket.org/pypy/pypy/changeset/ac75e33e51bb/
Log: Add utf8-based replacement for runicode.unicode_encode_decimal() to unicodehelper and fix PyUnicode_EncodeDecimal() diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -1,3 +1,4 @@ +import pytest from hypothesis import given, strategies from rpython.rlib import rutf8 @@ -5,6 +6,7 @@ from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii from pypy.interpreter import unicodehelper as uh +from pypy.module._codecs.interp_codecs import CodecState def decode_utf8(u): return str_decode_utf8(u, True, "strict", None) @@ -68,3 +70,16 @@ def test_unicode_escape(u): r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None) assert r == u.encode("unicode-escape") + +def test_encode_decimal(space): + assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 ' + with pytest.raises(ValueError): + uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None) + state = space.fromcache(CodecState) + handler = state.encode_error_handler + assert uh.unicode_encode_decimal( + u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v' + + result = uh.unicode_encode_decimal( + u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler) + assert result == '12ሴ' diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -7,6 +7,7 @@ from rpython.rlib.rstring import StringBuilder from rpython.rtyper.lltypesystem import rffi from pypy.module._codecs import interp_codecs +from pypy.module.unicodedata import unicodedb @specialize.memo() def decode_error_handler(space): @@ -35,6 +36,16 @@ space.newtext(msg)])) return raise_unicode_exception_encode +def default_error_encode( + errors, encoding, msg, u, startingpos, endingpos): + """A default handler, for tests""" + assert endingpos >= 0 + if errors == 'replace': + return '?', endingpos + if errors == 'ignore': + return '', endingpos + raise ValueError + def convert_arg_to_w_unicode(space, w_arg, strict=None): return space.convert_arg_to_w_unicode(w_arg) @@ -1458,3 +1469,70 @@ pos = rutf8.next_codepoint_pos(s, pos) return result.build() +# ____________________________________________________________ +# Decimal Encoder +def unicode_encode_decimal(s, errors, errorhandler=None): + """Converts whitespace to ' ', decimal characters to their + corresponding ASCII digit and all other Latin-1 characters except + \0 as-is. Characters outside this range (Unicode ordinals 1-256) + are treated as errors. This includes embedded NULL bytes. + """ + if errorhandler is None: + errorhandler = default_error_encode + result = StringBuilder(len(s)) + pos = 0 + i = 0 + it = rutf8.Utf8StringIterator(s) + for ch in it: + if unicodedb.isspace(ch): + result.append(' ') + i += 1 + continue + try: + decimal = unicodedb.decimal(ch) + except KeyError: + pass + else: + result.append(chr(48 + decimal)) + i += 1 + continue + if 0 < ch < 256: + result.append(chr(ch)) + i += 1 + continue + # All other characters are considered unencodable + start_index = i + i += 1 + while not it.done(): + ch = rutf8.codepoint_at_pos(s, it.get_pos()) + try: + if (0 < ch < 256 or unicodedb.isspace(ch) or + unicodedb.decimal(ch) >= 0): + break + except KeyError: + # not a decimal + pass + if it.done(): + break + ch = next(it) + i += 1 + end_index = i + msg = "invalid decimal Unicode string" + r, pos = errorhandler( + errors, 'decimal', msg, s, start_index, end_index) + for ch in rutf8.Utf8StringIterator(r): + if unicodedb.isspace(ch): + result.append(' ') + continue + try: + decimal = unicodedb.decimal(ch) + except KeyError: + pass + else: + result.append(chr(48 + decimal)) + continue + if 0 < ch < 256: + result.append(chr(ch)) + continue + errorhandler('strict', 'decimal', msg, s, start_index, end_index) + return result.build() diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py --- a/pypy/module/cpyext/unicodeobject.py +++ b/pypy/module/cpyext/unicodeobject.py @@ -4,7 +4,8 @@ from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.unicodehelper import ( - wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper) + wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper, + unicode_encode_decimal) from pypy.module.unicodedata import unicodedb from pypy.module.cpyext.api import ( CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api, @@ -643,14 +644,13 @@ Returns 0 on success, -1 on failure. """ - u = rffi.wcharpsize2unicode(s, length) + u = rffi.wcharpsize2utf8(s, length) if llerrors: errors = rffi.charp2str(llerrors) else: errors = None state = space.fromcache(CodecState) - result = runicode.unicode_encode_decimal(u, length, errors, - state.encode_error_handler) + result = unicode_encode_decimal(u, errors, state.encode_error_handler) i = len(result) output[i] = '\0' i -= 1 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit