Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8-test Changeset: r93324:e6db8eec731a Date: 2017-12-09 02:46 +0000 http://bitbucket.org/pypy/pypy/changeset/e6db8eec731a/
Log: hg merge unicode-utf8 diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -1,3 +1,4 @@ +import pytest from hypothesis import given, strategies from rpython.rlib import rutf8 @@ -5,6 +6,7 @@ from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii from pypy.interpreter import unicodehelper as uh +from pypy.module._codecs.interp_codecs import CodecState def decode_utf8(u): return str_decode_utf8(u, True, "strict", None) @@ -68,3 +70,16 @@ def test_unicode_escape(u): r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None) assert r == u.encode("unicode-escape") + +def test_encode_decimal(space): + assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 ' + with pytest.raises(ValueError): + uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None) + state = space.fromcache(CodecState) + handler = state.encode_error_handler + assert uh.unicode_encode_decimal( + u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v' + + result = uh.unicode_encode_decimal( + u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler) + assert result == '12ሴ' diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -7,6 +7,7 @@ from rpython.rlib.rstring import StringBuilder from rpython.rtyper.lltypesystem import rffi from pypy.module._codecs import interp_codecs +from pypy.module.unicodedata import unicodedb @specialize.memo() def decode_error_handler(space): @@ -35,6 +36,16 @@ space.newtext(msg)])) return raise_unicode_exception_encode +def default_error_encode( + errors, encoding, msg, u, startingpos, endingpos): + """A default handler, for tests""" + assert endingpos >= 0 + if errors == 'replace': + return '?', endingpos + if errors == 'ignore': + return '', endingpos + raise ValueError + def convert_arg_to_w_unicode(space, w_arg, strict=None): return space.convert_arg_to_w_unicode(w_arg) @@ -1458,3 +1469,70 @@ pos = rutf8.next_codepoint_pos(s, pos) return result.build() +# ____________________________________________________________ +# Decimal Encoder +def unicode_encode_decimal(s, errors, errorhandler=None): + """Converts whitespace to ' ', decimal characters to their + corresponding ASCII digit and all other Latin-1 characters except + \0 as-is. Characters outside this range (Unicode ordinals 1-256) + are treated as errors. This includes embedded NULL bytes. + """ + if errorhandler is None: + errorhandler = default_error_encode + result = StringBuilder(len(s)) + pos = 0 + i = 0 + it = rutf8.Utf8StringIterator(s) + for ch in it: + if unicodedb.isspace(ch): + result.append(' ') + i += 1 + continue + try: + decimal = unicodedb.decimal(ch) + except KeyError: + pass + else: + result.append(chr(48 + decimal)) + i += 1 + continue + if 0 < ch < 256: + result.append(chr(ch)) + i += 1 + continue + # All other characters are considered unencodable + start_index = i + i += 1 + while not it.done(): + ch = rutf8.codepoint_at_pos(s, it.get_pos()) + try: + if (0 < ch < 256 or unicodedb.isspace(ch) or + unicodedb.decimal(ch) >= 0): + break + except KeyError: + # not a decimal + pass + if it.done(): + break + ch = next(it) + i += 1 + end_index = i + msg = "invalid decimal Unicode string" + r, pos = errorhandler( + errors, 'decimal', msg, s, start_index, end_index) + for ch in rutf8.Utf8StringIterator(r): + if unicodedb.isspace(ch): + result.append(' ') + continue + try: + decimal = unicodedb.decimal(ch) + except KeyError: + pass + else: + result.append(chr(48 + decimal)) + continue + if 0 < ch < 256: + result.append(chr(ch)) + continue + errorhandler('strict', 'decimal', msg, s, start_index, end_index) + return result.build() diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py --- a/pypy/module/_pypyjson/interp_decoder.py +++ b/pypy/module/_pypyjson/interp_decoder.py @@ -3,6 +3,7 @@ from rpython.rlib.objectmodel import specialize, always_inline, r_dict from rpython.rlib import rfloat, runicode, rutf8 from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.rlib.rarithmetic import r_uint from pypy.interpreter.error import oefmt from pypy.interpreter import unicodehelper @@ -366,7 +367,7 @@ return # help the annotator to know that we'll never go beyond # this point # - utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True) + utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True) builder.append(utf8_ch) return i @@ -400,7 +401,7 @@ break elif ch == '\\' or ch < '\x20': self.pos = i-1 - return self.space.unicode_w(self.decode_string_escaped(start)) + return self.decode_string_escaped(start) strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i])) bits |= ord(ch) length = i - start - 1 diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -128,7 +128,7 @@ intval: lltype.Signed """ self.error(w_ffitype, w_obj) - + def handle_unichar(self, w_ffitype, w_obj, intval): """ intval: lltype.Signed @@ -174,7 +174,7 @@ def handle_struct_rawffi(self, w_ffitype, w_structinstance): """ This method should be killed as soon as we remove support for _rawffi structures - + w_structinstance: W_StructureInstance """ self.error(w_ffitype, w_structinstance) @@ -228,7 +228,7 @@ return space.newbytes(chr(ucharval)) elif w_ffitype.is_unichar(): wcharval = self.get_unichar(w_ffitype) - return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1) + return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1) elif w_ffitype.is_double(): return self._float(w_ffitype) elif w_ffitype.is_singlefloat(): @@ -349,7 +349,7 @@ def get_struct_rawffi(self, w_ffitype, w_structdescr): """ This should be killed as soon as we kill support for _rawffi structures - + Return type: lltype.Unsigned (the address of the structure) """ diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -596,9 +596,9 @@ return space.w_None wcharp_addr = rffi.cast(rffi.CWCHARP, address) if maxlength == -1: - s = rffi.wcharp2utf8(wcharp_addr) + s = rffi.wcharp2unicode(wcharp_addr) else: - s = rffi.wcharpsize2utf8(wcharp_addr, maxlength) + s = rffi.wcharp2unicoden(wcharp_addr, maxlength) return space.newunicode(s) @unwrap_spec(address=r_uint, maxlength=int) diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py --- a/pypy/module/array/interp_array.py +++ b/pypy/module/array/interp_array.py @@ -1,7 +1,7 @@ from rpython.rlib import jit, rgc, rutf8 from rpython.rlib.buffer import RawBuffer from rpython.rlib.objectmodel import keepalive_until_here -from rpython.rlib.rarithmetic import ovfcheck, widen +from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint from rpython.rlib.unroll import unrolling_iterable from rpython.rtyper.annlowlevel import llstr from rpython.rtyper.lltypesystem import lltype, rffi @@ -1013,7 +1013,7 @@ elif mytype.typecode == 'c': return space.newbytes(item) elif mytype.typecode == 'u': - code = ord(item) + code = r_uint(ord(item)) return space.newutf8(rutf8.unichr_as_utf8(code), 1) assert 0, "unreachable" diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py --- a/pypy/module/cpyext/longobject.py +++ b/pypy/module/cpyext/longobject.py @@ -4,6 +4,7 @@ CONST_STRING, ADDR, CANNOT_FAIL) from pypy.objspace.std.longobject import W_LongObject from pypy.interpreter.error import OperationError +from pypy.interpreter.unicodehelper import wcharpsize2utf8 from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask from rpython.rlib.rbigint import rbigint @@ -191,7 +192,7 @@ string, length gives the number of characters, and base is the radix for the conversion. The radix must be in the range [2, 36]; if it is out of range, ValueError will be raised.""" - w_value = space.newunicode(rffi.wcharpsize2unicode(u, length)) + w_value = space.newutf8(wcharpsize2utf8(space, u, length), length) w_base = space.newint(rffi.cast(lltype.Signed, base)) return space.call_function(space.w_long, w_value, w_base) diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py --- a/pypy/module/cpyext/object.py +++ b/pypy/module/cpyext/object.py @@ -246,7 +246,7 @@ the Python expression unicode(o). Called by the unicode() built-in function.""" if w_obj is None: - return space.newunicode(u"<NULL>") + return space.newutf8("<NULL>", 6) return space.call_function(space.w_unicode, w_obj) @cpython_api([PyObject, PyObject], rffi.INT_real, error=-1) @@ -302,7 +302,7 @@ if opid == Py_EQ: return 1 if opid == Py_NE: - return 0 + return 0 w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int) return int(space.is_true(w_res)) diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py --- a/pypy/module/cpyext/unicodeobject.py +++ b/pypy/module/cpyext/unicodeobject.py @@ -3,7 +3,9 @@ from rpython.tool.sourcetools import func_renamer from pypy.interpreter.error import OperationError, oefmt -from pypy.interpreter.unicodehelper import wcharpsize2utf8 +from pypy.interpreter.unicodehelper import ( + wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper, + unicode_encode_decimal) from pypy.module.unicodedata import unicodedb from pypy.module.cpyext.api import ( CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api, @@ -568,15 +570,11 @@ else: errors = None - result, length, byteorder = runicode.str_decode_utf_16_helper( - string, size, errors, - True, # final ? false for multiple passes? - None, # errorhandler - byteorder) + result, _, length, byteorder = str_decode_utf_16_helper( + string, errors, final=True, errorhandler=None, byteorder=byteorder) if pbyteorder is not None: pbyteorder[0] = rffi.cast(rffi.INT, byteorder) - - return space.newunicode(result) + return space.newutf8(result, length) @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject) def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder): @@ -624,15 +622,11 @@ else: errors = None - result, length, byteorder = runicode.str_decode_utf_32_helper( - string, size, errors, - True, # final ? false for multiple passes? - None, # errorhandler - byteorder) + result, _, length, byteorder = str_decode_utf_32_helper( + string, errors, final=True, errorhandler=None, byteorder=byteorder) if pbyteorder is not None: pbyteorder[0] = rffi.cast(rffi.INT, byteorder) - - return space.newunicode(result) + return space.newutf8(result, length) @cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING], rffi.INT_real, error=-1) @@ -650,14 +644,13 @@ Returns 0 on success, -1 on failure. """ - u = rffi.wcharpsize2unicode(s, length) + u = rffi.wcharpsize2utf8(s, length) if llerrors: errors = rffi.charp2str(llerrors) else: errors = None state = space.fromcache(CodecState) - result = runicode.unicode_encode_decimal(u, length, errors, - state.encode_error_handler) + result = unicode_encode_decimal(u, errors, state.encode_error_handler) i = len(result) output[i] = '\0' i -= 1 @@ -710,12 +703,17 @@ """Return 1 if substr matches str[start:end] at the given tail end (direction == -1 means to do a prefix match, direction == 1 a suffix match), 0 otherwise. Return -1 if an error occurred.""" + space.utf8_w(w_str) # type check + space.utf8_w(w_substr) w_start = space.newint(start) w_end = space.newint(end) if rffi.cast(lltype.Signed, direction) <= 0: - return space.call_method(w_str, "startswith", w_substr, w_start, w_end) + w_result = space.call_method( + w_str, "startswith", w_substr, w_start, w_end) else: - return space.call_method(w_str, "endswith", w_substr, w_start, w_end) + w_result = space.call_method( + w_str, "endswith", w_substr, w_start, w_end) + return space.int_w(w_result) @cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, error=-1) def PyUnicode_Count(space, w_str, w_substr, start, end): diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py --- a/pypy/module/pyexpat/interp_pyexpat.py +++ b/pypy/module/pyexpat/interp_pyexpat.py @@ -483,7 +483,7 @@ except rutf8.CheckError: from pypy.interpreter import unicodehelper # get the correct error msg - unicodehelper.str_decode_utf8(s, len(s), 'string', True, + unicodehelper.str_decode_utf8(s, 'string', True, unicodehelper.decode_error_handler(space)) assert False, "always raises" else: diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py --- a/pypy/objspace/std/formatting.py +++ b/pypy/objspace/std/formatting.py @@ -3,7 +3,7 @@ from rpython.rlib import jit, rutf8 from rpython.rlib.objectmodel import specialize -from rpython.rlib.rarithmetic import INT_MAX +from rpython.rlib.rarithmetic import INT_MAX, r_uint from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf from rpython.rlib.rstring import StringBuilder from rpython.rlib.unroll import unrolling_iterable @@ -330,7 +330,7 @@ space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) - w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1) + w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1) else: cp = ord(self.fmt[self.fmtpos - 1]) w_s = space.newbytes(chr(cp)) @@ -466,7 +466,7 @@ n = space.int_w(w_value) if do_unicode: try: - c = rutf8.unichr_as_utf8(n) + c = rutf8.unichr_as_utf8(r_uint(n)) except ValueError: raise oefmt(space.w_OverflowError, "unicode character code out of range") _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit