Author: Armin Rigo <ar...@tunes.org> Branch: cffi-char16-char32 Changeset: r91503:d6d714960021 Date: 2017-06-04 07:01 +0200 http://bitbucket.org/pypy/pypy/changeset/d6d714960021/
Log: in-progress diff --git a/pypy/module/_cffi_backend/cffi_opcode.py b/pypy/module/_cffi_backend/cffi_opcode.py --- a/pypy/module/_cffi_backend/cffi_opcode.py +++ b/pypy/module/_cffi_backend/cffi_opcode.py @@ -107,8 +107,10 @@ PRIM_UINTMAX = 47 PRIM_FLOATCOMPLEX = 48 PRIM_DOUBLECOMPLEX = 49 +PRIM_CHAR16 = 50 +PRIM_CHAR32 = 51 -_NUM_PRIM = 50 +_NUM_PRIM = 52 _UNKNOWN_PRIM = -1 _UNKNOWN_FLOAT_PRIM = -2 _UNKNOWN_LONG_DOUBLE = -3 @@ -131,8 +133,12 @@ 'float': PRIM_FLOAT, 'double': PRIM_DOUBLE, 'long double': PRIM_LONGDOUBLE, + 'float _Complex': PRIM_FLOATCOMPLEX, + 'double _Complex': PRIM_DOUBLECOMPLEX, '_Bool': PRIM_BOOL, 'wchar_t': PRIM_WCHAR, + 'char16_t': PRIM_CHAR16, + 'char32_t': PRIM_CHAR32, 'int8_t': PRIM_INT8, 'uint8_t': PRIM_UINT8, 'int16_t': PRIM_INT16, diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py --- a/pypy/module/_cffi_backend/ctypeprim.py +++ b/pypy/module/_cffi_backend/ctypeprim.py @@ -42,6 +42,7 @@ def cast_unicode(self, w_ob): space = self.space s = space.unicode_w(w_ob) + XXXXXXXXXXXXXX if len(s) != 1: raise oefmt(space.w_TypeError, "cannot cast unicode string of length %d to ctype '%s'", @@ -149,15 +150,15 @@ class W_CTypePrimitiveUniChar(W_CTypePrimitiveCharOrUniChar): - _attrs_ = ['is_signed'] - _immutable_fields_ = ['is_signed'] + _attrs_ = ['is_signed_wchar'] + _immutable_fields_ = ['is_signed_wchar'] _wchar_is_signed = rfficache.signof_c_type('wchar_t') def __init__(self, space, size, name, name_position, align): - W_CTypePrimitiveUniChar.__init__(self, space, size, name, - name_position, align) - self.is_signed = self._wchar_is_signed and (name == "wchar_t") + W_CTypePrimitiveCharOrUniChar.__init__(self, space, size, name, + name_position, align) + self.is_signed_wchar = self._wchar_is_signed and (name == "wchar_t") # "char16_t" and "char32_t" are always unsigned def cast_to_int(self, cdata): @@ -185,32 +186,41 @@ w_res = self.convert_to_object(ptr) return w_res - def _convert_to_charN_t(self, w_ob, size): - # returns a r_uint. If size == 2, it is smaller than 0x10000 + def _convert_to_charN_t(self, w_ob): + # returns a r_uint. If self.size == 2, it is smaller than 0x10000 space = self.space if space.isinstance_w(w_ob, space.w_unicode): u = space.unicode_w(w_ob) - if len(u) == 1: - u = ord(u[0]) - if size == 2 and u > 0xffff: + try: + ordinal = wchar_helper.unicode_to_ordinal(u) + except ValueError: + pass + else: + if self.size == 2 and ordinal > 0xffff: raise self._convert_error("single character <= 0xFFFF", w_ob) - return r_uint(u) - elif size == 4 and len(u) == 2 and ... - + return ordinal elif (isinstance(w_ob, cdataobj.W_CData) and isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and - w_ob.ctype.size == 2): + w_ob.ctype.size == self.size): with w_ob as ptr: - return misc.read_raw_ulong_data(ptr, 2) + return misc.read_raw_ulong_data(ptr, self.size) raise self._convert_error("unicode string of length 1", w_ob) def convert_from_object(self, cdata, w_ob): - ordinal = self._convert_to_char16(w_ob, self.size) + ordinal = self._convert_to_charN_t(w_ob) misc.write_raw_unsigned_data(cdata, ordinal, self.size) def unpack_ptr(self, w_ctypeptr, ptr, length): - u = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length) + if self.size == 2: + u = wchar_helper.unicode_from_char16(ptr, length) + else: + try: + u = wchar_helper.unicode_from_char32(ptr, length) + except OutOfRange as e: + raise oefmt(self.space.w_ValueError, + "char32_t out of range for " + "conversion to unicode: %s", hex(e.ordinal)) return self.space.newunicode(u) diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py --- a/pypy/module/_cffi_backend/ctypeptr.py +++ b/pypy/module/_cffi_backend/ctypeptr.py @@ -91,11 +91,24 @@ if not space.isinstance_w(w_ob, space.w_unicode): raise self._convert_error("unicode or list or tuple", w_ob) s = space.unicode_w(w_ob) + XXXXXXXXXXXXXXX n = len(s) if self.length >= 0 and n > self.length: raise oefmt(space.w_IndexError, "initializer unicode string is too long for '%s' " "(got %d characters)", self.name, n) + + + + + if self.ctitem.size == 2: + length = wchar_helper.measure_length_16(ptr, length) + else: + length = wchar_helper.measure_length_32(ptr, length) + XXXX + + + unichardata = rffi.cast(rffi.CWCHARP, cdata) copy_unicode_to_raw(llunicode(s), unichardata, 0, n) if n != self.length: @@ -134,12 +147,12 @@ # # pointer to a wchar_t: builds and returns a unicode if self.is_unichar_ptr_or_array(): - cdata = rffi.cast(rffi.CWCHARP, ptr) - if length < 0: - u = rffi.wcharp2unicode(cdata) + from pypy.module._cffi_backend import wchar_helper + if self.ctitem.size == 2: + length = wchar_helper.measure_length_16(ptr, length) else: - u = rffi.wcharp2unicoden(cdata, length) - return space.newunicode(u) + length = wchar_helper.measure_length_32(ptr, length) + return self.ctitem.unpack_ptr(self, ptr, length) # return W_CType.string(self, cdataobj, maxlen) @@ -304,6 +317,7 @@ length = space.int_w(space.len(w_init)) elif space.isinstance_w(w_init, space.w_basestring): # from a string, we add the null terminator + XXXXXXXXXXXXXXX length = space.int_w(space.len(w_init)) + 1 elif self.is_file: result = self.prepare_file(w_init) diff --git a/pypy/module/_cffi_backend/misc.py b/pypy/module/_cffi_backend/misc.py --- a/pypy/module/_cffi_backend/misc.py +++ b/pypy/module/_cffi_backend/misc.py @@ -296,6 +296,7 @@ return (w_value, space.int_w(space.len(w_value))) elif space.isinstance_w(w_value, space.w_basestring): # from a string, we add the null terminator + XXXXXXXXXX return (w_value, space.int_w(space.len(w_value)) + 1) else: explicitlength = space.getindex_w(w_value, space.w_OverflowError) diff --git a/pypy/module/_cffi_backend/realize_c_type.py b/pypy/module/_cffi_backend/realize_c_type.py --- a/pypy/module/_cffi_backend/realize_c_type.py +++ b/pypy/module/_cffi_backend/realize_c_type.py @@ -73,6 +73,8 @@ "uintmax_t", "float _Complex", "double _Complex", + "char16_t", + "char32_t", ] assert len(NAMES) == cffi_opcode._NUM_PRIM diff --git a/pypy/module/_cffi_backend/src/parse_c_type.c b/pypy/module/_cffi_backend/src/parse_c_type.c --- a/pypy/module/_cffi_backend/src/parse_c_type.c +++ b/pypy/module/_cffi_backend/src/parse_c_type.c @@ -505,6 +505,7 @@ case '1': if (size == 8 && !memcmp(p, "uint16", 6)) return _CFFI_PRIM_UINT16; + if (size == 8 && !memcmp(p, "char16", 6)) return _CFFI_PRIM_CHAR16; break; case '2': @@ -513,6 +514,7 @@ case '3': if (size == 8 && !memcmp(p, "uint32", 6)) return _CFFI_PRIM_UINT32; + if (size == 8 && !memcmp(p, "char32", 6)) return _CFFI_PRIM_CHAR32; break; case '4': diff --git a/pypy/module/_cffi_backend/src/parse_c_type.h b/pypy/module/_cffi_backend/src/parse_c_type.h --- a/pypy/module/_cffi_backend/src/parse_c_type.h +++ b/pypy/module/_cffi_backend/src/parse_c_type.h @@ -80,8 +80,10 @@ #define _CFFI_PRIM_UINTMAX 47 #define _CFFI_PRIM_FLOATCOMPLEX 48 #define _CFFI_PRIM_DOUBLECOMPLEX 49 +#define _CFFI_PRIM_CHAR16 50 +#define _CFFI_PRIM_CHAR32 51 -#define _CFFI__NUM_PRIM 50 +#define _CFFI__NUM_PRIM 52 #define _CFFI__UNKNOWN_PRIM (-1) #define _CFFI__UNKNOWN_FLOAT_PRIM (-2) #define _CFFI__UNKNOWN_LONG_DOUBLE (-3) diff --git a/pypy/module/_cffi_backend/test/_backend_test_c.py b/pypy/module/_cffi_backend/test/_backend_test_c.py --- a/pypy/module/_cffi_backend/test/_backend_test_c.py +++ b/pypy/module/_cffi_backend/test/_backend_test_c.py @@ -1925,7 +1925,11 @@ assert string(a, 8).startswith(b'ABC') # may contain additional garbage def test_string_wchar(): - BWChar = new_primitive_type("wchar_t") + for typename in ["wchar_t", "char16_t", "char32_t"]: + _test_string_wchar_variant(typename) + +def _test_string_wchar_variant(typename): + BWChar = new_primitive_type(typename) assert string(cast(BWChar, 42)) == u+'*' assert string(cast(BWChar, 0x4253)) == u+'\u4253' assert string(cast(BWChar, 0)) == u+'\x00' @@ -2088,6 +2092,10 @@ def test_wchar(): _test_wchar_variant("wchar_t") + if sys.platform.startswith("linux"): + BWChar = new_primitive_type("wchar_t") + assert sizeof(BWChar) == 4 + assert int(cast(BWChar, -1)) == -1 # signed, on linux def test_char16(): BChar16 = new_primitive_type("char16_t") @@ -2231,6 +2239,22 @@ x = cast(BWChar, -1) py.test.raises(ValueError, string, x) +def test_wchar_variants_mix(): + BWChar = new_primitive_type("wchar_t") + BChar16 = new_primitive_type("char16_t") + BChar32 = new_primitive_type("char32_t") + assert int(cast(BChar32, cast(BChar16, -2))) == 0xfffe + assert int(cast(BWChar, cast(BChar16, -2))) == 0xfffe + assert int(cast(BChar16, cast(BChar32, 0x0001f345))) == 0xf345 + assert int(cast(BChar16, cast(BWChar, 0x0001f345))) == 0xf345 + # + BChar16A = new_array_type(new_pointer_type(BChar16), None) + BChar32A = new_array_type(new_pointer_type(BChar32), None) + x = cast(BChar32, 'A') + py.test.raises(TypeError, newp, BChar16A, [x]) + x = cast(BChar16, 'A') + py.test.raises(TypeError, newp, BChar32A, [x]) + def test_keepalive_struct(): # exception to the no-keepalive rule: p=newp(BStructPtr) returns a # pointer owning the memory, and p[0] returns a pointer to the @@ -3457,14 +3481,15 @@ py.test.raises(TypeError, "p[1:5] = u+'XYZT'") py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]") # - BUniChar = new_primitive_type("wchar_t") - BArray = new_array_type(new_pointer_type(BUniChar), None) - p = newp(BArray, u+"foobar") - p[2:5] = [u+"*", u+"Z", u+"T"] - p[1:3] = u+"XY" - assert list(p) == [u+"f", u+"X", u+"Y", u+"Z", u+"T", u+"r", u+"\x00"] - py.test.raises(TypeError, "p[1:5] = b'XYZT'") - py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]") + for typename in ["wchar_t", "char16_t", "char32_t"]: + BUniChar = new_primitive_type(typename) + BArray = new_array_type(new_pointer_type(BUniChar), None) + p = newp(BArray, u+"foobar") + p[2:5] = [u+"*", u+"Z", u+"T"] + p[1:3] = u+"XY" + assert list(p) == [u+"f", u+"X", u+"Y", u+"Z", u+"T", u+"r", u+"\x00"] + py.test.raises(TypeError, "p[1:5] = b'XYZT'") + py.test.raises(TypeError, "p[1:5] = [1, 2, 3, 4]") def test_void_p_arithmetic(): BVoid = new_void_type() @@ -3777,10 +3802,12 @@ p0 = p assert unpack(p, 10) == b"abc\x00def\x00\x00\x00" assert unpack(p+1, 5) == b"bc\x00de" - BWChar = new_primitive_type("wchar_t") - BArray = new_array_type(new_pointer_type(BWChar), 10) # wchar_t[10] - p = newp(BArray, u"abc\x00def") - assert unpack(p, 10) == u"abc\x00def\x00\x00\x00" + + for typename in ["wchar_t", "char16_t", "char32_t"]: + BWChar = new_primitive_type(typename) + BArray = new_array_type(new_pointer_type(BWChar), 10) # wchar_t[10] + p = newp(BArray, u"abc\x00def") + assert unpack(p, 10) == u"abc\x00def\x00\x00\x00" for typename, samples in [ ("uint8_t", [0, 2**8-1]), diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py --- a/pypy/module/_cffi_backend/wchar_helper.py +++ b/pypy/module/_cffi_backend/wchar_helper.py @@ -1,7 +1,8 @@ +from rpython.rlib.objectmodel import specialize from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask from rpython.rtyper.lltypesystem import lltype, rffi -SIZE_UNICHAR = rffi.sizeof(lltype.UniChar) +SIZE_UNICODE = rffi.sizeof(lltype.UniChar) if SIZE_UNICODE == 4: @@ -15,3 +16,102 @@ ordinal = intmask(ordinal - 0x10000) return (unichr(0xD800 | (ordinal >> 10)) + unichr(0xDC00 | (ordinal & 0x3FF))) + +def is_surrogate(u, index): + return (index + 1 < len(u) and + unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and + unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF)) + +def as_surrogate(u, index): + ordinal = (ord(u[index + 0]) - 0xD800) << 10 + ordinal |= (ord(u[index + 1]) - 0xDC00) + return r_uint(ordinal + 0x10000) + +def unicode_to_ordinal(u): + if len(u) == 1: + u = ord(u[0]) + return r_uint(u) + elif SIZE_UNICODE == 2: + if len(u) == 2 and is_surrogate(u, 0): + return r_uint(as_surrogate(u, 0)) + raise ValueError + + +class OutOfRange(Exception): + def __init__(self, ordinal): + ordinal = intmask(rffi.cast(rffi.INT, ordinal)) + self.ordinal = ordinal + + +if SIZE_UNICODE == 2: + def unicode_from_char32(ptr, length): + ptr = rffi.cast(rffi.UINTP, ptr) + alloc = length + for i in range(length): + if rffi.cast(lltype.Unsigned, ptr[i]) > 0xFFFF: + alloc += 1 + + u = [u'\x00'] * alloc + j = 0 + for i in range(length): + ordinal = rffi.cast(lltype.Unsigned, ptr[i]) + if ordinal > 0xFFFF: + if ordinal > 0x10FFFF: + raise OutOfRange(ordinal) + ordinal = intmask(ordinal - 0x10000) + u[j] = unichr(0xD800 | (ordinal >> 10)) + j += 1 + u[j] = unichr(0xDC00 | (ordinal & 0x3FF)) + j += 1 + else: + u[j] = unichr(intmask(ordinal)) + j += 1 + assert j == len(u) + return u''.join(u) + + def unicode_from_char16(ptr, length): + return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length) + +else: + def unicode_from_char32(ptr, length): + return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length) + + def unicode_from_char16(ptr, length): + ptr = rffi.cast(rffi.USHORTP, ptr) + u = [u'\x00'] * length + i = 0 + j = 0 + while j < length: + ch = intmask(ptr[j]) + j += 1 + if 0xD800 <= ch <= 0xDBFF and j < length: + ch2 = intmask(ptr[j]) + if 0xDC00 <= ch2 <= 0xDFFF: + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000 + j += 1 + u[i] = unichr(ch) + i += 1 + del u[i:] + return u''.join(u) + + +@specialize.ll() +def _measure_length(ptr, maxlen): + result = 0 + if maxlen < 0: + while intmask(ptr[result]) != 0: + result += 1 + else: + while result < maxlen and intmask(ptr[result]) != 0: + result += 1 + return result + +def measure_length_16(ptr, maxlen=-1): + return _measure_length(rffi.cast(rffi.USHORTP, ptr), maxlen) + +def measure_length_32(ptr, maxlen=-1): + return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen) + + +def unicode_to_char16(u, ptr): + XXX _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit