Author: Armin Rigo <ar...@tunes.org> Branch: cffi-char16-char32 Changeset: r91504:8bc39f008ba8 Date: 2017-06-04 09:58 +0200 http://bitbucket.org/pypy/pypy/changeset/8bc39f008ba8/
Log: in-progress diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py --- a/pypy/module/_cffi_backend/ctypearray.py +++ b/pypy/module/_cffi_backend/ctypearray.py @@ -36,8 +36,7 @@ datasize = self.size # if datasize < 0: - from pypy.module._cffi_backend import misc - w_init, length = misc.get_new_array_length(space, w_init) + w_init, length = self.get_new_array_length(w_init) try: datasize = ovfcheck(length * self.ctitem.size) except OverflowError: @@ -53,6 +52,29 @@ self.convert_from_object(ptr, w_init) return cdata + def get_new_array_length(self, w_value): + space = self.space + if (space.isinstance_w(w_value, space.w_list) or + space.isinstance_w(w_value, space.w_tuple)): + return (w_value, space.int_w(space.len(w_value))) + elif space.isinstance_w(w_value, space.w_bytes): + # from a string, we add the null terminator + s = space.bytes_w(w_value) + return (w_value, len(s) + 1) + elif space.isinstance_w(w_value, space.w_unicode): + from pypy.module._cffi_backend import wchar_helper + u = space.unicode_w(w_value) + if self.ctitem.size == 2: + length = wchar_helper.unicode_size_as_char16(u) + else: + length = wchar_helper.unicode_size_as_char32(u) + return (w_value, length + 1) + else: + explicitlength = space.getindex_w(w_value, space.w_OverflowError) + if explicitlength < 0: + raise oefmt(space.w_ValueError, "negative array length") + return (space.w_None, explicitlength) + def _check_subscript_index(self, w_cdata, i): space = self.space if i < 0: diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py --- a/pypy/module/_cffi_backend/ctypeprim.py +++ b/pypy/module/_cffi_backend/ctypeprim.py @@ -42,12 +42,13 @@ def cast_unicode(self, w_ob): space = self.space s = space.unicode_w(w_ob) - XXXXXXXXXXXXXX - if len(s) != 1: + try: + ordinal = wchar_helper.unicode_to_ordinal(s) + except ValueError: raise oefmt(space.w_TypeError, "cannot cast unicode string of length %d to ctype '%s'", len(s), self.name) - return ord(s[0]) + return intmask(ordinal) def cast(self, w_ob): from pypy.module._cffi_backend import ctypeptr diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py --- a/pypy/module/_cffi_backend/ctypeptr.py +++ b/pypy/module/_cffi_backend/ctypeptr.py @@ -4,9 +4,9 @@ from rpython.rlib import rposix from rpython.rlib.rarithmetic import ovfcheck -from rpython.rtyper.annlowlevel import llstr, llunicode +from rpython.rtyper.annlowlevel import llstr from rpython.rtyper.lltypesystem import lltype, rffi -from rpython.rtyper.lltypesystem.rstr import copy_string_to_raw, copy_unicode_to_raw +from rpython.rtyper.lltypesystem.rstr import copy_string_to_raw from pypy.interpreter.error import OperationError, oefmt, wrap_oserror from pypy.module._cffi_backend import cdataobj, misc, ctypeprim, ctypevoid @@ -88,31 +88,23 @@ if n != self.length: cdata[n] = '\x00' elif isinstance(self.ctitem, ctypeprim.W_CTypePrimitiveUniChar): + from pypy.module._cffi_backend import wchar_helper if not space.isinstance_w(w_ob, space.w_unicode): raise self._convert_error("unicode or list or tuple", w_ob) s = space.unicode_w(w_ob) - XXXXXXXXXXXXXXX - n = len(s) + if self.ctitem.size == 2: + n = wchar_helper.unicode_size_as_char16(s) + else: + n = wchar_helper.unicode_size_as_char32(s) if self.length >= 0 and n > self.length: raise oefmt(space.w_IndexError, "initializer unicode string is too long for '%s' " "(got %d characters)", self.name, n) - - - - + add_final_zero = (n != self.length) if self.ctitem.size == 2: - length = wchar_helper.measure_length_16(ptr, length) + wchar_helper.unicode_to_char16(s, cdata, n, add_final_zero) else: - length = wchar_helper.measure_length_32(ptr, length) - XXXX - - - - unichardata = rffi.cast(rffi.CWCHARP, cdata) - copy_unicode_to_raw(llunicode(s), unichardata, 0, n) - if n != self.length: - unichardata[n] = u'\x00' + wchar_helper.unicode_to_char32(s, cdata, n, add_final_zero) else: raise self._convert_error("list or tuple", w_ob) @@ -315,10 +307,18 @@ if (space.isinstance_w(w_init, space.w_list) or space.isinstance_w(w_init, space.w_tuple)): length = space.int_w(space.len(w_init)) - elif space.isinstance_w(w_init, space.w_basestring): + elif space.isinstance_w(w_init, space.w_bytes): # from a string, we add the null terminator - XXXXXXXXXXXXXXX - length = space.int_w(space.len(w_init)) + 1 + s = space.bytes_w(w_init) + length = len(s) + 1 + elif space.isinstance_w(w_init, space.w_unicode): + from pypy.module._cffi_backend import wchar_helper + u = space.unicode_w(w_init) + if self.ctitem.size == 2: + length = wchar_helper.unicode_size_as_char16(u) + else: + length = wchar_helper.unicode_size_as_char32(u) + length += 1 elif self.is_file: result = self.prepare_file(w_init) if result: diff --git a/pypy/module/_cffi_backend/ctypestruct.py b/pypy/module/_cffi_backend/ctypestruct.py --- a/pypy/module/_cffi_backend/ctypestruct.py +++ b/pypy/module/_cffi_backend/ctypestruct.py @@ -244,7 +244,7 @@ ct = self.ctype if isinstance(ct, ctypearray.W_CTypeArray) and ct.length < 0: space = ct.space - w_ob, varsizelength = misc.get_new_array_length(space, w_ob) + w_ob, varsizelength = ct.get_new_array_length(w_ob) if optvarsize != -1: # in this mode, the only purpose of this function is to compute # the real size of the structure from a var-sized C99 array diff --git a/pypy/module/_cffi_backend/misc.py b/pypy/module/_cffi_backend/misc.py --- a/pypy/module/_cffi_backend/misc.py +++ b/pypy/module/_cffi_backend/misc.py @@ -290,22 +290,6 @@ # ____________________________________________________________ -def get_new_array_length(space, w_value): - if (space.isinstance_w(w_value, space.w_list) or - space.isinstance_w(w_value, space.w_tuple)): - return (w_value, space.int_w(space.len(w_value))) - elif space.isinstance_w(w_value, space.w_basestring): - # from a string, we add the null terminator - XXXXXXXXXX - return (w_value, space.int_w(space.len(w_value)) + 1) - else: - explicitlength = space.getindex_w(w_value, space.w_OverflowError) - if explicitlength < 0: - raise oefmt(space.w_ValueError, "negative array length") - return (space.w_None, explicitlength) - -# ____________________________________________________________ - @specialize.arg(0) def _raw_memcopy_tp(TPP, source, dest): # in its own function: LONGLONG may make the whole function jit-opaque diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py --- a/pypy/module/_cffi_backend/wchar_helper.py +++ b/pypy/module/_cffi_backend/wchar_helper.py @@ -1,6 +1,8 @@ from rpython.rlib.objectmodel import specialize from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask +from rpython.rtyper.annlowlevel import llunicode from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw SIZE_UNICODE = rffi.sizeof(lltype.UniChar) @@ -18,8 +20,7 @@ unichr(0xDC00 | (ordinal & 0x3FF))) def is_surrogate(u, index): - return (index + 1 < len(u) and - unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and + return (unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF)) def as_surrogate(u, index): @@ -42,9 +43,13 @@ ordinal = intmask(rffi.cast(rffi.INT, ordinal)) self.ordinal = ordinal +def _unicode_from_wchar(ptr, length): + return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length) + if SIZE_UNICODE == 2: def unicode_from_char32(ptr, length): + # 'ptr' is a pointer to 'length' 32-bit integers ptr = rffi.cast(rffi.UINTP, ptr) alloc = length for i in range(length): @@ -69,14 +74,13 @@ assert j == len(u) return u''.join(u) - def unicode_from_char16(ptr, length): - return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length) + unicode_from_char16 = _unicode_from_wchar else: - def unicode_from_char32(ptr, length): - return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length) + unicode_from_char32 = _unicode_from_wchar def unicode_from_char16(ptr, length): + # 'ptr' is a pointer to 'length' 16-bit integers ptr = rffi.cast(rffi.USHORTP, ptr) u = [u'\x00'] * length i = 0 @@ -113,5 +117,71 @@ return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen) -def unicode_to_char16(u, ptr): - XXX +def unicode_size_as_char16(u): + result = len(u) + if SIZE_UNICODE == 4: + for i in range(result): + if ord(u[i]) > 0xFFFF: + result += 1 + return result + +def unicode_size_as_char32(u): + result = len(u) + if SIZE_UNICODE == 2 and result > 1: + for i in range(result - 1): + if is_surrogate(u, i): + result -= 1 + return result + + +def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero): + # 'target_ptr' is a raw pointer to 'target_length' wchars; + # we assume here that target_length == len(u). + unichardata = rffi.cast(rffi.CWCHARP, target_ptr) + copy_unicode_to_raw(llunicode(u), unichardata, 0, target_length) + if add_final_zero: + unichardata[target_length] = u'\x00' + + +if SIZE_UNICODE == 2: + def unicode_to_char32(u, target_ptr, target_length, add_final_zero): + # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers; + # we assume here that target_length == unicode_size_as_char32(u). + ptr = rffi.cast(rffi.UINTP, target_ptr) + src_index = 0 + for i in range(target_length): + if i < target_length - 1 and is_surrogate(u, src_index): + ordinal = as_surrogate(u, src_index) + src_index += 2 + else: + ordinal = r_uint(ord(u[src_index])) + src_index += 1 + ptr[i] = rffi.cast(rffi.UINT, ordinal) + if add_final_zero: + ptr[target_length] = rffi.cast(rffi.UINT, 0) + + unicode_to_char16 = _unicode_to_wchar + +else: + unicode_to_char32 = _unicode_to_wchar + + def unicode_to_char16(u, target_ptr, target_length, add_final_zero): + # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers; + # we assume here that target_length == unicode_size_as_char16(u). + ptr = rffi.cast(rffi.USHORTP, target_ptr) + for uc in u: + ordinal = ord(uc) + if ordinal > 0xFFFF: + # NB. like CPython, ignore the problem of unicode string + # objects containing characters greater than sys.maxunicode + ordinal -= 0x10000 + ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10)) + ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF)) + ptr = rffi.ptradd(ptr, 2) + else: + ptr[0] = rffi.cast(rffi.USHORT, ordinal) + ptr = rffi.ptradd(ptr, 1) + assert ptr == ( + rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length)) + if add_final_zero: + ptr[0] = rffi.cast(rffi.USHORT, 0) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit