Author: Armin Rigo <ar...@tunes.org> Branch: unicode-utf8 Changeset: r93137:a94b5860dbb3 Date: 2017-11-23 15:40 +0100 http://bitbucket.org/pypy/pypy/changeset/a94b5860dbb3/
Log: Fixes for _cffi_backend diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py --- a/pypy/module/_cffi_backend/ctypearray.py +++ b/pypy/module/_cffi_backend/ctypearray.py @@ -64,13 +64,10 @@ elif space.isinstance_w(w_value, space.w_unicode): from pypy.module._cffi_backend import wchar_helper w_u = space.convert_arg_to_w_unicode(w_value) - if self.citem.size == 4: + if self.ctitem.size == 2: + length = wchar_helper.utf8_size_as_char16(w_u._utf8) + else: length = w_u._len() - else: - if not w_u._has_surrogates(): - length = w_u._len() - else: - length = wchar_helper.unicode_size_as_char16(w_u._utf8, w_u._len()) return (w_value, length + 1) else: explicitlength = space.getindex_w(w_value, space.w_OverflowError) diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py --- a/pypy/module/_cffi_backend/ctypeprim.py +++ b/pypy/module/_cffi_backend/ctypeprim.py @@ -40,16 +40,13 @@ return ord(s[0]) def cast_unicode(self, w_ob): - import pdb - pdb.set_trace() space = self.space w_u = space.convert_arg_to_w_unicode(w_ob) if w_u._len() != 1: raise oefmt(space.w_TypeError, "cannot cast unicode string of length %d to ctype '%s'", w_u._len(), self.name) - ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0) - return intmask(ordinal) + return rutf8.codepoint_at_pos(w_u._utf8, 0) def cast(self, w_ob): from pypy.module._cffi_backend import ctypeptr @@ -175,21 +172,19 @@ return self.space.newint(value) # r_uint => 'long' object def convert_to_object(self, cdata): - if self.is_signed_wchar: - code = ord(rffi.cast(rffi.CWCHARP, cdata)[0]) - return self.space.newutf8( - rutf8.unichr_as_utf8(code), 1, - rutf8.get_flag_from_code(code)) - else: - value = misc.read_raw_ulong_data(cdata, self.size) # r_uint - try: - u = wchar_helper.ordinal_to_unicode(value) - except wchar_helper.OutOfRange as e: - raise oefmt(self.space.w_ValueError, - "char32_t out of range for " - "conversion to unicode: %s", hex(e.ordinal)) - return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1, - rutf8.get_flag_from_code(ord(u))) + value = misc.read_raw_ulong_data(cdata, self.size) # r_uint + try: + utf8 = rutf8.unichr_as_utf8(value, allow_surrogates=True) + except ValueError: + if self.is_signed_wchar: + s = hex(intmask(value)) + else: + s = hex(value) + raise oefmt(self.space.w_ValueError, + "%s out of range for conversion to unicode: %s", + self.name, s) + flag = rutf8.get_flag_from_code(intmask(value)) + return self.space.newutf8(utf8, 1, flag) def string(self, cdataobj, maxlen): with cdataobj as ptr: @@ -200,7 +195,13 @@ # returns a r_uint. If self.size == 2, it is smaller than 0x10000 space = self.space if space.isinstance_w(w_ob, space.w_unicode): - return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0) + w_u = space.convert_arg_to_w_unicode(w_ob) + if w_u._len() != 1: + raise self._convert_error("single character", w_ob) + ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0) + if self.size == 2 and ordinal > 0xFFFF: + raise self._convert_error("single character <= 0xFFFF", w_ob) + return r_uint(ordinal) elif (isinstance(w_ob, cdataobj.W_CData) and isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and w_ob.ctype.size == self.size): @@ -214,15 +215,15 @@ def unpack_ptr(self, w_ctypeptr, ptr, length): if self.size == 2: - u = wchar_helper.unicode_from_char16(ptr, length) + utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length) else: try: - u = wchar_helper.unicode_from_char32(ptr, length) + utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length) except wchar_helper.OutOfRange as e: raise oefmt(self.space.w_ValueError, - "char32_t out of range for " - "conversion to unicode: %s", hex(e.ordinal)) - return self.space.newunicode(u) + "%s out of range for conversion to unicode: %s", + self.name, hex(e.ordinal)) + return self.space.newutf8(utf8, lgt, flag) class W_CTypePrimitiveSigned(W_CTypePrimitive): diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py --- a/pypy/module/_cffi_backend/ctypeptr.py +++ b/pypy/module/_cffi_backend/ctypeptr.py @@ -92,28 +92,20 @@ if not space.isinstance_w(w_ob, space.w_unicode): raise self._convert_error("unicode or list or tuple", w_ob) w_u = space.convert_arg_to_w_unicode(w_ob) - if self.size == 4: + s = w_u._utf8 + if self.ctitem.size == 2: + n = wchar_helper.utf8_size_as_char16(s) + else: n = w_u._len() - else: - if not w_u._has_surrogates(): - n = w_u._len() - else: - n = wchar_helper.unicode_size_as_char16(w_u._utf8, - w_u._len()) if self.length >= 0 and n > self.length: raise oefmt(space.w_IndexError, "initializer unicode string is too long for '%s' " "(got %d characters)", self.name, n) add_final_zero = (n != self.length) if self.ctitem.size == 2: - try: - wchar_helper.unicode_to_char16(s, cdata, n, add_final_zero) - except wchar_helper.OutOfRange as e: - raise oefmt(self.space.w_ValueError, - "unicode character ouf of range for " - "conversion to char16_t: %s", hex(e.ordinal)) + wchar_helper.utf8_to_char16(s, cdata, n, add_final_zero) else: - wchar_helper.unicode_to_char32(s, cdata, n, add_final_zero) + wchar_helper.utf8_to_char32(s, cdata, n, add_final_zero) else: raise self._convert_error("list or tuple", w_ob) @@ -334,8 +326,7 @@ from pypy.module._cffi_backend import wchar_helper w_u = space.convert_arg_to_w_unicode(w_init) if self.ctitem.size == 2: - length = wchar_helper.unicode_size_as_char16(w_u._utf8, - w_u._len()) + length = wchar_helper.utf8_size_as_char16(w_u._utf8) else: length = w_u._len() length += 1 diff --git a/pypy/module/_cffi_backend/test/test_wchar_helper.py b/pypy/module/_cffi_backend/test/test_wchar_helper.py new file mode 100644 --- /dev/null +++ b/pypy/module/_cffi_backend/test/test_wchar_helper.py @@ -0,0 +1,10 @@ +from hypothesis import given, strategies +from pypy.module._cffi_backend.wchar_helper import utf8_size_as_char16 + + + +@given(strategies.text()) +def test_utf8_size_as_char16(u): + assert type(u) is unicode + length = utf8_size_as_char16(''.join(uc.encode('utf8') for uc in u)) + assert length == sum((1 if uc <= u'\uFFFF' else 2) for uc in u) diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py --- a/pypy/module/_cffi_backend/wchar_helper.py +++ b/pypy/module/_cffi_backend/wchar_helper.py @@ -6,41 +6,6 @@ from rpython.rtyper.lltypesystem import lltype, rffi from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw -SIZE_UNICODE = 4 - - -if SIZE_UNICODE == 4: - def ordinal_to_unicode(ordinal): # 'ordinal' is a r_uint - return unichr(intmask(ordinal)) -else: - def ordinal_to_unicode(ordinal): # 'ordinal' is a r_uint - if ordinal <= 0xffff: - return unichr(intmask(ordinal)) - elif ordinal <= 0x10ffff: - ordinal = intmask(ordinal - 0x10000) - return (unichr(0xD800 | (ordinal >> 10)) + - unichr(0xDC00 | (ordinal & 0x3FF))) - else: - raise OutOfRange(ordinal) - -def is_surrogate(u, index): - return (unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and - unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF)) - -def as_surrogate(u, index): - ordinal = (ord(u[index + 0]) - 0xD800) << 10 - ordinal |= (ord(u[index + 1]) - 0xDC00) - return r_uint(ordinal + 0x10000) - -def unicode_to_ordinal(u): - if len(u) == 1: - u = ord(u[0]) - return r_uint(u) - elif SIZE_UNICODE == 2: - if len(u) == 2 and is_surrogate(u, 0): - return r_uint(as_surrogate(u, 0)) - raise ValueError - class OutOfRange(Exception): ordinal = 0 @@ -49,59 +14,41 @@ ordinal = intmask(rffi.cast(rffi.INT, ordinal)) self.ordinal = ordinal -def _unicode_from_wchar(ptr, length): - return rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, ptr), length) +def utf8_from_char32(ptr, length): + # 'ptr' is a pointer to 'length' 32-bit integers + ptr = rffi.cast(rffi.UINTP, ptr) + u = StringBuilder(length) + j = 0 + flag = rutf8.FLAG_ASCII + while j < length: + ch = intmask(ptr[j]) + j += 1 + flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch)) + try: + rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True) + except ValueError: + raise OutOfRange(ch) + return u.build(), length, flag - -if SIZE_UNICODE == 2: - def unicode_from_char32(ptr, length): - # 'ptr' is a pointer to 'length' 32-bit integers - ptr = rffi.cast(rffi.UINTP, ptr) - alloc = length - for i in range(length): - if rffi.cast(lltype.Unsigned, ptr[i]) > 0xFFFF: - alloc += 1 - - u = [u'\x00'] * alloc - j = 0 - for i in range(length): - ordinal = rffi.cast(lltype.Unsigned, ptr[i]) - if ordinal > 0xFFFF: - if ordinal > 0x10FFFF: - raise OutOfRange(ordinal) - ordinal = intmask(ordinal - 0x10000) - u[j] = unichr(0xD800 | (ordinal >> 10)) +def utf8_from_char16(ptr, length): + # 'ptr' is a pointer to 'length' 16-bit integers + ptr = rffi.cast(rffi.USHORTP, ptr) + u = StringBuilder(length) + j = 0 + result_length = length + flag = rutf8.FLAG_ASCII + while j < length: + ch = intmask(ptr[j]) + j += 1 + if 0xD800 <= ch <= 0xDBFF and j < length: + ch2 = intmask(ptr[j]) + if 0xDC00 <= ch2 <= 0xDFFF: + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000 j += 1 - u[j] = unichr(0xDC00 | (ordinal & 0x3FF)) - j += 1 - else: - u[j] = unichr(intmask(ordinal)) - j += 1 - assert j == len(u) - return u''.join(u) - - unicode_from_char16 = _unicode_from_wchar - -else: - unicode_from_char32 = _unicode_from_wchar - - def unicode_from_char16(ptr, length): - # 'ptr' is a pointer to 'length' 16-bit integers - ptr = rffi.cast(rffi.USHORTP, ptr) - u = StringBuilder(length) - i = 0 - j = 0 - while j < length: - ch = intmask(ptr[j]) - j += 1 - if 0xD800 <= ch <= 0xDBFF and j < length: - ch2 = intmask(ptr[j]) - if 0xDC00 <= ch2 <= 0xDFFF: - ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000 - j += 1 - rutf8.unichr_as_utf8_append(u, ch) - i += 1 - return u.build() + result_length -= 1 + flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch)) + rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True) + return u.build(), result_length, flag @specialize.ll() @@ -122,65 +69,44 @@ return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen) -def unicode_size_as_char16(u, len): - result = len - i = 0 - while i < len(u): - code = rutf8.codepoint_at_pos(u, i) - if code > 0xFFFF: - result += 1 - i = rutf8.next_codepoint_pos(u, i) +def utf8_size_as_char16(u): + # Counts one per unichar in 'u', or two if they are greater than 0xffff. + TABLE = "\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x02" + result = 0 + for c in u: + result += ord(TABLE[ord(c) >> 4]) return result -def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero): - # 'target_ptr' is a raw pointer to 'target_length' wchars; - # we assume here that target_length == len(u). - unichardata = rffi.cast(rffi.CWCHARP, target_ptr) - copy_unicode_to_raw(llunicode(u), unichardata, 0, target_length) +def utf8_to_char32(utf8, target_ptr, target_length, add_final_zero): + # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers; + # we assume (and check) that target_length == number of unichars in utf8. + unichardata = rffi.cast(rffi.UINTP, target_ptr) + i = 0 + for j in range(target_length): + code = rutf8.codepoint_at_pos(utf8, i) + unichardata[j] = rffi.cast(rffi.UINT, code) + i = rutf8.next_codepoint_pos(utf8, i) + assert i == len(utf8) if add_final_zero: - unichardata[target_length] = u'\x00' + unichardata[target_length] = rffi.cast(rffi.UINT, 0) - -if SIZE_UNICODE == 2: - def unicode_to_char32(u, target_ptr, target_length, add_final_zero): - # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers; - # we assume here that target_length == unicode_size_as_char32(u). - ptr = rffi.cast(rffi.UINTP, target_ptr) - src_index = 0 - last_surrogate_pos = len(u) - 2 - for i in range(target_length): - if src_index <= last_surrogate_pos and is_surrogate(u, src_index): - ordinal = as_surrogate(u, src_index) - src_index += 2 - else: - ordinal = r_uint(ord(u[src_index])) - src_index += 1 - ptr[i] = rffi.cast(rffi.UINT, ordinal) - if add_final_zero: - ptr[target_length] = rffi.cast(rffi.UINT, 0) - - unicode_to_char16 = _unicode_to_wchar - -else: - unicode_to_char32 = _unicode_to_wchar - - def unicode_to_char16(u, target_ptr, target_length, add_final_zero): - # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers; - # we assume here that target_length == unicode_size_as_char16(u). - ptr = rffi.cast(rffi.USHORTP, target_ptr) - for uc in u: - ordinal = ord(uc) - if ordinal > 0xFFFF: - if ordinal > 0x10FFFF: - raise OutOfRange(ordinal) - ordinal -= 0x10000 - ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10)) - ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF)) - ptr = rffi.ptradd(ptr, 2) - else: - ptr[0] = rffi.cast(rffi.USHORT, ordinal) - ptr = rffi.ptradd(ptr, 1) - assert ptr == ( - rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length)) - if add_final_zero: - ptr[0] = rffi.cast(rffi.USHORT, 0) +def utf8_to_char16(utf8, target_ptr, target_length, add_final_zero): + # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers; + # we assume (and check) that target_length == utf8_size_as_char16(utf8). + ptr = rffi.cast(rffi.USHORTP, target_ptr) + i = 0 + while i < len(utf8): + ordinal = rutf8.codepoint_at_pos(utf8, i) + if ordinal > 0xFFFF: + ordinal -= 0x10000 + ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10)) + ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF)) + ptr = rffi.ptradd(ptr, 2) + else: + ptr[0] = rffi.cast(rffi.USHORT, ordinal) + ptr = rffi.ptradd(ptr, 1) + i = rutf8.next_codepoint_pos(utf8, i) + assert ptr == ( + rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length)) + if add_final_zero: + ptr[0] = rffi.cast(rffi.USHORT, 0) diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -453,6 +453,7 @@ ))))) def get_flag_from_code(oc): + assert isinstance(oc, int) if oc <= 0x7F: return FLAG_ASCII if 0xD800 <= oc <= 0xDFFF: _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit