Author: Tyler Wade <[email protected]>
Branch: utf8-unicode2
Changeset: r72447:cc1160f9014e
Date: 2014-07-13 07:34 -0500
http://bitbucket.org/pypy/pypy/changeset/cc1160f9014e/
Log: Fix _rawffi module
diff --git a/pypy/interpreter/test/test_utf8.py
b/pypy/interpreter/test/test_utf8.py
--- a/pypy/interpreter/test/test_utf8.py
+++ b/pypy/interpreter/test/test_utf8.py
@@ -51,7 +51,7 @@
if sys.maxunicode < 65536:
assert l[:3] == [u'A', u'\u010F', u'\u20AC']
else:
- assert l == [u'A', u'\u010F', u'\u20AC', u'\U00001F63D']
+ assert l == [u'A', u'\u010F', u'\u20AC', u'\U0001F63D']
def test_reverse_iterator():
s = build_utf8str()
@@ -197,7 +197,7 @@
def test_copy_to_wcharp():
s = build_utf8str()
- if sys.maxunicode < 0x10000:
+ if sys.maxunicode < 0x10000 and rffi.sizeof(rffi.WCHAR_T) == 4:
# The last character requires a surrogate pair on narrow builds and
# so won't be converted correctly by rffi.wcharp2unicode
s = s[:-1]
@@ -206,3 +206,27 @@
u = rffi.wcharp2unicode(wcharp)
rffi.free_wcharp(wcharp)
assert s == u
+
+def test_from_wcharp():
+ def check(u):
+ wcharp = rffi.unicode2wcharp(u)
+ s = Utf8Str.from_wcharp(wcharp)
+ rffi.free_wcharp(wcharp)
+ assert s == u
+ check(u'A\u010F\u20AC\U0001F63D')
+ check(u'0xDCC0 ')
+ check(u'0xDCC0')
+
+def test_from_wcharpn():
+ u = u'A\u010F\u20AC\U0001F63D'
+ wcharp = rffi.unicode2wcharp(u)
+ s = Utf8Str.from_wcharpn(wcharp, 3)
+ assert s == u[:3]
+
+ s = Utf8Str.from_wcharpn(wcharp, 4)
+ if sys.maxunicode == 0xFFFF:
+ assert s == u[:4]
+ else:
+ assert s == u
+
+ rffi.free_wcharp(wcharp)
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -4,6 +4,14 @@
from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
from rpython.rlib.rarithmetic import r_uint
from rpython.rtyper.lltypesystem import rffi
+from rpython.rtyper.lltypesystem import lltype
+
+wchar_rint = rffi.r_uint
+WCHAR_INTP = rffi.UINTP
+if rffi.sizeof(rffi.WCHAR_T) == 2:
+ wchar_rint = rffi.r_ushort
+ WCHAR_INTP = rffi.USHORTP
+
def utf8chr(value):
# Like unichr, but returns a Utf8Str object
@@ -415,15 +423,89 @@
byte_pos -= 1
return byte_pos
- def copy_to_wcharp(self):
- # XXX Temporary solution. This won't work on correctly on systems
- # where sizeof(wchar_t) == 2. Also, it copies twice.
- from pypy.interpreter.utf8_codecs import
unicode_encode_unicode_internal
- from rpython.rlib.runicode import MAXUNICODE
- bytes = unicode_encode_unicode_internal(self, len(self), 'strict')
- return rffi.cast(rffi.CWCHARP, rffi.str2charp(bytes))
+ def copy_to_wcharp(self, track_allocation=True):
+ length = len(self) + 1
+ if rffi.sizeof(rffi.WCHAR_T) == 2:
+ for c in self.codepoint_iter():
+ if c > 0xFFFF:
+ length += 1
+ array = lltype.malloc(WCHAR_INTP.TO, length, flavor='raw',
+ track_allocation=track_allocation)
+ from pypy.interpreter.utf8_codecs import create_surrogate_pair
+ i = 0;
+ for c in self.codepoint_iter():
+ if rffi.sizeof(rffi.WCHAR_T) == 2:
+ c1, c2 = create_surrogate_pair(c)
+ array[i] = wchar_rint(c1)
+ if c2:
+ i += 1
+ array[i] = wchar_rint(c2)
+ else:
+ array[i] = wchar_rint(c)
+
+ i += 1
+
+ array[i] = wchar_rint(0)
+ array = rffi.cast(rffi.CWCHARP, array)
+ return array
+
+ @staticmethod
+ def from_wcharp(wcharp):
+ array = rffi.cast(WCHAR_INTP, wcharp)
+ builder = Utf8Builder()
+ i = 0;
+ while True:
+ c = int(array[i])
+ if c == 0:
+ break
+
+ if rffi.sizeof(rffi.WCHAR_T) == 2:
+ if 0xD800 <= c <= 0xDBFF:
+ i += 1
+ c2 = int(array[i])
+ if c2 == 0:
+ builder.append(c)
+ break
+ elif not (0xDC00 <= c2 <= 0xDFFF):
+ builder.append(c)
+ c = c2
+ else:
+ c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
+
+ builder.append(c)
+ i += 1
+
+ return builder.build()
+
+ @staticmethod
+ def from_wcharpn(wcharp, size):
+ array = rffi.cast(WCHAR_INTP, wcharp)
+ builder = Utf8Builder()
+ i = 0;
+ while i < size:
+ c = int(array[i])
+ if c == 0:
+ break
+
+ if rffi.sizeof(rffi.WCHAR_T) == 2:
+ if i != size - 1 and 0xD800 <= c <= 0xDBFF:
+ i += 1
+ c2 = int(array[i])
+ if c2 == 0:
+ builder.append(c)
+ break
+ elif not (0xDC00 <= c2 <= 0xDFFF):
+ builder.append(c)
+ c = c2
+ else:
+ c = (((c & 0x3FF)<<10) | (c2 & 0x3FF)) + 0x10000;
+
+ builder.append(c)
+ i += 1
+
+ return builder.build()
class Utf8Builder(object):
@specialize.argtype(1)
diff --git a/pypy/interpreter/utf8_codecs.py b/pypy/interpreter/utf8_codecs.py
--- a/pypy/interpreter/utf8_codecs.py
+++ b/pypy/interpreter/utf8_codecs.py
@@ -784,6 +784,13 @@
result.append(r)
return result.build(), pos, bo
+def create_surrogate_pair(val):
+ if val >= 0x10000:
+ return (0xD800 | ((val-0x10000) >> 10),
+ 0xDC00 | ((val-0x10000) & 0x3FF))
+ else:
+ return val, 0
+
def unicode_encode_utf_16_helper(s, size, errors,
errorhandler=None,
byteorder='little'):
@@ -803,10 +810,7 @@
while i < size:
ch = utf8ord(s, i)
i += 1
- ch2 = 0
- if ch >= 0x10000:
- ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
- ch = 0xD800 | ((ch-0x10000) >> 10)
+ ch, ch2 = create_surrogate_pair(ch)
_STORECHAR(result, ch, byteorder)
if ch2:
diff --git a/pypy/module/_rawffi/alt/interp_funcptr.py
b/pypy/module/_rawffi/alt/interp_funcptr.py
--- a/pypy/module/_rawffi/alt/interp_funcptr.py
+++ b/pypy/module/_rawffi/alt/interp_funcptr.py
@@ -168,7 +168,7 @@
self.argchain.arg(addr)
def handle_unichar_p(self, w_ffitype, w_obj, unicodeval):
- buf = rffi.unicode2wcharp(unicodeval)
+ buf = unicodeval.copy_to_wcharp()
self.w_func.to_free.append(rffi.cast(rffi.VOIDP, buf))
addr = rffi.cast(rffi.ULONG, buf)
self.argchain.arg(addr)
diff --git a/pypy/module/_rawffi/alt/test/test_type_converter.py
b/pypy/module/_rawffi/alt/test/test_type_converter.py
--- a/pypy/module/_rawffi/alt/test/test_type_converter.py
+++ b/pypy/module/_rawffi/alt/test/test_type_converter.py
@@ -1,6 +1,7 @@
import sys
from rpython.rlib.rarithmetic import r_uint, r_singlefloat, r_longlong,
r_ulonglong
from rpython.rlib.libffi import IS_32_BIT
+from pypy.interpreter.utf8 import Utf8Str
from pypy.module._rawffi.alt.interp_ffitype import app_types, descr_new_pointer
from pypy.module._rawffi.alt.type_converter import FromAppLevelConverter,
ToAppLevelConverter
@@ -58,7 +59,8 @@
def test_char(self):
space = self.space
self.check(app_types.char, space.wrap('a'), ord('a'))
- self.check(app_types.unichar, space.wrap(u'\u1234'), 0x1234)
+ self.check(app_types.unichar,
+ space.wrap(Utf8Str.from_unicode(u'\u1234')), 0x1234)
def test_signed_longlong(self):
space = self.space
@@ -120,8 +122,11 @@
def test_strings(self):
# first, try automatic conversion from applevel
self.check(app_types.char_p, self.space.wrap('foo'), 'foo')
- self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'),
u'foo\u1234')
- self.check(app_types.unichar_p, self.space.wrap('foo'), u'foo')
+ self.check(app_types.unichar_p,
+ self.space.wrap(Utf8Str.from_unicode(u'foo\u1234')),
+ Utf8Str.from_unicode(u'foo\u1234'))
+ self.check(app_types.unichar_p, self.space.wrap('foo'),
+ Utf8Str.from_unicode(u'foo'))
# then, try to pass explicit pointers
self.check(app_types.char_p, self.space.wrap(42), 42)
self.check(app_types.unichar_p, self.space.wrap(42), 42)
diff --git a/pypy/module/_rawffi/alt/type_converter.py
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -2,6 +2,7 @@
from rpython.rlib import jit
from rpython.rlib.rarithmetic import r_uint
from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.utf8 import utf8chr
from pypy.module._rawffi.structure import W_StructureInstance, W_Structure
from pypy.module._rawffi.alt.interp_ffitype import app_types
@@ -228,7 +229,7 @@
return space.wrap(chr(ucharval))
elif w_ffitype.is_unichar():
wcharval = self.get_unichar(w_ffitype)
- return space.wrap(unichr(wcharval))
+ return space.wrap(utf8chr(int(wcharval)))
elif w_ffitype.is_double():
return self._float(w_ffitype)
elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_rawffi/array.py b/pypy/module/_rawffi/array.py
--- a/pypy/module/_rawffi/array.py
+++ b/pypy/module/_rawffi/array.py
@@ -42,14 +42,27 @@
if not space.is_none(w_items):
items_w = space.unpackiterable(w_items)
iterlength = len(items_w)
- if iterlength > length:
+
+ double_length_items = 0
+ if rffi.sizeof(rffi.WCHAR_T) == 2:
+ # On systems where sizeof(wchar_t) = 2, the resulting array
+ # needs to be encoded in utf-16. As a result, codepoints larger
+ # than 0xFFFF will occupy two array values
+ for w_i in items_w:
+ if space.isinstance_w(w_i, space.w_unicode):
+ u = space.unicode_w(w_i)
+ if len(u) == 0 and utf8ord(u) > 0xFFFF:
+ double_length_items += 1
+
+ if iterlength + double_length_items > length:
raise OperationError(space.w_ValueError,
space.wrap("too many items for specified"
" array length"))
- for num in range(iterlength):
- w_item = items_w[num]
- unwrap_value(space, write_ptr, result.ll_buffer, num,
- self.itemcode, w_item)
+ i = 0
+ for w_item in items_w:
+ i += unwrap_value(space, write_ptr, result.ll_buffer, i,
+ self.itemcode, w_item)
+
return space.wrap(result)
def descr_repr(self, space):
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -2,6 +2,9 @@
from pypy.interpreter.error import OperationError, oefmt, wrap_oserror
from pypy.interpreter.gateway import interp2app, unwrap_spec
from pypy.interpreter.typedef import TypeDef, GetSetProperty
+from pypy.interpreter.utf8 import (
+ Utf8Str, utf8ord, utf8chr, WCHAR_INTP, wchar_rint)
+from pypy.interpreter.utf8_codecs import create_surrogate_pair
from rpython.rlib.clibffi import *
from rpython.rtyper.lltypesystem import lltype, rffi
@@ -85,6 +88,7 @@
LL_TYPEMAP['X'] = rffi.CCHARP
LL_TYPEMAP['v'] = rffi.SHORT
+
def letter2tp(space, key):
from pypy.module._rawffi.array import PRIMITIVE_ARRAY_TYPES
try:
@@ -269,6 +273,8 @@
ptr_val = t_array[0]
return ptr_val
else:
+ if T is rffi.CWCHARP:
+ return utf8chr(int(rffi.cast(WCHAR_INTP, ptr)[ofs]))
return rffi.cast(T, ptr)[ofs]
read_ptr._annspecialcase_ = 'specialize:arg(2)'
@@ -382,14 +388,18 @@
else:
ptr = unwrap_truncate_int(rffi.VOIDP, space, w_arg)
push_func(add_arg, argdesc, ptr)
+ return 1
elif letter == "d":
push_func(add_arg, argdesc, space.float_w(w_arg))
+ return 1
elif letter == "f":
push_func(add_arg, argdesc, rffi.cast(rffi.FLOAT,
space.float_w(w_arg)))
+ return 1
elif letter == "g":
push_func(add_arg, argdesc, rffi.cast(rffi.LONGDOUBLE,
space.float_w(w_arg)))
+ return 1
elif letter == "c":
s = space.str_w(w_arg)
if len(s) != 1:
@@ -397,20 +407,31 @@
"Expected string of length one as character"))
val = s[0]
push_func(add_arg, argdesc, val)
+ return 1
elif letter == 'u':
s = space.unicode_w(w_arg)
if len(s) != 1:
raise OperationError(space.w_TypeError, w(
"Expected unicode string of length one as wide character"))
- val = s[0]
- push_func(add_arg, argdesc, val)
+
+ val = utf8ord(s)
+ if rffi.sizeof(rffi.WCHAR_T) == 2 and val > 0xFFFF:
+ # Utf-16 must be used on systems with a 2 byte wchar_t to
+ # encode codepoints > 0xFFFF
+ c1, c2 = create_surrogate_pair(val)
+ push_func(add_arg, argdesc, wchar_rint(c1))
+ push_func(add_arg, argdesc+1, wchar_rint(c2))
+ return 2
+ else:
+ push_func(add_arg, argdesc, wchar_rint(val))
+ return 1
else:
for c in unroll_letters_for_numbers:
if letter == c:
TP = LL_TYPEMAP[c]
val = unwrap_truncate_int(TP, space, w_arg)
push_func(add_arg, argdesc, val)
- return
+ return 1
else:
raise OperationError(space.w_TypeError,
space.wrap("cannot directly write value"))
@@ -559,9 +580,9 @@
return space.w_None
wcharp_addr = rffi.cast(rffi.CWCHARP, address)
if maxlength == -1:
- s = rffi.wcharp2unicode(wcharp_addr)
+ s = Utf8Str.from_wcharp(wcharp_addr)
else:
- s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
+ s = Utf8Str.from_wcharpn(wcharp_addr, maxlength)
return space.wrap(s)
@unwrap_spec(address=r_uint, maxlength=int)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit