Author: fijal
Branch: unicode-utf8
Changeset: r93338:93560a4f1a42
Date: 2017-12-09 21:35 +0200
http://bitbucket.org/pypy/pypy/changeset/93560a4f1a42/
Log: fix _rawffi and add a todo item
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -12,3 +12,4 @@
* improve performance of splitlines
* fix _pypyjson to not use a wrapped dict when decoding an object
+* make sure we review all the places that call ord(unichr) to check for
ValueErrors
\ No newline at end of file
diff --git a/pypy/module/_locale/interp_locale.py
b/pypy/module/_locale/interp_locale.py
--- a/pypy/module/_locale/interp_locale.py
+++ b/pypy/module/_locale/interp_locale.py
@@ -133,10 +133,11 @@
rffi.free_charp(s1_c)
rffi.free_charp(s2_c)
- s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2)
+ s1, l1 = space.utf8_len_w(w_s1)
+ s2, l2 = space.utf8_len_w(w_s2)
- s1_c = rffi.unicode2wcharp(s1)
- s2_c = rffi.unicode2wcharp(s2)
+ s1_c = rffi.utf82wcharp(s1, l1)
+ s2_c = rffi.utf82wcharp(s2, l2)
try:
result = _wcscoll(s1_c, s2_c)
finally:
diff --git a/pypy/module/_rawffi/alt/type_converter.py
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -227,8 +227,8 @@
ucharval = self.get_char(w_ffitype)
return space.newbytes(chr(ucharval))
elif w_ffitype.is_unichar():
- wcharval = self.get_unichar(w_ffitype)
- return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1)
+ wcharval = r_uint(self.get_unichar(w_ffitype))
+ return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
elif w_ffitype.is_double():
return self._float(w_ffitype)
elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_rawffi/interp_rawffi.py
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -448,7 +448,8 @@
elif c == 'c':
return space.newbytes(func(add_arg, argdesc, ll_type))
elif c == 'u':
- return space.newunicode(func(add_arg, argdesc, ll_type))
+ return space.newutf8(rutf8.unichr_as_utf8(
+ ord(func(add_arg, argdesc, ll_type))), 1)
elif c == 'f' or c == 'd' or c == 'g':
return space.newfloat(float(func(add_arg, argdesc, ll_type)))
else:
@@ -596,10 +597,10 @@
return space.w_None
wcharp_addr = rffi.cast(rffi.CWCHARP, address)
if maxlength == -1:
- s = rffi.wcharp2unicode(wcharp_addr)
+ s, lgt = rffi.wcharp2utf8(wcharp_addr)
else:
- s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
- return space.newunicode(s)
+ s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength)
+ return space.newutf8(s, lgt)
@unwrap_spec(address=r_uint, maxlength=int)
def charp2rawstring(space, address, maxlength=-1):
@@ -612,8 +613,8 @@
def wcharp2rawunicode(space, address, maxlength=-1):
if maxlength == -1:
return wcharp2unicode(space, address)
- s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength)
- return space.newunicode(s)
+ s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength)
+ return space.newutf8(s, maxlength)
@unwrap_spec(address=r_uint, newcontent='bufferstr')
def rawstring2charp(space, address, newcontent):
diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py
--- a/rpython/annotator/unaryop.py
+++ b/rpython/annotator/unaryop.py
@@ -792,7 +792,7 @@
def ord(self):
# warning, on 32-bit with 32-bit unichars, this might return
# negative numbers
- return SomeInteger()
+ return SomeInteger(nonneg=True)
class __extend__(SomeIterator):
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -19,7 +19,7 @@
from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib import jit
+from rpython.rlib import jit, types
from rpython.rlib.signature import signature
from rpython.rlib.types import char, none
from rpython.rlib.rarithmetic import r_uint
@@ -27,6 +27,8 @@
from rpython.rtyper.lltypesystem import lltype, rffi
+# we need a way to accept both r_uint and int(nonneg=True)
+#@signature(types.int_nonneg(), types.bool(), returns=types.str())
def unichr_as_utf8(code, allow_surrogates=False):
"""Encode code (numeric value) as utf8 encoded string
"""
diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py
--- a/rpython/rlib/types.py
+++ b/rpython/rlib/types.py
@@ -26,6 +26,8 @@
def int():
return model.SomeInteger()
+def int_nonneg():
+ return model.SomeInteger(nonneg=True)
def bool():
return model.SomeBool()
diff --git a/rpython/rtyper/lltypesystem/rffi.py
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1019,7 +1019,27 @@
s = StringBuilder(size)
for i in range(size):
rutf8.unichr_as_utf8_append(s, ord(w[i]))
- return s.build()
+ return s.build()
+
+def wcharp2utf8(w):
+ from rpython.rlib import rutf8
+
+ s = rutf8.Utf8StringBuilder()
+ i = 0
+ while ord(w[i]):
+ s.append_code(ord(w[i]))
+ i += 1
+ return s.build(), i
+
+def wcharp2utf8n(w, maxlen):
+ from rpython.rlib import rutf8
+
+ s = rutf8.Utf8StringBuilder(maxlen)
+ i = 0
+ while i < maxlen and w[i]:
+ s.append_code(ord(w[i]))
+ i += 1
+ return s.build(), i
def utf82wcharp(utf8, utf8len):
from rpython.rlib import rutf8
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit