Author: fijal Branch: unicode-utf8 Changeset: r93338:93560a4f1a42 Date: 2017-12-09 21:35 +0200 http://bitbucket.org/pypy/pypy/changeset/93560a4f1a42/
Log: fix _rawffi and add a todo item diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -12,3 +12,4 @@ * improve performance of splitlines * fix _pypyjson to not use a wrapped dict when decoding an object +* make sure we review all the places that call ord(unichr) to check for ValueErrors \ No newline at end of file diff --git a/pypy/module/_locale/interp_locale.py b/pypy/module/_locale/interp_locale.py --- a/pypy/module/_locale/interp_locale.py +++ b/pypy/module/_locale/interp_locale.py @@ -133,10 +133,11 @@ rffi.free_charp(s1_c) rffi.free_charp(s2_c) - s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2) + s1, l1 = space.utf8_len_w(w_s1) + s2, l2 = space.utf8_len_w(w_s2) - s1_c = rffi.unicode2wcharp(s1) - s2_c = rffi.unicode2wcharp(s2) + s1_c = rffi.utf82wcharp(s1, l1) + s2_c = rffi.utf82wcharp(s2, l2) try: result = _wcscoll(s1_c, s2_c) finally: diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -227,8 +227,8 @@ ucharval = self.get_char(w_ffitype) return space.newbytes(chr(ucharval)) elif w_ffitype.is_unichar(): - wcharval = self.get_unichar(w_ffitype) - return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1) + wcharval = r_uint(self.get_unichar(w_ffitype)) + return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1) elif w_ffitype.is_double(): return self._float(w_ffitype) elif w_ffitype.is_singlefloat(): diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -448,7 +448,8 @@ elif c == 'c': return space.newbytes(func(add_arg, argdesc, ll_type)) elif c == 'u': - return space.newunicode(func(add_arg, argdesc, ll_type)) + return space.newutf8(rutf8.unichr_as_utf8( + ord(func(add_arg, argdesc, ll_type))), 1) elif c == 'f' or c == 'd' or c == 'g': return space.newfloat(float(func(add_arg, argdesc, ll_type))) else: @@ -596,10 +597,10 @@ return space.w_None wcharp_addr = rffi.cast(rffi.CWCHARP, address) if maxlength == -1: - s = rffi.wcharp2unicode(wcharp_addr) + s, lgt = rffi.wcharp2utf8(wcharp_addr) else: - s = rffi.wcharp2unicoden(wcharp_addr, maxlength) - return space.newunicode(s) + s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength) + return space.newutf8(s, lgt) @unwrap_spec(address=r_uint, maxlength=int) def charp2rawstring(space, address, maxlength=-1): @@ -612,8 +613,8 @@ def wcharp2rawunicode(space, address, maxlength=-1): if maxlength == -1: return wcharp2unicode(space, address) - s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength) - return space.newunicode(s) + s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength) + return space.newutf8(s, maxlength) @unwrap_spec(address=r_uint, newcontent='bufferstr') def rawstring2charp(space, address, newcontent): diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py --- a/rpython/annotator/unaryop.py +++ b/rpython/annotator/unaryop.py @@ -792,7 +792,7 @@ def ord(self): # warning, on 32-bit with 32-bit unichars, this might return # negative numbers - return SomeInteger() + return SomeInteger(nonneg=True) class __extend__(SomeIterator): diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -19,7 +19,7 @@ from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline from rpython.rlib.rstring import StringBuilder -from rpython.rlib import jit +from rpython.rlib import jit, types from rpython.rlib.signature import signature from rpython.rlib.types import char, none from rpython.rlib.rarithmetic import r_uint @@ -27,6 +27,8 @@ from rpython.rtyper.lltypesystem import lltype, rffi +# we need a way to accept both r_uint and int(nonneg=True) +#@signature(types.int_nonneg(), types.bool(), returns=types.str()) def unichr_as_utf8(code, allow_surrogates=False): """Encode code (numeric value) as utf8 encoded string """ diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py --- a/rpython/rlib/types.py +++ b/rpython/rlib/types.py @@ -26,6 +26,8 @@ def int(): return model.SomeInteger() +def int_nonneg(): + return model.SomeInteger(nonneg=True) def bool(): return model.SomeBool() diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py --- a/rpython/rtyper/lltypesystem/rffi.py +++ b/rpython/rtyper/lltypesystem/rffi.py @@ -1019,7 +1019,27 @@ s = StringBuilder(size) for i in range(size): rutf8.unichr_as_utf8_append(s, ord(w[i])) - return s.build() + return s.build() + +def wcharp2utf8(w): + from rpython.rlib import rutf8 + + s = rutf8.Utf8StringBuilder() + i = 0 + while ord(w[i]): + s.append_code(ord(w[i])) + i += 1 + return s.build(), i + +def wcharp2utf8n(w, maxlen): + from rpython.rlib import rutf8 + + s = rutf8.Utf8StringBuilder(maxlen) + i = 0 + while i < maxlen and w[i]: + s.append_code(ord(w[i])) + i += 1 + return s.build(), i def utf82wcharp(utf8, utf8len): from rpython.rlib import rutf8 _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit