Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95082:7dd0a62dbf67
Date: 2018-09-05 15:09 +0300
http://bitbucket.org/pypy/pypy/changeset/7dd0a62dbf67/
Log: use unicodehelper methods instead of rffi calls
diff --git a/pypy/module/cpyext/test1/test_unicodeobject.py
b/pypy/module/cpyext/test1/test_unicodeobject.py
--- a/pypy/module/cpyext/test1/test_unicodeobject.py
+++ b/pypy/module/cpyext/test1/test_unicodeobject.py
@@ -659,9 +659,9 @@
b_text = rffi.str2charp('caf\x82xx')
b_encoding = rffi.str2charp('cp437')
b_errors = rffi.str2charp('strict')
- assert space.utf8_w(PyUnicode_Decode(
- space, b_text, 4, b_encoding, b_errors)).decode() == u'caf\xe9'
- assert (space.utf8_w(
+ assert space.text_w(PyUnicode_Decode(
+ space, b_text, 4, b_encoding, b_errors)).decode('utf8') ==
u'caf\xe9'
+ assert (space.text_w(
PyUnicode_Decode(space, b_text, 4, b_encoding, None)) ==
u'caf\xe9'.encode("utf-8"))
@@ -681,7 +681,7 @@
def test_decode_null_encoding(self, space):
null_charp = lltype.nullptr(rffi.CCHARP.TO)
u_text = u'abcdefg'
- s_text = space.text_w(PyUnicode_AsEncodedString(space,
space.wrap(u_text), null_charp, null_charp))
+ s_text = space.bytes_w(PyUnicode_AsEncodedString(space,
space.wrap(u_text), null_charp, null_charp))
b_text = rffi.str2charp(s_text)
assert (space.utf8_w(PyUnicode_Decode(
space, b_text, len(s_text), null_charp, null_charp)) ==
diff --git a/pypy/module/cpyext/unicodeobject.py
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,13 +1,13 @@
from rpython.rtyper.lltypesystem import rffi, lltype
-from rpython.rlib.runicode import unicode_encode_latin_1,
unicode_encode_utf_16_helper
from rpython.rlib.rarithmetic import widen
-from rpython.rlib import rstring, runicode
+from rpython.rlib import rstring, runicode, rutf8
from rpython.tool.sourcetools import func_renamer
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.unicodehelper import (
wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
- unicode_encode_decimal)
+ unicode_encode_decimal, utf8_encode_utf_16_helper, BYTEORDER,
+ utf8_encode_utf_32_helper)
from pypy.module.unicodedata import unicodedb
from pypy.module.cpyext.api import (
CANNOT_FAIL, Py_ssize_t, build_type_checkers, cpython_api,
@@ -71,7 +71,7 @@
def unicode_attach(space, py_obj, w_obj, w_userdata=None):
"Fills a newly allocated PyUnicodeObject with a unicode string"
- value = space.utf8_w(w_obj).decode('utf8')
+ value = space.utf8_w(w_obj)
set_wsize(py_obj, len(value))
set_wbuffer(py_obj, lltype.nullptr(rffi.CWCHARP.TO))
_readify(space, py_obj, value)
@@ -271,20 +271,19 @@
assert isinstance(w_obj, unicodeobject.W_UnicodeObject)
py_obj = as_pyobj(space, w_obj)
assert get_kind(py_obj) == WCHAR_KIND
- return _readify(space, py_obj, space.utf8_w(w_obj).decode('utf8'))
+ return _readify(space, py_obj, space.utf8_w(w_obj))
def _readify(space, py_obj, value):
maxchar = 0
- for c in value:
- if ord(c) > maxchar:
- maxchar = ord(c)
+ for c in rutf8.Utf8StringIterator(value):
+ if c > maxchar:
+ maxchar = c
if maxchar > MAX_UNICODE:
raise oefmt(space.w_ValueError,
"Character U+%d is not in range [U+0000; U+10ffff]",
maxchar)
if maxchar < 256:
- ucs1_data = rffi.str2charp(unicode_encode_latin_1(
- value, len(value), errors='strict'))
+ ucs1_data = rffi.str2charp(value)
set_data(py_obj, cts.cast('void*', ucs1_data))
set_kind(py_obj, _1BYTE_KIND)
set_len(py_obj, get_wsize(py_obj))
@@ -298,9 +297,9 @@
set_utf8_len(py_obj, 0)
elif maxchar < 65536:
# XXX: assumes that sizeof(wchar_t) == 4
- ucs2_str = unicode_encode_utf_16_helper(
- value, len(value), errors='strict',
- byteorder=runicode.BYTEORDER)
+ ucs2_str = utf8_encode_utf_16_helper(
+ value, 'strict',
+ byteorder=BYTEORDER)
ucs2_data = cts.cast('Py_UCS2 *', rffi.str2charp(ucs2_str))
set_data(py_obj, cts.cast('void*', ucs2_data))
set_len(py_obj, get_wsize(py_obj))
@@ -309,10 +308,14 @@
set_utf8_len(py_obj, 0)
else:
# XXX: assumes that sizeof(wchar_t) == 4
+ ucs4_str = utf8_encode_utf_32_helper(
+ value, 'strict',
+ byteorder=BYTEORDER)
if not get_wbuffer(py_obj):
# Copy unicode buffer
- set_wbuffer(py_obj, rffi.unicode2wcharp(value))
- set_wsize(py_obj, len(value))
+ wchar = cts.cast('wchar_t*', rffi.str2charp(ucs4_str))
+ set_wbuffer(py_obj, wchar)
+ set_wsize(py_obj, len(ucs4_str) // 4)
ucs4_data = get_wbuffer(py_obj)
set_data(py_obj, cts.cast('void*', ucs4_data))
set_len(py_obj, get_wsize(py_obj))
@@ -493,9 +496,10 @@
the codec."""
if not encoding:
# This tracks CPython 2.7, in CPython 3.4 'utf-8' is hardcoded instead
- encoding = PyUnicode_GetDefaultEncoding(space)
+ w_encoding = space.newtext('utf-8')
+ else:
+ w_encoding = space.newtext(rffi.charp2str(encoding))
w_str = space.newbytes(rffi.charpsize2str(s, size))
- w_encoding = space.newtext(rffi.charp2str(encoding))
if errors:
w_errors = space.newtext(rffi.charp2str(errors))
else:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit