[pypy-commit] pypy unicode-utf8-py3: use unicodehelper methods instead of rffi calls

mattip Wed, 05 Sep 2018 05:11:38 -0700

Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95082:7dd0a62dbf67
Date: 2018-09-05 15:09 +0300
http://bitbucket.org/pypy/pypy/changeset/7dd0a62dbf67/


Log:    use unicodehelper methods instead of rffi calls

diff --git a/pypy/module/cpyext/test1/test_unicodeobject.py 
b/pypy/module/cpyext/test1/test_unicodeobject.py
--- a/pypy/module/cpyext/test1/test_unicodeobject.py
+++ b/pypy/module/cpyext/test1/test_unicodeobject.py
@@ -659,9 +659,9 @@
         b_text = rffi.str2charp('caf\x82xx')
         b_encoding = rffi.str2charp('cp437')
         b_errors = rffi.str2charp('strict')
-        assert space.utf8_w(PyUnicode_Decode(
-            space, b_text, 4, b_encoding, b_errors)).decode() == u'caf\xe9'
-        assert (space.utf8_w(
+        assert space.text_w(PyUnicode_Decode(
+            space, b_text, 4, b_encoding, b_errors)).decode('utf8') == 
u'caf\xe9'
+        assert (space.text_w(
             PyUnicode_Decode(space, b_text, 4, b_encoding, None)) ==
             u'caf\xe9'.encode("utf-8"))
 
@@ -681,7 +681,7 @@
     def test_decode_null_encoding(self, space):
         null_charp = lltype.nullptr(rffi.CCHARP.TO)
         u_text = u'abcdefg'
-        s_text = space.text_w(PyUnicode_AsEncodedString(space, 
space.wrap(u_text), null_charp, null_charp))
+        s_text = space.bytes_w(PyUnicode_AsEncodedString(space, 
space.wrap(u_text), null_charp, null_charp))
         b_text = rffi.str2charp(s_text)
         assert (space.utf8_w(PyUnicode_Decode(
             space, b_text, len(s_text), null_charp, null_charp)) ==
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,13 +1,13 @@
 from rpython.rtyper.lltypesystem import rffi, lltype
-from rpython.rlib.runicode import unicode_encode_latin_1, 
unicode_encode_utf_16_helper
 from rpython.rlib.rarithmetic import widen
-from rpython.rlib import rstring, runicode
+from rpython.rlib import rstring, runicode, rutf8
 from rpython.tool.sourcetools import func_renamer
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.unicodehelper import (
     wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
-    unicode_encode_decimal)
+    unicode_encode_decimal, utf8_encode_utf_16_helper, BYTEORDER,
+    utf8_encode_utf_32_helper)
 from pypy.module.unicodedata import unicodedb
 from pypy.module.cpyext.api import (
     CANNOT_FAIL, Py_ssize_t, build_type_checkers, cpython_api,
@@ -71,7 +71,7 @@
 
 def unicode_attach(space, py_obj, w_obj, w_userdata=None):
     "Fills a newly allocated PyUnicodeObject with a unicode string"
-    value = space.utf8_w(w_obj).decode('utf8')
+    value = space.utf8_w(w_obj)
     set_wsize(py_obj, len(value))
     set_wbuffer(py_obj, lltype.nullptr(rffi.CWCHARP.TO))
     _readify(space, py_obj, value)
@@ -271,20 +271,19 @@
     assert isinstance(w_obj, unicodeobject.W_UnicodeObject)
     py_obj = as_pyobj(space, w_obj)
     assert get_kind(py_obj) == WCHAR_KIND
-    return _readify(space, py_obj, space.utf8_w(w_obj).decode('utf8'))
+    return _readify(space, py_obj, space.utf8_w(w_obj))
 
 def _readify(space, py_obj, value):
     maxchar = 0
-    for c in value:
-        if ord(c) > maxchar:
-            maxchar = ord(c)
+    for c in rutf8.Utf8StringIterator(value):
+        if c > maxchar:
+            maxchar = c
             if maxchar > MAX_UNICODE:
                 raise oefmt(space.w_ValueError,
                     "Character U+%d is not in range [U+0000; U+10ffff]",
                     maxchar)
     if maxchar < 256:
-        ucs1_data = rffi.str2charp(unicode_encode_latin_1(
-            value, len(value), errors='strict'))
+        ucs1_data = rffi.str2charp(value)
         set_data(py_obj, cts.cast('void*', ucs1_data))
         set_kind(py_obj, _1BYTE_KIND)
         set_len(py_obj, get_wsize(py_obj))
@@ -298,9 +297,9 @@
             set_utf8_len(py_obj, 0)
     elif maxchar < 65536:
         # XXX: assumes that sizeof(wchar_t) == 4
-        ucs2_str = unicode_encode_utf_16_helper(
-            value, len(value), errors='strict',
-            byteorder=runicode.BYTEORDER)
+        ucs2_str = utf8_encode_utf_16_helper(
+            value, 'strict',
+            byteorder=BYTEORDER)
         ucs2_data = cts.cast('Py_UCS2 *', rffi.str2charp(ucs2_str))
         set_data(py_obj, cts.cast('void*', ucs2_data))
         set_len(py_obj, get_wsize(py_obj))
@@ -309,10 +308,14 @@
         set_utf8_len(py_obj, 0)
     else:
         # XXX: assumes that sizeof(wchar_t) == 4
+        ucs4_str = utf8_encode_utf_32_helper(
+            value, 'strict',
+            byteorder=BYTEORDER)
         if not get_wbuffer(py_obj):
             # Copy unicode buffer
-            set_wbuffer(py_obj, rffi.unicode2wcharp(value))
-            set_wsize(py_obj, len(value))
+            wchar = cts.cast('wchar_t*', rffi.str2charp(ucs4_str))
+            set_wbuffer(py_obj, wchar)
+            set_wsize(py_obj, len(ucs4_str) // 4)
         ucs4_data = get_wbuffer(py_obj)
         set_data(py_obj, cts.cast('void*', ucs4_data))
         set_len(py_obj, get_wsize(py_obj))
@@ -493,9 +496,10 @@
     the codec."""
     if not encoding:
         # This tracks CPython 2.7, in CPython 3.4 'utf-8' is hardcoded instead
-        encoding = PyUnicode_GetDefaultEncoding(space)
+        w_encoding = space.newtext('utf-8')
+    else:
+        w_encoding = space.newtext(rffi.charp2str(encoding))
     w_str = space.newbytes(rffi.charpsize2str(s, size))
-    w_encoding = space.newtext(rffi.charp2str(encoding))
     if errors:
         w_errors = space.newtext(rffi.charp2str(errors))
     else:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-py3: use unicodehelper methods instead of rffi calls

Reply via email to