Author: Matti Picus <[email protected]>
Branch: better-PyDict_Next
Changeset: r89024:d9b07fbc433c
Date: 2016-12-12 16:23 +0200
http://bitbucket.org/pypy/pypy/changeset/d9b07fbc433c/

Log:    merge default into branch

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -0,0 +1,26 @@
+from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+
+class FakeSpace:
+    pass
+
+def test_encode_utf8():
+    space = FakeSpace()
+    assert encode_utf8(space, u"abc") == "abc"
+    assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
+    assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80"
+    assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80"
+    # for the following test, go to lengths to avoid CPython's optimizer
+    # and .pyc file storage, which collapse the two surrogates into one
+    c = u"\udc00"
+    assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80"
+
+def test_decode_utf8():
+    space = FakeSpace()
+    assert decode_utf8(space, "abc") == u"abc"
+    assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
+    assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
+    assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
+    got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
+    assert map(ord, got) == [0xd800, 0xdc00]
+    got = decode_utf8(space, "\xf0\x90\x80\x80")
+    assert map(ord, got) == [0x10000]
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -51,6 +51,10 @@
     return result
 
 def decode_utf8(space, string):
+    # Surrogates are accepted and not treated specially at all.
+    # If there happen to be two 3-bytes encoding a pair of surrogates,
+    # you still get two surrogate unicode characters in the result.
+    # These are the Python2 rules; Python3 differs.
     result, consumed = runicode.str_decode_utf_8(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
@@ -59,8 +63,9 @@
 
 def encode_utf8(space, uni):
     # Note that this function never raises UnicodeEncodeError,
-    # since surrogate pairs are allowed.
-    # This is not the case with Python3.
+    # since surrogates are allowed, either paired or lone.
+    # A paired surrogate is considered like the non-BMP character
+    # it stands for.  These are the Python2 rules; Python3 differs.
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=raise_unicode_exception_encode,
diff --git a/pypy/module/cpyext/pyobject.py b/pypy/module/cpyext/pyobject.py
--- a/pypy/module/cpyext/pyobject.py
+++ b/pypy/module/cpyext/pyobject.py
@@ -25,11 +25,9 @@
     basestruct = PyObject.TO
     W_BaseObject = W_ObjectObject
 
-    def get_dealloc(self, space):
+    def get_dealloc(self):
         from pypy.module.cpyext.typeobject import subtype_dealloc
-        return llhelper(
-            subtype_dealloc.api_func.functype,
-            subtype_dealloc.api_func.get_wrapper(space))
+        return subtype_dealloc
 
     def allocate(self, space, w_type, itemcount=0):
         # similar to PyType_GenericAlloc?
@@ -109,10 +107,8 @@
                 return tp_alloc(space, w_type, itemcount)
 
         if tp_dealloc:
-            def get_dealloc(self, space):
-                return llhelper(
-                    tp_dealloc.api_func.functype,
-                    tp_dealloc.api_func.get_wrapper(space))
+            def get_dealloc(self):
+                return tp_dealloc
 
         if tp_attach:
             def attach(self, space, pyobj, w_obj, w_userdata=None):
diff --git a/pypy/module/cpyext/slotdefs.py b/pypy/module/cpyext/slotdefs.py
--- a/pypy/module/cpyext/slotdefs.py
+++ b/pypy/module/cpyext/slotdefs.py
@@ -59,6 +59,9 @@
                 "expected %d-%d arguments, got %d",
                 low, high, space.len_w(w_ob))
 
+def llslot(space, func):
+    return llhelper(func.api_func.functype, func.api_func.get_wrapper(space))
+
 def wrap_init(space, w_self, w_args, func, w_kwargs):
     func_init = rffi.cast(initproc, func)
     res = generic_cpy_call(space, func_init, w_self, w_args, w_kwargs)
@@ -106,7 +109,7 @@
     args_w = space.fixedview(w_args)
     arg3 = space.w_None
     if len(args_w) > 1:
-        arg3 = args_w[1] 
+        arg3 = args_w[1]
     return generic_cpy_call(space, func_ternary, w_self, args_w[0], arg3)
 
 def wrap_ternaryfunc_r(space, w_self, w_args, func):
@@ -121,7 +124,7 @@
     Py_DecRef(space, ref)
     arg3 = space.w_None
     if len(args_w) > 1:
-        arg3 = args_w[1] 
+        arg3 = args_w[1]
     return generic_cpy_call(space, func_ternary, args_w[0], w_self, arg3)
 
 
@@ -322,7 +325,7 @@
             self.strides = [1]
         else:
             self.strides = strides
-        self.ndim = ndim 
+        self.ndim = ndim
         self.itemsize = itemsize
         self.readonly = readonly
 
@@ -472,7 +475,6 @@
             @func_renamer("cpyext_%s_%s" % (name.replace('.', '_'), 
typedef.name))
             def slot_func(space, w_self):
                 return space.call_function(slot_fn, w_self)
-            api_func = slot_func.api_func
             handled = True
 
     # binary functions
@@ -499,7 +501,6 @@
             @func_renamer("cpyext_%s_%s" % (name.replace('.', '_'), 
typedef.name))
             def slot_func(space, w_self, w_arg):
                 return space.call_function(slot_fn, w_self, w_arg)
-            api_func = slot_func.api_func
             handled = True
 
     # binary-with-Py_ssize_t-type
@@ -517,7 +518,6 @@
             @func_renamer("cpyext_%s_%s" % (name.replace('.', '_'), 
typedef.name))
             def slot_func(space, w_self, arg):
                 return space.call_function(slot_fn, w_self, space.wrap(arg))
-            api_func = slot_func.api_func
             handled = True
 
     # ternary functions
@@ -532,7 +532,6 @@
             @func_renamer("cpyext_%s_%s" % (name.replace('.', '_'), 
typedef.name))
             def slot_func(space, w_self, w_arg1, w_arg2):
                 return space.call_function(slot_fn, w_self, w_arg1, w_arg2)
-            api_func = slot_func.api_func
             handled = True
 
     if handled:
@@ -552,7 +551,7 @@
             else:
                 space.call_function(delattr_fn, w_self, w_name)
             return 0
-        api_func = slot_tp_setattro.api_func
+        slot_func = slot_tp_setattro
     elif name == 'tp_getattro':
         getattr_fn = w_type.getdictvalue(space, '__getattribute__')
         if getattr_fn is None:
@@ -562,7 +561,7 @@
         @func_renamer("cpyext_tp_getattro_%s" % (typedef.name,))
         def slot_tp_getattro(space, w_self, w_name):
             return space.call_function(getattr_fn, w_self, w_name)
-        api_func = slot_tp_getattro.api_func
+        slot_func = slot_tp_getattro
     elif name == 'tp_call':
         call_fn = w_type.getdictvalue(space, '__call__')
         if call_fn is None:
@@ -574,7 +573,7 @@
             args = Arguments(space, [w_self],
                              w_stararg=w_args, w_starstararg=w_kwds)
             return space.call_args(call_fn, args)
-        api_func = slot_tp_call.api_func
+        slot_func = slot_tp_call
 
     elif name == 'tp_iternext':
         iternext_fn = w_type.getdictvalue(space, 'next')
@@ -590,7 +589,7 @@
                 if not e.match(space, space.w_StopIteration):
                     raise
                 return None
-        api_func = slot_tp_iternext.api_func
+        slot_func = slot_tp_iternext
 
     elif name == 'tp_init':
         init_fn = w_type.getdictvalue(space, '__init__')
@@ -605,7 +604,7 @@
                              w_stararg=w_args, w_starstararg=w_kwds)
             space.call_args(init_fn, args)
             return 0
-        api_func = slot_tp_init.api_func
+        slot_func = slot_tp_init
     elif name == 'tp_new':
         new_fn = w_type.getdictvalue(space, '__new__')
         if new_fn is None:
@@ -617,12 +616,12 @@
             args = Arguments(space, [w_self],
                              w_stararg=w_args, w_starstararg=w_kwds)
             return space.call_args(space.get(new_fn, w_self), args)
-        api_func = slot_tp_new.api_func
+        slot_func = slot_tp_new
     elif name == 'tp_as_buffer.c_bf_getbuffer':
         buff_fn = w_type.getdictvalue(space, '__buffer__')
         if buff_fn is None:
             return
-        @cpython_api([PyObject, Py_bufferP, rffi.INT_real], 
+        @cpython_api([PyObject, Py_bufferP, rffi.INT_real],
                 rffi.INT_real, header=None, error=-1)
         @func_renamer("cpyext_%s_%s" % (name.replace('.', '_'), typedef.name))
         def buff_w(space, w_self, view, flags):
@@ -646,14 +645,14 @@
             return 0
         # XXX remove this when it no longer crashes a translated PyPy
         return
-        api_func = buff_w.api_func
+        slot_func = buff_w
     else:
         # missing: tp_as_number.nb_nonzero, tp_as_number.nb_coerce
         # tp_as_sequence.c_sq_contains, tp_as_sequence.c_sq_length
         # richcmpfunc(s)
         return
 
-    return lambda: llhelper(api_func.functype, api_func.get_wrapper(space))
+    return slot_func
 
 PyWrapperFlag_KEYWORDS = 1
 
diff --git a/pypy/module/cpyext/typeobject.py b/pypy/module/cpyext/typeobject.py
--- a/pypy/module/cpyext/typeobject.py
+++ b/pypy/module/cpyext/typeobject.py
@@ -3,7 +3,6 @@
 from rpython.rlib import jit
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rstring import rsplit
-from rpython.rtyper.annlowlevel import llhelper
 from rpython.rtyper.lltypesystem import rffi, lltype
 
 from pypy.interpreter.baseobjspace import W_Root, DescrMismatch
@@ -28,7 +27,8 @@
     PyObject, make_ref, create_ref, from_ref, get_typedescr, make_typedescr,
     track_reference, Py_DecRef, as_pyobj)
 from pypy.module.cpyext.slotdefs import (
-    slotdefs_for_tp_slots, slotdefs_for_wrappers, get_slot_tp_function)
+    slotdefs_for_tp_slots, slotdefs_for_wrappers, get_slot_tp_function,
+    llslot)
 from pypy.module.cpyext.state import State
 from pypy.module.cpyext.structmember import PyMember_GetOne, PyMember_SetOne
 from pypy.module.cpyext.typeobjectdefs import (
@@ -273,21 +273,14 @@
             # XXX special case iternext
             continue
 
-        slot_func_helper = None
-
         if slot_func is None and typedef is not None:
-            get_slot = get_slot_tp_function(space, typedef, slot_name)
-            if get_slot:
-                slot_func_helper = get_slot()
-        elif slot_func:
-            slot_func_helper = llhelper(slot_func.api_func.functype,
-                                        slot_func.api_func.get_wrapper(space))
-
-        if slot_func_helper is None:
+            slot_func = get_slot_tp_function(space, typedef, slot_name)
+        if not slot_func:
             if WARN_ABOUT_MISSING_SLOT_FUNCTIONS:
                 os.write(2, "%s defined by %s but no slot function defined!\n" 
% (
                         method_name, w_type.getname(space)))
             continue
+        slot_func_helper = llslot(space, slot_func)
 
         # XXX special case wrapper-functions and use a "specific" slot func
 
@@ -393,9 +386,8 @@
 
 def setup_new_method_def(space):
     ptr = get_new_method_def(space)
-    ptr.c_ml_meth = rffi.cast(PyCFunction_typedef,
-        llhelper(tp_new_wrapper.api_func.functype,
-                 tp_new_wrapper.api_func.get_wrapper(space)))
+    ptr.c_ml_meth = rffi.cast(
+        PyCFunction_typedef, llslot(space, tp_new_wrapper))
 
 def add_tp_new_wrapper(space, dict_w, pto):
     if "__new__" in dict_w:
@@ -518,8 +510,7 @@
 def subtype_dealloc(space, obj):
     pto = obj.c_ob_type
     base = pto
-    this_func_ptr = llhelper(subtype_dealloc.api_func.functype,
-            subtype_dealloc.api_func.get_wrapper(space))
+    this_func_ptr = llslot(space, subtype_dealloc)
     while base.c_tp_dealloc == this_func_ptr:
         base = base.c_tp_base
         assert base
@@ -621,46 +612,31 @@
         return
     c_buf = lltype.malloc(PyBufferProcs, flavor='raw', zero=True)
     lltype.render_immortal(c_buf)
-    c_buf.c_bf_getsegcount = llhelper(bf_segcount.api_func.functype,
-                                      bf_segcount.api_func.get_wrapper(space))
+    c_buf.c_bf_getsegcount = llslot(space, bf_segcount)
     if space.is_w(w_type, space.w_str):
         # Special case: str doesn't support get_raw_address(), so we have a
         # custom get*buffer that instead gives the address of the char* in the
         # PyBytesObject*!
-        c_buf.c_bf_getreadbuffer = llhelper(
-            str_getreadbuffer.api_func.functype,
-            str_getreadbuffer.api_func.get_wrapper(space))
-        c_buf.c_bf_getcharbuffer = llhelper(
-            str_getcharbuffer.api_func.functype,
-            str_getcharbuffer.api_func.get_wrapper(space))
+        c_buf.c_bf_getreadbuffer = llslot(space, str_getreadbuffer)
+        c_buf.c_bf_getcharbuffer = llslot(space, str_getcharbuffer)
     elif space.is_w(w_type, space.w_unicode):
         # Special case: unicode doesn't support get_raw_address(), so we have a
         # custom get*buffer that instead gives the address of the char* in the
         # PyUnicodeObject*!
-        c_buf.c_bf_getreadbuffer = llhelper(
-            unicode_getreadbuffer.api_func.functype,
-            unicode_getreadbuffer.api_func.get_wrapper(space))
+        c_buf.c_bf_getreadbuffer = llslot(space, unicode_getreadbuffer)
     elif space.is_w(w_type, space.w_buffer):
         # Special case: we store a permanent address on the cpyext wrapper,
         # so we'll reuse that.
         # Note: we could instead store a permanent address on the buffer 
object,
         # and use get_raw_address()
-        c_buf.c_bf_getreadbuffer = llhelper(
-            buf_getreadbuffer.api_func.functype,
-            buf_getreadbuffer.api_func.get_wrapper(space))
-        c_buf.c_bf_getcharbuffer = llhelper(
-            buf_getcharbuffer.api_func.functype,
-            buf_getcharbuffer.api_func.get_wrapper(space))
+        c_buf.c_bf_getreadbuffer = llslot(space, buf_getreadbuffer)
+        c_buf.c_bf_getcharbuffer = llslot(space, buf_getcharbuffer)
     else:
         # use get_raw_address()
-        c_buf.c_bf_getreadbuffer = llhelper(bf_getreadbuffer.api_func.functype,
-                                    
bf_getreadbuffer.api_func.get_wrapper(space))
-        c_buf.c_bf_getcharbuffer = llhelper(bf_getcharbuffer.api_func.functype,
-                                    
bf_getcharbuffer.api_func.get_wrapper(space))
+        c_buf.c_bf_getreadbuffer = llslot(space, bf_getreadbuffer)
+        c_buf.c_bf_getcharbuffer = llslot(space, bf_getcharbuffer)
         if bufspec == 'read-write':
-            c_buf.c_bf_getwritebuffer = llhelper(
-                bf_getwritebuffer.api_func.functype,
-                bf_getwritebuffer.api_func.get_wrapper(space))
+            c_buf.c_bf_getwritebuffer = llslot(space, bf_getwritebuffer)
     pto.c_tp_as_buffer = c_buf
     pto.c_tp_flags |= Py_TPFLAGS_HAVE_GETCHARBUFFER
     pto.c_tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER
@@ -721,12 +697,10 @@
     # dealloc
     if space.gettypeobject(w_type.layout.typedef) is w_type:
         # only for the exact type, like 'space.w_tuple' or 'space.w_list'
-        pto.c_tp_dealloc = typedescr.get_dealloc(space)
+        pto.c_tp_dealloc = llslot(space, typedescr.get_dealloc())
     else:
         # for all subtypes, use subtype_dealloc()
-        pto.c_tp_dealloc = llhelper(
-            subtype_dealloc.api_func.functype,
-            subtype_dealloc.api_func.get_wrapper(space))
+        pto.c_tp_dealloc = llslot(space, subtype_dealloc)
     if space.is_w(w_type, space.w_str):
         pto.c_tp_itemsize = 1
     elif space.is_w(w_type, space.w_tuple):
@@ -734,10 +708,8 @@
     # buffer protocol
     setup_buffer_procs(space, w_type, pto)
 
-    pto.c_tp_free = llhelper(PyObject_Free.api_func.functype,
-            PyObject_Free.api_func.get_wrapper(space))
-    pto.c_tp_alloc = llhelper(PyType_GenericAlloc.api_func.functype,
-            PyType_GenericAlloc.api_func.get_wrapper(space))
+    pto.c_tp_free = llslot(space, PyObject_Free)
+    pto.c_tp_alloc = llslot(space, PyType_GenericAlloc)
     builder = space.fromcache(StaticObjectBuilder)
     if ((pto.c_tp_flags & Py_TPFLAGS_HEAPTYPE) != 0
             and builder.cpyext_type_init is None):
@@ -928,15 +900,11 @@
 
     if not pto.c_tp_setattro:
         from pypy.module.cpyext.object import PyObject_GenericSetAttr
-        pto.c_tp_setattro = llhelper(
-            PyObject_GenericSetAttr.api_func.functype,
-            PyObject_GenericSetAttr.api_func.get_wrapper(space))
+        pto.c_tp_setattro = llslot(space, PyObject_GenericSetAttr)
 
     if not pto.c_tp_getattro:
         from pypy.module.cpyext.object import PyObject_GenericGetAttr
-        pto.c_tp_getattro = llhelper(
-            PyObject_GenericGetAttr.api_func.functype,
-            PyObject_GenericGetAttr.api_func.get_wrapper(space))
+        pto.c_tp_getattro = llslot(space, PyObject_GenericGetAttr)
 
     if w_obj.is_cpytype():
         Py_DecRef(space, pto.c_tp_dict)
diff --git a/rpython/jit/backend/ppc/regalloc.py 
b/rpython/jit/backend/ppc/regalloc.py
--- a/rpython/jit/backend/ppc/regalloc.py
+++ b/rpython/jit/backend/ppc/regalloc.py
@@ -1066,7 +1066,6 @@
 
     prepare_cond_call_value_r = prepare_cond_call_value_i
 
-
 def notimplemented(self, op):
     msg = '[PPC/regalloc] %s not implemented\n' % op.getopname()
     if we_are_translated():
diff --git a/rpython/jit/backend/zarch/opassembler.py 
b/rpython/jit/backend/zarch/opassembler.py
--- a/rpython/jit/backend/zarch/opassembler.py
+++ b/rpython/jit/backend/zarch/opassembler.py
@@ -374,10 +374,11 @@
     _COND_CALL_SAVE_REGS = [r.r11, r.r2, r.r3, r.r4, r.r5]
 
     def emit_cond_call(self, op, arglocs, regalloc):
+        resloc = arglocs[0]
+        arglocs = arglocs[1:]
         fcond = self.guard_success_cc
         self.guard_success_cc = c.cond_none
         assert fcond.value != c.cond_none.value
-        fcond = c.negate(fcond)
 
         jmp_adr = self.mc.get_relative_pos()
         self.mc.reserve_cond_jump() # patched later to a relative branch
@@ -411,6 +412,8 @@
         self.mc.BASR(r.r14, r.r14)
         # restoring the registers saved above, and doing pop_gcmap(), is left
         # to the cond_call_slowpath helper.  We never have any result value.
+        if resloc is not None:
+            self.mc.LGR(resloc, r.RES)
         relative_target = self.mc.currpos() - jmp_adr
         pmc = OverwritingBuilder(self.mc, jmp_adr, 1)
         pmc.BRCL(fcond, l.imm(relative_target))
@@ -419,6 +422,9 @@
         # guard_no_exception too
         self.previous_cond_call_jcond = jmp_adr, fcond
 
+    emit_cond_call_value_i = emit_cond_call
+    emit_cond_call_value_r = emit_cond_call
+
 class AllocOpAssembler(object):
     _mixin_ = True
 
diff --git a/rpython/jit/backend/zarch/regalloc.py 
b/rpython/jit/backend/zarch/regalloc.py
--- a/rpython/jit/backend/zarch/regalloc.py
+++ b/rpython/jit/backend/zarch/regalloc.py
@@ -1107,7 +1107,7 @@
 
     def prepare_cond_call(self, op):
         self.load_condition_into_cc(op.getarg(0))
-        locs = []
+        locs = [None]
         # support between 0 and 4 integer arguments
         assert 2 <= op.numargs() <= 2 + 4
         for i in range(1, op.numargs()):
@@ -1116,6 +1116,22 @@
             locs.append(loc)
         return locs
 
+    def prepare_cond_call_value_i(self, op):
+        x = self.ensure_reg(op.getarg(0))
+        self.load_condition_into_cc(op.getarg(0))
+        self.rm.force_allocate_reg(op, selected_reg=x)   # spilled if survives
+        # ^^^ if arg0!=0, we jump over the next block of code (the call)
+        locs = [x]
+        # support between 0 and 4 integer arguments
+        assert 2 <= op.numargs() <= 2 + 4
+        for i in range(1, op.numargs()):
+            loc = self.loc(op.getarg(i))
+            assert loc.type != FLOAT
+            locs.append(loc)
+        return locs     # [res, function, args...]
+
+    prepare_cond_call_value_r = prepare_cond_call_value_i
+
     def prepare_cond_call_gc_wb(self, op):
         arglocs = [self.ensure_reg(op.getarg(0))]
         return arglocs
diff --git a/rpython/jit/codewriter/support.py 
b/rpython/jit/codewriter/support.py
--- a/rpython/jit/codewriter/support.py
+++ b/rpython/jit/codewriter/support.py
@@ -142,10 +142,14 @@
             assert len(lst) == len(args_v), (
                 "not supported so far: 'greens' variables contain Void")
         # a crash here means that you have to reorder the variable named in
-        # the JitDriver.  Indeed, greens and reds must both be sorted: first
-        # all INTs, followed by all REFs, followed by all FLOATs.
+        # the JitDriver.  
         lst2 = sort_vars(lst)
-        assert lst == lst2
+        assert lst == lst2, ("You have to reorder the variables named in "
+            "the JitDriver (both the 'greens' and 'reds' independently). "
+            "They must be sorted like this: first all the integer-like, "
+            "then all the pointer-like, and finally the floats.\n"
+            "Got: %r\n"
+            "Expected: %r" % (lst, lst2))
         return lst
     #
     return (_sort(greens_v, True), _sort(reds_v, False))
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -327,6 +327,16 @@
 
 def unicode_encode_utf_8(s, size, errors, errorhandler=None,
                          allow_surrogates=allow_surrogate_by_default):
+    # In this function, allow_surrogates can be:
+    #
+    #  * True:  surrogates are always allowed.  A valid surrogate pair
+    #           is replaced with the non-BMP unicode char it stands for,
+    #           which is then encoded as 4 bytes.
+    #
+    #  * False: surrogates are always forbidden.
+    #
+    # See also unicode_encode_utf8sp().
+    #
     if errorhandler is None:
         errorhandler = default_unicode_error_encode
     return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
@@ -391,6 +401,33 @@
                 _encodeUCS4(result, ch)
     return result.build()
 
+def unicode_encode_utf8sp(s, size):
+    # Surrogate-preserving utf-8 encoding.  Any surrogate character
+    # turns into its 3-bytes encoding, whether it is paired or not.
+    # This should always be reversible, and the reverse is the regular
+    # str_decode_utf_8() with allow_surrogates=True.
+    assert(size >= 0)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        elif ch < 0x10000:
+            # Encode UCS2 Unicode ordinals, and surrogates
+            result.append((chr((0xe0 | (ch >> 12)))))
+            result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+            result.append((chr((0x80 | (ch & 0x3f)))))
+        else:
+            _encodeUCS4(result, ch)
+    return result.build()
+
 # ____________________________________________________________
 # utf-16
 
diff --git a/rpython/rlib/test/test_runicode.py 
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -812,6 +812,21 @@
         py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
         assert encoder(u'u\u1234', 2, 'replace') == 'u?'
 
+    def test_encode_utf8sp(self):
+        # for the following test, go to lengths to avoid CPython's optimizer
+        # and .pyc file storage, which collapse the two surrogates into one
+        c = u"\udc00"
+        for input, expected in [
+                (u"", ""),
+                (u"abc", "abc"),
+                (u"\u1234", "\xe1\x88\xb4"),
+                (u"\ud800", "\xed\xa0\x80"),
+                (u"\udc00", "\xed\xb0\x80"),
+                (u"\ud800" + c, "\xed\xa0\x80\xed\xb0\x80"),
+            ]:
+            got = runicode.unicode_encode_utf8sp(input, len(input))
+            assert got == expected
+
 
 class TestTranslation(object):
     def setup_class(cls):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to