[pypy-commit] pypy default: merge unicode-utf8 into default

mattip Wed, 13 Feb 2019 14:07:35 -0800

Author: Matti Picus <matti.pi...@gmail.com>
Branch: 
Changeset: r96003:ba081fb468f4
Date: 2019-02-13 23:11 +0200
http://bitbucket.org/pypy/pypy/changeset/ba081fb468f4/


Log:    merge unicode-utf8 into default

diff too long, truncating to 2000 out of 15164 lines

diff --git a/TODO b/TODO
new file mode 100644
--- /dev/null
+++ b/TODO
@@ -0,0 +1,4 @@
+* find a better way to run "find" without creating the index storage,
+  if one is not already readily available (understand cost now, improve after 
merge)
+* improve performance of splitlines
+* think about cost of utf8 list strategy (Armin and CF)
diff --git a/lib-python/2.7/test/test_memoryio.py 
b/lib-python/2.7/test/test_memoryio.py
--- a/lib-python/2.7/test/test_memoryio.py
+++ b/lib-python/2.7/test/test_memoryio.py
@@ -712,6 +712,7 @@
 
     # XXX: For the Python version of io.StringIO, this is highly
     # dependent on the encoding used for the underlying buffer.
+    @support.cpython_only
     def test_widechar(self):
         buf = self.buftype("\U0002030a\U00020347")
         memio = self.ioclass(buf)
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -29,7 +29,11 @@
 
 Improve register allocation in the JIT.
 
-
 .. branch: promote-unicode
 
 Implement rlib.jit.promote_unicode to complement promote_string
+
+.. branch: unicode-utf8
+
+Use utf8 internally to represent unicode, with the goal of never using 
rpython-level unicode
+
diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py
--- a/pypy/interpreter/argument.py
+++ b/pypy/interpreter/argument.py
@@ -535,24 +535,26 @@
         if num_remainingkwds == 1:
             for i in range(len(keywords)):
                 if i not in kwds_mapping:
-                    name = keywords[i]
-                    if name is None:
-                        # We'll assume it's unicode. Encode it.
-                        # Careful, I *think* it should not be possible to
-                        # get an IndexError here but you never know.
-                        try:
-                            if keyword_names_w is None:
-                                raise IndexError
-                            # note: negative-based indexing from the end
-                            w_name = keyword_names_w[i - len(keywords)]
-                        except IndexError:
+                    name = '?'
+                    # We'll assume it's unicode. Encode it.
+                    # Careful, I *think* it should not be possible to
+                    # get an IndexError here but you never know.
+                    try:
+                        if keyword_names_w is None:
+                            raise IndexError
+                        # note: negative-based indexing from the end
+                        w_name = keyword_names_w[i - len(keywords)]
+                    except IndexError:
+                        if keywords is None:
                             name = '?'
                         else:
-                            w_enc = space.newtext(space.sys.defaultencoding)
-                            w_err = space.newtext("replace")
-                            w_name = space.call_method(w_name, "encode", w_enc,
-                                                       w_err)
-                            name = space.text_w(w_name)
+                            name = keywords[i]
+                    else:
+                        w_enc = space.newtext(space.sys.defaultencoding)
+                        w_err = space.newtext("replace")
+                        w_name = space.call_method(w_name, "encode", w_enc,
+                                                   w_err)
+                        name = space.text_w(w_name)
                     break
         self.kwd_name = name
 
diff --git a/pypy/interpreter/astcompiler/optimize.py 
b/pypy/interpreter/astcompiler/optimize.py
--- a/pypy/interpreter/astcompiler/optimize.py
+++ b/pypy/interpreter/astcompiler/optimize.py
@@ -5,7 +5,7 @@
 from pypy.tool import stdlib_opcode as ops
 from pypy.interpreter.error import OperationError
 from rpython.rlib.unroll import unrolling_iterable
-from rpython.rlib.runicode import MAXUNICODE
+from rpython.rlib.rutf8 import MAXUNICODE
 from rpython.rlib.objectmodel import specialize
 
 
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py 
b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -975,9 +975,6 @@
 
 class AppTestCompiler:
 
-    def setup_class(cls):
-        cls.w_maxunicode = cls.space.wrap(sys.maxunicode)
-
     def test_docstring_not_loaded(self):
         import StringIO, dis, sys
         ns = {}
@@ -1027,7 +1024,7 @@
         import sys
         d = {}
         exec '# -*- coding: utf-8 -*-\n\nu = u"\xf0\x9f\x92\x8b"' in d
-        if sys.maxunicode > 65535 and self.maxunicode > 65535:
+        if sys.maxunicode > 65535:
             expected_length = 1
         else:
             expected_length = 2
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -3,7 +3,7 @@
 
 from rpython.rlib.cache import Cache
 from rpython.tool.uid import HUGEVAL_BYTES
-from rpython.rlib import jit, types
+from rpython.rlib import jit, types, rutf8
 from rpython.rlib.debug import make_sure_not_resized
 from rpython.rlib.objectmodel import (we_are_translated, newlist_hint,
      compute_unique_id, specialize, not_rpython)
@@ -283,7 +283,10 @@
     def str_w(self, space):
         self._typed_unwrap_error(space, "string")
 
-    def unicode_w(self, space):
+    def utf8_w(self, space):
+        self._typed_unwrap_error(space, "unicode")
+
+    def convert_to_w_unicode(self, space):
         self._typed_unwrap_error(space, "unicode")
 
     def bytearray_list_of_chars_w(self, space):
@@ -1103,7 +1106,7 @@
         """
         return None
 
-    def listview_unicode(self, w_list):
+    def listview_utf8(self, w_list):
         """ Return a list of unwrapped unicode out of a list of unicode. If the
         argument is not a list or does not contain only unicode, return None.
         May return None anyway.
@@ -1133,8 +1136,15 @@
     def newlist_bytes(self, list_s):
         return self.newlist([self.newbytes(s) for s in list_s])
 
-    def newlist_unicode(self, list_u):
-        return self.newlist([self.newunicode(u) for u in list_u])
+    def newlist_utf8(self, list_u, is_ascii):
+        l_w = [None] * len(list_u)
+        for i, item in enumerate(list_u):
+            if not is_ascii:
+                length = rutf8.check_utf8(item, True)
+            else:
+                length = len(item)
+            l_w[i] = self.newutf8(item, length)
+        return self.newlist(l_w)
 
     def newlist_int(self, list_i):
         return self.newlist([self.newint(i) for i in list_i])
@@ -1661,6 +1671,8 @@
         # needed because CPython has the same issue.  (Well, it's
         # unclear if there is any use at all for getting the bytes in
         # the unicode buffer.)
+        if self.isinstance_w(w_obj, self.w_unicode):
+            return w_obj.charbuf_w(self)
         try:
             return self.bytes_w(w_obj)
         except OperationError as e:
@@ -1802,27 +1814,38 @@
             raise oefmt(self.w_TypeError, "argument must be a string")
         return self.bytes_w(w_obj)
 
-    @specialize.argtype(1)
-    def unicode_w(self, w_obj):
-        assert w_obj is not None
-        return w_obj.unicode_w(self)
+    def utf8_w(self, w_obj):
+        return w_obj.utf8_w(self)
+
+    def convert_to_w_unicode(self, w_obj):
+        return w_obj.convert_to_w_unicode(self)
 
     def unicode0_w(self, w_obj):
         "Like unicode_w, but rejects strings with NUL bytes."
         from rpython.rlib import rstring
-        result = w_obj.unicode_w(self)
+        result = w_obj.utf8_w(self).decode('utf8')
         if u'\x00' in result:
             raise oefmt(self.w_TypeError,
                         "argument must be a unicode string without NUL "
                         "characters")
         return rstring.assert_str0(result)
 
-    def realunicode_w(self, w_obj):
-        # Like unicode_w(), but only works if w_obj is really of type
-        # 'unicode'.  On Python 3 this is the same as unicode_w().
+    def convert_arg_to_w_unicode(self, w_obj, strict=None):
+        # XXX why convert_to_w_unicode does something slightly different?
+        from pypy.objspace.std.unicodeobject import W_UnicodeObject
+        assert not hasattr(self, 'is_fake_objspace')
+        return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict)
+
+    def utf8_len_w(self, w_obj):
+        w_obj = self.convert_arg_to_w_unicode(w_obj)
+        return w_obj._utf8, w_obj._len()
+
+    def realutf8_w(self, w_obj):
+        # Like utf8_w(), but only works if w_obj is really of type
+        # 'unicode'.  On Python 3 this is the same as utf8_w().
         if not self.isinstance_w(w_obj, self.w_unicode):
             raise oefmt(self.w_TypeError, "argument must be a unicode")
-        return self.unicode_w(w_obj)
+        return self.utf8_w(w_obj)
 
     def bool_w(self, w_obj):
         # Unwraps a bool, also accepting an int for compatibility.
@@ -2187,7 +2210,7 @@
     'float_w',
     'uint_w',
     'bigint_w',
-    'unicode_w',
+    'utf8_w',
     'unwrap',
     'is_true',
     'is_w',
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -160,6 +160,9 @@
     def visit_text0(self, el, app_sig):
         self.checked_space_method(el, app_sig)
 
+    def visit_utf8(self, el, app_sig):
+        self.checked_space_method(el, app_sig)
+
     def visit_fsencode(self, el, app_sig):
         self.checked_space_method(el, app_sig)
 
@@ -304,6 +307,9 @@
     def visit_text0(self, typ):
         self.run_args.append("space.text0_w(%s)" % (self.scopenext(),))
 
+    def visit_utf8(self, typ):
+        self.run_args.append("space.utf8_w(%s)" % (self.scopenext(),))
+
     def visit_fsencode(self, typ):
         self.run_args.append("space.fsencode_w(%s)" % (self.scopenext(),))
 
@@ -469,6 +475,9 @@
     def visit_text0(self, typ):
         self.unwrap.append("space.text0_w(%s)" % (self.nextarg(),))
 
+    def visit_utf8(self, typ):
+        self.unwrap.append("space.utf8_w(%s)" % (self.nextarg(),))
+
     def visit_fsencode(self, typ):
         self.unwrap.append("space.fsencode_w(%s)" % (self.nextarg(),))
 
@@ -533,10 +542,10 @@
 
 
 def int_unwrapping_space_method(typ):
-    assert typ in (int, str, float, unicode, r_longlong, r_uint, r_ulonglong, 
bool)
+    assert typ in (int, str, float, r_longlong, r_uint, r_ulonglong, bool)
     if typ is r_int is r_longlong:
         return 'gateway_r_longlong_w'
-    elif typ in (str, unicode, bool):
+    elif typ in (str, bool):
         return typ.__name__ + '_w'
     else:
         return 'gateway_' + typ.__name__ + '_w'
diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -1,3 +1,4 @@
+from rpython.rlib import rutf8
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter import unicodehelper
 from rpython.rlib.rstring import StringBuilder
@@ -51,18 +52,20 @@
                                         'unmatched triple quotes in literal')
         q -= 2
 
-    if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
+    if unicode_literal:
         if encoding is None or encoding == "iso-8859-1":
             # 'unicode_escape' expects latin-1 bytes, string is ready.
             assert 0 <= ps <= q
             substr = s[ps:q]
         else:
+            unicodehelper.check_utf8_or_raise(space, s, ps, q)
             substr = decode_unicode_utf8(space, s, ps, q)
         if rawmode:
-            v = unicodehelper.decode_raw_unicode_escape(space, substr)
+            r = unicodehelper.decode_raw_unicode_escape(space, substr)
         else:
-            v = unicodehelper.decode_unicode_escape(space, substr)
-        return space.newunicode(v)
+            r = unicodehelper.decode_unicode_escape(space, substr)
+        v, length = r
+        return space.newutf8(v, length)
 
     need_encoding = (encoding is not None and
                      encoding != "utf-8" and encoding != "utf8" and
@@ -71,7 +74,8 @@
     substr = s[ps : q]
     if rawmode or '\\' not in s[ps:]:
         if need_encoding:
-            w_u = space.newunicode(unicodehelper.decode_utf8(space, substr))
+            lgt = unicodehelper.check_utf8_or_raise(space, substr)
+            w_u = space.newutf8(substr, lgt)
             w_v = unicodehelper.encode(space, w_u, encoding)
             return w_v
         else:
@@ -101,15 +105,12 @@
                 # the backslash we just wrote, we emit "\u005c"
                 # instead.
                 lis.append("u005c")
-        if ord(s[ps]) & 0x80: # XXX inefficient
-            w, ps = decode_utf8(space, s, ps, end)
-            for c in w:
-                # The equivalent of %08x, which is not supported by RPython.
-                # 7 zeroes are enough for the unicode range, and the
-                # result still fits in 32-bit.
-                hexa = hex(ord(c) + 0x10000000)
-                lis.append('\\U0')
-                lis.append(hexa[3:])  # Skip 0x and the leading 1
+        if ord(s[ps]) & 0x80:
+            cp = rutf8.codepoint_at_pos(s, ps)
+            hexa = hex(cp + 0x10000000)
+            lis.append('\\U0')
+            lis.append(hexa[3:])  # Skip 0x and the leading 1
+            ps = rutf8.next_codepoint_pos(s, ps)
         else:
             lis.append(s[ps])
             ps += 1
@@ -215,20 +216,29 @@
             ch >= 'A' and ch <= 'F')
 
 
-def decode_utf8(space, s, ps, end):
+def check_utf8(space, s, ps, end):
     assert ps >= 0
     pt = ps
     # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
     while ps < end and ord(s[ps]) & 0x80:
         ps += 1
-    u = unicodehelper.decode_utf8(space, s[pt:ps])
-    return u, ps
+    try:
+        rutf8.check_utf8(s, True, pt, ps)
+    except rutf8.CheckError as e:
+        lgt, flag = rutf8.check_utf8(s, True, pt, e.pos)
+        unicodehelper.decode_error_handler(space)('strict', 'utf8',
+            'invalid utf-8', s, pt + lgt, pt + lgt + 1)
+    return s[pt:ps]
 
 def decode_utf8_recode(space, s, ps, end, recode_encoding):
-    u, ps = decode_utf8(space, s, ps, end)
-    w_v = unicodehelper.encode(space, space.newunicode(u), recode_encoding)
+    p = ps
+    while p < end and ord(s[p]) & 0x80:
+        p += 1
+    lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+    w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt),
+                               recode_encoding)
     v = space.bytes_w(w_v)
-    return v, ps
+    return v, p
 
 def raise_app_valueerror(space, msg):
     raise OperationError(space.w_ValueError, space.newtext(msg))
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py 
b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -10,7 +10,7 @@
             assert space.str_w(w_ret) == value
         elif isinstance(value, unicode):
             assert space.type(w_ret) == space.w_unicode
-            assert space.unicode_w(w_ret) == value
+            assert space.utf8_w(w_ret).decode('utf8') == value
         else:
             assert False
 
@@ -50,7 +50,7 @@
         s = "u'\x81'"
         s = s.decode("koi8-u").encode("utf8")
         w_ret = parsestring.parsestr(self.space, 'koi8-u', s)
-        ret = space.unwrap(w_ret)
+        ret = w_ret._utf8.decode('utf8')
         assert ret == eval("# -*- coding: koi8-u -*-\nu'\x81'")
 
     def test_unicode_literals(self):
@@ -102,7 +102,4 @@
     def test_decode_unicode_utf8(self):
         buf = parsestring.decode_unicode_utf8(self.space,
                                               'u"\xf0\x9f\x92\x8b"', 2, 6)
-        if sys.maxunicode == 65535:
-            assert buf == r"\U0000d83d\U0000dc8b"
-        else:
-            assert buf == r"\U0001f48b"
+        assert buf == r"\U0001f48b"
diff --git a/pypy/interpreter/test/test_argument.py 
b/pypy/interpreter/test/test_argument.py
--- a/pypy/interpreter/test/test_argument.py
+++ b/pypy/interpreter/test/test_argument.py
@@ -54,6 +54,9 @@
     pass
 
 class DummySpace(object):
+    class sys:
+        defaultencoding = 'utf-8'
+
     def newtuple(self, items):
         return tuple(items)
 
diff --git a/pypy/interpreter/test/test_gateway.py 
b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -535,25 +535,33 @@
         w_app_g3_r = space.wrap(app_g3_r)
         raises(gateway.OperationError,space.call_function,w_app_g3_r,w(1.0))
 
-    def test_interp2app_unwrap_spec_unicode(self):
+    def test_interp2app_unwrap_spec_utf8(self):
         space = self.space
         w = space.wrap
-        def g3_u(space, uni):
-            return space.wrap(len(uni))
+        def g3_u(space, utf8):
+            return space.wrap(utf8)
         app_g3_u = gateway.interp2app_temp(g3_u,
                                          unwrap_spec=[gateway.ObjSpace,
-                                                      unicode])
+                                                      'utf8'])
         w_app_g3_u = space.wrap(app_g3_u)
+        encoded = u"g&#281;&#347;&#263;".encode('utf8')
         assert self.space.eq_w(
-            space.call_function(w_app_g3_u, w(u"foo")),
-            w(3))
+            space.call_function(w_app_g3_u, w(u"g&#281;&#347;&#263;")),
+            w(encoded))
         assert self.space.eq_w(
-            space.call_function(w_app_g3_u, w("baz")),
-            w(3))
+            space.call_function(w_app_g3_u, w("foo")),
+            w("foo"))
         raises(gateway.OperationError, space.call_function, w_app_g3_u,
                w(None))
         raises(gateway.OperationError, space.call_function, w_app_g3_u,
                w(42))
+        # XXX this part of the test seems wrong, why would "\x80" fail?
+        # w_ascii = space.appexec([], """():
+        #     import sys
+        #     return sys.getdefaultencoding() == 'ascii'""")
+        # if space.is_true(w_ascii):
+        #     raises(gateway.OperationError, space.call_function, w_app_g3_u,
+        #            w("\x80"))
 
     def test_interp2app_unwrap_spec_unwrapper(self):
         space = self.space
diff --git a/pypy/interpreter/test/test_objspace.py 
b/pypy/interpreter/test/test_objspace.py
--- a/pypy/interpreter/test/test_objspace.py
+++ b/pypy/interpreter/test/test_objspace.py
@@ -216,9 +216,7 @@
         space = self.space
         w = space.wrap
         assert space.text0_w(w("123")) == "123"
-        exc = space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
-        assert space.unicode0_w(w(u"123")) == u"123"
-        exc = space.raises_w(space.w_TypeError, space.unicode0_w, 
w(u"123\x004"))
+        space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
 
     def test_getindex_w(self):
         w_instance1 = self.space.appexec([], """():
diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,53 +1,93 @@
 import pytest
+try:
+    from hypothesis import given, strategies
+    HAS_HYPOTHESIS = True
+except ImportError:
+    HAS_HYPOTHESIS = False
 import struct
 import sys
-from pypy.interpreter.unicodehelper import (
-    encode_utf8, decode_utf8, unicode_encode_utf_32_be)
 
-class FakeSpace:
-    pass
+from rpython.rlib import rutf8
 
-def test_encode_utf8():
-    space = FakeSpace()
-    assert encode_utf8(space, u"abc") == "abc"
-    assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
-    assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80"
-    assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80"
-    # for the following test, go to lengths to avoid CPython's optimizer
-    # and .pyc file storage, which collapse the two surrogates into one
-    c = u"\udc00"
-    assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80"
+from pypy.interpreter.unicodehelper import str_decode_utf8
+from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
+
+def decode_utf8(u):
+    return str_decode_utf8(u, True, "strict", None)
 
 def test_decode_utf8():
-    space = FakeSpace()
-    assert decode_utf8(space, "abc") == u"abc"
-    assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
-    assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
-    assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
-    got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
-    assert map(ord, got) == [0xd800, 0xdc00]
-    got = decode_utf8(space, "\xf0\x90\x80\x80")
-    if sys.maxunicode > 65535:
-        assert map(ord, got) == [0x10000]
-    else:
-        assert map(ord, got) == [55296, 56320]
+    assert decode_utf8("abc") == ("abc", 3, 3)
+    assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1)
+    assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1)
+    assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1)
+    assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == (
+        "\xed\xa0\x80\xed\xb0\x80", 6, 2)
+    assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1)
 
-@pytest.mark.parametrize('unich', [u"\ud800", u"\udc80"])
-def test_utf32_surrogates(unich):
-    assert (unicode_encode_utf_32_be(unich, 1, None) ==
-            struct.pack('>i', ord(unich)))
-    with pytest.raises(UnicodeEncodeError):
-        unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False)
+def test_utf8_encode_ascii():
+    assert utf8_encode_ascii("abc", "??", "??") == "abc"
+    def eh(errors, encoding, reason, p, start, end):
+        lst.append((errors, encoding, p, start, end))
+        return "<FOO>", end
+    lst = []
+    input = u"\u1234".encode("utf8")
+    assert utf8_encode_ascii(input, "??", eh) == "<FOO>"
+    assert lst == [("??", "ascii", input, 0, 1)]
+    lst = []
+    input = u"\u1234\u5678abc\u8765\u4321".encode("utf8")
+    assert utf8_encode_ascii(input, "??", eh) == "<FOO>abc<FOO>"
+    assert lst == [("??", "ascii", input, 0, 2),
+                   ("??", "ascii", input, 5, 7)]
 
-    def replace_with(ru, rs):
-        def errorhandler(errors, enc, msg, u, startingpos, endingpos):
-            if errors == 'strict':
-                raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg)
-            return ru, rs, endingpos
-        return unicode_encode_utf_32_be(
-            u"<%s>" % unich, 3, None,
-            errorhandler, allow_surrogates=False)
+if HAS_HYPOTHESIS:
+    @given(strategies.text())
+    def test_utf8_encode_ascii_2(u):
+        def eh(errors, encoding, reason, p, start, end):
+            return "?" * (end - start), end
 
-    assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
-    assert (replace_with(None, '\xca\xfe\xca\xfe') ==
-            '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>')
+        assert utf8_encode_ascii(u.encode("utf8"),
+                                "replace", eh) == u.encode("ascii", "replace")
+
+def test_str_decode_ascii():
+    assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3)
+    def eh(errors, encoding, reason, p, start, end):
+        lst.append((errors, encoding, p, start, end))
+        return u"\u1234\u5678".encode("utf8"), end
+    lst = []
+    input = "\xe8"
+    exp = u"\u1234\u5678".encode("utf8")
+    assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2)
+    assert lst == [("??", "ascii", input, 0, 1)]
+    lst = []
+    input = "\xe8\xe9abc\xea\xeb"
+    assert str_decode_ascii(input, "??", True, eh) == (
+        exp + exp + "abc" + exp + exp, 7, 11)
+    assert lst == [("??", "ascii", input, 0, 1),
+                   ("??", "ascii", input, 1, 2),
+                   ("??", "ascii", input, 5, 6),
+                   ("??", "ascii", input, 6, 7)]
+if HAS_HYPOTHESIS:
+    @given(strategies.text())
+    def test_unicode_raw_escape(u):
+        r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
+        assert r == u.encode("raw-unicode-escape")
+
+    @given(strategies.text())
+    def test_unicode_escape(u):
+        r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
+        assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+    assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+    with pytest.raises(ValueError):
+        uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+    state = space.fromcache(CodecState)
+    handler = state.encode_error_handler
+    assert uh.unicode_encode_decimal(
+        u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+    result = uh.unicode_encode_decimal(
+        u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+    assert result == '12&#4660;'
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,11 +1,12 @@
+import sys
+
+from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.objectmodel import specialize
-from rpython.rlib.rarithmetic import intmask
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib import runicode
-from rpython.rlib.runicode import (
-    default_unicode_error_encode, default_unicode_error_decode,
-    MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
-from pypy.interpreter.error import OperationError
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib import rutf8
+from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rtyper.lltypesystem import rffi
+from pypy.module.unicodedata import unicodedb
 
 @specialize.memo()
 def decode_error_handler(space):
@@ -20,90 +21,982 @@
                                              space.newtext(msg)]))
     return raise_unicode_exception_decode
 
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+    assert startingpos >= 0
+    ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
+    return ''.join(ux), endingpos, 'b'
+
 @specialize.memo()
 def encode_error_handler(space):
     # Fast version of the "strict" errors handler.
-    def raise_unicode_exception_encode(errors, encoding, msg, u,
+    def raise_unicode_exception_encode(errors, encoding, msg, utf8,
                                        startingpos, endingpos):
+        u_len = rutf8.get_utf8_length(utf8)
         raise OperationError(space.w_UnicodeEncodeError,
                              space.newtuple([space.newtext(encoding),
-                                             space.newunicode(u),
+                                             space.newutf8(utf8, u_len),
                                              space.newint(startingpos),
                                              space.newint(endingpos),
                                              space.newtext(msg)]))
     return raise_unicode_exception_encode
 
+def default_error_encode(
+        errors, encoding, msg, u, startingpos, endingpos):
+    """A default handler, for tests"""
+    assert endingpos >= 0
+    if errors == 'replace':
+        return '?', endingpos
+    if errors == 'ignore':
+        return '', endingpos
+    raise ValueError
+
 # ____________________________________________________________
+_WIN32 = sys.platform == 'win32'
+_MACOSX = sys.platform == 'darwin'
+
 
 def encode(space, w_data, encoding=None, errors='strict'):
     from pypy.objspace.std.unicodeobject import encode_object
     return encode_object(space, w_data, encoding, errors)
 
-# These functions take and return unwrapped rpython strings and unicodes
+
+def _has_surrogate(u):
+    for c in u:
+        if 0xD800 <= ord(c) <= 0xDFFF:
+            return True
+    return False
+
+# These functions take and return unwrapped rpython strings
 def decode_unicode_escape(space, string):
     from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
-    result, consumed = runicode.str_decode_unicode_escape(
-        string, len(string), "strict",
-        final=True, errorhandler=decode_error_handler(space),
-        unicodedata_handler=unicodedata_handler)
-    return result
+    result_utf8, consumed, length = str_decode_unicode_escape(
+        string, "strict",
+        final=True,
+        errorhandler=decode_error_handler(space),
+        ud_handler=unicodedata_handler)
+    return result_utf8, length
 
 def decode_raw_unicode_escape(space, string):
-    result, consumed = runicode.str_decode_raw_unicode_escape(
-        string, len(string), "strict",
+    result_utf8, consumed, lgt = str_decode_raw_unicode_escape(
+        string, "strict",
         final=True, errorhandler=decode_error_handler(space))
-    return result
+    return result_utf8, lgt
 
-def decode_utf8(space, string):
+def check_ascii_or_raise(space, string):
+    try:
+        rutf8.check_ascii(string)
+    except rutf8.CheckError as e:
+        decode_error_handler(space)('strict', 'ascii',
+                                    'ordinal not in range(128)', string,
+                                    e.pos, e.pos + 1)
+        assert False, "unreachable"
+
+def check_utf8_or_raise(space, string, start=0, end=-1):
     # Surrogates are accepted and not treated specially at all.
     # If there happen to be two 3-bytes encoding a pair of surrogates,
     # you still get two surrogate unicode characters in the result.
     # These are the Python2 rules; Python3 differs.
-    result, consumed = runicode.str_decode_utf_8(
-        string, len(string), "strict",
-        final=True, errorhandler=decode_error_handler(space),
-        allow_surrogates=True)
-    return result
+    try:
+        length = rutf8.check_utf8(string, True, start, end)
+    except rutf8.CheckError as e:
+        # convert position into unicode position
+        lgt = rutf8.check_utf8(string, True, start, stop=e.pos)
+        decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
+                                    start + lgt, start + lgt + 1)
+        assert False, "unreachable"
+    return length
 
-def encode_utf8(space, uni):
-    # Note that this function never raises UnicodeEncodeError,
-    # since surrogates are allowed, either paired or lone.
-    # A paired surrogate is considered like the non-BMP character
-    # it stands for.  These are the Python2 rules; Python3 differs.
+def str_decode_ascii(s, errors, final, errorhandler):
+    try:
+        rutf8.check_ascii(s)
+        return s, len(s), len(s)
+    except rutf8.CheckError:
+        return _str_decode_ascii_slowpath(s, errors, final, errorhandler)
+
+def _str_decode_ascii_slowpath(s, errors, final, errorhandler):
+    i = 0
+    res = StringBuilder()
+    while i < len(s):
+        ch = s[i]
+        if ord(ch) > 0x7F:
+            r, i = errorhandler(errors, 'ascii', 'ordinal not in range(128)',
+                s, i, i + 1)
+            res.append(r)
+        else:
+            res.append(ch)
+            i += 1
+    ress = res.build()
+    lgt = rutf8.check_utf8(ress, True)
+    return ress, len(s), lgt
+
+def str_decode_latin_1(s, errors, final, errorhandler):
+    try:
+        rutf8.check_ascii(s)
+        return s, len(s), len(s)
+    except rutf8.CheckError:
+        return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
+
+def _str_decode_latin_1_slowpath(s, errors, final, errorhandler):
+    res = StringBuilder(len(s))
+    i = 0
+    while i < len(s):
+        if ord(s[i]) > 0x7F:
+            while i < len(s) and ord(s[i]) > 0x7F:
+                rutf8.unichr_as_utf8_append(res, ord(s[i]))
+                i += 1
+        else:
+            start = i
+            end = i + 1
+            while end < len(s) and ord(s[end]) <= 0x7F:
+                end += 1
+            res.append_slice(s, start, end)
+            i = end
+    # cannot be ASCII, cannot have surrogates, I believe
+    return res.build(), len(s), len(s)
+
+def utf8_encode_latin_1(s, errors, errorhandler):
+    try:
+        rutf8.check_ascii(s)
+        return s
+    except rutf8.CheckError:
+        return _utf8_encode_latin_1_slowpath(s, errors, errorhandler)
+
+def _utf8_encode_latin_1_slowpath(s, errors, errorhandler):
+    size = len(s)
+    result = StringBuilder(size)
+    index = 0
+    pos = 0
+    while pos < size:
+        ch = rutf8.codepoint_at_pos(s, pos)
+        if ch <= 0xFF:
+            result.append(chr(ch))
+            index += 1
+            pos = rutf8.next_codepoint_pos(s, pos)
+        else:
+            startindex = index
+            pos = rutf8.next_codepoint_pos(s, pos)
+            index += 1
+            while pos < size and rutf8.codepoint_at_pos(s, pos) > 0xFF:
+                pos = rutf8.next_codepoint_pos(s, pos)
+                index += 1
+            msg = "ordinal not in range(256)"
+            res_8, newindex = errorhandler(
+                errors, 'latin1', msg, s, startindex, index)
+            for cp in rutf8.Utf8StringIterator(res_8):
+                if cp > 0xFF:
+                    errorhandler("strict", 'latin1', msg, s, startindex, index)
+                result.append(chr(cp))
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
+    return result.build()
+
+def utf8_encode_ascii(s, errors, errorhandler):
+    """ Don't be confused - this is a slowpath for errors e.g. "ignore"
+    or an obscure errorhandler
+    """
+    size = len(s)
+    result = StringBuilder(size)
+    index = 0
+    pos = 0
+    while pos < size:
+        ch = rutf8.codepoint_at_pos(s, pos)
+        if ch <= 0x7F:
+            result.append(chr(ch))
+            index += 1
+            pos = rutf8.next_codepoint_pos(s, pos)
+        else:
+            startindex = index
+            pos = rutf8.next_codepoint_pos(s, pos)
+            index += 1
+            while pos < size and rutf8.codepoint_at_pos(s, pos) > 0x7F:
+                pos = rutf8.next_codepoint_pos(s, pos)
+                index += 1
+            msg = "ordinal not in range(128)"
+            res_8, newindex = errorhandler(
+                errors, 'ascii', msg, s, startindex, index)
+            for cp in rutf8.Utf8StringIterator(res_8):
+                if cp > 0x7F:
+                    errorhandler("strict", 'ascii', msg, s, startindex, index)
+                result.append(chr(cp))
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
+    return result.build()
+
+if sys.platform == 'win32':
+    def utf8_encode_mbcs(s, errors, errorhandler):
+        from rpython.rlib import runicode
+        s = s.decode('utf-8')
+        slen = len(s)
+        res = runicode.unicode_encode_mbcs(s, slen, errors, errorhandler)
+        return res
+        
+    def str_decode_mbcs(s, errors, final, errorhandler):
+        from rpython.rlib import runicode
+        slen = len(s)
+        res, size = runicode.str_decode_mbcs(s, slen, final=final, 
errors=errors,
+                                           errorhandler=errorhandler)
+        return res.encode('utf8'), size, len(res)
+
+def str_decode_utf8(s, errors, final, errorhandler):
+    """ Same as checking for the valid utf8, but we know the utf8 is not
+    valid so we're trying to either raise or pack stuff with error handler.
+    The key difference is that this is call_may_force
+    """
+    slen = len(s)
+    res = StringBuilder(slen)
+    pos = 0
+    end = len(s)
+    suppressing = False # we are in a chain of "bad" unicode, only emit one fix
+    while pos < end:
+        ordch1 = ord(s[pos])
+        # fast path for ASCII
+        if ordch1 <= 0x7F:
+            pos += 1
+            res.append(chr(ordch1))
+            suppressing = False
+            continue
+
+        if ordch1 <= 0xC1:
+            r, pos = errorhandler(errors, "utf8", "invalid start byte",
+                    s, pos, pos + 1)
+            if not suppressing:
+                res.append(r)
+            continue
+
+        pos += 1
+
+        if ordch1 <= 0xDF:
+            if pos >= end:
+                if not final:
+                    pos -= 1
+                    break
+                r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+                    s, pos - 1, pos)
+                if not suppressing:
+                    res.append(r)
+                continue
+            ordch2 = ord(s[pos])
+
+            if rutf8._invalid_byte_2_of_2(ordch2):
+                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                    s, pos - 1, pos)
+                if not suppressing:
+                    res.append(r)
+                continue
+            # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+            pos += 1
+            res.append(chr(ordch1))
+            res.append(chr(ordch2))
+            continue
+
+        if ordch1 <= 0xEF:
+            if (pos + 2) > end:
+                if not final:
+                    pos -= 1
+                    break
+                r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+                    s, pos - 1, pos)
+                res.append(r)
+                suppressing = True
+                continue
+            ordch2 = ord(s[pos])
+            ordch3 = ord(s[pos + 1])
+
+            if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True):
+                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                    s, pos - 1, pos)
+                if not suppressing:
+                    res.append(r)
+                continue
+            elif rutf8._invalid_byte_3_of_3(ordch3):
+                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                    s, pos - 1, pos + 1)
+                if not suppressing:
+                    res.append(r)
+                continue
+            pos += 2
+
+            # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+            res.append(chr(ordch1))
+            res.append(chr(ordch2))
+            res.append(chr(ordch3))
+            suppressing = False
+            continue
+
+        if ordch1 <= 0xF4:
+            if (pos + 3) > end:
+                if not final:
+                    pos -= 1
+                    break
+                r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+                    s, pos - 1, pos)
+                res.append(r)
+                suppressing = True
+                continue
+            ordch2 = ord(s[pos])
+            ordch3 = ord(s[pos + 1])
+            ordch4 = ord(s[pos + 2])
+
+            if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
+                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                    s, pos - 1, pos)
+                if not suppressing:
+                    res.append(r)
+                continue
+            elif rutf8._invalid_byte_3_of_4(ordch3):
+                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                    s, pos - 1, pos + 1)
+                res.append(r)
+                continue
+            elif rutf8._invalid_byte_4_of_4(ordch4):
+                r, pos = errorhandler(errors, "utf8", "invalid continuation 
byte",
+                    s, pos - 1, pos + 2)
+                if not suppressing:
+                    res.append(r)
+                continue
+
+            pos += 3
+            # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+            res.append(chr(ordch1))
+            res.append(chr(ordch2))
+            res.append(chr(ordch3))
+            res.append(chr(ordch4))
+            suppressing = False
+            continue
+
+        r, pos = errorhandler(errors, "utf8", "invalid start byte",
+                s, pos - 1, pos)
+        if not suppressing:
+            res.append(r)
+
+    r = res.build()
+    return r, pos, rutf8.check_utf8(r, True)
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits,
+              encoding, errorhandler, message, errors):
+    chr = 0
+    if pos + digits > len(s):
+        endinpos = pos
+        while endinpos < len(s) and s[endinpos] in hexdigits:
+            endinpos += 1
+        res, pos = errorhandler(
+            errors, encoding, message, s, pos - 2, endinpos)
+        builder.append(res)
+    else:
+        try:
+            chr = int(s[pos:pos + digits], 16)
+        except ValueError:
+            endinpos = pos
+            while s[endinpos] in hexdigits:
+                endinpos += 1
+            res, pos = errorhandler(
+                errors, encoding, message, s, pos - 2, endinpos)
+            builder.append(res)
+        else:
+            # when we get here, chr is a 32-bit unicode character
+            try:
+                builder.append_code(chr)
+                pos += digits
+            except ValueError:
+                message = "illegal Unicode character"
+                res, pos = errorhandler(
+                    errors, encoding, message, s, pos - 2, pos + digits)
+                builder.append(res)
+    return pos
+
+def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
+    size = len(s)
+    if size == 0:
+        return '', 0, 0
+
+    builder = rutf8.Utf8StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            if ord(ch) > 0x7F:
+                builder.append_code(ord(ch))
+            else:
+                builder.append(ch)
+            pos += 1
+            continue
+
+        # - Escapes
+        pos += 1
+        if pos >= size:
+            message = "\\ at end of string"
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos - 1, size)
+            builder.append(res)
+            continue
+
+        ch = s[pos]
+        pos += 1
+        # \x escapes
+        if ch == '\n':
+            pass
+        elif ch == '\\':
+            builder.append_char('\\')
+        elif ch == '\'':
+            builder.append_char('\'')
+        elif ch == '\"':
+            builder.append_char('\"')
+        elif ch == 'b':
+            builder.append_char('\b')
+        elif ch == 'f':
+            builder.append_char('\f')
+        elif ch == 't':
+            builder.append_char('\t')
+        elif ch == 'n':
+            builder.append_char('\n')
+        elif ch == 'r':
+            builder.append_char('\r')
+        elif ch == 'v':
+            builder.append_char('\v')
+        elif ch == 'a':
+            builder.append_char('\a')
+        elif '0' <= ch <= '7':
+            x = ord(ch) - ord('0')
+            if pos < size:
+                ch = s[pos]
+                if '0' <= ch <= '7':
+                    pos += 1
+                    x = (x << 3) + ord(ch) - ord('0')
+                    if pos < size:
+                        ch = s[pos]
+                        if '0' <= ch <= '7':
+                            pos += 1
+                            x = (x << 3) + ord(ch) - ord('0')
+            if x > 0x7F:
+                builder.append_code(x)
+            else:
+                builder.append_char(chr(x))
+        # hex escapes
+        # \xXX
+        elif ch == 'x':
+            digits = 2
+            message = "truncated \\xXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+        # \uXXXX
+        elif ch == 'u':
+            digits = 4
+            message = "truncated \\uXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+        #  \UXXXXXXXX
+        elif ch == 'U':
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            "unicodeescape", errorhandler, message, errors)
+        # \N{name}
+        elif ch == 'N' and ud_handler is not None:
+            message = "malformed \\N character escape"
+            look = pos
+
+            if look < size and s[look] == '{':
+                # look for the closing brace
+                while look < size and s[look] != '}':
+                    look += 1
+                if look < size and s[look] == '}':
+                    # found a name.  look it up in the unicode database
+                    message = "unknown Unicode character name"
+                    name = s[pos + 1:look]
+                    code = ud_handler.call(name)
+                    if code < 0:
+                        res, pos = errorhandler(
+                            errors, "unicodeescape", message,
+                            s, pos - 1, look + 1)
+                        builder.append(res)
+                        continue
+                    pos = look + 1
+                    builder.append_code(code)
+                else:
+                    res, pos = errorhandler(errors, "unicodeescape",
+                                            message, s, pos - 1, look + 1)
+                    builder.append(res)
+            else:
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos - 1, look + 1)
+                builder.append(res)
+        else:
+            builder.append_char('\\')
+            builder.append_code(ord(ch))
+
+    return builder.build(), pos, builder.getlength()
+
+def wcharpsize2utf8(space, wcharp, size):
+    """Safe version of rffi.wcharpsize2utf8.
+
+    Raises app-level ValueError if any wchar value is outside the valid
+    codepoint range.
+    """
+    try:
+        return rffi.wcharpsize2utf8(wcharp, size)
+    except ValueError:
+        raise oefmt(space.w_ValueError,
+            "character is not in range [U+0000; U+10ffff]")
+
+
+# ____________________________________________________________
+# Raw unicode escape
+
+def str_decode_raw_unicode_escape(s, errors, final=False,
+                                  errorhandler=None):
+    size = len(s)
+    if size == 0:
+        return '', 0, 0
+
+    builder = rutf8.Utf8StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if ch != '\\':
+            builder.append_code(ord(ch))
+            pos += 1
+            continue
+
+        # \u-escapes are only interpreted iff the number of leading
+        # backslashes is odd
+        bs = pos
+        while pos < size:
+            pos += 1
+            if pos == size or s[pos] != '\\':
+                break
+            builder.append_char('\\')
+
+        # we have a backslash at the end of the string, stop here
+        if pos >= size:
+            builder.append_char('\\')
+            break
+
+        if ((pos - bs) & 1 == 0 or pos >= size or
+                (s[pos] != 'u' and s[pos] != 'U')):
+            builder.append_char('\\')
+            builder.append_code(ord(s[pos]))
+            pos += 1
+            continue
+
+        digits = 4 if s[pos] == 'u' else 8
+        message = "truncated \\uXXXX"
+        pos += 1
+        pos = hexescape(builder, s, pos, digits,
+                           "rawunicodeescape", errorhandler, message, errors)
+
+    return builder.build(), pos, builder.getlength()
+
+_utf8_encode_unicode_escape = rutf8.make_utf8_escape_function()
+
+
+TABLE = '0123456789abcdef'
+
+def raw_unicode_escape_helper(result, char):
+    if char >= 0x10000 or char < 0:
+        result.append("\\U")
+        zeros = 8
+    elif char >= 0x100:
+        result.append("\\u")
+        zeros = 4
+    else:
+        result.append("\\x")
+        zeros = 2
+    for i in range(zeros-1, -1, -1):
+        result.append(TABLE[(char >> (4 * i)) & 0x0f])
+
+def utf8_encode_raw_unicode_escape(s, errors, errorhandler):
+    # errorhandler is not used: this function cannot cause Unicode errors
+    size = len(s)
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        oc = rutf8.codepoint_at_pos(s, pos)
+
+        if oc < 0x100:
+            result.append(chr(oc))
+        else:
+            raw_unicode_escape_helper(result, oc)
+        pos = rutf8.next_codepoint_pos(s, pos)
+
+    return result.build()
+
+
+def utf8_encode_unicode_escape(s, errors, errorhandler):
+    return _utf8_encode_unicode_escape(s)
+
+# ____________________________________________________________
+# utf-7
+
+# Three simple macros defining base-64
+
+def _utf7_IS_BASE64(oc):
+    "Is c a base-64 character?"
+    c = chr(oc)
+    return c.isalnum() or c == '+' or c == '/'
+def _utf7_TO_BASE64(n):
+    "Returns the base-64 character of the bottom 6 bits of n"
+    return 
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
+def _utf7_FROM_BASE64(c):
+    "given that c is a base-64 character, what is its base-64 value?"
+    if c >= 'a':
+        return ord(c) - 71
+    elif c >= 'A':
+        return ord(c) - 65
+    elif c >= '0':
+        return ord(c) + 4
+    elif c == '+':
+        return 62
+    else: # c == '/'
+        return 63
+
+def _utf7_DECODE_DIRECT(oc):
+    return oc <= 127 and oc != ord('+')
+
+# The UTF-7 encoder treats ASCII characters differently according to
+# whether they are Set D, Set O, Whitespace, or special (i.e. none of
+# the above).  See RFC2152.  This array identifies these different
+# sets:
+# 0 : "Set D"
+#      alphanumeric and '(),-./:?
+# 1 : "Set O"
+#     !"#$%&*;<=>@[]^_`{|}
+# 2 : "whitespace"
+#     ht nl cr sp
+# 3 : special (must be base64 encoded)
+#     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
+
+utf7_category = [
+#  nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
+#  dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+#  sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
+    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
+#   0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
+#   @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+#   P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
+#   `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+#   p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
+]
+
+# ENCODE_DIRECT: this character should be encoded as itself.  The
+# answer depends on whether we are encoding set O as itself, and also
+# on whether we are encoding whitespace as itself.  RFC2152 makes it
+# clear that the answers to these questions vary between
+# applications, so this code needs to be flexible.
+
+def _utf7_ENCODE_DIRECT(oc, directO, directWS):
+    return(oc < 128 and oc > 0 and
+           (utf7_category[oc] == 0 or
+            (directWS and utf7_category[oc] == 2) or
+            (directO and utf7_category[oc] == 1)))
+
+def _utf7_ENCODE_CHAR(result, oc, base64bits, base64buffer):
+    if oc >= 0x10000:
+        # code first surrogate
+        base64bits += 16
+        base64buffer = (base64buffer << 16) | 0xd800 | ((oc-0x10000) >> 10)
+        while base64bits >= 6:
+            result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+            base64bits -= 6
+        # prepare second surrogate
+        oc = 0xDC00 | ((oc-0x10000) & 0x3FF)
+    base64bits += 16
+    base64buffer = (base64buffer << 16) | oc
+    while base64bits >= 6:
+        result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+        base64bits -= 6
+    return base64bits, base64buffer
+
+def str_decode_utf_7(s, errors, final=False,
+                     errorhandler=None):
+    size = len(s)
+    if size == 0:
+        return '', 0, 0
+
+    inShift = False
+    base64bits = 0
+    base64buffer = 0
+    surrogate = 0
+    outsize = 0
+
+    result = StringBuilder(size)
+    pos = 0
+    shiftOutStartPos = 0
+    startinpos = 0
+    while pos < size:
+        ch = s[pos]
+
+        if inShift: # in a base-64 section
+            if _utf7_IS_BASE64(ord(ch)): #consume a base-64 character
+                base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
+                assert base64buffer >= 0
+                base64bits += 6
+                pos += 1
+
+                if base64bits >= 16:
+                    # enough bits for a UTF-16 value
+                    outCh = base64buffer >> (base64bits - 16)
+                    assert outCh >= 0
+                    base64bits -= 16
+                    base64buffer &= (1 << base64bits) - 1 # clear high bits
+                    assert outCh <= 0xffff
+                    if surrogate:
+                        # expecting a second surrogate
+                        if outCh >= 0xDC00 and outCh <= 0xDFFF:
+                            code = (((surrogate & 0x3FF)<<10) |
+                                        (outCh & 0x3FF)) + 0x10000
+                            rutf8.unichr_as_utf8_append(result, code)
+                            outsize += 1
+                            surrogate = 0
+                            continue
+                        else:
+                            rutf8.unichr_as_utf8_append(result, surrogate,
+                                                        allow_surrogates=True)
+                            outsize += 1
+                            surrogate = 0
+                            # Not done with outCh: falls back to next line
+                    if outCh >= 0xD800 and outCh <= 0xDBFF:
+                        # first surrogate
+                        surrogate = outCh
+                    else:
+                        outsize += 1
+                        assert outCh >= 0
+                        rutf8.unichr_as_utf8_append(result, outCh, True)
+
+            else:
+                # now leaving a base-64 section
+                inShift = False
+
+                if base64bits > 0: # left-over bits
+                    if base64bits >= 6:
+                        # We've seen at least one base-64 character
+                        pos += 1
+                        msg = "partial character in shift sequence"
+                        res, pos = errorhandler(errors, 'utf7',
+                                                msg, s, pos-1, pos)
+                        reslen = rutf8.check_utf8(res, True)
+                        outsize += reslen
+                        result.append(res)
+                        continue
+                    else:
+                        # Some bits remain; they should be zero
+                        if base64buffer != 0:
+                            pos += 1
+                            msg = "non-zero padding bits in shift sequence"
+                            res, pos = errorhandler(errors, 'utf7',
+                                                    msg, s, pos-1, pos)
+                            reslen = rutf8.check_utf8(res, True)
+                            outsize += reslen
+                            result.append(res)
+                            continue
+
+                if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
+                    outsize += 1
+                    rutf8.unichr_as_utf8_append(result, surrogate, True)
+                surrogate = 0
+
+                if ch == '-':
+                    # '-' is absorbed; other terminating characters are
+                    # preserved
+                    pos += 1
+
+        elif ch == '+':
+            startinpos = pos
+            pos += 1 # consume '+'
+            if pos < size and s[pos] == '-': # '+-' encodes '+'
+                pos += 1
+                result.append('+')
+                outsize += 1
+            else: # begin base64-encoded section
+                inShift = 1
+                surrogate = 0
+                shiftOutStartPos = result.getlength()
+                base64bits = 0
+                base64buffer = 0
+
+        elif _utf7_DECODE_DIRECT(ord(ch)): # character decodes at itself
+            result.append(ch)
+            outsize += 1
+            pos += 1
+        else:
+            startinpos = pos
+            pos += 1
+            msg = "unexpected special character"
+            res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
+            reslen = rutf8.check_utf8(res, True)
+            outsize += reslen
+            result.append(res)
+
+    # end of string
+    final_length = result.getlength()
+    if inShift and final: # in shift sequence, no more to follow
+        # if we're in an inconsistent state, that's an error
+        inShift = 0
+        if (surrogate or
+            base64bits >= 6 or
+            (base64bits > 0 and base64buffer != 0)):
+            msg = "unterminated shift sequence"
+            res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, 
pos)
+            reslen = rutf8.check_utf8(res, True)
+            outsize += reslen
+            result.append(res)
+            final_length = result.getlength()
+    elif inShift:
+        pos = startinpos
+        final_length = shiftOutStartPos # back off output
+
+    assert final_length >= 0
+    return result.build()[:final_length], pos, outsize
+
+def utf8_encode_utf_7(s, errors, errorhandler):
+    size = len(s)
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+
+    encodeSetO = encodeWhiteSpace = False
+
+    inShift = False
+    base64bits = 0
+    base64buffer = 0
+
+    pos = 0
+    while pos < size:
+        oc = rutf8.codepoint_at_pos(s, pos)
+        if not inShift:
+            if oc == ord('+'):
+                result.append('+-')
+            elif _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+                result.append(chr(oc))
+            else:
+                result.append('+')
+                inShift = True
+                base64bits, base64buffer = _utf7_ENCODE_CHAR(
+                    result, oc, base64bits, base64buffer)
+        else:
+            if _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+                # shifting out
+                if base64bits: # output remaining bits
+                    result.append(_utf7_TO_BASE64(base64buffer << 
(6-base64bits)))
+                    base64buffer = 0
+                    base64bits = 0
+
+                inShift = False
+                ## Characters not in the BASE64 set implicitly unshift the
+                ## sequence so no '-' is required, except if the character is
+                ## itself a '-'
+                if _utf7_IS_BASE64(oc) or oc == ord('-'):
+                    result.append('-')
+                result.append(chr(oc))
+            else:
+                base64bits, base64buffer = _utf7_ENCODE_CHAR(
+                    result, oc, base64bits, base64buffer)
+        pos = rutf8.next_codepoint_pos(s, pos)
+
+    if base64bits:
+        result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits)))
+    if inShift:
+        result.append('-')
+
+    return result.build()
+
+@specialize.memo()
+def _encode_unicode_error_handler(space):
+    # Fast version of the "strict" errors handler.
+    from rpython.rlib import runicode
+    def raise_unicode_exception_encode(errors, encoding, msg, uni,
+                                       startingpos, endingpos):
+        assert isinstance(uni, unicode)
+        u_len = len(uni)
+        utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
+        raise OperationError(space.w_UnicodeEncodeError,
+                             space.newtuple([space.newtext(encoding),
+                                             space.newtext(utf8, u_len),
+                                             space.newint(startingpos),
+                                             space.newint(endingpos),
+                                             space.newtext(msg)]))
+        return u'', None, 0
+    return raise_unicode_exception_encode
+
+
+def encode_utf8(space, uni, allow_surrogates=False):
+    # Note that Python3 tends to forbid *all* surrogates in utf-8.
+    # If allow_surrogates=True, then revert to the Python 2 behavior
+    # which never raises UnicodeEncodeError.  Surrogate pairs are then
+    # allowed, either paired or lone.  A paired surrogate is considered
+    # like the non-BMP character it stands for.  See also *_utf8sp().
+    from rpython.rlib import runicode
+    assert isinstance(uni, unicode)
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
-        errorhandler=None,
-        allow_surrogates=True)
+        errorhandler=_encode_unicode_error_handler(space),
+        allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni, allow_surrogates=True):
+    xxx
+    # Surrogate-preserving utf-8 encoding.  Any surrogate character
+    # turns into its 3-bytes encoding, whether it is paired or not.
+    # This should always be reversible, and the reverse is
+    # decode_utf8sp().
+    from rpython.rlib import runicode
+    return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+    # Surrogate-preserving utf-8 decoding.  Assuming there is no
+    # encoding error, it should always be reversible, and the reverse is
+    # encode_utf8sp().
+    return str_decode_utf8(string, "string", True, decode_never_raise,
+                           allow_surrogates=True)
+
 
 # ____________________________________________________________
 # utf-16
 
-def str_decode_utf_16(s, size, errors, final=True,
+BYTEORDER = sys.byteorder
+BYTEORDER2 = BYTEORDER[0] + 'e'      # either "le" or "be"
+assert BYTEORDER2 in ('le', 'be')
+
+def str_decode_utf_16(s, errors, final=True,
                       errorhandler=None):
-    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, 
final,
+    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
                                                          errorhandler, 
"native")
-    return result, length
+    return result, c, lgt
 
-def str_decode_utf_16_be(s, size, errors, final=True,
+def str_decode_utf_16_be(s, errors, final=True,
+                        errorhandler=None):
+    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
+                                                         errorhandler, "big")
+    return result, c, lgt
+
+def str_decode_utf_16_le(s, errors, final=True,
                          errorhandler=None):
-    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, 
final,
-                                                         errorhandler, "big")
-    return result, length
+    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
+                                                         errorhandler, 
"little")
+    return result, c, lgt
 
-def str_decode_utf_16_le(s, size, errors, final=True,
-                         errorhandler=None):
-    result, length, byteorder = str_decode_utf_16_helper(s, size, errors, 
final,
-                                                         errorhandler, 
"little")
-    return result, length
-
-def str_decode_utf_16_helper(s, size, errors, final=True,
+def str_decode_utf_16_helper(s, errors, final=True,
                              errorhandler=None,
                              byteorder="native",
                              public_encoding_name='utf16'):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_decode
+    size = len(s)
     bo = 0
 
     if BYTEORDER == 'little':
@@ -140,7 +1033,7 @@
     else:
         bo = 1
     if size == 0:
-        return u'', 0, bo
+        return '', 0, 0, bo
     if bo == -1:
         # force little endian
         ihi = 1
@@ -151,7 +1044,7 @@
         ihi = 0
         ilo = 1
 
-    result = UnicodeBuilder(size // 2)
+    result = StringBuilder(size // 2)
 
     #XXX I think the errors are not correctly handled here
     while pos < size:
@@ -168,7 +1061,7 @@
         ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
         pos += 2
         if ch < 0xD800 or ch > 0xDFFF:
-            result.append(unichr(ch))
+            rutf8.unichr_as_utf8_append(result, ch)
             continue
         # UTF-16 code pair:
         if len(s) - pos < 2:
@@ -185,12 +1078,8 @@
             ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
             pos += 2
             if 0xDC00 <= ch2 <= 0xDFFF:
-                if MAXUNICODE < 65536:
-                    result.append(unichr(ch))
-                    result.append(unichr(ch2))
-                else:
-                    result.append(UNICHR((((ch & 0x3FF)<<10) |
-                                           (ch2 & 0x3FF)) + 0x10000))
+                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
+                rutf8.unichr_as_utf8_append(result, ch)
                 continue
             else:
                 r, pos = errorhandler(errors, public_encoding_name,
@@ -202,7 +1091,9 @@
                                   "illegal encoding",
                                   s, pos - 2, pos)
             result.append(r)
-    return result.build(), pos, bo
+    r = result.build()
+    lgt = rutf8.check_utf8(r, True)
+    return result.build(), pos, lgt, bo
 
 def _STORECHAR(result, CH, byteorder):
     hi = chr(((CH) >> 8) & 0xff)
@@ -214,13 +1105,12 @@
         result.append(hi)
         result.append(lo)
 
-def unicode_encode_utf_16_helper(s, size, errors,
+def unicode_encode_utf_16_helper(s, errors,
                                  errorhandler=None,
                                  allow_surrogates=True,
                                  byteorder='little',
                                  public_encoding_name='utf16'):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_encode
+    size = len(s)
     if size == 0:
         if byteorder == 'native':
             result = StringBuilder(2)
@@ -234,9 +1124,9 @@
         byteorder = BYTEORDER
 
     pos = 0
+    index = 0
     while pos < size:
-        ch = ord(s[pos])
-        pos += 1
+        ch = rutf8.codepoint_at_pos(s, pos)
 
         if ch < 0xD800:
             _STORECHAR(result, ch, byteorder)
@@ -246,78 +1136,76 @@
         elif ch >= 0xE000 or allow_surrogates:
             _STORECHAR(result, ch, byteorder)
         else:
-            ru, rs, pos = errorhandler(errors, public_encoding_name,
-                                       'surrogates not allowed',
-                                       s, pos-1, pos)
-            if rs is not None:
-                # py3k only
-                if len(rs) % 2 != 0:
-                    errorhandler('strict', public_encoding_name,
-                                 'surrogates not allowed',
-                                 s, pos-1, pos)
-                result.append(rs)
-                continue
-            for ch in ru:
-                if ord(ch) < 0xD800:
-                    _STORECHAR(result, ord(ch), byteorder)
+            res_8, newindex = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos, pos+1)
+            for cp in rutf8.Utf8StringIterator(res_8):
+                if cp < 0xD800:
+                    _STORECHAR(result, cp, byteorder)
                 else:
                     errorhandler('strict', public_encoding_name,
                                  'surrogates not allowed',
-                                 s, pos-1, pos)
+                                 s, pos, pos+1)
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
 
+        pos = rutf8.next_codepoint_pos(s, pos)
+        index += 1
+
     return result.build()
 
-def unicode_encode_utf_16(s, size, errors,
+def utf8_encode_utf_16(s, errors,
                           errorhandler=None,
                           allow_surrogates=True):
-    return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+    return unicode_encode_utf_16_helper(s, errors, errorhandler,
                                         allow_surrogates, "native")
 
-def unicode_encode_utf_16_be(s, size, errors,
+def utf8_encode_utf_16_be(s, errors,
                              errorhandler=None,
                              allow_surrogates=True):
-    return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+    return unicode_encode_utf_16_helper(s, errors, errorhandler,
                                         allow_surrogates, "big")
 
-def unicode_encode_utf_16_le(s, size, errors,
+def utf8_encode_utf_16_le(s, errors,
                              errorhandler=None,
                              allow_surrogates=True):
-    return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+    return unicode_encode_utf_16_helper(s, errors, errorhandler,
                                         allow_surrogates, "little")
 
-
 # ____________________________________________________________
 # utf-32
 
-def str_decode_utf_32(s, size, errors, final=True,
+def str_decode_utf_32(s, errors, final=True,
                       errorhandler=None):
-    result, length, byteorder = str_decode_utf_32_helper(
-        s, size, errors, final, errorhandler, "native")
-    return result, length
+    result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
+                                                         errorhandler, 
"native")
+    return result, c, lgt
 
-def str_decode_utf_32_be(s, size, errors, final=True,
+def str_decode_utf_32_be(s, errors, final=True,
                          errorhandler=None):
-    result, length, byteorder = str_decode_utf_32_helper(
-        s, size, errors, final, errorhandler, "big")
-    return result, length
+    result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
+                                                         errorhandler, "big")
+    return result, c, lgt
 
-def str_decode_utf_32_le(s, size, errors, final=True,
+def str_decode_utf_32_le(s, errors, final=True,
                          errorhandler=None):
-    result, length, byteorder = str_decode_utf_32_helper(
-        s, size, errors, final, errorhandler, "little")
-    return result, length
+    result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
+                                                         errorhandler, 
"little")
+    return result, c, lgt
 
-BOM32_DIRECT = intmask(0x0000FEFF)
+BOM32_DIRECT  = intmask(0x0000FEFF)
 BOM32_REVERSE = intmask(0xFFFE0000)
 
-def str_decode_utf_32_helper(s, size, errors, final=True,
-                             errorhandler=None,
+def str_decode_utf_32_helper(s, errors, final,
+                             errorhandler,
                              byteorder="native",
-                             public_encoding_name='utf32'):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_decode
+                             public_encoding_name='utf32',
+                             allow_surrogates=True):
+    assert errorhandler is not None
     bo = 0
+    size = len(s)
 
     if BYTEORDER == 'little':
         iorder = [0, 1, 2, 3]
@@ -353,7 +1241,7 @@
     else:
         bo = 1
     if size == 0:
-        return u'', 0, bo
+        return '', 0, 0, bo
     if bo == -1:
         # force little endian
         iorder = [0, 1, 2, 3]
@@ -361,7 +1249,7 @@
         # force big endian
         iorder = [3, 2, 1, 0]
 
-    result = UnicodeBuilder(size // 4)
+    result = StringBuilder(size // 4)
 
     while pos < size:
         # remaining bytes at the end? (size should be divisible by 4)
@@ -376,22 +1264,26 @@
                 break
             continue
         ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 
16) |
-            (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
-        if ch >= 0x110000:
+              (ord(s[pos + iorder[1]]) << 8)  | ord(s[pos + iorder[0]]))
+        if not allow_surrogates and 0xD800 <= ch <= 0xDFFF:
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "code point in surrogate code point "
+                                  "range(0xd800, 0xe000)",
+                                  s, pos, pos + 4)
+            result.append(r)
+            continue
+        elif ch >= 0x110000:
             r, pos = errorhandler(errors, public_encoding_name,
                                   "codepoint not in range(0x110000)",
                                   s, pos, len(s))
             result.append(r)
             continue
 
-        if MAXUNICODE < 65536 and ch >= 0x10000:
-            ch -= 0x10000L
-            result.append(unichr(0xD800 + (ch >> 10)))
-            result.append(unichr(0xDC00 + (ch & 0x03FF)))
-        else:
-            result.append(UNICHR(ch))
+        rutf8.unichr_as_utf8_append(result, ch, 
allow_surrogates=allow_surrogates)
         pos += 4
-    return result.build(), pos, bo
+    r = result.build()
+    lgt = rutf8.check_utf8(r, True)
+    return r, pos, lgt, bo
 
 def _STORECHAR32(result, CH, byteorder):
     c0 = chr(((CH) >> 24) & 0xff)
@@ -409,13 +1301,12 @@
         result.append(c2)
         result.append(c3)
 
-def unicode_encode_utf_32_helper(s, size, errors,
+def unicode_encode_utf_32_helper(s, errors,
                                  errorhandler=None,
                                  allow_surrogates=True,
                                  byteorder='little',
                                  public_encoding_name='utf32'):
-    if errorhandler is None:
-        errorhandler = default_unicode_error_encode
+    size = len(s)
     if size == 0:
         if byteorder == 'native':
             result = StringBuilder(4)
@@ -429,50 +1320,253 @@
         byteorder = BYTEORDER
 
     pos = 0
+    index = 0
     while pos < size:
-        ch = ord(s[pos])
-        pos += 1
-        ch2 = 0
+        ch = rutf8.codepoint_at_pos(s, pos)
+        pos = rutf8.next_codepoint_pos(s, pos)
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
-            ru, rs, pos = errorhandler(
+            res_8, newindex = errorhandler(
                 errors, public_encoding_name, 'surrogates not allowed',
                 s, pos - 1, pos)
-            if rs is not None:
-                # py3k only
-                if len(rs) % 4 != 0:
+            for ch in rutf8.Utf8StringIterator(res_8):
+                if ch < 0xD800:
+                    _STORECHAR32(result, ch, byteorder)
+                else:
                     errorhandler(
                         'strict', public_encoding_name, 'surrogates not 
allowed',
                         s, pos - 1, pos)
-                result.append(rs)
-                continue
-            for ch in ru:
-                if ord(ch) < 0xD800:
-                    _STORECHAR32(result, ord(ch), byteorder)
-                else:
-                    errorhandler(
-                        'strict', public_encoding_name,
-                        'surrogates not allowed', s, pos - 1, pos)
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
-        if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size:
-            ch2 = ord(s[pos])
-            if 0xDC00 <= ch2 < 0xE000:
-                ch = (((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000
-                pos += 1
         _STORECHAR32(result, ch, byteorder)
+        index += 1
 
     return result.build()
 
-def unicode_encode_utf_32(s, size, errors,
+def utf8_encode_utf_32(s, errors,
                           errorhandler=None, allow_surrogates=True):
-    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+    return unicode_encode_utf_32_helper(s, errors, errorhandler,
                                         allow_surrogates, "native")
 
-def unicode_encode_utf_32_be(s, size, errors,
+def utf8_encode_utf_32_be(s, errors,
                              errorhandler=None, allow_surrogates=True):
-    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+    return unicode_encode_utf_32_helper(s, errors, errorhandler,
                                         allow_surrogates, "big")
 
-def unicode_encode_utf_32_le(s, size, errors,
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy default: merge unicode-utf8 into default

Reply via email to