[pypy-commit] pypy unicode-utf8-re: hg merge unicode-utf8

arigo Sat, 09 Dec 2017 23:31:51 -0800

Author: Armin Rigo <ar...@tunes.org>
Branch: unicode-utf8-re
Changeset: r93347:58b6fedc39bc
Date: 2017-12-10 08:27 +0100
http://bitbucket.org/pypy/pypy/changeset/58b6fedc39bc/


Log:    hg merge unicode-utf8

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -12,3 +12,4 @@
 * improve performance of splitlines
 
 * fix _pypyjson to not use a wrapped dict when decoding an object
+* make sure we review all the places that call ord(unichr) to check for 
ValueErrors
\ No newline at end of file
diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,4 @@
+import pytest
 from hypothesis import given, strategies
 
 from rpython.rlib import rutf8
@@ -5,6 +6,7 @@
 from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
 from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
 
 def decode_utf8(u):
     return str_decode_utf8(u, True, "strict", None)
@@ -68,3 +70,16 @@
 def test_unicode_escape(u):
     r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
     assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+    assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+    with pytest.raises(ValueError):
+        uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+    state = space.fromcache(CodecState)
+    handler = state.encode_error_handler
+    assert uh.unicode_encode_decimal(
+        u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+    result = uh.unicode_encode_decimal(
+        u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+    assert result == '12&#4660;'
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,11 +1,13 @@
 import sys
 
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import rutf8
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rstring import StringBuilder
+from rpython.rtyper.lltypesystem import rffi
 from pypy.module._codecs import interp_codecs
+from pypy.module.unicodedata import unicodedb
 
 @specialize.memo()
 def decode_error_handler(space):
@@ -34,6 +36,16 @@
                                              space.newtext(msg)]))
     return raise_unicode_exception_encode
 
+def default_error_encode(
+        errors, encoding, msg, u, startingpos, endingpos):
+    """A default handler, for tests"""
+    assert endingpos >= 0
+    if errors == 'replace':
+        return '?', endingpos
+    if errors == 'ignore':
+        return '', endingpos
+    raise ValueError
+
 def convert_arg_to_w_unicode(space, w_arg, strict=None):
     return space.convert_arg_to_w_unicode(w_arg)
 
@@ -204,7 +216,7 @@
                 if c > 0x7F:
                     errorhandler("strict", 'ascii',
                                  'ordinal not in range(128)', utf8,
-                                 pos, pos + 1)  
+                                 pos, pos + 1)
                 j = rutf8.next_codepoint_pos(r, j)
             pos = newpos
             res.append(r)
@@ -530,6 +542,19 @@
 
     return builder.build(), pos, outsize
 
+def wcharpsize2utf8(space, wcharp, size):
+    """Safe version of rffi.wcharpsize2utf8.
+
+    Raises app-level ValueError if any wchar value is outside the valid
+    codepoint range.
+    """
+    try:
+        return rffi.wcharpsize2utf8(wcharp, size)
+    except ValueError:
+        raise oefmt(space.w_ValueError,
+            "character is not in range [U+0000; U+10ffff]")
+
+
 # ____________________________________________________________
 # Raw unicode escape
 
@@ -575,8 +600,8 @@
         digits = 4 if s[pos] == 'u' else 8
         message = "truncated \\uXXXX"
         pos += 1
-        pos, _, _ = hexescape(result, s, pos, digits,
-                        "rawunicodeescape", errorhandler, message, errors)
+        pos, _ = hexescape(result, s, pos, digits,
+                           "rawunicodeescape", errorhandler, message, errors)
 
     r = result.build()
     lgt = rutf8.check_utf8(r, True)
@@ -1073,22 +1098,19 @@
         elif ch >= 0xE000 or allow_surrogates:
             _STORECHAR(result, ch, byteorder)
         else:
-            ru, newindex = errorhandler(errors, public_encoding_name,
-                                   'surrogates not allowed',
-                                    s, pos-1, pos)
-            for j in range(newindex - index):
-                pos = rutf8.next_codepoint_pos(s, pos)
-            j = 0
-            while j < len(ru):
-                ch = rutf8.codepoint_at_pos(ru, j)
-                if ord(ch) < 0xD800:
-                    _STORECHAR(result, ord(ch), byteorder)
+            res_8, newindex = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos - 1, pos)
+            for cp in rutf8.Utf8StringIterator(res_8):
+                if cp < 0xD800:
+                    _STORECHAR(result, cp, byteorder)
                 else:
                     errorhandler('strict', public_encoding_name,
                                  'surrogates not allowed',
                                  s, pos-1, pos)
-                j = rutf8.next_codepoint_pos(ru, j)
-            index = newindex
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
 
         pos = rutf8.next_codepoint_pos(s, pos)
@@ -1257,22 +1279,19 @@
         ch = rutf8.codepoint_at_pos(s, pos)
         pos = rutf8.next_codepoint_pos(s, pos)
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
-            ru, newindex = errorhandler(errors, public_encoding_name,
-                                        'surrogates not allowed',
-                                        s, pos-1, pos)
-            for j in range(newindex - index):
-                pos = rutf8.next_codepoint_pos(s, pos)
-            j = 0
-            while j < len(ru):
-                ch = rutf8.codepoint_at_pos(ru, j)
-                if ord(ch) < 0xD800:
-                    _STORECHAR32(result, ord(ch), byteorder)
+            res_8, newindex = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos - 1, pos)
+            for ch in rutf8.Utf8StringIterator(res_8):
+                if ch < 0xD800:
+                    _STORECHAR32(result, ch, byteorder)
                 else:
-                    errorhandler('strict', public_encoding_name,
-                                 'surrogates not allowed',
-                                 s, pos-1, pos)
-                j = rutf8.next_codepoint_pos(ru, j)
-            index = newindex
+                    errorhandler(
+                        'strict', public_encoding_name, 'surrogates not 
allowed',
+                        s, pos - 1, pos)
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
         _STORECHAR32(result, ch, byteorder)
         index += 1
@@ -1400,8 +1419,7 @@
     lgt = rutf8.check_utf8(r, True)
     return r, pos, lgt
 
-def utf8_encode_charmap(s, errors, errorhandler=None,
-                           mapping=None):
+def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
     size = len(s)
     if mapping is None:
         return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
@@ -1413,34 +1431,99 @@
     index = 0
     while pos < size:
         ch = rutf8.codepoint_at_pos(s, pos)
-
         c = mapping.get(ch, '')
         if len(c) == 0:
-            # collect all unencodable chars. Important for narrow builds.
-            collend = rutf8.next_codepoint_pos(s, pos)
-            endindex = index + 1
-            while collend < size and mapping.get(rutf8.codepoint_at_pos(s, 
collend), '') == '':
-                collend = rutf8.next_codepoint_pos(s, collend)
-                endindex += 1
-            rs, endindex = errorhandler(errors, "charmap",
+            # collect all unencodable chars.
+            startindex = index
+            pos = rutf8.next_codepoint_pos(s, pos)
+            index += 1
+            while (pos < size and
+                   mapping.get(rutf8.codepoint_at_pos(s, pos), '') == ''):
+                pos = rutf8.next_codepoint_pos(s, pos)
+                index += 1
+            res_8, newindex = errorhandler(errors, "charmap",
                                    "character maps to <undefined>",
-                                   s, index, endindex)
-            j = 0
-            for _ in range(endindex - index):
-                ch2 = rutf8.codepoint_at_pos(rs, j)
-                ch2 = mapping.get(ch2, '')
+                                   s, startindex, index)
+            for cp2 in rutf8.Utf8StringIterator(res_8):
+                ch2 = mapping.get(cp2, '')
                 if not ch2:
                     errorhandler(
-                        "strict", "charmap",
-                        "character maps to <undefined>",
-                        s,  index, index + 1)
+                        "strict", "charmap", "character maps to <undefined>",
+                        s,  startindex, index)
                 result.append(ch2)
-                index += 1
-                j = rutf8.next_codepoint_pos(rs, j)
-                pos = rutf8.next_codepoint_pos(s, pos)
+            if index != newindex:  # Should be uncommon
+                index = newindex
+                pos = rutf8._pos_at_index(s, newindex)
             continue
         result.append(c)
         index += 1
         pos = rutf8.next_codepoint_pos(s, pos)
     return result.build()
 
+# ____________________________________________________________
+# Decimal Encoder
+def unicode_encode_decimal(s, errors, errorhandler=None):
+    """Converts whitespace to ' ', decimal characters to their
+    corresponding ASCII digit and all other Latin-1 characters except
+    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
+    are treated as errors. This includes embedded NULL bytes.
+    """
+    if errorhandler is None:
+        errorhandler = default_error_encode
+    result = StringBuilder(len(s))
+    pos = 0
+    i = 0
+    it = rutf8.Utf8StringIterator(s)
+    for ch in it:
+        if unicodedb.isspace(ch):
+            result.append(' ')
+            i += 1
+            continue
+        try:
+            decimal = unicodedb.decimal(ch)
+        except KeyError:
+            pass
+        else:
+            result.append(chr(48 + decimal))
+            i += 1
+            continue
+        if 0 < ch < 256:
+            result.append(chr(ch))
+            i += 1
+            continue
+        # All other characters are considered unencodable
+        start_index = i
+        i += 1
+        while not it.done():
+            ch = rutf8.codepoint_at_pos(s, it.get_pos())
+            try:
+                if (0 < ch < 256 or unicodedb.isspace(ch) or
+                        unicodedb.decimal(ch) >= 0):
+                    break
+            except KeyError:
+                # not a decimal
+                pass
+            if it.done():
+                break
+            ch = next(it)
+            i += 1
+        end_index = i
+        msg = "invalid decimal Unicode string"
+        r, pos = errorhandler(
+            errors, 'decimal', msg, s, start_index, end_index)
+        for ch in rutf8.Utf8StringIterator(r):
+            if unicodedb.isspace(ch):
+                result.append(' ')
+                continue
+            try:
+                decimal = unicodedb.decimal(ch)
+            except KeyError:
+                pass
+            else:
+                result.append(chr(48 + decimal))
+                continue
+            if 0 < ch < 256:
+                result.append(chr(ch))
+                continue
+            errorhandler('strict', 'decimal', msg, s, start_index, end_index)
+    return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -70,9 +70,6 @@
                 raise oefmt(space.w_IndexError,
                             "position %d from error handler out of bounds",
                             newpos)
-            if newpos < startpos:
-                raise oefmt(space.w_IndexError,
-                    "position %d from error handler did not progress", newpos)
             w_replace = space.convert_to_w_unicode(w_replace)
             return w_replace._utf8, newpos
         return call_errorhandler
@@ -226,7 +223,7 @@
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
         start = w_obj._index_to_byte(start)
-        end = w_obj._index_to_byte(end)        
+        end = w_obj._index_to_byte(end)
         builder = StringBuilder()
         pos = start
         obj = w_obj._utf8
@@ -460,22 +457,12 @@
 
 # utf-8 functions are not regular, because we have to pass
 # "allow_surrogates=True"
-@unwrap_spec(utf8='utf8', errors='text_or_none')
-def utf_8_encode(space, utf8, errors="strict"):
-    length, _ = rutf8.check_utf8(utf8, allow_surrogates=True)
-    return space.newtuple([space.newbytes(utf8), space.newint(length)])
-#@unwrap_spec(uni=unicode, errors='text_or_none')
-#def utf_8_encode(space, uni, errors="strict"):
-#    if errors is None:
-#        errors = 'strict'
-#    state = space.fromcache(CodecState)
-#    # NB. can't call unicode_encode_utf_8() directly because that's
-#    # an @elidable function nowadays.  Instead, we need the _impl().
-#    # (The problem is the errorhandler, which calls arbitrary Python.)
-#    result = runicode.unicode_encode_utf_8_impl(
-#        uni, len(uni), errors, state.encode_error_handler,
-#        allow_surrogates=True)
-#    return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+@unwrap_spec(errors='text_or_none')
+def utf_8_encode(space, w_obj, errors="strict"):
+    utf8, lgt = space.utf8_len_w(w_obj)
+    if rutf8.has_surrogates(utf8):
+        utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+    return space.newtuple([space.newbytes(utf8), space.newint(lgt)])
 
 @unwrap_spec(string='bufferstr', errors='text_or_none',
              w_final = WrappedDefault(False))
diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -537,8 +537,12 @@
         assert '\xff'.decode('utf-7', 'ignore') == ''
         assert '\x00'.decode('unicode-internal', 'ignore') == ''
 
-    def test_backslahreplace(self):
-        assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') 
== 'a\\xac\u1234\u20ac\u8000'
+    def test_backslashreplace(self):
+        sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
+        expected = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+        assert sin.encode('ascii', 'backslashreplace') == expected
+        expected = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+        assert sin.encode("iso-8859-15", "backslashreplace") == expected
 
     def test_badhandler(self):
         import codecs
diff --git a/pypy/module/_io/interp_stringio.py 
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,3 +1,5 @@
+from rpython.rlib.rutf8 import get_utf8_length
+
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.typedef import (
     TypeDef, generic_new_descr, GetSetProperty)
@@ -152,7 +154,7 @@
         if self.readnl is None:
             w_readnl = space.w_None
         else:
-            w_readnl = space.str(space.new_from_utf8(self.readnl))  # YYY
+            w_readnl = space.str(space.newutf8(self.readnl, 
get_utf8_length(self.readnl)))  # YYY
         return space.newtuple([
             w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
         ])
@@ -215,7 +217,8 @@
         if self.writenl:
             w_decoded = space.call_method(
                 w_decoded, "replace",
-                space.newtext("\n"), space.new_from_utf8(self.writenl))
+                space.newtext("\n"), space.newutf8(self.writenl,
+                    get_utf8_length(self.writenl)))
         string = space.utf8_w(w_decoded)
         if string:
             self.buf.write(string)
@@ -225,7 +228,9 @@
     def read_w(self, space, w_size=None):
         self._check_closed(space)
         size = convert_size(space, w_size)
-        return space.new_from_utf8(self.buf.read(size))
+        v = self.buf.read(size)
+        lgt = get_utf8_length(v)
+        return space.newutf8(v, lgt)
 
     def readline_w(self, space, w_limit=None):
         self._check_closed(space)
@@ -239,7 +244,8 @@
             else:
                 newline = self.readnl
             result = self.buf.readline(newline, limit)
-        return space.new_from_utf8(result)
+        resultlen = get_utf8_length(result)
+        return space.newutf8(result, resultlen)
 
 
     @unwrap_spec(pos=int, mode=int)
@@ -276,7 +282,9 @@
 
     def getvalue_w(self, space):
         self._check_closed(space)
-        return space.new_from_utf8(self.buf.getvalue())
+        v = self.buf.getvalue()
+        lgt = get_utf8_length(v)
+        return space.newutf8(v, lgt)
 
     def readable_w(self, space):
         self._check_closed(space)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -12,7 +12,8 @@
 from rpython.rlib.rbigint import rbigint
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos,
-                                codepoints_in_utf8)
+                                codepoints_in_utf8, get_utf8_length,
+                                Utf8StringBuilder)
 
 
 STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -684,13 +685,15 @@
             w_bytes = space.call_method(self.w_buffer, "read")
             w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, 
space.w_True)
             check_decoded(space, w_decoded)
-            w_result = space.new_from_utf8(self.decoded.get_chars(-1))
+            chars = self.decoded.get_chars(-1)
+            lgt = get_utf8_length(chars)
+            w_result = space.newutf8(chars, lgt)
             w_final = space.add(w_result, w_decoded)
             self.snapshot = None
             return w_final
 
         remaining = size
-        builder = StringBuilder(size)
+        builder = Utf8StringBuilder(size)
 
         # Keep reading chunks until we have n characters to return
         while remaining > 0:
@@ -700,7 +703,7 @@
             builder.append(data)
             remaining -= len(data)
 
-        return space.new_from_utf8(builder.build())
+        return space.newutf8(builder.build(), builder.get_length())
 
     def _scan_line_ending(self, limit):
         if self.readuniversal:
@@ -725,6 +728,7 @@
         limit = convert_size(space, w_limit)
         remnant = None
         builder = StringBuilder()
+        # XXX maybe use Utf8StringBuilder instead?
         while True:
             # First, get some data if necessary
             has_data = self._ensure_data(space)
@@ -771,7 +775,8 @@
             self.decoded.reset()
 
         result = builder.build()
-        return space.new_from_utf8(result)
+        lgt = get_utf8_length(result)
+        return space.newutf8(result, lgt)
 
     # _____________________________________________________________
     # write methods
@@ -794,8 +799,8 @@
             if text.find('\n') >= 0:
                 haslf = True
         if haslf and self.writetranslate and self.writenl:
-            w_text = space.call_method(w_text, "replace", 
space.new_from_utf8('\n'),
-                                       space.new_from_utf8(self.writenl))
+            w_text = space.call_method(w_text, "replace", space.newutf8('\n', 
1),
+                                       space.newutf8(self.writenl, 
get_utf8_length(self.writenl)))
             text = space.utf8_w(w_text)
 
         needflush = False
diff --git a/pypy/module/_locale/interp_locale.py 
b/pypy/module/_locale/interp_locale.py
--- a/pypy/module/_locale/interp_locale.py
+++ b/pypy/module/_locale/interp_locale.py
@@ -133,10 +133,11 @@
             rffi.free_charp(s1_c)
             rffi.free_charp(s2_c)
 
-    s1, s2 = space.unicode_w(w_s1), space.unicode_w(w_s2)
+    s1, l1 = space.utf8_len_w(w_s1)
+    s2, l2 = space.utf8_len_w(w_s2)
 
-    s1_c = rffi.unicode2wcharp(s1)
-    s2_c = rffi.unicode2wcharp(s2)
+    s1_c = rffi.utf82wcharp(s1, l1)
+    s2_c = rffi.utf82wcharp(s2, l2)
     try:
         result = _wcscoll(s1_c, s2_c)
     finally:
diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -157,7 +157,7 @@
         replace, end = errorcb(errors, namecb, reason,
                                stringdata, start, end)
         # 'replace' is RPython unicode here
-    lgt, _ = rutf8.check_utf8(replace, True)
+    lgt = rutf8.get_utf8_length(replace)
     inbuf = rffi.utf82wcharp(replace, lgt)
     try:
         r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
@@ -268,7 +268,7 @@
         rets, end = errorcb(errors, namecb, reason,
                             unicodedata, start, end)
         codec = pypy_cjk_enc_getcodec(encodebuf)
-        lgt, _ = rutf8.get_utf8_length_flag(rets)
+        lgt = rutf8.get_utf8_length(rets)
         replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
     with rffi.scoped_nonmovingbuffer(replace) as inbuf:
         r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
diff --git a/pypy/module/_multibytecodec/interp_incremental.py 
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -66,7 +66,7 @@
         pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
         assert 0 <= pos <= len(object)
         self.pending = object[pos:]
-        lgt = rutf8.get_utf8_length_flag(output)
+        lgt = rutf8.get_utf8_length(output)
         return space.newutf8(output, lgt)
 
 
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py 
b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -27,8 +27,8 @@
             raise wrap_unicodedecodeerror(space, e, input, self.name)
         except RuntimeError:
             raise wrap_runtimeerror(space)
-        lgt, flag = rutf8.check_utf8(utf8_output, True)
-        return space.newtuple([space.newutf8(utf8_output, lgt, flag),
+        lgt = rutf8.get_utf8_length(utf8_output)
+        return space.newtuple([space.newutf8(utf8_output, lgt),
                                space.newint(len(input))])
 
     @unwrap_spec(errors="text_or_none")
diff --git a/pypy/module/_multibytecodec/test/test_translation.py 
b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -14,7 +14,7 @@
             codecname, string = argv[1], argv[2]
             c = c_codecs.getcodec(codecname)
             u = c_codecs.decode(c, string)
-            lgt, _ = rutf8.get_utf8_length_flag(u)
+            lgt = rutf8.get_utf8_length(u)
             r = c_codecs.encode(c, u, lgt)
             print r
             return 0
diff --git a/pypy/module/_pypyjson/interp_decoder.py 
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -3,6 +3,7 @@
 from rpython.rlib.objectmodel import specialize, always_inline, r_dict
 from rpython.rlib import rfloat, runicode, rutf8
 from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rarithmetic import r_uint
 from pypy.interpreter.error import oefmt
 from pypy.interpreter import unicodehelper
 
@@ -366,7 +367,7 @@
             return # help the annotator to know that we'll never go beyond
                    # this point
         #
-        utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True)
+        utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True)
         builder.append(utf8_ch)
         return i
 
@@ -400,7 +401,7 @@
                 break
             elif ch == '\\' or ch < '\x20':
                 self.pos = i-1
-                return self.space.unicode_w(self.decode_string_escaped(start))
+                return self.decode_string_escaped(start)
             strhash = intmask((1000003 * strhash) ^ ord(ll_chars[i]))
             bits |= ord(ch)
         length = i - start - 1
diff --git a/pypy/module/_rawffi/alt/type_converter.py 
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -128,7 +128,7 @@
         intval: lltype.Signed
         """
         self.error(w_ffitype, w_obj)
-        
+
     def handle_unichar(self, w_ffitype, w_obj, intval):
         """
         intval: lltype.Signed
@@ -174,7 +174,7 @@
     def handle_struct_rawffi(self, w_ffitype, w_structinstance):
         """
         This method should be killed as soon as we remove support for _rawffi 
structures
-        
+
         w_structinstance: W_StructureInstance
         """
         self.error(w_ffitype, w_structinstance)
@@ -227,7 +227,7 @@
             ucharval = self.get_char(w_ffitype)
             return space.newbytes(chr(ucharval))
         elif w_ffitype.is_unichar():
-            wcharval = self.get_unichar(w_ffitype)
+            wcharval = r_uint(self.get_unichar(w_ffitype))
             return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
         elif w_ffitype.is_double():
             return self._float(w_ffitype)
@@ -349,7 +349,7 @@
     def get_struct_rawffi(self, w_ffitype, w_structdescr):
         """
         This should be killed as soon as we kill support for _rawffi structures
-        
+
         Return type: lltype.Unsigned
         (the address of the structure)
         """
diff --git a/pypy/module/_rawffi/interp_rawffi.py 
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -448,7 +448,8 @@
             elif c == 'c':
                 return space.newbytes(func(add_arg, argdesc, ll_type))
             elif c == 'u':
-                return space.newunicode(func(add_arg, argdesc, ll_type))
+                return space.newutf8(rutf8.unichr_as_utf8(
+                    ord(func(add_arg, argdesc, ll_type))), 1)
             elif c == 'f' or c == 'd' or c == 'g':
                 return space.newfloat(float(func(add_arg, argdesc, ll_type)))
             else:
@@ -596,10 +597,10 @@
         return space.w_None
     wcharp_addr = rffi.cast(rffi.CWCHARP, address)
     if maxlength == -1:
-        s = rffi.wcharp2utf8(wcharp_addr)
+        s, lgt = rffi.wcharp2utf8(wcharp_addr)
     else:
-        s = rffi.wcharpsize2utf8(wcharp_addr, maxlength)
-    return space.newunicode(s)
+        s, lgt = rffi.wcharp2utf8n(wcharp_addr, maxlength)
+    return space.newutf8(s, lgt)
 
 @unwrap_spec(address=r_uint, maxlength=int)
 def charp2rawstring(space, address, maxlength=-1):
@@ -612,8 +613,8 @@
 def wcharp2rawunicode(space, address, maxlength=-1):
     if maxlength == -1:
         return wcharp2unicode(space, address)
-    s = rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, address), maxlength)
-    return space.newunicode(s)
+    s = rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, address), maxlength)
+    return space.newutf8(s, maxlength)
 
 @unwrap_spec(address=r_uint, newcontent='bufferstr')
 def rawstring2charp(space, address, newcontent):
diff --git a/pypy/module/array/interp_array.py 
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1,7 +1,7 @@
 from rpython.rlib import jit, rgc, rutf8
 from rpython.rlib.buffer import RawBuffer
 from rpython.rlib.objectmodel import keepalive_until_here
-from rpython.rlib.rarithmetic import ovfcheck, widen
+from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rtyper.annlowlevel import llstr
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -380,6 +380,7 @@
         if len(s) % self.itemsize != 0:
             raise oefmt(self.space.w_ValueError,
                         "string length not a multiple of item size")
+        self.check_valid_unicode(space, s) # empty for non-u arrays
         oldlen = self.len
         new = len(s) / self.itemsize
         if not new:
@@ -451,7 +452,7 @@
         """
         if self.typecode == 'u':
             buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned())
-            return space.newutf8(rffi.wcharpsize2unicode(buf, self.len))
+            return space.newutf8(rffi.wcharpsize2utf8(buf, self.len), self.len)
         else:
             raise oefmt(space.w_ValueError,
                         "tounicode() may only be called on type 'u' arrays")
@@ -710,6 +711,9 @@
             s = "array('%s', %s)" % (self.typecode, space.text_w(r))
             return space.newtext(s)
 
+    def check_valid_unicode(self, space, s):
+        pass # overwritten by u
+
 W_ArrayBase.typedef = TypeDef(
     'array.array',
     __new__ = interp2app(w_array),
@@ -870,6 +874,18 @@
         def get_buffer(self):
             return rffi.cast(mytype.arrayptrtype, self._buffer)
 
+        if mytype.unwrap == 'utf8_len_w':
+            def check_valid_unicode(self, space, s):
+                i = 0
+                while i < len(s):
+                    if s[i] != '\x00' or ord(s[i + 1]) > 0x10:
+                        v = ((ord(s[i]) << 24) + (ord(s[i + 1]) << 16) +
+                             (ord(s[i + 2]) << 8) + ord(s[i + 3]))
+                        raise oefmt(space.w_ValueError,
+                            "Character U+%s is not in range [U+0000, 
U+10ffff]",
+                            hex(v)[2:])
+                    i += 4
+
         def item_w(self, w_item):
             space = self.space
             unwrap = getattr(space, mytype.unwrap)
@@ -1013,7 +1029,7 @@
             elif mytype.typecode == 'c':
                 return space.newbytes(item)
             elif mytype.typecode == 'u':
-                code = ord(item)
+                code = r_uint(ord(item))
                 return space.newutf8(rutf8.unichr_as_utf8(code), 1)
             assert 0, "unreachable"
 
diff --git a/pypy/module/array/test/test_array.py 
b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -844,13 +844,7 @@
         import sys
         if sys.maxunicode == 0xffff:
             skip("test for 32-bit unicodes")
-        a = self.array('u', b'\xff\xff\xff\xff')
-        assert len(a) == 1
-        assert repr(a[0]) == "u'\Uffffffff'"
-        if sys.maxint == 2147483647:
-            assert ord(a[0]) == -1
-        else:
-            assert ord(a[0]) == 4294967295
+        raises(ValueError, self.array, 'u', b'\xff\xff\xff\xff')
 
     def test_weakref(self):
         import weakref
diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py
--- a/pypy/module/cpyext/longobject.py
+++ b/pypy/module/cpyext/longobject.py
@@ -4,6 +4,7 @@
     CONST_STRING, ADDR, CANNOT_FAIL)
 from pypy.objspace.std.longobject import W_LongObject
 from pypy.interpreter.error import OperationError
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
 from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask
 from rpython.rlib.rbigint import rbigint
 
@@ -191,7 +192,7 @@
     string, length gives the number of characters, and base is the radix
     for the conversion.  The radix must be in the range [2, 36]; if it is
     out of range, ValueError will be raised."""
-    w_value = space.newunicode(rffi.wcharpsize2unicode(u, length))
+    w_value = space.newutf8(wcharpsize2utf8(space, u, length), length)
     w_base = space.newint(rffi.cast(lltype.Signed, base))
     return space.call_function(space.w_long, w_value, w_base)
 
diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py
--- a/pypy/module/cpyext/object.py
+++ b/pypy/module/cpyext/object.py
@@ -246,7 +246,7 @@
     the Python expression unicode(o).  Called by the unicode() built-in
     function."""
     if w_obj is None:
-        return space.newunicode(u"<NULL>")
+        return space.newutf8("<NULL>", 6)
     return space.call_function(space.w_unicode, w_obj)
 
 @cpython_api([PyObject, PyObject], rffi.INT_real, error=-1)
@@ -302,7 +302,7 @@
         if opid == Py_EQ:
             return 1
         if opid == Py_NE:
-            return 0 
+            return 0
     w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int)
     return int(space.is_true(w_res))
 
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,5 +1,11 @@
+from rpython.rtyper.lltypesystem import rffi, lltype
+from rpython.rlib import rstring, runicode
+from rpython.tool.sourcetools import func_renamer
+
 from pypy.interpreter.error import OperationError, oefmt
-from rpython.rtyper.lltypesystem import rffi, lltype
+from pypy.interpreter.unicodehelper import (
+    wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
+    unicode_encode_decimal)
 from pypy.module.unicodedata import unicodedb
 from pypy.module.cpyext.api import (
     CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -13,8 +19,6 @@
 from pypy.module.sys.interp_encoding import setdefaultencoding
 from pypy.module._codecs.interp_codecs import CodecState
 from pypy.objspace.std import unicodeobject
-from rpython.rlib import rstring, runicode
-from rpython.tool.sourcetools import func_renamer
 import sys
 
 ## See comment in bytesobject.py.
@@ -61,10 +65,10 @@
 def unicode_attach(space, py_obj, w_obj, w_userdata=None):
     "Fills a newly allocated PyUnicodeObject with a unicode string"
     py_unicode = rffi.cast(PyUnicodeObject, py_obj)
-    s = space.unicode_w(w_obj)
-    py_unicode.c_length = len(s)
+    s, length = space.utf8_len_w(w_obj)
+    py_unicode.c_length = length
     py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
-    py_unicode.c_hash = space.hash_w(space.newunicode(s))
+    py_unicode.c_hash = space.hash_w(space.newutf8(s, length))
     py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
 
 def unicode_realize(space, py_obj):
@@ -73,11 +77,12 @@
     be modified after this call.
     """
     py_uni = rffi.cast(PyUnicodeObject, py_obj)
-    s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length)
+    length = py_uni.c_length
+    s = wcharpsize2utf8(space, py_uni.c_str, length)
     w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type))
     w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type)
-    w_obj.__init__(s)
-    py_uni.c_hash = space.hash_w(space.newunicode(s))
+    w_obj.__init__(s, length)
+    py_uni.c_hash = space.hash_w(space.newutf8(s, length))
     track_reference(space, py_obj, w_obj)
     return w_obj
 
@@ -214,8 +219,8 @@
     if not ref_unicode.c_str:
         # Copy unicode buffer
         w_unicode = from_ref(space, rffi.cast(PyObject, ref))
-        u = space.unicode_w(w_unicode)
-        ref_unicode.c_str = rffi.unicode2wcharp(u)
+        u, length = space.utf8_len_w(w_unicode)
+        ref_unicode.c_str = rffi.utf82wcharp(u, length)
     return ref_unicode.c_str
 
 @cpython_api([PyObject], rffi.CWCHARP)
@@ -335,8 +340,8 @@
     Therefore, modification of the resulting Unicode object is only allowed 
when u
     is NULL."""
     if wchar_p:
-        s = rffi.wcharpsize2unicode(wchar_p, length)
-        return make_ref(space, space.newunicode(s))
+        s = wcharpsize2utf8(space, wchar_p, length)
+        return make_ref(space, space.newutf8(s, length))
     else:
         return rffi.cast(PyObject, new_empty_unicode(space, length))
 
@@ -506,7 +511,8 @@
         """Encode the Py_UNICODE buffer of the given size and return a
         Python string object.  Return NULL if an exception was raised
         by the codec."""
-        w_u = space.newunicode(rffi.wcharpsize2unicode(s, size))
+        u = wcharpsize2utf8(space, s, size)
+        w_u = space.newutf8(u, size)
         if errors:
             w_errors = space.newtext(rffi.charp2str(errors))
         else:
@@ -564,15 +570,11 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_16_helper(
-        string, size, errors,
-        True, # final ? false for multiple passes?
-        None, # errorhandler
-        byteorder)
+    result, _,  length, byteorder = str_decode_utf_16_helper(
+        string, errors, final=True, errorhandler=None, byteorder=byteorder)
     if pbyteorder is not None:
         pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
-    return space.newunicode(result)
+    return space.newutf8(result, length)
 
 @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject)
 def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder):
@@ -620,15 +622,11 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_32_helper(
-        string, size, errors,
-        True, # final ? false for multiple passes?
-        None, # errorhandler
-        byteorder)
+    result, _,  length, byteorder = str_decode_utf_32_helper(
+        string, errors, final=True, errorhandler=None, byteorder=byteorder)
     if pbyteorder is not None:
         pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
-    return space.newunicode(result)
+    return space.newutf8(result, length)
 
 @cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING],
              rffi.INT_real, error=-1)
@@ -646,14 +644,13 @@
 
     Returns 0 on success, -1 on failure.
     """
-    u = rffi.wcharpsize2unicode(s, length)
+    u = rffi.wcharpsize2utf8(s, length)
     if llerrors:
         errors = rffi.charp2str(llerrors)
     else:
         errors = None
     state = space.fromcache(CodecState)
-    result = runicode.unicode_encode_decimal(u, length, errors,
-                                             state.encode_error_handler)
+    result = unicode_encode_decimal(u, errors, state.encode_error_handler)
     i = len(result)
     output[i] = '\0'
     i -= 1
@@ -706,12 +703,17 @@
     """Return 1 if substr matches str[start:end] at the given tail end
     (direction == -1 means to do a prefix match, direction == 1 a
     suffix match), 0 otherwise. Return -1 if an error occurred."""
-    str = space.unicode_w(w_str)
-    substr = space.unicode_w(w_substr)
+    space.utf8_w(w_str)  # type check
+    space.utf8_w(w_substr)
+    w_start = space.newint(start)
+    w_end = space.newint(end)
     if rffi.cast(lltype.Signed, direction) <= 0:
-        return rstring.startswith(str, substr, start, end)
+        w_result = space.call_method(
+            w_str, "startswith", w_substr, w_start, w_end)
     else:
-        return rstring.endswith(str, substr, start, end)
+        w_result = space.call_method(
+            w_str, "endswith", w_substr, w_start, w_end)
+    return space.int_w(w_result)
 
 @cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, 
error=-1)
 def PyUnicode_Count(space, w_str, w_substr, start, end):
diff --git a/pypy/module/pyexpat/interp_pyexpat.py 
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -483,7 +483,7 @@
             except rutf8.CheckError:
                 from pypy.interpreter import unicodehelper
                 # get the correct error msg
-                unicodehelper.str_decode_utf8(s, len(s), 'string', True,
+                unicodehelper.str_decode_utf8(s, 'string', True,
                     unicodehelper.decode_error_handler(space))
                 assert False, "always raises"
         else:
@@ -587,21 +587,22 @@
 
     def UnknownEncodingHandler(self, space, name, info):
         # Yes, supports only 8bit encodings
-        translationmap = space.unicode_w(
+        translationmap, lgt = space.utf8_len_w(
             space.call_method(
                 space.newbytes(self.all_chars), "decode",
                 space.newtext(name), space.newtext("replace")))
 
-        if len(translationmap) != 256:
+        if lgt != 256:
             raise oefmt(space.w_ValueError,
                         "multi-byte encodings are not supported")
 
-        for i in range(256):
-            c = translationmap[i]
-            if c == u'\ufffd':
+        i = 0
+        for c in rutf8.Utf8StringIterator(translationmap):
+            if c == 0xfffd:
                 info.c_map[i] = rffi.cast(rffi.INT, -1)
             else:
                 info.c_map[i] = rffi.cast(rffi.INT, c)
+            i += 1
         info.c_data = lltype.nullptr(rffi.VOIDP.TO)
         info.c_convert = lltype.nullptr(rffi.VOIDP.TO)
         info.c_release = lltype.nullptr(rffi.VOIDP.TO)
diff --git a/pypy/module/struct/formatiterator.py 
b/pypy/module/struct/formatiterator.py
--- a/pypy/module/struct/formatiterator.py
+++ b/pypy/module/struct/formatiterator.py
@@ -1,6 +1,6 @@
 from rpython.rlib.rarithmetic import (r_uint, r_ulonglong, r_longlong,
                                       maxint, intmask)
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rstruct.error import StructError
 from rpython.rlib.rstruct.formatiterator import FormatIterator
@@ -107,7 +107,7 @@
 
     def accept_unicode_arg(self):
         w_obj = self.accept_obj_arg()
-        return self.space.unicode_w(w_obj)
+        return self.space.utf8_len_w(w_obj)
 
     def accept_float_arg(self):
         w_obj = self.accept_obj_arg()
@@ -191,6 +191,10 @@
             assert 0, "unreachable"
         self.result_w.append(w_value)
 
+    def append_utf8(self, value):
+        w_ch = self.space.newutf8(rutf8.unichr_as_utf8(r_uint(value)), 1)
+        self.result_w.append(w_ch)
+
     def get_pos(self):
         return self.pos
 
diff --git a/pypy/module/unicodedata/interp_ucd.py 
b/pypy/module/unicodedata/interp_ucd.py
--- a/pypy/module/unicodedata/interp_ucd.py
+++ b/pypy/module/unicodedata/interp_ucd.py
@@ -7,11 +7,8 @@
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.typedef import TypeDef, interp_attrproperty
 from rpython.rlib.rarithmetic import r_longlong
-from rpython.rlib.objectmodel import we_are_translated
-from rpython.rlib.runicode import MAXUNICODE
 from rpython.rlib.unicodedata import unicodedb_5_2_0, unicodedb_3_2_0
-from rpython.rlib.runicode import code_to_unichr, ord_accepts_surrogate
-import sys
+from rpython.rlib.rutf8 import Utf8StringBuilder, unichr_as_utf8
 
 
 # Contants for Hangul characters
@@ -30,49 +27,17 @@
 # unicode code point.
 
 
-if MAXUNICODE > 0xFFFF:
-    # Target is wide build
-    def unichr_to_code_w(space, w_unichr):
-        if not space.isinstance_w(w_unichr, space.w_unicode):
-            raise oefmt(
-                space.w_TypeError, 'argument 1 must be unicode, not %T',
-                w_unichr)
+# Target is wide build
+def unichr_to_code_w(space, w_unichr):
+    if not space.isinstance_w(w_unichr, space.w_unicode):
+        raise oefmt(
+            space.w_TypeError, 'argument 1 must be unicode, not %T',
+            w_unichr)
 
-        if not we_are_translated() and sys.maxunicode == 0xFFFF:
-            # Host CPython is narrow build, accept surrogates
-            try:
-                return ord_accepts_surrogate(space.unicode_w(w_unichr))
-            except TypeError:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
-        else:
-            if not space.len_w(w_unichr) == 1:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
-            return space.int_w(space.ord(w_unichr))
-
-else:
-    # Target is narrow build
-    def unichr_to_code_w(space, w_unichr):
-        if not space.isinstance_w(w_unichr, space.w_unicode):
-            raise oefmt(
-                space.w_TypeError, 'argument 1 must be unicode, not %T',
-                w_unichr)
-
-        if not we_are_translated() and sys.maxunicode > 0xFFFF:
-            # Host CPython is wide build, forbid surrogates
-            if not space.len_w(w_unichr) == 1:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
-            return space.int_w(space.ord(w_unichr))
-
-        else:
-            # Accept surrogates
-            try:
-                return ord_accepts_surrogate(space.unicode_w(w_unichr))
-            except TypeError:
-                raise oefmt(space.w_TypeError,
-                            "need a single Unicode character as parameter")
+    if not space.len_w(w_unichr) == 1:
+        raise oefmt(space.w_TypeError,
+                    "need a single Unicode character as parameter")
+    return space.int_w(space.ord(w_unichr))
 
 
 class UCD(W_Root):
@@ -110,7 +75,8 @@
         except KeyError:
             msg = space.mod(space.newtext("undefined character name '%s'"), 
space.newtext(name))
             raise OperationError(space.w_KeyError, msg)
-        return space.newunicode(code_to_unichr(code))
+        assert code >= 0
+        return space.newutf8(unichr_as_utf8(code), 1)
 
     def name(self, space, w_unichr, w_default=None):
         code = unichr_to_code_w(space, w_unichr)
@@ -259,10 +225,10 @@
                 result[0] = ch
 
         if not composed: # If decomposed normalization we are done
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         if j <= 1:
-            return space.newunicode(u''.join([unichr(i) for i in result[:j]]))
+            return self.build(space, result, stop=j)
 
         current = result[0]
         starter_pos = 0
@@ -310,7 +276,13 @@
 
         result[starter_pos] = current
 
-        return space.newunicode(u''.join([unichr(i) for i in 
result[:next_insert]]))
+        return self.build(space, result, stop=next_insert)
+
+    def build(self, space, r, stop):
+        builder = Utf8StringBuilder(stop * 3)
+        for i in range(stop):
+            builder.append_code(r[i])
+        return space.newutf8(builder.build(), stop)
 
 
 methods = {}
diff --git a/pypy/module/unicodedata/test/test_hyp.py 
b/pypy/module/unicodedata/test/test_hyp.py
--- a/pypy/module/unicodedata/test/test_hyp.py
+++ b/pypy/module/unicodedata/test/test_hyp.py
@@ -1,3 +1,4 @@
+
 import pytest
 try:
     from hypothesis import given, strategies as st, example, settings
@@ -5,12 +6,14 @@
     pytest.skip("hypothesis required")
 
 from pypy.module.unicodedata.interp_ucd import ucd
+from rpython.rlib.rutf8 import get_utf8_length
 
 def make_normalization(space, NF_code):
     def normalize(s):
-        w_s = space.newunicode(s)
+        u = s.encode('utf8')
+        w_s = space.newutf8(u, get_utf8_length(u))
         w_res = ucd.normalize(space, NF_code, w_s)
-        return space.unicode_w(w_res)
+        return space.utf8_w(w_res).decode('utf8')
     return normalize
 
 all_forms = ['NFC', 'NFD', 'NFKC', 'NFKD']
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -212,9 +212,6 @@
     def newutf8(self, x, l):
         return w_some_obj()
 
-    def new_from_utf8(self, a):
-        return w_some_obj()
-
     def newunicode(self, a):
         return w_some_obj()
 
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -3,7 +3,7 @@
 
 from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import specialize
-from rpython.rlib.rarithmetic import INT_MAX
+from rpython.rlib.rarithmetic import INT_MAX, r_uint
 from rpython.rlib.rfloat import DTSF_ALT, formatd, isnan, isinf
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.unroll import unrolling_iterable
@@ -330,7 +330,7 @@
             space = self.space
             if do_unicode:
                 cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
-                w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1)
+                w_s = space.newutf8(rutf8.unichr_as_utf8(r_uint(cp)), 1)
             else:
                 cp = ord(self.fmt[self.fmtpos - 1])
                 w_s = space.newbytes(chr(cp))
@@ -466,7 +466,7 @@
                 n = space.int_w(w_value)
                 if do_unicode:
                     try:
-                        c = rutf8.unichr_as_utf8(n)
+                        c = rutf8.unichr_as_utf8(r_uint(n))
                     except ValueError:
                         raise oefmt(space.w_OverflowError,
                                     "unicode character code out of range")
diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -741,6 +741,8 @@
         assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
         assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
         assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96'
+        assert u'\ud800\udc02'.encode('uTf-8') == '\xf0\x90\x80\x82'
+        assert u'\ud84d\udc56'.encode('Utf8') == '\xf0\xa3\x91\x96'
         assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80'
         assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80'
         assert (u'\ud800\udc02'*1000).encode('utf-8') == 
'\xf0\x90\x80\x82'*1000
diff --git a/rpython/annotator/unaryop.py b/rpython/annotator/unaryop.py
--- a/rpython/annotator/unaryop.py
+++ b/rpython/annotator/unaryop.py
@@ -792,7 +792,7 @@
     def ord(self):
         # warning, on 32-bit with 32-bit unichars, this might return
         # negative numbers
-        return SomeInteger()
+        return SomeInteger(nonneg=True)
 
 class __extend__(SomeIterator):
 
diff --git a/rpython/rlib/rstruct/nativefmttable.py 
b/rpython/rlib/rstruct/nativefmttable.py
--- a/rpython/rlib/rstruct/nativefmttable.py
+++ b/rpython/rlib/rstruct/nativefmttable.py
@@ -4,7 +4,7 @@
 """
 import struct
 
-from rpython.rlib import jit, longlong2float
+from rpython.rlib import rutf8, longlong2float
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rarithmetic import r_singlefloat, widen, intmask
 from rpython.rlib.rstruct import standardfmttable as std
@@ -139,17 +139,17 @@
 from rpython.rlib.rstruct import unichar
 
 def pack_unichar(fmtiter):
-    unistr = fmtiter.accept_unicode_arg()
-    if len(unistr) != 1:
+    utf8, lgt = fmtiter.accept_unicode_arg()
+    if lgt != 1:
         raise StructError("expected a unicode string of length 1")
-    c = unistr[0]   # string->char conversion for the annotator
-    unichar.pack_unichar(c, fmtiter.wbuf, fmtiter.pos)
+    uchr = rutf8.codepoint_at_pos(utf8, 0)
+    unichar.pack_codepoint(uchr, fmtiter.wbuf, fmtiter.pos)
     fmtiter.advance(unichar.UNICODE_SIZE)
 
 @specialize.argtype(0)
 def unpack_unichar(fmtiter):
     data = fmtiter.read(unichar.UNICODE_SIZE)
-    fmtiter.appendobj(unichar.unpack_unichar(data))
+    fmtiter.append_utf8(unichar.unpack_codepoint(data))
 
 native_fmttable['u'] = {'size': unichar.UNICODE_SIZE,
                         'alignment': unichar.UNICODE_SIZE,
diff --git a/rpython/rlib/rstruct/unichar.py b/rpython/rlib/rstruct/unichar.py
--- a/rpython/rlib/rstruct/unichar.py
+++ b/rpython/rlib/rstruct/unichar.py
@@ -3,12 +3,8 @@
 """
 
 import sys
-from rpython.rlib.runicode import MAXUNICODE
 
-if MAXUNICODE <= 65535:
-    UNICODE_SIZE = 2
-else:
-    UNICODE_SIZE = 4
+UNICODE_SIZE = 4
 BIGENDIAN = sys.byteorder == "big"
 
 def pack_unichar(unich, buf, pos):
@@ -34,7 +30,7 @@
             buf.setitem(pos+2, chr((unich >> 16) & 0xFF))
             buf.setitem(pos+3, chr(unich >> 24))
 
-def unpack_unichar(rawstring):
+def unpack_codepoint(rawstring):
     assert len(rawstring) == UNICODE_SIZE
     if UNICODE_SIZE == 2:
         if BIGENDIAN:
@@ -54,4 +50,7 @@
                  ord(rawstring[1]) << 8 |
                  ord(rawstring[2]) << 16 |
                  ord(rawstring[3]) << 24)
-    return unichr(n)
+    return n
+
+def unpack_unichar(rawstring):
+    return unichr(unpack_codepoint(rawstring))
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -19,7 +19,7 @@
 from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
 from rpython.rlib.objectmodel import always_inline, dont_inline, try_inline
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib import jit
+from rpython.rlib import jit, types
 from rpython.rlib.signature import signature
 from rpython.rlib.types import char, none
 from rpython.rlib.rarithmetic import r_uint
@@ -27,6 +27,8 @@
 from rpython.rtyper.lltypesystem import lltype, rffi
 
 
+# we need a way to accept both r_uint and int(nonneg=True)
+#@signature(types.int_nonneg(), types.bool(), returns=types.str())
 def unichr_as_utf8(code, allow_surrogates=False):
     """Encode code (numeric value) as utf8 encoded string
     """
@@ -437,7 +439,7 @@
             low = codepoint_at_pos(utf8, i)
             if 0xDC00 <= low <= 0xDFFF:
                 uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00)
-                i = next_codepoint_pos(utf8, i)                
+                i = next_codepoint_pos(utf8, i)
             # else not really a surrogate pair, just append high
         else:
             i = next_codepoint_pos(utf8, i)
@@ -535,6 +537,13 @@
     else:
         return next_codepoint_pos(utf8, next_codepoint_pos(utf8, bytepos))
 
+def _pos_at_index(utf8, index):
+    # Slow!
+    pos = 0
+    for _ in range(index):
+        pos = next_codepoint_pos(utf8, pos)
+    return pos
+
 @jit.dont_look_inside
 def codepoint_at_index(utf8, storage, index):
     """ Return codepoint of a character inside utf8 encoded string, given
diff --git a/rpython/rlib/types.py b/rpython/rlib/types.py
--- a/rpython/rlib/types.py
+++ b/rpython/rlib/types.py
@@ -26,6 +26,8 @@
 def int():
     return model.SomeInteger()
 
+def int_nonneg():
+    return model.SomeInteger(nonneg=True)
 
 def bool():
     return model.SomeBool()
diff --git a/rpython/rtyper/lltypesystem/rffi.py 
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1019,7 +1019,27 @@
     s = StringBuilder(size)
     for i in range(size):
         rutf8.unichr_as_utf8_append(s, ord(w[i]))
-    return s.build()    
+    return s.build()
+
+def wcharp2utf8(w):
+    from rpython.rlib import rutf8
+
+    s = rutf8.Utf8StringBuilder()
+    i = 0
+    while ord(w[i]):
+        s.append_code(ord(w[i]))
+        i += 1
+    return s.build(), i
+
+def wcharp2utf8n(w, maxlen):
+    from rpython.rlib import rutf8
+
+    s = rutf8.Utf8StringBuilder(maxlen)
+    i = 0
+    while i < maxlen and w[i]:
+        s.append_code(ord(w[i]))
+        i += 1
+    return s.build(), i
 
 def utf82wcharp(utf8, utf8len):
     from rpython.rlib import rutf8
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8-re: hg merge unicode-utf8

Reply via email to