Author: Matti Picus <[email protected]>
Branch:
Changeset: r96003:ba081fb468f4
Date: 2019-02-13 23:11 +0200
http://bitbucket.org/pypy/pypy/changeset/ba081fb468f4/
Log: merge unicode-utf8 into default
diff too long, truncating to 2000 out of 15164 lines
diff --git a/TODO b/TODO
new file mode 100644
--- /dev/null
+++ b/TODO
@@ -0,0 +1,4 @@
+* find a better way to run "find" without creating the index storage,
+ if one is not already readily available (understand cost now, improve after
merge)
+* improve performance of splitlines
+* think about cost of utf8 list strategy (Armin and CF)
diff --git a/lib-python/2.7/test/test_memoryio.py
b/lib-python/2.7/test/test_memoryio.py
--- a/lib-python/2.7/test/test_memoryio.py
+++ b/lib-python/2.7/test/test_memoryio.py
@@ -712,6 +712,7 @@
# XXX: For the Python version of io.StringIO, this is highly
# dependent on the encoding used for the underlying buffer.
+ @support.cpython_only
def test_widechar(self):
buf = self.buftype("\U0002030a\U00020347")
memio = self.ioclass(buf)
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -29,7 +29,11 @@
Improve register allocation in the JIT.
-
.. branch: promote-unicode
Implement rlib.jit.promote_unicode to complement promote_string
+
+.. branch: unicode-utf8
+
+Use utf8 internally to represent unicode, with the goal of never using
rpython-level unicode
+
diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py
--- a/pypy/interpreter/argument.py
+++ b/pypy/interpreter/argument.py
@@ -535,24 +535,26 @@
if num_remainingkwds == 1:
for i in range(len(keywords)):
if i not in kwds_mapping:
- name = keywords[i]
- if name is None:
- # We'll assume it's unicode. Encode it.
- # Careful, I *think* it should not be possible to
- # get an IndexError here but you never know.
- try:
- if keyword_names_w is None:
- raise IndexError
- # note: negative-based indexing from the end
- w_name = keyword_names_w[i - len(keywords)]
- except IndexError:
+ name = '?'
+ # We'll assume it's unicode. Encode it.
+ # Careful, I *think* it should not be possible to
+ # get an IndexError here but you never know.
+ try:
+ if keyword_names_w is None:
+ raise IndexError
+ # note: negative-based indexing from the end
+ w_name = keyword_names_w[i - len(keywords)]
+ except IndexError:
+ if keywords is None:
name = '?'
else:
- w_enc = space.newtext(space.sys.defaultencoding)
- w_err = space.newtext("replace")
- w_name = space.call_method(w_name, "encode", w_enc,
- w_err)
- name = space.text_w(w_name)
+ name = keywords[i]
+ else:
+ w_enc = space.newtext(space.sys.defaultencoding)
+ w_err = space.newtext("replace")
+ w_name = space.call_method(w_name, "encode", w_enc,
+ w_err)
+ name = space.text_w(w_name)
break
self.kwd_name = name
diff --git a/pypy/interpreter/astcompiler/optimize.py
b/pypy/interpreter/astcompiler/optimize.py
--- a/pypy/interpreter/astcompiler/optimize.py
+++ b/pypy/interpreter/astcompiler/optimize.py
@@ -5,7 +5,7 @@
from pypy.tool import stdlib_opcode as ops
from pypy.interpreter.error import OperationError
from rpython.rlib.unroll import unrolling_iterable
-from rpython.rlib.runicode import MAXUNICODE
+from rpython.rlib.rutf8 import MAXUNICODE
from rpython.rlib.objectmodel import specialize
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py
b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -975,9 +975,6 @@
class AppTestCompiler:
- def setup_class(cls):
- cls.w_maxunicode = cls.space.wrap(sys.maxunicode)
-
def test_docstring_not_loaded(self):
import StringIO, dis, sys
ns = {}
@@ -1027,7 +1024,7 @@
import sys
d = {}
exec '# -*- coding: utf-8 -*-\n\nu = u"\xf0\x9f\x92\x8b"' in d
- if sys.maxunicode > 65535 and self.maxunicode > 65535:
+ if sys.maxunicode > 65535:
expected_length = 1
else:
expected_length = 2
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -3,7 +3,7 @@
from rpython.rlib.cache import Cache
from rpython.tool.uid import HUGEVAL_BYTES
-from rpython.rlib import jit, types
+from rpython.rlib import jit, types, rutf8
from rpython.rlib.debug import make_sure_not_resized
from rpython.rlib.objectmodel import (we_are_translated, newlist_hint,
compute_unique_id, specialize, not_rpython)
@@ -283,7 +283,10 @@
def str_w(self, space):
self._typed_unwrap_error(space, "string")
- def unicode_w(self, space):
+ def utf8_w(self, space):
+ self._typed_unwrap_error(space, "unicode")
+
+ def convert_to_w_unicode(self, space):
self._typed_unwrap_error(space, "unicode")
def bytearray_list_of_chars_w(self, space):
@@ -1103,7 +1106,7 @@
"""
return None
- def listview_unicode(self, w_list):
+ def listview_utf8(self, w_list):
""" Return a list of unwrapped unicode out of a list of unicode. If the
argument is not a list or does not contain only unicode, return None.
May return None anyway.
@@ -1133,8 +1136,15 @@
def newlist_bytes(self, list_s):
return self.newlist([self.newbytes(s) for s in list_s])
- def newlist_unicode(self, list_u):
- return self.newlist([self.newunicode(u) for u in list_u])
+ def newlist_utf8(self, list_u, is_ascii):
+ l_w = [None] * len(list_u)
+ for i, item in enumerate(list_u):
+ if not is_ascii:
+ length = rutf8.check_utf8(item, True)
+ else:
+ length = len(item)
+ l_w[i] = self.newutf8(item, length)
+ return self.newlist(l_w)
def newlist_int(self, list_i):
return self.newlist([self.newint(i) for i in list_i])
@@ -1661,6 +1671,8 @@
# needed because CPython has the same issue. (Well, it's
# unclear if there is any use at all for getting the bytes in
# the unicode buffer.)
+ if self.isinstance_w(w_obj, self.w_unicode):
+ return w_obj.charbuf_w(self)
try:
return self.bytes_w(w_obj)
except OperationError as e:
@@ -1802,27 +1814,38 @@
raise oefmt(self.w_TypeError, "argument must be a string")
return self.bytes_w(w_obj)
- @specialize.argtype(1)
- def unicode_w(self, w_obj):
- assert w_obj is not None
- return w_obj.unicode_w(self)
+ def utf8_w(self, w_obj):
+ return w_obj.utf8_w(self)
+
+ def convert_to_w_unicode(self, w_obj):
+ return w_obj.convert_to_w_unicode(self)
def unicode0_w(self, w_obj):
"Like unicode_w, but rejects strings with NUL bytes."
from rpython.rlib import rstring
- result = w_obj.unicode_w(self)
+ result = w_obj.utf8_w(self).decode('utf8')
if u'\x00' in result:
raise oefmt(self.w_TypeError,
"argument must be a unicode string without NUL "
"characters")
return rstring.assert_str0(result)
- def realunicode_w(self, w_obj):
- # Like unicode_w(), but only works if w_obj is really of type
- # 'unicode'. On Python 3 this is the same as unicode_w().
+ def convert_arg_to_w_unicode(self, w_obj, strict=None):
+ # XXX why convert_to_w_unicode does something slightly different?
+ from pypy.objspace.std.unicodeobject import W_UnicodeObject
+ assert not hasattr(self, 'is_fake_objspace')
+ return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict)
+
+ def utf8_len_w(self, w_obj):
+ w_obj = self.convert_arg_to_w_unicode(w_obj)
+ return w_obj._utf8, w_obj._len()
+
+ def realutf8_w(self, w_obj):
+ # Like utf8_w(), but only works if w_obj is really of type
+ # 'unicode'. On Python 3 this is the same as utf8_w().
if not self.isinstance_w(w_obj, self.w_unicode):
raise oefmt(self.w_TypeError, "argument must be a unicode")
- return self.unicode_w(w_obj)
+ return self.utf8_w(w_obj)
def bool_w(self, w_obj):
# Unwraps a bool, also accepting an int for compatibility.
@@ -2187,7 +2210,7 @@
'float_w',
'uint_w',
'bigint_w',
- 'unicode_w',
+ 'utf8_w',
'unwrap',
'is_true',
'is_w',
diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py
--- a/pypy/interpreter/gateway.py
+++ b/pypy/interpreter/gateway.py
@@ -160,6 +160,9 @@
def visit_text0(self, el, app_sig):
self.checked_space_method(el, app_sig)
+ def visit_utf8(self, el, app_sig):
+ self.checked_space_method(el, app_sig)
+
def visit_fsencode(self, el, app_sig):
self.checked_space_method(el, app_sig)
@@ -304,6 +307,9 @@
def visit_text0(self, typ):
self.run_args.append("space.text0_w(%s)" % (self.scopenext(),))
+ def visit_utf8(self, typ):
+ self.run_args.append("space.utf8_w(%s)" % (self.scopenext(),))
+
def visit_fsencode(self, typ):
self.run_args.append("space.fsencode_w(%s)" % (self.scopenext(),))
@@ -469,6 +475,9 @@
def visit_text0(self, typ):
self.unwrap.append("space.text0_w(%s)" % (self.nextarg(),))
+ def visit_utf8(self, typ):
+ self.unwrap.append("space.utf8_w(%s)" % (self.nextarg(),))
+
def visit_fsencode(self, typ):
self.unwrap.append("space.fsencode_w(%s)" % (self.nextarg(),))
@@ -533,10 +542,10 @@
def int_unwrapping_space_method(typ):
- assert typ in (int, str, float, unicode, r_longlong, r_uint, r_ulonglong,
bool)
+ assert typ in (int, str, float, r_longlong, r_uint, r_ulonglong, bool)
if typ is r_int is r_longlong:
return 'gateway_r_longlong_w'
- elif typ in (str, unicode, bool):
+ elif typ in (str, bool):
return typ.__name__ + '_w'
else:
return 'gateway_' + typ.__name__ + '_w'
diff --git a/pypy/interpreter/pyparser/parsestring.py
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -1,3 +1,4 @@
+from rpython.rlib import rutf8
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter import unicodehelper
from rpython.rlib.rstring import StringBuilder
@@ -51,18 +52,20 @@
'unmatched triple quotes in literal')
q -= 2
- if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
+ if unicode_literal:
if encoding is None or encoding == "iso-8859-1":
# 'unicode_escape' expects latin-1 bytes, string is ready.
assert 0 <= ps <= q
substr = s[ps:q]
else:
+ unicodehelper.check_utf8_or_raise(space, s, ps, q)
substr = decode_unicode_utf8(space, s, ps, q)
if rawmode:
- v = unicodehelper.decode_raw_unicode_escape(space, substr)
+ r = unicodehelper.decode_raw_unicode_escape(space, substr)
else:
- v = unicodehelper.decode_unicode_escape(space, substr)
- return space.newunicode(v)
+ r = unicodehelper.decode_unicode_escape(space, substr)
+ v, length = r
+ return space.newutf8(v, length)
need_encoding = (encoding is not None and
encoding != "utf-8" and encoding != "utf8" and
@@ -71,7 +74,8 @@
substr = s[ps : q]
if rawmode or '\\' not in s[ps:]:
if need_encoding:
- w_u = space.newunicode(unicodehelper.decode_utf8(space, substr))
+ lgt = unicodehelper.check_utf8_or_raise(space, substr)
+ w_u = space.newutf8(substr, lgt)
w_v = unicodehelper.encode(space, w_u, encoding)
return w_v
else:
@@ -101,15 +105,12 @@
# the backslash we just wrote, we emit "\u005c"
# instead.
lis.append("u005c")
- if ord(s[ps]) & 0x80: # XXX inefficient
- w, ps = decode_utf8(space, s, ps, end)
- for c in w:
- # The equivalent of %08x, which is not supported by RPython.
- # 7 zeroes are enough for the unicode range, and the
- # result still fits in 32-bit.
- hexa = hex(ord(c) + 0x10000000)
- lis.append('\\U0')
- lis.append(hexa[3:]) # Skip 0x and the leading 1
+ if ord(s[ps]) & 0x80:
+ cp = rutf8.codepoint_at_pos(s, ps)
+ hexa = hex(cp + 0x10000000)
+ lis.append('\\U0')
+ lis.append(hexa[3:]) # Skip 0x and the leading 1
+ ps = rutf8.next_codepoint_pos(s, ps)
else:
lis.append(s[ps])
ps += 1
@@ -215,20 +216,29 @@
ch >= 'A' and ch <= 'F')
-def decode_utf8(space, s, ps, end):
+def check_utf8(space, s, ps, end):
assert ps >= 0
pt = ps
# while (s < end && *s != '\\') s++; */ /* inefficient for u".."
while ps < end and ord(s[ps]) & 0x80:
ps += 1
- u = unicodehelper.decode_utf8(space, s[pt:ps])
- return u, ps
+ try:
+ rutf8.check_utf8(s, True, pt, ps)
+ except rutf8.CheckError as e:
+ lgt, flag = rutf8.check_utf8(s, True, pt, e.pos)
+ unicodehelper.decode_error_handler(space)('strict', 'utf8',
+ 'invalid utf-8', s, pt + lgt, pt + lgt + 1)
+ return s[pt:ps]
def decode_utf8_recode(space, s, ps, end, recode_encoding):
- u, ps = decode_utf8(space, s, ps, end)
- w_v = unicodehelper.encode(space, space.newunicode(u), recode_encoding)
+ p = ps
+ while p < end and ord(s[p]) & 0x80:
+ p += 1
+ lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+ w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt),
+ recode_encoding)
v = space.bytes_w(w_v)
- return v, ps
+ return v, p
def raise_app_valueerror(space, msg):
raise OperationError(space.w_ValueError, space.newtext(msg))
diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py
b/pypy/interpreter/pyparser/test/test_parsestring.py
--- a/pypy/interpreter/pyparser/test/test_parsestring.py
+++ b/pypy/interpreter/pyparser/test/test_parsestring.py
@@ -10,7 +10,7 @@
assert space.str_w(w_ret) == value
elif isinstance(value, unicode):
assert space.type(w_ret) == space.w_unicode
- assert space.unicode_w(w_ret) == value
+ assert space.utf8_w(w_ret).decode('utf8') == value
else:
assert False
@@ -50,7 +50,7 @@
s = "u'\x81'"
s = s.decode("koi8-u").encode("utf8")
w_ret = parsestring.parsestr(self.space, 'koi8-u', s)
- ret = space.unwrap(w_ret)
+ ret = w_ret._utf8.decode('utf8')
assert ret == eval("# -*- coding: koi8-u -*-\nu'\x81'")
def test_unicode_literals(self):
@@ -102,7 +102,4 @@
def test_decode_unicode_utf8(self):
buf = parsestring.decode_unicode_utf8(self.space,
'u"\xf0\x9f\x92\x8b"', 2, 6)
- if sys.maxunicode == 65535:
- assert buf == r"\U0000d83d\U0000dc8b"
- else:
- assert buf == r"\U0001f48b"
+ assert buf == r"\U0001f48b"
diff --git a/pypy/interpreter/test/test_argument.py
b/pypy/interpreter/test/test_argument.py
--- a/pypy/interpreter/test/test_argument.py
+++ b/pypy/interpreter/test/test_argument.py
@@ -54,6 +54,9 @@
pass
class DummySpace(object):
+ class sys:
+ defaultencoding = 'utf-8'
+
def newtuple(self, items):
return tuple(items)
diff --git a/pypy/interpreter/test/test_gateway.py
b/pypy/interpreter/test/test_gateway.py
--- a/pypy/interpreter/test/test_gateway.py
+++ b/pypy/interpreter/test/test_gateway.py
@@ -535,25 +535,33 @@
w_app_g3_r = space.wrap(app_g3_r)
raises(gateway.OperationError,space.call_function,w_app_g3_r,w(1.0))
- def test_interp2app_unwrap_spec_unicode(self):
+ def test_interp2app_unwrap_spec_utf8(self):
space = self.space
w = space.wrap
- def g3_u(space, uni):
- return space.wrap(len(uni))
+ def g3_u(space, utf8):
+ return space.wrap(utf8)
app_g3_u = gateway.interp2app_temp(g3_u,
unwrap_spec=[gateway.ObjSpace,
- unicode])
+ 'utf8'])
w_app_g3_u = space.wrap(app_g3_u)
+ encoded = u"gęść".encode('utf8')
assert self.space.eq_w(
- space.call_function(w_app_g3_u, w(u"foo")),
- w(3))
+ space.call_function(w_app_g3_u, w(u"gęść")),
+ w(encoded))
assert self.space.eq_w(
- space.call_function(w_app_g3_u, w("baz")),
- w(3))
+ space.call_function(w_app_g3_u, w("foo")),
+ w("foo"))
raises(gateway.OperationError, space.call_function, w_app_g3_u,
w(None))
raises(gateway.OperationError, space.call_function, w_app_g3_u,
w(42))
+ # XXX this part of the test seems wrong, why would "\x80" fail?
+ # w_ascii = space.appexec([], """():
+ # import sys
+ # return sys.getdefaultencoding() == 'ascii'""")
+ # if space.is_true(w_ascii):
+ # raises(gateway.OperationError, space.call_function, w_app_g3_u,
+ # w("\x80"))
def test_interp2app_unwrap_spec_unwrapper(self):
space = self.space
diff --git a/pypy/interpreter/test/test_objspace.py
b/pypy/interpreter/test/test_objspace.py
--- a/pypy/interpreter/test/test_objspace.py
+++ b/pypy/interpreter/test/test_objspace.py
@@ -216,9 +216,7 @@
space = self.space
w = space.wrap
assert space.text0_w(w("123")) == "123"
- exc = space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
- assert space.unicode0_w(w(u"123")) == u"123"
- exc = space.raises_w(space.w_TypeError, space.unicode0_w,
w(u"123\x004"))
+ space.raises_w(space.w_TypeError, space.text0_w, w("123\x004"))
def test_getindex_w(self):
w_instance1 = self.space.appexec([], """():
diff --git a/pypy/interpreter/test/test_unicodehelper.py
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,53 +1,93 @@
import pytest
+try:
+ from hypothesis import given, strategies
+ HAS_HYPOTHESIS = True
+except ImportError:
+ HAS_HYPOTHESIS = False
import struct
import sys
-from pypy.interpreter.unicodehelper import (
- encode_utf8, decode_utf8, unicode_encode_utf_32_be)
-class FakeSpace:
- pass
+from rpython.rlib import rutf8
-def test_encode_utf8():
- space = FakeSpace()
- assert encode_utf8(space, u"abc") == "abc"
- assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
- assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80"
- assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80"
- # for the following test, go to lengths to avoid CPython's optimizer
- # and .pyc file storage, which collapse the two surrogates into one
- c = u"\udc00"
- assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80"
+from pypy.interpreter.unicodehelper import str_decode_utf8
+from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
+
+def decode_utf8(u):
+ return str_decode_utf8(u, True, "strict", None)
def test_decode_utf8():
- space = FakeSpace()
- assert decode_utf8(space, "abc") == u"abc"
- assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
- assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
- assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
- got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
- assert map(ord, got) == [0xd800, 0xdc00]
- got = decode_utf8(space, "\xf0\x90\x80\x80")
- if sys.maxunicode > 65535:
- assert map(ord, got) == [0x10000]
- else:
- assert map(ord, got) == [55296, 56320]
+ assert decode_utf8("abc") == ("abc", 3, 3)
+ assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1)
+ assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1)
+ assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1)
+ assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == (
+ "\xed\xa0\x80\xed\xb0\x80", 6, 2)
+ assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1)
[email protected]('unich', [u"\ud800", u"\udc80"])
-def test_utf32_surrogates(unich):
- assert (unicode_encode_utf_32_be(unich, 1, None) ==
- struct.pack('>i', ord(unich)))
- with pytest.raises(UnicodeEncodeError):
- unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False)
+def test_utf8_encode_ascii():
+ assert utf8_encode_ascii("abc", "??", "??") == "abc"
+ def eh(errors, encoding, reason, p, start, end):
+ lst.append((errors, encoding, p, start, end))
+ return "<FOO>", end
+ lst = []
+ input = u"\u1234".encode("utf8")
+ assert utf8_encode_ascii(input, "??", eh) == "<FOO>"
+ assert lst == [("??", "ascii", input, 0, 1)]
+ lst = []
+ input = u"\u1234\u5678abc\u8765\u4321".encode("utf8")
+ assert utf8_encode_ascii(input, "??", eh) == "<FOO>abc<FOO>"
+ assert lst == [("??", "ascii", input, 0, 2),
+ ("??", "ascii", input, 5, 7)]
- def replace_with(ru, rs):
- def errorhandler(errors, enc, msg, u, startingpos, endingpos):
- if errors == 'strict':
- raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg)
- return ru, rs, endingpos
- return unicode_encode_utf_32_be(
- u"<%s>" % unich, 3, None,
- errorhandler, allow_surrogates=False)
+if HAS_HYPOTHESIS:
+ @given(strategies.text())
+ def test_utf8_encode_ascii_2(u):
+ def eh(errors, encoding, reason, p, start, end):
+ return "?" * (end - start), end
- assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
- assert (replace_with(None, '\xca\xfe\xca\xfe') ==
- '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>')
+ assert utf8_encode_ascii(u.encode("utf8"),
+ "replace", eh) == u.encode("ascii", "replace")
+
+def test_str_decode_ascii():
+ assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3)
+ def eh(errors, encoding, reason, p, start, end):
+ lst.append((errors, encoding, p, start, end))
+ return u"\u1234\u5678".encode("utf8"), end
+ lst = []
+ input = "\xe8"
+ exp = u"\u1234\u5678".encode("utf8")
+ assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2)
+ assert lst == [("??", "ascii", input, 0, 1)]
+ lst = []
+ input = "\xe8\xe9abc\xea\xeb"
+ assert str_decode_ascii(input, "??", True, eh) == (
+ exp + exp + "abc" + exp + exp, 7, 11)
+ assert lst == [("??", "ascii", input, 0, 1),
+ ("??", "ascii", input, 1, 2),
+ ("??", "ascii", input, 5, 6),
+ ("??", "ascii", input, 6, 7)]
+if HAS_HYPOTHESIS:
+ @given(strategies.text())
+ def test_unicode_raw_escape(u):
+ r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
+ assert r == u.encode("raw-unicode-escape")
+
+ @given(strategies.text())
+ def test_unicode_escape(u):
+ r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
+ assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+ assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+ with pytest.raises(ValueError):
+ uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+ state = space.fromcache(CodecState)
+ handler = state.encode_error_handler
+ assert uh.unicode_encode_decimal(
+ u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+ result = uh.unicode_encode_decimal(
+ u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+ assert result == '12ሴ'
diff --git a/pypy/interpreter/unicodehelper.py
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,11 +1,12 @@
+import sys
+
+from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.objectmodel import specialize
-from rpython.rlib.rarithmetic import intmask
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib import runicode
-from rpython.rlib.runicode import (
- default_unicode_error_encode, default_unicode_error_decode,
- MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
-from pypy.interpreter.error import OperationError
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib import rutf8
+from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rtyper.lltypesystem import rffi
+from pypy.module.unicodedata import unicodedb
@specialize.memo()
def decode_error_handler(space):
@@ -20,90 +21,982 @@
space.newtext(msg)]))
return raise_unicode_exception_decode
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+ assert startingpos >= 0
+ ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
+ return ''.join(ux), endingpos, 'b'
+
@specialize.memo()
def encode_error_handler(space):
# Fast version of the "strict" errors handler.
- def raise_unicode_exception_encode(errors, encoding, msg, u,
+ def raise_unicode_exception_encode(errors, encoding, msg, utf8,
startingpos, endingpos):
+ u_len = rutf8.get_utf8_length(utf8)
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([space.newtext(encoding),
- space.newunicode(u),
+ space.newutf8(utf8, u_len),
space.newint(startingpos),
space.newint(endingpos),
space.newtext(msg)]))
return raise_unicode_exception_encode
+def default_error_encode(
+ errors, encoding, msg, u, startingpos, endingpos):
+ """A default handler, for tests"""
+ assert endingpos >= 0
+ if errors == 'replace':
+ return '?', endingpos
+ if errors == 'ignore':
+ return '', endingpos
+ raise ValueError
+
# ____________________________________________________________
+_WIN32 = sys.platform == 'win32'
+_MACOSX = sys.platform == 'darwin'
+
def encode(space, w_data, encoding=None, errors='strict'):
from pypy.objspace.std.unicodeobject import encode_object
return encode_object(space, w_data, encoding, errors)
-# These functions take and return unwrapped rpython strings and unicodes
+
+def _has_surrogate(u):
+ for c in u:
+ if 0xD800 <= ord(c) <= 0xDFFF:
+ return True
+ return False
+
+# These functions take and return unwrapped rpython strings
def decode_unicode_escape(space, string):
from pypy.module._codecs import interp_codecs
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
- result, consumed = runicode.str_decode_unicode_escape(
- string, len(string), "strict",
- final=True, errorhandler=decode_error_handler(space),
- unicodedata_handler=unicodedata_handler)
- return result
+ result_utf8, consumed, length = str_decode_unicode_escape(
+ string, "strict",
+ final=True,
+ errorhandler=decode_error_handler(space),
+ ud_handler=unicodedata_handler)
+ return result_utf8, length
def decode_raw_unicode_escape(space, string):
- result, consumed = runicode.str_decode_raw_unicode_escape(
- string, len(string), "strict",
+ result_utf8, consumed, lgt = str_decode_raw_unicode_escape(
+ string, "strict",
final=True, errorhandler=decode_error_handler(space))
- return result
+ return result_utf8, lgt
-def decode_utf8(space, string):
+def check_ascii_or_raise(space, string):
+ try:
+ rutf8.check_ascii(string)
+ except rutf8.CheckError as e:
+ decode_error_handler(space)('strict', 'ascii',
+ 'ordinal not in range(128)', string,
+ e.pos, e.pos + 1)
+ assert False, "unreachable"
+
+def check_utf8_or_raise(space, string, start=0, end=-1):
# Surrogates are accepted and not treated specially at all.
# If there happen to be two 3-bytes encoding a pair of surrogates,
# you still get two surrogate unicode characters in the result.
# These are the Python2 rules; Python3 differs.
- result, consumed = runicode.str_decode_utf_8(
- string, len(string), "strict",
- final=True, errorhandler=decode_error_handler(space),
- allow_surrogates=True)
- return result
+ try:
+ length = rutf8.check_utf8(string, True, start, end)
+ except rutf8.CheckError as e:
+ # convert position into unicode position
+ lgt = rutf8.check_utf8(string, True, start, stop=e.pos)
+ decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string,
+ start + lgt, start + lgt + 1)
+ assert False, "unreachable"
+ return length
-def encode_utf8(space, uni):
- # Note that this function never raises UnicodeEncodeError,
- # since surrogates are allowed, either paired or lone.
- # A paired surrogate is considered like the non-BMP character
- # it stands for. These are the Python2 rules; Python3 differs.
+def str_decode_ascii(s, errors, final, errorhandler):
+ try:
+ rutf8.check_ascii(s)
+ return s, len(s), len(s)
+ except rutf8.CheckError:
+ return _str_decode_ascii_slowpath(s, errors, final, errorhandler)
+
+def _str_decode_ascii_slowpath(s, errors, final, errorhandler):
+ i = 0
+ res = StringBuilder()
+ while i < len(s):
+ ch = s[i]
+ if ord(ch) > 0x7F:
+ r, i = errorhandler(errors, 'ascii', 'ordinal not in range(128)',
+ s, i, i + 1)
+ res.append(r)
+ else:
+ res.append(ch)
+ i += 1
+ ress = res.build()
+ lgt = rutf8.check_utf8(ress, True)
+ return ress, len(s), lgt
+
+def str_decode_latin_1(s, errors, final, errorhandler):
+ try:
+ rutf8.check_ascii(s)
+ return s, len(s), len(s)
+ except rutf8.CheckError:
+ return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
+
+def _str_decode_latin_1_slowpath(s, errors, final, errorhandler):
+ res = StringBuilder(len(s))
+ i = 0
+ while i < len(s):
+ if ord(s[i]) > 0x7F:
+ while i < len(s) and ord(s[i]) > 0x7F:
+ rutf8.unichr_as_utf8_append(res, ord(s[i]))
+ i += 1
+ else:
+ start = i
+ end = i + 1
+ while end < len(s) and ord(s[end]) <= 0x7F:
+ end += 1
+ res.append_slice(s, start, end)
+ i = end
+ # cannot be ASCII, cannot have surrogates, I believe
+ return res.build(), len(s), len(s)
+
+def utf8_encode_latin_1(s, errors, errorhandler):
+ try:
+ rutf8.check_ascii(s)
+ return s
+ except rutf8.CheckError:
+ return _utf8_encode_latin_1_slowpath(s, errors, errorhandler)
+
+def _utf8_encode_latin_1_slowpath(s, errors, errorhandler):
+ size = len(s)
+ result = StringBuilder(size)
+ index = 0
+ pos = 0
+ while pos < size:
+ ch = rutf8.codepoint_at_pos(s, pos)
+ if ch <= 0xFF:
+ result.append(chr(ch))
+ index += 1
+ pos = rutf8.next_codepoint_pos(s, pos)
+ else:
+ startindex = index
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ while pos < size and rutf8.codepoint_at_pos(s, pos) > 0xFF:
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ msg = "ordinal not in range(256)"
+ res_8, newindex = errorhandler(
+ errors, 'latin1', msg, s, startindex, index)
+ for cp in rutf8.Utf8StringIterator(res_8):
+ if cp > 0xFF:
+ errorhandler("strict", 'latin1', msg, s, startindex, index)
+ result.append(chr(cp))
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
+ return result.build()
+
+def utf8_encode_ascii(s, errors, errorhandler):
+ """ Don't be confused - this is a slowpath for errors e.g. "ignore"
+ or an obscure errorhandler
+ """
+ size = len(s)
+ result = StringBuilder(size)
+ index = 0
+ pos = 0
+ while pos < size:
+ ch = rutf8.codepoint_at_pos(s, pos)
+ if ch <= 0x7F:
+ result.append(chr(ch))
+ index += 1
+ pos = rutf8.next_codepoint_pos(s, pos)
+ else:
+ startindex = index
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ while pos < size and rutf8.codepoint_at_pos(s, pos) > 0x7F:
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+ msg = "ordinal not in range(128)"
+ res_8, newindex = errorhandler(
+ errors, 'ascii', msg, s, startindex, index)
+ for cp in rutf8.Utf8StringIterator(res_8):
+ if cp > 0x7F:
+ errorhandler("strict", 'ascii', msg, s, startindex, index)
+ result.append(chr(cp))
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
+ return result.build()
+
+if sys.platform == 'win32':
+ def utf8_encode_mbcs(s, errors, errorhandler):
+ from rpython.rlib import runicode
+ s = s.decode('utf-8')
+ slen = len(s)
+ res = runicode.unicode_encode_mbcs(s, slen, errors, errorhandler)
+ return res
+
+ def str_decode_mbcs(s, errors, final, errorhandler):
+ from rpython.rlib import runicode
+ slen = len(s)
+ res, size = runicode.str_decode_mbcs(s, slen, final=final,
errors=errors,
+ errorhandler=errorhandler)
+ return res.encode('utf8'), size, len(res)
+
+def str_decode_utf8(s, errors, final, errorhandler):
+ """ Same as checking for the valid utf8, but we know the utf8 is not
+ valid so we're trying to either raise or pack stuff with error handler.
+ The key difference is that this is call_may_force
+ """
+ slen = len(s)
+ res = StringBuilder(slen)
+ pos = 0
+ end = len(s)
+ suppressing = False # we are in a chain of "bad" unicode, only emit one fix
+ while pos < end:
+ ordch1 = ord(s[pos])
+ # fast path for ASCII
+ if ordch1 <= 0x7F:
+ pos += 1
+ res.append(chr(ordch1))
+ suppressing = False
+ continue
+
+ if ordch1 <= 0xC1:
+ r, pos = errorhandler(errors, "utf8", "invalid start byte",
+ s, pos, pos + 1)
+ if not suppressing:
+ res.append(r)
+ continue
+
+ pos += 1
+
+ if ordch1 <= 0xDF:
+ if pos >= end:
+ if not final:
+ pos -= 1
+ break
+ r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+ s, pos - 1, pos)
+ if not suppressing:
+ res.append(r)
+ continue
+ ordch2 = ord(s[pos])
+
+ if rutf8._invalid_byte_2_of_2(ordch2):
+ r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ s, pos - 1, pos)
+ if not suppressing:
+ res.append(r)
+ continue
+ # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
+ pos += 1
+ res.append(chr(ordch1))
+ res.append(chr(ordch2))
+ continue
+
+ if ordch1 <= 0xEF:
+ if (pos + 2) > end:
+ if not final:
+ pos -= 1
+ break
+ r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+ s, pos - 1, pos)
+ res.append(r)
+ suppressing = True
+ continue
+ ordch2 = ord(s[pos])
+ ordch3 = ord(s[pos + 1])
+
+ if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True):
+ r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ s, pos - 1, pos)
+ if not suppressing:
+ res.append(r)
+ continue
+ elif rutf8._invalid_byte_3_of_3(ordch3):
+ r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ s, pos - 1, pos + 1)
+ if not suppressing:
+ res.append(r)
+ continue
+ pos += 2
+
+ # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
+ res.append(chr(ordch1))
+ res.append(chr(ordch2))
+ res.append(chr(ordch3))
+ suppressing = False
+ continue
+
+ if ordch1 <= 0xF4:
+ if (pos + 3) > end:
+ if not final:
+ pos -= 1
+ break
+ r, pos = errorhandler(errors, "utf8", "unexpected end of data",
+ s, pos - 1, pos)
+ res.append(r)
+ suppressing = True
+ continue
+ ordch2 = ord(s[pos])
+ ordch3 = ord(s[pos + 1])
+ ordch4 = ord(s[pos + 2])
+
+ if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
+ r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ s, pos - 1, pos)
+ if not suppressing:
+ res.append(r)
+ continue
+ elif rutf8._invalid_byte_3_of_4(ordch3):
+ r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ s, pos - 1, pos + 1)
+ res.append(r)
+ continue
+ elif rutf8._invalid_byte_4_of_4(ordch4):
+ r, pos = errorhandler(errors, "utf8", "invalid continuation
byte",
+ s, pos - 1, pos + 2)
+ if not suppressing:
+ res.append(r)
+ continue
+
+ pos += 3
+ # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz
+ res.append(chr(ordch1))
+ res.append(chr(ordch2))
+ res.append(chr(ordch3))
+ res.append(chr(ordch4))
+ suppressing = False
+ continue
+
+ r, pos = errorhandler(errors, "utf8", "invalid start byte",
+ s, pos - 1, pos)
+ if not suppressing:
+ res.append(r)
+
+ r = res.build()
+ return r, pos, rutf8.check_utf8(r, True)
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits,
+ encoding, errorhandler, message, errors):
+ chr = 0
+ if pos + digits > len(s):
+ endinpos = pos
+ while endinpos < len(s) and s[endinpos] in hexdigits:
+ endinpos += 1
+ res, pos = errorhandler(
+ errors, encoding, message, s, pos - 2, endinpos)
+ builder.append(res)
+ else:
+ try:
+ chr = int(s[pos:pos + digits], 16)
+ except ValueError:
+ endinpos = pos
+ while s[endinpos] in hexdigits:
+ endinpos += 1
+ res, pos = errorhandler(
+ errors, encoding, message, s, pos - 2, endinpos)
+ builder.append(res)
+ else:
+ # when we get here, chr is a 32-bit unicode character
+ try:
+ builder.append_code(chr)
+ pos += digits
+ except ValueError:
+ message = "illegal Unicode character"
+ res, pos = errorhandler(
+ errors, encoding, message, s, pos - 2, pos + digits)
+ builder.append(res)
+ return pos
+
+def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
+ size = len(s)
+ if size == 0:
+ return '', 0, 0
+
+ builder = rutf8.Utf8StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ if ord(ch) > 0x7F:
+ builder.append_code(ord(ch))
+ else:
+ builder.append(ch)
+ pos += 1
+ continue
+
+ # - Escapes
+ pos += 1
+ if pos >= size:
+ message = "\\ at end of string"
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos - 1, size)
+ builder.append(res)
+ continue
+
+ ch = s[pos]
+ pos += 1
+ # \x escapes
+ if ch == '\n':
+ pass
+ elif ch == '\\':
+ builder.append_char('\\')
+ elif ch == '\'':
+ builder.append_char('\'')
+ elif ch == '\"':
+ builder.append_char('\"')
+ elif ch == 'b':
+ builder.append_char('\b')
+ elif ch == 'f':
+ builder.append_char('\f')
+ elif ch == 't':
+ builder.append_char('\t')
+ elif ch == 'n':
+ builder.append_char('\n')
+ elif ch == 'r':
+ builder.append_char('\r')
+ elif ch == 'v':
+ builder.append_char('\v')
+ elif ch == 'a':
+ builder.append_char('\a')
+ elif '0' <= ch <= '7':
+ x = ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x << 3) + ord(ch) - ord('0')
+ if pos < size:
+ ch = s[pos]
+ if '0' <= ch <= '7':
+ pos += 1
+ x = (x << 3) + ord(ch) - ord('0')
+ if x > 0x7F:
+ builder.append_code(x)
+ else:
+ builder.append_char(chr(x))
+ # hex escapes
+ # \xXX
+ elif ch == 'x':
+ digits = 2
+ message = "truncated \\xXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+ # \uXXXX
+ elif ch == 'u':
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+ # \UXXXXXXXX
+ elif ch == 'U':
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
+ pos = hexescape(builder, s, pos, digits,
+ "unicodeescape", errorhandler, message, errors)
+ # \N{name}
+ elif ch == 'N' and ud_handler is not None:
+ message = "malformed \\N character escape"
+ look = pos
+
+ if look < size and s[look] == '{':
+ # look for the closing brace
+ while look < size and s[look] != '}':
+ look += 1
+ if look < size and s[look] == '}':
+ # found a name. look it up in the unicode database
+ message = "unknown Unicode character name"
+ name = s[pos + 1:look]
+ code = ud_handler.call(name)
+ if code < 0:
+ res, pos = errorhandler(
+ errors, "unicodeescape", message,
+ s, pos - 1, look + 1)
+ builder.append(res)
+ continue
+ pos = look + 1
+ builder.append_code(code)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos - 1, look + 1)
+ builder.append(res)
+ else:
+ res, pos = errorhandler(errors, "unicodeescape",
+ message, s, pos - 1, look + 1)
+ builder.append(res)
+ else:
+ builder.append_char('\\')
+ builder.append_code(ord(ch))
+
+ return builder.build(), pos, builder.getlength()
+
+def wcharpsize2utf8(space, wcharp, size):
+ """Safe version of rffi.wcharpsize2utf8.
+
+ Raises app-level ValueError if any wchar value is outside the valid
+ codepoint range.
+ """
+ try:
+ return rffi.wcharpsize2utf8(wcharp, size)
+ except ValueError:
+ raise oefmt(space.w_ValueError,
+ "character is not in range [U+0000; U+10ffff]")
+
+
+# ____________________________________________________________
+# Raw unicode escape
+
+def str_decode_raw_unicode_escape(s, errors, final=False,
+ errorhandler=None):
+ size = len(s)
+ if size == 0:
+ return '', 0, 0
+
+ builder = rutf8.Utf8StringBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+
+ # Non-escape characters are interpreted as Unicode ordinals
+ if ch != '\\':
+ builder.append_code(ord(ch))
+ pos += 1
+ continue
+
+ # \u-escapes are only interpreted iff the number of leading
+ # backslashes is odd
+ bs = pos
+ while pos < size:
+ pos += 1
+ if pos == size or s[pos] != '\\':
+ break
+ builder.append_char('\\')
+
+ # we have a backslash at the end of the string, stop here
+ if pos >= size:
+ builder.append_char('\\')
+ break
+
+ if ((pos - bs) & 1 == 0 or pos >= size or
+ (s[pos] != 'u' and s[pos] != 'U')):
+ builder.append_char('\\')
+ builder.append_code(ord(s[pos]))
+ pos += 1
+ continue
+
+ digits = 4 if s[pos] == 'u' else 8
+ message = "truncated \\uXXXX"
+ pos += 1
+ pos = hexescape(builder, s, pos, digits,
+ "rawunicodeescape", errorhandler, message, errors)
+
+ return builder.build(), pos, builder.getlength()
+
+_utf8_encode_unicode_escape = rutf8.make_utf8_escape_function()
+
+
+TABLE = '0123456789abcdef'
+
+def raw_unicode_escape_helper(result, char):
+ if char >= 0x10000 or char < 0:
+ result.append("\\U")
+ zeros = 8
+ elif char >= 0x100:
+ result.append("\\u")
+ zeros = 4
+ else:
+ result.append("\\x")
+ zeros = 2
+ for i in range(zeros-1, -1, -1):
+ result.append(TABLE[(char >> (4 * i)) & 0x0f])
+
+def utf8_encode_raw_unicode_escape(s, errors, errorhandler):
+ # errorhandler is not used: this function cannot cause Unicode errors
+ size = len(s)
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+ pos = 0
+ while pos < size:
+ oc = rutf8.codepoint_at_pos(s, pos)
+
+ if oc < 0x100:
+ result.append(chr(oc))
+ else:
+ raw_unicode_escape_helper(result, oc)
+ pos = rutf8.next_codepoint_pos(s, pos)
+
+ return result.build()
+
+
+def utf8_encode_unicode_escape(s, errors, errorhandler):
+ return _utf8_encode_unicode_escape(s)
+
+# ____________________________________________________________
+# utf-7
+
+# Three simple macros defining base-64
+
+def _utf7_IS_BASE64(oc):
+ "Is c a base-64 character?"
+ c = chr(oc)
+ return c.isalnum() or c == '+' or c == '/'
+def _utf7_TO_BASE64(n):
+ "Returns the base-64 character of the bottom 6 bits of n"
+ return
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
+def _utf7_FROM_BASE64(c):
+ "given that c is a base-64 character, what is its base-64 value?"
+ if c >= 'a':
+ return ord(c) - 71
+ elif c >= 'A':
+ return ord(c) - 65
+ elif c >= '0':
+ return ord(c) + 4
+ elif c == '+':
+ return 62
+ else: # c == '/'
+ return 63
+
+def _utf7_DECODE_DIRECT(oc):
+ return oc <= 127 and oc != ord('+')
+
+# The UTF-7 encoder treats ASCII characters differently according to
+# whether they are Set D, Set O, Whitespace, or special (i.e. none of
+# the above). See RFC2152. This array identifies these different
+# sets:
+# 0 : "Set D"
+# alphanumeric and '(),-./:?
+# 1 : "Set O"
+# !"#$%&*;<=>@[]^_`{|}
+# 2 : "whitespace"
+# ht nl cr sp
+# 3 : special (must be base64 encoded)
+# everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
+
+utf7_category = [
+# nul soh stx etx eot enq ack bel bs ht nl vt np cr so si
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
+# dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+# sp ! " # $ % & ' ( ) * + , - . /
+ 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
+# 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+# @ A B C D E F G H I J K L M N O
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+# P Q R S T U V W X Y Z [ \ ] ^ _
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
+# ` a b c d e f g h i j k l m n o
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+# p q r s t u v w x y z { | } ~ del
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
+]
+
+# ENCODE_DIRECT: this character should be encoded as itself. The
+# answer depends on whether we are encoding set O as itself, and also
+# on whether we are encoding whitespace as itself. RFC2152 makes it
+# clear that the answers to these questions vary between
+# applications, so this code needs to be flexible.
+
+def _utf7_ENCODE_DIRECT(oc, directO, directWS):
+ return(oc < 128 and oc > 0 and
+ (utf7_category[oc] == 0 or
+ (directWS and utf7_category[oc] == 2) or
+ (directO and utf7_category[oc] == 1)))
+
+def _utf7_ENCODE_CHAR(result, oc, base64bits, base64buffer):
+ if oc >= 0x10000:
+ # code first surrogate
+ base64bits += 16
+ base64buffer = (base64buffer << 16) | 0xd800 | ((oc-0x10000) >> 10)
+ while base64bits >= 6:
+ result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+ base64bits -= 6
+ # prepare second surrogate
+ oc = 0xDC00 | ((oc-0x10000) & 0x3FF)
+ base64bits += 16
+ base64buffer = (base64buffer << 16) | oc
+ while base64bits >= 6:
+ result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+ base64bits -= 6
+ return base64bits, base64buffer
+
+def str_decode_utf_7(s, errors, final=False,
+ errorhandler=None):
+ size = len(s)
+ if size == 0:
+ return '', 0, 0
+
+ inShift = False
+ base64bits = 0
+ base64buffer = 0
+ surrogate = 0
+ outsize = 0
+
+ result = StringBuilder(size)
+ pos = 0
+ shiftOutStartPos = 0
+ startinpos = 0
+ while pos < size:
+ ch = s[pos]
+
+ if inShift: # in a base-64 section
+ if _utf7_IS_BASE64(ord(ch)): #consume a base-64 character
+ base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
+ assert base64buffer >= 0
+ base64bits += 6
+ pos += 1
+
+ if base64bits >= 16:
+ # enough bits for a UTF-16 value
+ outCh = base64buffer >> (base64bits - 16)
+ assert outCh >= 0
+ base64bits -= 16
+ base64buffer &= (1 << base64bits) - 1 # clear high bits
+ assert outCh <= 0xffff
+ if surrogate:
+ # expecting a second surrogate
+ if outCh >= 0xDC00 and outCh <= 0xDFFF:
+ code = (((surrogate & 0x3FF)<<10) |
+ (outCh & 0x3FF)) + 0x10000
+ rutf8.unichr_as_utf8_append(result, code)
+ outsize += 1
+ surrogate = 0
+ continue
+ else:
+ rutf8.unichr_as_utf8_append(result, surrogate,
+ allow_surrogates=True)
+ outsize += 1
+ surrogate = 0
+ # Not done with outCh: falls back to next line
+ if outCh >= 0xD800 and outCh <= 0xDBFF:
+ # first surrogate
+ surrogate = outCh
+ else:
+ outsize += 1
+ assert outCh >= 0
+ rutf8.unichr_as_utf8_append(result, outCh, True)
+
+ else:
+ # now leaving a base-64 section
+ inShift = False
+
+ if base64bits > 0: # left-over bits
+ if base64bits >= 6:
+ # We've seen at least one base-64 character
+ pos += 1
+ msg = "partial character in shift sequence"
+ res, pos = errorhandler(errors, 'utf7',
+ msg, s, pos-1, pos)
+ reslen = rutf8.check_utf8(res, True)
+ outsize += reslen
+ result.append(res)
+ continue
+ else:
+ # Some bits remain; they should be zero
+ if base64buffer != 0:
+ pos += 1
+ msg = "non-zero padding bits in shift sequence"
+ res, pos = errorhandler(errors, 'utf7',
+ msg, s, pos-1, pos)
+ reslen = rutf8.check_utf8(res, True)
+ outsize += reslen
+ result.append(res)
+ continue
+
+ if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
+ outsize += 1
+ rutf8.unichr_as_utf8_append(result, surrogate, True)
+ surrogate = 0
+
+ if ch == '-':
+ # '-' is absorbed; other terminating characters are
+ # preserved
+ pos += 1
+
+ elif ch == '+':
+ startinpos = pos
+ pos += 1 # consume '+'
+ if pos < size and s[pos] == '-': # '+-' encodes '+'
+ pos += 1
+ result.append('+')
+ outsize += 1
+ else: # begin base64-encoded section
+ inShift = 1
+ surrogate = 0
+ shiftOutStartPos = result.getlength()
+ base64bits = 0
+ base64buffer = 0
+
+ elif _utf7_DECODE_DIRECT(ord(ch)): # character decodes at itself
+ result.append(ch)
+ outsize += 1
+ pos += 1
+ else:
+ startinpos = pos
+ pos += 1
+ msg = "unexpected special character"
+ res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
+ reslen = rutf8.check_utf8(res, True)
+ outsize += reslen
+ result.append(res)
+
+ # end of string
+ final_length = result.getlength()
+ if inShift and final: # in shift sequence, no more to follow
+ # if we're in an inconsistent state, that's an error
+ inShift = 0
+ if (surrogate or
+ base64bits >= 6 or
+ (base64bits > 0 and base64buffer != 0)):
+ msg = "unterminated shift sequence"
+ res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos,
pos)
+ reslen = rutf8.check_utf8(res, True)
+ outsize += reslen
+ result.append(res)
+ final_length = result.getlength()
+ elif inShift:
+ pos = startinpos
+ final_length = shiftOutStartPos # back off output
+
+ assert final_length >= 0
+ return result.build()[:final_length], pos, outsize
+
+def utf8_encode_utf_7(s, errors, errorhandler):
+ size = len(s)
+ if size == 0:
+ return ''
+ result = StringBuilder(size)
+
+ encodeSetO = encodeWhiteSpace = False
+
+ inShift = False
+ base64bits = 0
+ base64buffer = 0
+
+ pos = 0
+ while pos < size:
+ oc = rutf8.codepoint_at_pos(s, pos)
+ if not inShift:
+ if oc == ord('+'):
+ result.append('+-')
+ elif _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+ result.append(chr(oc))
+ else:
+ result.append('+')
+ inShift = True
+ base64bits, base64buffer = _utf7_ENCODE_CHAR(
+ result, oc, base64bits, base64buffer)
+ else:
+ if _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+ # shifting out
+ if base64bits: # output remaining bits
+ result.append(_utf7_TO_BASE64(base64buffer <<
(6-base64bits)))
+ base64buffer = 0
+ base64bits = 0
+
+ inShift = False
+ ## Characters not in the BASE64 set implicitly unshift the
+ ## sequence so no '-' is required, except if the character is
+ ## itself a '-'
+ if _utf7_IS_BASE64(oc) or oc == ord('-'):
+ result.append('-')
+ result.append(chr(oc))
+ else:
+ base64bits, base64buffer = _utf7_ENCODE_CHAR(
+ result, oc, base64bits, base64buffer)
+ pos = rutf8.next_codepoint_pos(s, pos)
+
+ if base64bits:
+ result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits)))
+ if inShift:
+ result.append('-')
+
+ return result.build()
+
[email protected]()
+def _encode_unicode_error_handler(space):
+ # Fast version of the "strict" errors handler.
+ from rpython.rlib import runicode
+ def raise_unicode_exception_encode(errors, encoding, msg, uni,
+ startingpos, endingpos):
+ assert isinstance(uni, unicode)
+ u_len = len(uni)
+ utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
+ raise OperationError(space.w_UnicodeEncodeError,
+ space.newtuple([space.newtext(encoding),
+ space.newtext(utf8, u_len),
+ space.newint(startingpos),
+ space.newint(endingpos),
+ space.newtext(msg)]))
+ return u'', None, 0
+ return raise_unicode_exception_encode
+
+
+def encode_utf8(space, uni, allow_surrogates=False):
+ # Note that Python3 tends to forbid *all* surrogates in utf-8.
+ # If allow_surrogates=True, then revert to the Python 2 behavior
+ # which never raises UnicodeEncodeError. Surrogate pairs are then
+ # allowed, either paired or lone. A paired surrogate is considered
+ # like the non-BMP character it stands for. See also *_utf8sp().
+ from rpython.rlib import runicode
+ assert isinstance(uni, unicode)
return runicode.unicode_encode_utf_8(
uni, len(uni), "strict",
- errorhandler=None,
- allow_surrogates=True)
+ errorhandler=_encode_unicode_error_handler(space),
+ allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni, allow_surrogates=True):
+ xxx
+ # Surrogate-preserving utf-8 encoding. Any surrogate character
+ # turns into its 3-bytes encoding, whether it is paired or not.
+ # This should always be reversible, and the reverse is
+ # decode_utf8sp().
+ from rpython.rlib import runicode
+ return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+ # Surrogate-preserving utf-8 decoding. Assuming there is no
+ # encoding error, it should always be reversible, and the reverse is
+ # encode_utf8sp().
+ return str_decode_utf8(string, "string", True, decode_never_raise,
+ allow_surrogates=True)
+
# ____________________________________________________________
# utf-16
-def str_decode_utf_16(s, size, errors, final=True,
+BYTEORDER = sys.byteorder
+BYTEORDER2 = BYTEORDER[0] + 'e' # either "le" or "be"
+assert BYTEORDER2 in ('le', 'be')
+
+def str_decode_utf_16(s, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_16_helper(s, size, errors,
final,
+ result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
errorhandler,
"native")
- return result, length
+ return result, c, lgt
-def str_decode_utf_16_be(s, size, errors, final=True,
+def str_decode_utf_16_be(s, errors, final=True,
+ errorhandler=None):
+ result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
+ errorhandler, "big")
+ return result, c, lgt
+
+def str_decode_utf_16_le(s, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_16_helper(s, size, errors,
final,
- errorhandler, "big")
- return result, length
+ result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
+ errorhandler,
"little")
+ return result, c, lgt
-def str_decode_utf_16_le(s, size, errors, final=True,
- errorhandler=None):
- result, length, byteorder = str_decode_utf_16_helper(s, size, errors,
final,
- errorhandler,
"little")
- return result, length
-
-def str_decode_utf_16_helper(s, size, errors, final=True,
+def str_decode_utf_16_helper(s, errors, final=True,
errorhandler=None,
byteorder="native",
public_encoding_name='utf16'):
- if errorhandler is None:
- errorhandler = default_unicode_error_decode
+ size = len(s)
bo = 0
if BYTEORDER == 'little':
@@ -140,7 +1033,7 @@
else:
bo = 1
if size == 0:
- return u'', 0, bo
+ return '', 0, 0, bo
if bo == -1:
# force little endian
ihi = 1
@@ -151,7 +1044,7 @@
ihi = 0
ilo = 1
- result = UnicodeBuilder(size // 2)
+ result = StringBuilder(size // 2)
#XXX I think the errors are not correctly handled here
while pos < size:
@@ -168,7 +1061,7 @@
ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
pos += 2
if ch < 0xD800 or ch > 0xDFFF:
- result.append(unichr(ch))
+ rutf8.unichr_as_utf8_append(result, ch)
continue
# UTF-16 code pair:
if len(s) - pos < 2:
@@ -185,12 +1078,8 @@
ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
pos += 2
if 0xDC00 <= ch2 <= 0xDFFF:
- if MAXUNICODE < 65536:
- result.append(unichr(ch))
- result.append(unichr(ch2))
- else:
- result.append(UNICHR((((ch & 0x3FF)<<10) |
- (ch2 & 0x3FF)) + 0x10000))
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
+ rutf8.unichr_as_utf8_append(result, ch)
continue
else:
r, pos = errorhandler(errors, public_encoding_name,
@@ -202,7 +1091,9 @@
"illegal encoding",
s, pos - 2, pos)
result.append(r)
- return result.build(), pos, bo
+ r = result.build()
+ lgt = rutf8.check_utf8(r, True)
+ return result.build(), pos, lgt, bo
def _STORECHAR(result, CH, byteorder):
hi = chr(((CH) >> 8) & 0xff)
@@ -214,13 +1105,12 @@
result.append(hi)
result.append(lo)
-def unicode_encode_utf_16_helper(s, size, errors,
+def unicode_encode_utf_16_helper(s, errors,
errorhandler=None,
allow_surrogates=True,
byteorder='little',
public_encoding_name='utf16'):
- if errorhandler is None:
- errorhandler = default_unicode_error_encode
+ size = len(s)
if size == 0:
if byteorder == 'native':
result = StringBuilder(2)
@@ -234,9 +1124,9 @@
byteorder = BYTEORDER
pos = 0
+ index = 0
while pos < size:
- ch = ord(s[pos])
- pos += 1
+ ch = rutf8.codepoint_at_pos(s, pos)
if ch < 0xD800:
_STORECHAR(result, ch, byteorder)
@@ -246,78 +1136,76 @@
elif ch >= 0xE000 or allow_surrogates:
_STORECHAR(result, ch, byteorder)
else:
- ru, rs, pos = errorhandler(errors, public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- if rs is not None:
- # py3k only
- if len(rs) % 2 != 0:
- errorhandler('strict', public_encoding_name,
- 'surrogates not allowed',
- s, pos-1, pos)
- result.append(rs)
- continue
- for ch in ru:
- if ord(ch) < 0xD800:
- _STORECHAR(result, ord(ch), byteorder)
+ res_8, newindex = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos, pos+1)
+ for cp in rutf8.Utf8StringIterator(res_8):
+ if cp < 0xD800:
+ _STORECHAR(result, cp, byteorder)
else:
errorhandler('strict', public_encoding_name,
'surrogates not allowed',
- s, pos-1, pos)
+ s, pos, pos+1)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
+ pos = rutf8.next_codepoint_pos(s, pos)
+ index += 1
+
return result.build()
-def unicode_encode_utf_16(s, size, errors,
+def utf8_encode_utf_16(s, errors,
errorhandler=None,
allow_surrogates=True):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ return unicode_encode_utf_16_helper(s, errors, errorhandler,
allow_surrogates, "native")
-def unicode_encode_utf_16_be(s, size, errors,
+def utf8_encode_utf_16_be(s, errors,
errorhandler=None,
allow_surrogates=True):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ return unicode_encode_utf_16_helper(s, errors, errorhandler,
allow_surrogates, "big")
-def unicode_encode_utf_16_le(s, size, errors,
+def utf8_encode_utf_16_le(s, errors,
errorhandler=None,
allow_surrogates=True):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ return unicode_encode_utf_16_helper(s, errors, errorhandler,
allow_surrogates, "little")
-
# ____________________________________________________________
# utf-32
-def str_decode_utf_32(s, size, errors, final=True,
+def str_decode_utf_32(s, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(
- s, size, errors, final, errorhandler, "native")
- return result, length
+ result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
+ errorhandler,
"native")
+ return result, c, lgt
-def str_decode_utf_32_be(s, size, errors, final=True,
+def str_decode_utf_32_be(s, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(
- s, size, errors, final, errorhandler, "big")
- return result, length
+ result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
+ errorhandler, "big")
+ return result, c, lgt
-def str_decode_utf_32_le(s, size, errors, final=True,
+def str_decode_utf_32_le(s, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(
- s, size, errors, final, errorhandler, "little")
- return result, length
+ result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final,
+ errorhandler,
"little")
+ return result, c, lgt
-BOM32_DIRECT = intmask(0x0000FEFF)
+BOM32_DIRECT = intmask(0x0000FEFF)
BOM32_REVERSE = intmask(0xFFFE0000)
-def str_decode_utf_32_helper(s, size, errors, final=True,
- errorhandler=None,
+def str_decode_utf_32_helper(s, errors, final,
+ errorhandler,
byteorder="native",
- public_encoding_name='utf32'):
- if errorhandler is None:
- errorhandler = default_unicode_error_decode
+ public_encoding_name='utf32',
+ allow_surrogates=True):
+ assert errorhandler is not None
bo = 0
+ size = len(s)
if BYTEORDER == 'little':
iorder = [0, 1, 2, 3]
@@ -353,7 +1241,7 @@
else:
bo = 1
if size == 0:
- return u'', 0, bo
+ return '', 0, 0, bo
if bo == -1:
# force little endian
iorder = [0, 1, 2, 3]
@@ -361,7 +1249,7 @@
# force big endian
iorder = [3, 2, 1, 0]
- result = UnicodeBuilder(size // 4)
+ result = StringBuilder(size // 4)
while pos < size:
# remaining bytes at the end? (size should be divisible by 4)
@@ -376,22 +1264,26 @@
break
continue
ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) <<
16) |
- (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
- if ch >= 0x110000:
+ (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
+ if not allow_surrogates and 0xD800 <= ch <= 0xDFFF:
+ r, pos = errorhandler(errors, public_encoding_name,
+ "code point in surrogate code point "
+ "range(0xd800, 0xe000)",
+ s, pos, pos + 4)
+ result.append(r)
+ continue
+ elif ch >= 0x110000:
r, pos = errorhandler(errors, public_encoding_name,
"codepoint not in range(0x110000)",
s, pos, len(s))
result.append(r)
continue
- if MAXUNICODE < 65536 and ch >= 0x10000:
- ch -= 0x10000L
- result.append(unichr(0xD800 + (ch >> 10)))
- result.append(unichr(0xDC00 + (ch & 0x03FF)))
- else:
- result.append(UNICHR(ch))
+ rutf8.unichr_as_utf8_append(result, ch,
allow_surrogates=allow_surrogates)
pos += 4
- return result.build(), pos, bo
+ r = result.build()
+ lgt = rutf8.check_utf8(r, True)
+ return r, pos, lgt, bo
def _STORECHAR32(result, CH, byteorder):
c0 = chr(((CH) >> 24) & 0xff)
@@ -409,13 +1301,12 @@
result.append(c2)
result.append(c3)
-def unicode_encode_utf_32_helper(s, size, errors,
+def unicode_encode_utf_32_helper(s, errors,
errorhandler=None,
allow_surrogates=True,
byteorder='little',
public_encoding_name='utf32'):
- if errorhandler is None:
- errorhandler = default_unicode_error_encode
+ size = len(s)
if size == 0:
if byteorder == 'native':
result = StringBuilder(4)
@@ -429,50 +1320,253 @@
byteorder = BYTEORDER
pos = 0
+ index = 0
while pos < size:
- ch = ord(s[pos])
- pos += 1
- ch2 = 0
+ ch = rutf8.codepoint_at_pos(s, pos)
+ pos = rutf8.next_codepoint_pos(s, pos)
if not allow_surrogates and 0xD800 <= ch < 0xE000:
- ru, rs, pos = errorhandler(
+ res_8, newindex = errorhandler(
errors, public_encoding_name, 'surrogates not allowed',
s, pos - 1, pos)
- if rs is not None:
- # py3k only
- if len(rs) % 4 != 0:
+ for ch in rutf8.Utf8StringIterator(res_8):
+ if ch < 0xD800:
+ _STORECHAR32(result, ch, byteorder)
+ else:
errorhandler(
'strict', public_encoding_name, 'surrogates not
allowed',
s, pos - 1, pos)
- result.append(rs)
- continue
- for ch in ru:
- if ord(ch) < 0xD800:
- _STORECHAR32(result, ord(ch), byteorder)
- else:
- errorhandler(
- 'strict', public_encoding_name,
- 'surrogates not allowed', s, pos - 1, pos)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
continue
- if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size:
- ch2 = ord(s[pos])
- if 0xDC00 <= ch2 < 0xE000:
- ch = (((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000
- pos += 1
_STORECHAR32(result, ch, byteorder)
+ index += 1
return result.build()
-def unicode_encode_utf_32(s, size, errors,
+def utf8_encode_utf_32(s, errors,
errorhandler=None, allow_surrogates=True):
- return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ return unicode_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "native")
-def unicode_encode_utf_32_be(s, size, errors,
+def utf8_encode_utf_32_be(s, errors,
errorhandler=None, allow_surrogates=True):
- return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ return unicode_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "big")
-def unicode_encode_utf_32_le(s, size, errors,
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit