Author: Matti Picus <matti.pi...@gmail.com> Branch: Changeset: r96003:ba081fb468f4 Date: 2019-02-13 23:11 +0200 http://bitbucket.org/pypy/pypy/changeset/ba081fb468f4/
Log: merge unicode-utf8 into default diff too long, truncating to 2000 out of 15164 lines diff --git a/TODO b/TODO new file mode 100644 --- /dev/null +++ b/TODO @@ -0,0 +1,4 @@ +* find a better way to run "find" without creating the index storage, + if one is not already readily available (understand cost now, improve after merge) +* improve performance of splitlines +* think about cost of utf8 list strategy (Armin and CF) diff --git a/lib-python/2.7/test/test_memoryio.py b/lib-python/2.7/test/test_memoryio.py --- a/lib-python/2.7/test/test_memoryio.py +++ b/lib-python/2.7/test/test_memoryio.py @@ -712,6 +712,7 @@ # XXX: For the Python version of io.StringIO, this is highly # dependent on the encoding used for the underlying buffer. + @support.cpython_only def test_widechar(self): buf = self.buftype("\U0002030a\U00020347") memio = self.ioclass(buf) diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -29,7 +29,11 @@ Improve register allocation in the JIT. - .. branch: promote-unicode Implement rlib.jit.promote_unicode to complement promote_string + +.. branch: unicode-utf8 + +Use utf8 internally to represent unicode, with the goal of never using rpython-level unicode + diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py --- a/pypy/interpreter/argument.py +++ b/pypy/interpreter/argument.py @@ -535,24 +535,26 @@ if num_remainingkwds == 1: for i in range(len(keywords)): if i not in kwds_mapping: - name = keywords[i] - if name is None: - # We'll assume it's unicode. Encode it. - # Careful, I *think* it should not be possible to - # get an IndexError here but you never know. - try: - if keyword_names_w is None: - raise IndexError - # note: negative-based indexing from the end - w_name = keyword_names_w[i - len(keywords)] - except IndexError: + name = '?' + # We'll assume it's unicode. Encode it. + # Careful, I *think* it should not be possible to + # get an IndexError here but you never know. + try: + if keyword_names_w is None: + raise IndexError + # note: negative-based indexing from the end + w_name = keyword_names_w[i - len(keywords)] + except IndexError: + if keywords is None: name = '?' else: - w_enc = space.newtext(space.sys.defaultencoding) - w_err = space.newtext("replace") - w_name = space.call_method(w_name, "encode", w_enc, - w_err) - name = space.text_w(w_name) + name = keywords[i] + else: + w_enc = space.newtext(space.sys.defaultencoding) + w_err = space.newtext("replace") + w_name = space.call_method(w_name, "encode", w_enc, + w_err) + name = space.text_w(w_name) break self.kwd_name = name diff --git a/pypy/interpreter/astcompiler/optimize.py b/pypy/interpreter/astcompiler/optimize.py --- a/pypy/interpreter/astcompiler/optimize.py +++ b/pypy/interpreter/astcompiler/optimize.py @@ -5,7 +5,7 @@ from pypy.tool import stdlib_opcode as ops from pypy.interpreter.error import OperationError from rpython.rlib.unroll import unrolling_iterable -from rpython.rlib.runicode import MAXUNICODE +from rpython.rlib.rutf8 import MAXUNICODE from rpython.rlib.objectmodel import specialize diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py b/pypy/interpreter/astcompiler/test/test_compiler.py --- a/pypy/interpreter/astcompiler/test/test_compiler.py +++ b/pypy/interpreter/astcompiler/test/test_compiler.py @@ -975,9 +975,6 @@ class AppTestCompiler: - def setup_class(cls): - cls.w_maxunicode = cls.space.wrap(sys.maxunicode) - def test_docstring_not_loaded(self): import StringIO, dis, sys ns = {} @@ -1027,7 +1024,7 @@ import sys d = {} exec '# -*- coding: utf-8 -*-\n\nu = u"\xf0\x9f\x92\x8b"' in d - if sys.maxunicode > 65535 and self.maxunicode > 65535: + if sys.maxunicode > 65535: expected_length = 1 else: expected_length = 2 diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -3,7 +3,7 @@ from rpython.rlib.cache import Cache from rpython.tool.uid import HUGEVAL_BYTES -from rpython.rlib import jit, types +from rpython.rlib import jit, types, rutf8 from rpython.rlib.debug import make_sure_not_resized from rpython.rlib.objectmodel import (we_are_translated, newlist_hint, compute_unique_id, specialize, not_rpython) @@ -283,7 +283,10 @@ def str_w(self, space): self._typed_unwrap_error(space, "string") - def unicode_w(self, space): + def utf8_w(self, space): + self._typed_unwrap_error(space, "unicode") + + def convert_to_w_unicode(self, space): self._typed_unwrap_error(space, "unicode") def bytearray_list_of_chars_w(self, space): @@ -1103,7 +1106,7 @@ """ return None - def listview_unicode(self, w_list): + def listview_utf8(self, w_list): """ Return a list of unwrapped unicode out of a list of unicode. If the argument is not a list or does not contain only unicode, return None. May return None anyway. @@ -1133,8 +1136,15 @@ def newlist_bytes(self, list_s): return self.newlist([self.newbytes(s) for s in list_s]) - def newlist_unicode(self, list_u): - return self.newlist([self.newunicode(u) for u in list_u]) + def newlist_utf8(self, list_u, is_ascii): + l_w = [None] * len(list_u) + for i, item in enumerate(list_u): + if not is_ascii: + length = rutf8.check_utf8(item, True) + else: + length = len(item) + l_w[i] = self.newutf8(item, length) + return self.newlist(l_w) def newlist_int(self, list_i): return self.newlist([self.newint(i) for i in list_i]) @@ -1661,6 +1671,8 @@ # needed because CPython has the same issue. (Well, it's # unclear if there is any use at all for getting the bytes in # the unicode buffer.) + if self.isinstance_w(w_obj, self.w_unicode): + return w_obj.charbuf_w(self) try: return self.bytes_w(w_obj) except OperationError as e: @@ -1802,27 +1814,38 @@ raise oefmt(self.w_TypeError, "argument must be a string") return self.bytes_w(w_obj) - @specialize.argtype(1) - def unicode_w(self, w_obj): - assert w_obj is not None - return w_obj.unicode_w(self) + def utf8_w(self, w_obj): + return w_obj.utf8_w(self) + + def convert_to_w_unicode(self, w_obj): + return w_obj.convert_to_w_unicode(self) def unicode0_w(self, w_obj): "Like unicode_w, but rejects strings with NUL bytes." from rpython.rlib import rstring - result = w_obj.unicode_w(self) + result = w_obj.utf8_w(self).decode('utf8') if u'\x00' in result: raise oefmt(self.w_TypeError, "argument must be a unicode string without NUL " "characters") return rstring.assert_str0(result) - def realunicode_w(self, w_obj): - # Like unicode_w(), but only works if w_obj is really of type - # 'unicode'. On Python 3 this is the same as unicode_w(). + def convert_arg_to_w_unicode(self, w_obj, strict=None): + # XXX why convert_to_w_unicode does something slightly different? + from pypy.objspace.std.unicodeobject import W_UnicodeObject + assert not hasattr(self, 'is_fake_objspace') + return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict) + + def utf8_len_w(self, w_obj): + w_obj = self.convert_arg_to_w_unicode(w_obj) + return w_obj._utf8, w_obj._len() + + def realutf8_w(self, w_obj): + # Like utf8_w(), but only works if w_obj is really of type + # 'unicode'. On Python 3 this is the same as utf8_w(). if not self.isinstance_w(w_obj, self.w_unicode): raise oefmt(self.w_TypeError, "argument must be a unicode") - return self.unicode_w(w_obj) + return self.utf8_w(w_obj) def bool_w(self, w_obj): # Unwraps a bool, also accepting an int for compatibility. @@ -2187,7 +2210,7 @@ 'float_w', 'uint_w', 'bigint_w', - 'unicode_w', + 'utf8_w', 'unwrap', 'is_true', 'is_w', diff --git a/pypy/interpreter/gateway.py b/pypy/interpreter/gateway.py --- a/pypy/interpreter/gateway.py +++ b/pypy/interpreter/gateway.py @@ -160,6 +160,9 @@ def visit_text0(self, el, app_sig): self.checked_space_method(el, app_sig) + def visit_utf8(self, el, app_sig): + self.checked_space_method(el, app_sig) + def visit_fsencode(self, el, app_sig): self.checked_space_method(el, app_sig) @@ -304,6 +307,9 @@ def visit_text0(self, typ): self.run_args.append("space.text0_w(%s)" % (self.scopenext(),)) + def visit_utf8(self, typ): + self.run_args.append("space.utf8_w(%s)" % (self.scopenext(),)) + def visit_fsencode(self, typ): self.run_args.append("space.fsencode_w(%s)" % (self.scopenext(),)) @@ -469,6 +475,9 @@ def visit_text0(self, typ): self.unwrap.append("space.text0_w(%s)" % (self.nextarg(),)) + def visit_utf8(self, typ): + self.unwrap.append("space.utf8_w(%s)" % (self.nextarg(),)) + def visit_fsencode(self, typ): self.unwrap.append("space.fsencode_w(%s)" % (self.nextarg(),)) @@ -533,10 +542,10 @@ def int_unwrapping_space_method(typ): - assert typ in (int, str, float, unicode, r_longlong, r_uint, r_ulonglong, bool) + assert typ in (int, str, float, r_longlong, r_uint, r_ulonglong, bool) if typ is r_int is r_longlong: return 'gateway_r_longlong_w' - elif typ in (str, unicode, bool): + elif typ in (str, bool): return typ.__name__ + '_w' else: return 'gateway_' + typ.__name__ + '_w' diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -1,3 +1,4 @@ +from rpython.rlib import rutf8 from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter import unicodehelper from rpython.rlib.rstring import StringBuilder @@ -51,18 +52,20 @@ 'unmatched triple quotes in literal') q -= 2 - if unicode_literal: # XXX Py_UnicodeFlag is ignored for now + if unicode_literal: if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. assert 0 <= ps <= q substr = s[ps:q] else: + unicodehelper.check_utf8_or_raise(space, s, ps, q) substr = decode_unicode_utf8(space, s, ps, q) if rawmode: - v = unicodehelper.decode_raw_unicode_escape(space, substr) + r = unicodehelper.decode_raw_unicode_escape(space, substr) else: - v = unicodehelper.decode_unicode_escape(space, substr) - return space.newunicode(v) + r = unicodehelper.decode_unicode_escape(space, substr) + v, length = r + return space.newutf8(v, length) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and @@ -71,7 +74,8 @@ substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: - w_u = space.newunicode(unicodehelper.decode_utf8(space, substr)) + lgt = unicodehelper.check_utf8_or_raise(space, substr) + w_u = space.newutf8(substr, lgt) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: @@ -101,15 +105,12 @@ # the backslash we just wrote, we emit "\u005c" # instead. lis.append("u005c") - if ord(s[ps]) & 0x80: # XXX inefficient - w, ps = decode_utf8(space, s, ps, end) - for c in w: - # The equivalent of %08x, which is not supported by RPython. - # 7 zeroes are enough for the unicode range, and the - # result still fits in 32-bit. - hexa = hex(ord(c) + 0x10000000) - lis.append('\\U0') - lis.append(hexa[3:]) # Skip 0x and the leading 1 + if ord(s[ps]) & 0x80: + cp = rutf8.codepoint_at_pos(s, ps) + hexa = hex(cp + 0x10000000) + lis.append('\\U0') + lis.append(hexa[3:]) # Skip 0x and the leading 1 + ps = rutf8.next_codepoint_pos(s, ps) else: lis.append(s[ps]) ps += 1 @@ -215,20 +216,29 @@ ch >= 'A' and ch <= 'F') -def decode_utf8(space, s, ps, end): +def check_utf8(space, s, ps, end): assert ps >= 0 pt = ps # while (s < end && *s != '\\') s++; */ /* inefficient for u".." while ps < end and ord(s[ps]) & 0x80: ps += 1 - u = unicodehelper.decode_utf8(space, s[pt:ps]) - return u, ps + try: + rutf8.check_utf8(s, True, pt, ps) + except rutf8.CheckError as e: + lgt, flag = rutf8.check_utf8(s, True, pt, e.pos) + unicodehelper.decode_error_handler(space)('strict', 'utf8', + 'invalid utf-8', s, pt + lgt, pt + lgt + 1) + return s[pt:ps] def decode_utf8_recode(space, s, ps, end, recode_encoding): - u, ps = decode_utf8(space, s, ps, end) - w_v = unicodehelper.encode(space, space.newunicode(u), recode_encoding) + p = ps + while p < end and ord(s[p]) & 0x80: + p += 1 + lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p) + w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt), + recode_encoding) v = space.bytes_w(w_v) - return v, ps + return v, p def raise_app_valueerror(space, msg): raise OperationError(space.w_ValueError, space.newtext(msg)) diff --git a/pypy/interpreter/pyparser/test/test_parsestring.py b/pypy/interpreter/pyparser/test/test_parsestring.py --- a/pypy/interpreter/pyparser/test/test_parsestring.py +++ b/pypy/interpreter/pyparser/test/test_parsestring.py @@ -10,7 +10,7 @@ assert space.str_w(w_ret) == value elif isinstance(value, unicode): assert space.type(w_ret) == space.w_unicode - assert space.unicode_w(w_ret) == value + assert space.utf8_w(w_ret).decode('utf8') == value else: assert False @@ -50,7 +50,7 @@ s = "u'\x81'" s = s.decode("koi8-u").encode("utf8") w_ret = parsestring.parsestr(self.space, 'koi8-u', s) - ret = space.unwrap(w_ret) + ret = w_ret._utf8.decode('utf8') assert ret == eval("# -*- coding: koi8-u -*-\nu'\x81'") def test_unicode_literals(self): @@ -102,7 +102,4 @@ def test_decode_unicode_utf8(self): buf = parsestring.decode_unicode_utf8(self.space, 'u"\xf0\x9f\x92\x8b"', 2, 6) - if sys.maxunicode == 65535: - assert buf == r"\U0000d83d\U0000dc8b" - else: - assert buf == r"\U0001f48b" + assert buf == r"\U0001f48b" diff --git a/pypy/interpreter/test/test_argument.py b/pypy/interpreter/test/test_argument.py --- a/pypy/interpreter/test/test_argument.py +++ b/pypy/interpreter/test/test_argument.py @@ -54,6 +54,9 @@ pass class DummySpace(object): + class sys: + defaultencoding = 'utf-8' + def newtuple(self, items): return tuple(items) diff --git a/pypy/interpreter/test/test_gateway.py b/pypy/interpreter/test/test_gateway.py --- a/pypy/interpreter/test/test_gateway.py +++ b/pypy/interpreter/test/test_gateway.py @@ -535,25 +535,33 @@ w_app_g3_r = space.wrap(app_g3_r) raises(gateway.OperationError,space.call_function,w_app_g3_r,w(1.0)) - def test_interp2app_unwrap_spec_unicode(self): + def test_interp2app_unwrap_spec_utf8(self): space = self.space w = space.wrap - def g3_u(space, uni): - return space.wrap(len(uni)) + def g3_u(space, utf8): + return space.wrap(utf8) app_g3_u = gateway.interp2app_temp(g3_u, unwrap_spec=[gateway.ObjSpace, - unicode]) + 'utf8']) w_app_g3_u = space.wrap(app_g3_u) + encoded = u"gęść".encode('utf8') assert self.space.eq_w( - space.call_function(w_app_g3_u, w(u"foo")), - w(3)) + space.call_function(w_app_g3_u, w(u"gęść")), + w(encoded)) assert self.space.eq_w( - space.call_function(w_app_g3_u, w("baz")), - w(3)) + space.call_function(w_app_g3_u, w("foo")), + w("foo")) raises(gateway.OperationError, space.call_function, w_app_g3_u, w(None)) raises(gateway.OperationError, space.call_function, w_app_g3_u, w(42)) + # XXX this part of the test seems wrong, why would "\x80" fail? + # w_ascii = space.appexec([], """(): + # import sys + # return sys.getdefaultencoding() == 'ascii'""") + # if space.is_true(w_ascii): + # raises(gateway.OperationError, space.call_function, w_app_g3_u, + # w("\x80")) def test_interp2app_unwrap_spec_unwrapper(self): space = self.space diff --git a/pypy/interpreter/test/test_objspace.py b/pypy/interpreter/test/test_objspace.py --- a/pypy/interpreter/test/test_objspace.py +++ b/pypy/interpreter/test/test_objspace.py @@ -216,9 +216,7 @@ space = self.space w = space.wrap assert space.text0_w(w("123")) == "123" - exc = space.raises_w(space.w_TypeError, space.text0_w, w("123\x004")) - assert space.unicode0_w(w(u"123")) == u"123" - exc = space.raises_w(space.w_TypeError, space.unicode0_w, w(u"123\x004")) + space.raises_w(space.w_TypeError, space.text0_w, w("123\x004")) def test_getindex_w(self): w_instance1 = self.space.appexec([], """(): diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -1,53 +1,93 @@ import pytest +try: + from hypothesis import given, strategies + HAS_HYPOTHESIS = True +except ImportError: + HAS_HYPOTHESIS = False import struct import sys -from pypy.interpreter.unicodehelper import ( - encode_utf8, decode_utf8, unicode_encode_utf_32_be) -class FakeSpace: - pass +from rpython.rlib import rutf8 -def test_encode_utf8(): - space = FakeSpace() - assert encode_utf8(space, u"abc") == "abc" - assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4" - assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80" - assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80" - # for the following test, go to lengths to avoid CPython's optimizer - # and .pyc file storage, which collapse the two surrogates into one - c = u"\udc00" - assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80" +from pypy.interpreter.unicodehelper import str_decode_utf8 +from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii +from pypy.interpreter import unicodehelper as uh +from pypy.module._codecs.interp_codecs import CodecState + +def decode_utf8(u): + return str_decode_utf8(u, True, "strict", None) def test_decode_utf8(): - space = FakeSpace() - assert decode_utf8(space, "abc") == u"abc" - assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234" - assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800" - assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00" - got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80") - assert map(ord, got) == [0xd800, 0xdc00] - got = decode_utf8(space, "\xf0\x90\x80\x80") - if sys.maxunicode > 65535: - assert map(ord, got) == [0x10000] - else: - assert map(ord, got) == [55296, 56320] + assert decode_utf8("abc") == ("abc", 3, 3) + assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1) + assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1) + assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1) + assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == ( + "\xed\xa0\x80\xed\xb0\x80", 6, 2) + assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1) -@pytest.mark.parametrize('unich', [u"\ud800", u"\udc80"]) -def test_utf32_surrogates(unich): - assert (unicode_encode_utf_32_be(unich, 1, None) == - struct.pack('>i', ord(unich))) - with pytest.raises(UnicodeEncodeError): - unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False) +def test_utf8_encode_ascii(): + assert utf8_encode_ascii("abc", "??", "??") == "abc" + def eh(errors, encoding, reason, p, start, end): + lst.append((errors, encoding, p, start, end)) + return "<FOO>", end + lst = [] + input = u"\u1234".encode("utf8") + assert utf8_encode_ascii(input, "??", eh) == "<FOO>" + assert lst == [("??", "ascii", input, 0, 1)] + lst = [] + input = u"\u1234\u5678abc\u8765\u4321".encode("utf8") + assert utf8_encode_ascii(input, "??", eh) == "<FOO>abc<FOO>" + assert lst == [("??", "ascii", input, 0, 2), + ("??", "ascii", input, 5, 7)] - def replace_with(ru, rs): - def errorhandler(errors, enc, msg, u, startingpos, endingpos): - if errors == 'strict': - raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg) - return ru, rs, endingpos - return unicode_encode_utf_32_be( - u"<%s>" % unich, 3, None, - errorhandler, allow_surrogates=False) +if HAS_HYPOTHESIS: + @given(strategies.text()) + def test_utf8_encode_ascii_2(u): + def eh(errors, encoding, reason, p, start, end): + return "?" * (end - start), end - assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be') - assert (replace_with(None, '\xca\xfe\xca\xfe') == - '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>') + assert utf8_encode_ascii(u.encode("utf8"), + "replace", eh) == u.encode("ascii", "replace") + +def test_str_decode_ascii(): + assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3) + def eh(errors, encoding, reason, p, start, end): + lst.append((errors, encoding, p, start, end)) + return u"\u1234\u5678".encode("utf8"), end + lst = [] + input = "\xe8" + exp = u"\u1234\u5678".encode("utf8") + assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2) + assert lst == [("??", "ascii", input, 0, 1)] + lst = [] + input = "\xe8\xe9abc\xea\xeb" + assert str_decode_ascii(input, "??", True, eh) == ( + exp + exp + "abc" + exp + exp, 7, 11) + assert lst == [("??", "ascii", input, 0, 1), + ("??", "ascii", input, 1, 2), + ("??", "ascii", input, 5, 6), + ("??", "ascii", input, 6, 7)] +if HAS_HYPOTHESIS: + @given(strategies.text()) + def test_unicode_raw_escape(u): + r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None) + assert r == u.encode("raw-unicode-escape") + + @given(strategies.text()) + def test_unicode_escape(u): + r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None) + assert r == u.encode("unicode-escape") + +def test_encode_decimal(space): + assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 ' + with pytest.raises(ValueError): + uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None) + state = space.fromcache(CodecState) + handler = state.encode_error_handler + assert uh.unicode_encode_decimal( + u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v' + + result = uh.unicode_encode_decimal( + u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler) + assert result == '12ሴ' diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1,11 +1,12 @@ +import sys + +from pypy.interpreter.error import OperationError, oefmt from rpython.rlib.objectmodel import specialize -from rpython.rlib.rarithmetic import intmask -from rpython.rlib.rstring import StringBuilder, UnicodeBuilder -from rpython.rlib import runicode -from rpython.rlib.runicode import ( - default_unicode_error_encode, default_unicode_error_decode, - MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR) -from pypy.interpreter.error import OperationError +from rpython.rlib.rstring import StringBuilder +from rpython.rlib import rutf8 +from rpython.rlib.rarithmetic import r_uint, intmask +from rpython.rtyper.lltypesystem import rffi +from pypy.module.unicodedata import unicodedb @specialize.memo() def decode_error_handler(space): @@ -20,90 +21,982 @@ space.newtext(msg)])) return raise_unicode_exception_decode +def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos): + assert startingpos >= 0 + ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]] + return ''.join(ux), endingpos, 'b' + @specialize.memo() def encode_error_handler(space): # Fast version of the "strict" errors handler. - def raise_unicode_exception_encode(errors, encoding, msg, u, + def raise_unicode_exception_encode(errors, encoding, msg, utf8, startingpos, endingpos): + u_len = rutf8.get_utf8_length(utf8) raise OperationError(space.w_UnicodeEncodeError, space.newtuple([space.newtext(encoding), - space.newunicode(u), + space.newutf8(utf8, u_len), space.newint(startingpos), space.newint(endingpos), space.newtext(msg)])) return raise_unicode_exception_encode +def default_error_encode( + errors, encoding, msg, u, startingpos, endingpos): + """A default handler, for tests""" + assert endingpos >= 0 + if errors == 'replace': + return '?', endingpos + if errors == 'ignore': + return '', endingpos + raise ValueError + # ____________________________________________________________ +_WIN32 = sys.platform == 'win32' +_MACOSX = sys.platform == 'darwin' + def encode(space, w_data, encoding=None, errors='strict'): from pypy.objspace.std.unicodeobject import encode_object return encode_object(space, w_data, encoding, errors) -# These functions take and return unwrapped rpython strings and unicodes + +def _has_surrogate(u): + for c in u: + if 0xD800 <= ord(c) <= 0xDFFF: + return True + return False + +# These functions take and return unwrapped rpython strings def decode_unicode_escape(space, string): from pypy.module._codecs import interp_codecs state = space.fromcache(interp_codecs.CodecState) unicodedata_handler = state.get_unicodedata_handler(space) - result, consumed = runicode.str_decode_unicode_escape( - string, len(string), "strict", - final=True, errorhandler=decode_error_handler(space), - unicodedata_handler=unicodedata_handler) - return result + result_utf8, consumed, length = str_decode_unicode_escape( + string, "strict", + final=True, + errorhandler=decode_error_handler(space), + ud_handler=unicodedata_handler) + return result_utf8, length def decode_raw_unicode_escape(space, string): - result, consumed = runicode.str_decode_raw_unicode_escape( - string, len(string), "strict", + result_utf8, consumed, lgt = str_decode_raw_unicode_escape( + string, "strict", final=True, errorhandler=decode_error_handler(space)) - return result + return result_utf8, lgt -def decode_utf8(space, string): +def check_ascii_or_raise(space, string): + try: + rutf8.check_ascii(string) + except rutf8.CheckError as e: + decode_error_handler(space)('strict', 'ascii', + 'ordinal not in range(128)', string, + e.pos, e.pos + 1) + assert False, "unreachable" + +def check_utf8_or_raise(space, string, start=0, end=-1): # Surrogates are accepted and not treated specially at all. # If there happen to be two 3-bytes encoding a pair of surrogates, # you still get two surrogate unicode characters in the result. # These are the Python2 rules; Python3 differs. - result, consumed = runicode.str_decode_utf_8( - string, len(string), "strict", - final=True, errorhandler=decode_error_handler(space), - allow_surrogates=True) - return result + try: + length = rutf8.check_utf8(string, True, start, end) + except rutf8.CheckError as e: + # convert position into unicode position + lgt = rutf8.check_utf8(string, True, start, stop=e.pos) + decode_error_handler(space)('strict', 'utf8', 'invalid utf-8', string, + start + lgt, start + lgt + 1) + assert False, "unreachable" + return length -def encode_utf8(space, uni): - # Note that this function never raises UnicodeEncodeError, - # since surrogates are allowed, either paired or lone. - # A paired surrogate is considered like the non-BMP character - # it stands for. These are the Python2 rules; Python3 differs. +def str_decode_ascii(s, errors, final, errorhandler): + try: + rutf8.check_ascii(s) + return s, len(s), len(s) + except rutf8.CheckError: + return _str_decode_ascii_slowpath(s, errors, final, errorhandler) + +def _str_decode_ascii_slowpath(s, errors, final, errorhandler): + i = 0 + res = StringBuilder() + while i < len(s): + ch = s[i] + if ord(ch) > 0x7F: + r, i = errorhandler(errors, 'ascii', 'ordinal not in range(128)', + s, i, i + 1) + res.append(r) + else: + res.append(ch) + i += 1 + ress = res.build() + lgt = rutf8.check_utf8(ress, True) + return ress, len(s), lgt + +def str_decode_latin_1(s, errors, final, errorhandler): + try: + rutf8.check_ascii(s) + return s, len(s), len(s) + except rutf8.CheckError: + return _str_decode_latin_1_slowpath(s, errors, final, errorhandler) + +def _str_decode_latin_1_slowpath(s, errors, final, errorhandler): + res = StringBuilder(len(s)) + i = 0 + while i < len(s): + if ord(s[i]) > 0x7F: + while i < len(s) and ord(s[i]) > 0x7F: + rutf8.unichr_as_utf8_append(res, ord(s[i])) + i += 1 + else: + start = i + end = i + 1 + while end < len(s) and ord(s[end]) <= 0x7F: + end += 1 + res.append_slice(s, start, end) + i = end + # cannot be ASCII, cannot have surrogates, I believe + return res.build(), len(s), len(s) + +def utf8_encode_latin_1(s, errors, errorhandler): + try: + rutf8.check_ascii(s) + return s + except rutf8.CheckError: + return _utf8_encode_latin_1_slowpath(s, errors, errorhandler) + +def _utf8_encode_latin_1_slowpath(s, errors, errorhandler): + size = len(s) + result = StringBuilder(size) + index = 0 + pos = 0 + while pos < size: + ch = rutf8.codepoint_at_pos(s, pos) + if ch <= 0xFF: + result.append(chr(ch)) + index += 1 + pos = rutf8.next_codepoint_pos(s, pos) + else: + startindex = index + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + while pos < size and rutf8.codepoint_at_pos(s, pos) > 0xFF: + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + msg = "ordinal not in range(256)" + res_8, newindex = errorhandler( + errors, 'latin1', msg, s, startindex, index) + for cp in rutf8.Utf8StringIterator(res_8): + if cp > 0xFF: + errorhandler("strict", 'latin1', msg, s, startindex, index) + result.append(chr(cp)) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) + return result.build() + +def utf8_encode_ascii(s, errors, errorhandler): + """ Don't be confused - this is a slowpath for errors e.g. "ignore" + or an obscure errorhandler + """ + size = len(s) + result = StringBuilder(size) + index = 0 + pos = 0 + while pos < size: + ch = rutf8.codepoint_at_pos(s, pos) + if ch <= 0x7F: + result.append(chr(ch)) + index += 1 + pos = rutf8.next_codepoint_pos(s, pos) + else: + startindex = index + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + while pos < size and rutf8.codepoint_at_pos(s, pos) > 0x7F: + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + msg = "ordinal not in range(128)" + res_8, newindex = errorhandler( + errors, 'ascii', msg, s, startindex, index) + for cp in rutf8.Utf8StringIterator(res_8): + if cp > 0x7F: + errorhandler("strict", 'ascii', msg, s, startindex, index) + result.append(chr(cp)) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) + return result.build() + +if sys.platform == 'win32': + def utf8_encode_mbcs(s, errors, errorhandler): + from rpython.rlib import runicode + s = s.decode('utf-8') + slen = len(s) + res = runicode.unicode_encode_mbcs(s, slen, errors, errorhandler) + return res + + def str_decode_mbcs(s, errors, final, errorhandler): + from rpython.rlib import runicode + slen = len(s) + res, size = runicode.str_decode_mbcs(s, slen, final=final, errors=errors, + errorhandler=errorhandler) + return res.encode('utf8'), size, len(res) + +def str_decode_utf8(s, errors, final, errorhandler): + """ Same as checking for the valid utf8, but we know the utf8 is not + valid so we're trying to either raise or pack stuff with error handler. + The key difference is that this is call_may_force + """ + slen = len(s) + res = StringBuilder(slen) + pos = 0 + end = len(s) + suppressing = False # we are in a chain of "bad" unicode, only emit one fix + while pos < end: + ordch1 = ord(s[pos]) + # fast path for ASCII + if ordch1 <= 0x7F: + pos += 1 + res.append(chr(ordch1)) + suppressing = False + continue + + if ordch1 <= 0xC1: + r, pos = errorhandler(errors, "utf8", "invalid start byte", + s, pos, pos + 1) + if not suppressing: + res.append(r) + continue + + pos += 1 + + if ordch1 <= 0xDF: + if pos >= end: + if not final: + pos -= 1 + break + r, pos = errorhandler(errors, "utf8", "unexpected end of data", + s, pos - 1, pos) + if not suppressing: + res.append(r) + continue + ordch2 = ord(s[pos]) + + if rutf8._invalid_byte_2_of_2(ordch2): + r, pos = errorhandler(errors, "utf8", "invalid continuation byte", + s, pos - 1, pos) + if not suppressing: + res.append(r) + continue + # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz + pos += 1 + res.append(chr(ordch1)) + res.append(chr(ordch2)) + continue + + if ordch1 <= 0xEF: + if (pos + 2) > end: + if not final: + pos -= 1 + break + r, pos = errorhandler(errors, "utf8", "unexpected end of data", + s, pos - 1, pos) + res.append(r) + suppressing = True + continue + ordch2 = ord(s[pos]) + ordch3 = ord(s[pos + 1]) + + if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True): + r, pos = errorhandler(errors, "utf8", "invalid continuation byte", + s, pos - 1, pos) + if not suppressing: + res.append(r) + continue + elif rutf8._invalid_byte_3_of_3(ordch3): + r, pos = errorhandler(errors, "utf8", "invalid continuation byte", + s, pos - 1, pos + 1) + if not suppressing: + res.append(r) + continue + pos += 2 + + # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz + res.append(chr(ordch1)) + res.append(chr(ordch2)) + res.append(chr(ordch3)) + suppressing = False + continue + + if ordch1 <= 0xF4: + if (pos + 3) > end: + if not final: + pos -= 1 + break + r, pos = errorhandler(errors, "utf8", "unexpected end of data", + s, pos - 1, pos) + res.append(r) + suppressing = True + continue + ordch2 = ord(s[pos]) + ordch3 = ord(s[pos + 1]) + ordch4 = ord(s[pos + 2]) + + if rutf8._invalid_byte_2_of_4(ordch1, ordch2): + r, pos = errorhandler(errors, "utf8", "invalid continuation byte", + s, pos - 1, pos) + if not suppressing: + res.append(r) + continue + elif rutf8._invalid_byte_3_of_4(ordch3): + r, pos = errorhandler(errors, "utf8", "invalid continuation byte", + s, pos - 1, pos + 1) + res.append(r) + continue + elif rutf8._invalid_byte_4_of_4(ordch4): + r, pos = errorhandler(errors, "utf8", "invalid continuation byte", + s, pos - 1, pos + 2) + if not suppressing: + res.append(r) + continue + + pos += 3 + # 11110www 10xxxxxx 10yyyyyy 10zzzzzz -> 000wwwxx xxxxyyyy yyzzzzzz + res.append(chr(ordch1)) + res.append(chr(ordch2)) + res.append(chr(ordch3)) + res.append(chr(ordch4)) + suppressing = False + continue + + r, pos = errorhandler(errors, "utf8", "invalid start byte", + s, pos - 1, pos) + if not suppressing: + res.append(r) + + r = res.build() + return r, pos, rutf8.check_utf8(r, True) + +hexdigits = "0123456789ABCDEFabcdef" + +def hexescape(builder, s, pos, digits, + encoding, errorhandler, message, errors): + chr = 0 + if pos + digits > len(s): + endinpos = pos + while endinpos < len(s) and s[endinpos] in hexdigits: + endinpos += 1 + res, pos = errorhandler( + errors, encoding, message, s, pos - 2, endinpos) + builder.append(res) + else: + try: + chr = int(s[pos:pos + digits], 16) + except ValueError: + endinpos = pos + while s[endinpos] in hexdigits: + endinpos += 1 + res, pos = errorhandler( + errors, encoding, message, s, pos - 2, endinpos) + builder.append(res) + else: + # when we get here, chr is a 32-bit unicode character + try: + builder.append_code(chr) + pos += digits + except ValueError: + message = "illegal Unicode character" + res, pos = errorhandler( + errors, encoding, message, s, pos - 2, pos + digits) + builder.append(res) + return pos + +def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler): + size = len(s) + if size == 0: + return '', 0, 0 + + builder = rutf8.Utf8StringBuilder(size) + pos = 0 + while pos < size: + ch = s[pos] + + # Non-escape characters are interpreted as Unicode ordinals + if ch != '\\': + if ord(ch) > 0x7F: + builder.append_code(ord(ch)) + else: + builder.append(ch) + pos += 1 + continue + + # - Escapes + pos += 1 + if pos >= size: + message = "\\ at end of string" + res, pos = errorhandler(errors, "unicodeescape", + message, s, pos - 1, size) + builder.append(res) + continue + + ch = s[pos] + pos += 1 + # \x escapes + if ch == '\n': + pass + elif ch == '\\': + builder.append_char('\\') + elif ch == '\'': + builder.append_char('\'') + elif ch == '\"': + builder.append_char('\"') + elif ch == 'b': + builder.append_char('\b') + elif ch == 'f': + builder.append_char('\f') + elif ch == 't': + builder.append_char('\t') + elif ch == 'n': + builder.append_char('\n') + elif ch == 'r': + builder.append_char('\r') + elif ch == 'v': + builder.append_char('\v') + elif ch == 'a': + builder.append_char('\a') + elif '0' <= ch <= '7': + x = ord(ch) - ord('0') + if pos < size: + ch = s[pos] + if '0' <= ch <= '7': + pos += 1 + x = (x << 3) + ord(ch) - ord('0') + if pos < size: + ch = s[pos] + if '0' <= ch <= '7': + pos += 1 + x = (x << 3) + ord(ch) - ord('0') + if x > 0x7F: + builder.append_code(x) + else: + builder.append_char(chr(x)) + # hex escapes + # \xXX + elif ch == 'x': + digits = 2 + message = "truncated \\xXX escape" + pos = hexescape(builder, s, pos, digits, + "unicodeescape", errorhandler, message, errors) + # \uXXXX + elif ch == 'u': + digits = 4 + message = "truncated \\uXXXX escape" + pos = hexescape(builder, s, pos, digits, + "unicodeescape", errorhandler, message, errors) + # \UXXXXXXXX + elif ch == 'U': + digits = 8 + message = "truncated \\UXXXXXXXX escape" + pos = hexescape(builder, s, pos, digits, + "unicodeescape", errorhandler, message, errors) + # \N{name} + elif ch == 'N' and ud_handler is not None: + message = "malformed \\N character escape" + look = pos + + if look < size and s[look] == '{': + # look for the closing brace + while look < size and s[look] != '}': + look += 1 + if look < size and s[look] == '}': + # found a name. look it up in the unicode database + message = "unknown Unicode character name" + name = s[pos + 1:look] + code = ud_handler.call(name) + if code < 0: + res, pos = errorhandler( + errors, "unicodeescape", message, + s, pos - 1, look + 1) + builder.append(res) + continue + pos = look + 1 + builder.append_code(code) + else: + res, pos = errorhandler(errors, "unicodeescape", + message, s, pos - 1, look + 1) + builder.append(res) + else: + res, pos = errorhandler(errors, "unicodeescape", + message, s, pos - 1, look + 1) + builder.append(res) + else: + builder.append_char('\\') + builder.append_code(ord(ch)) + + return builder.build(), pos, builder.getlength() + +def wcharpsize2utf8(space, wcharp, size): + """Safe version of rffi.wcharpsize2utf8. + + Raises app-level ValueError if any wchar value is outside the valid + codepoint range. + """ + try: + return rffi.wcharpsize2utf8(wcharp, size) + except ValueError: + raise oefmt(space.w_ValueError, + "character is not in range [U+0000; U+10ffff]") + + +# ____________________________________________________________ +# Raw unicode escape + +def str_decode_raw_unicode_escape(s, errors, final=False, + errorhandler=None): + size = len(s) + if size == 0: + return '', 0, 0 + + builder = rutf8.Utf8StringBuilder(size) + pos = 0 + while pos < size: + ch = s[pos] + + # Non-escape characters are interpreted as Unicode ordinals + if ch != '\\': + builder.append_code(ord(ch)) + pos += 1 + continue + + # \u-escapes are only interpreted iff the number of leading + # backslashes is odd + bs = pos + while pos < size: + pos += 1 + if pos == size or s[pos] != '\\': + break + builder.append_char('\\') + + # we have a backslash at the end of the string, stop here + if pos >= size: + builder.append_char('\\') + break + + if ((pos - bs) & 1 == 0 or pos >= size or + (s[pos] != 'u' and s[pos] != 'U')): + builder.append_char('\\') + builder.append_code(ord(s[pos])) + pos += 1 + continue + + digits = 4 if s[pos] == 'u' else 8 + message = "truncated \\uXXXX" + pos += 1 + pos = hexescape(builder, s, pos, digits, + "rawunicodeescape", errorhandler, message, errors) + + return builder.build(), pos, builder.getlength() + +_utf8_encode_unicode_escape = rutf8.make_utf8_escape_function() + + +TABLE = '0123456789abcdef' + +def raw_unicode_escape_helper(result, char): + if char >= 0x10000 or char < 0: + result.append("\\U") + zeros = 8 + elif char >= 0x100: + result.append("\\u") + zeros = 4 + else: + result.append("\\x") + zeros = 2 + for i in range(zeros-1, -1, -1): + result.append(TABLE[(char >> (4 * i)) & 0x0f]) + +def utf8_encode_raw_unicode_escape(s, errors, errorhandler): + # errorhandler is not used: this function cannot cause Unicode errors + size = len(s) + if size == 0: + return '' + result = StringBuilder(size) + pos = 0 + while pos < size: + oc = rutf8.codepoint_at_pos(s, pos) + + if oc < 0x100: + result.append(chr(oc)) + else: + raw_unicode_escape_helper(result, oc) + pos = rutf8.next_codepoint_pos(s, pos) + + return result.build() + + +def utf8_encode_unicode_escape(s, errors, errorhandler): + return _utf8_encode_unicode_escape(s) + +# ____________________________________________________________ +# utf-7 + +# Three simple macros defining base-64 + +def _utf7_IS_BASE64(oc): + "Is c a base-64 character?" + c = chr(oc) + return c.isalnum() or c == '+' or c == '/' +def _utf7_TO_BASE64(n): + "Returns the base-64 character of the bottom 6 bits of n" + return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f] +def _utf7_FROM_BASE64(c): + "given that c is a base-64 character, what is its base-64 value?" + if c >= 'a': + return ord(c) - 71 + elif c >= 'A': + return ord(c) - 65 + elif c >= '0': + return ord(c) + 4 + elif c == '+': + return 62 + else: # c == '/' + return 63 + +def _utf7_DECODE_DIRECT(oc): + return oc <= 127 and oc != ord('+') + +# The UTF-7 encoder treats ASCII characters differently according to +# whether they are Set D, Set O, Whitespace, or special (i.e. none of +# the above). See RFC2152. This array identifies these different +# sets: +# 0 : "Set D" +# alphanumeric and '(),-./:? +# 1 : "Set O" +# !"#$%&*;<=>@[]^_`{|} +# 2 : "whitespace" +# ht nl cr sp +# 3 : special (must be base64 encoded) +# everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127) + +utf7_category = [ +# nul soh stx etx eot enq ack bel bs ht nl vt np cr so si + 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, +# dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, +# sp ! " # $ % & ' ( ) * + , - . / + 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, +# 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, +# @ A B C D E F G H I J K L M N O + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# P Q R S T U V W X Y Z [ \ ] ^ _ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, +# ` a b c d e f g h i j k l m n o + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +# p q r s t u v w x y z { | } ~ del + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, +] + +# ENCODE_DIRECT: this character should be encoded as itself. The +# answer depends on whether we are encoding set O as itself, and also +# on whether we are encoding whitespace as itself. RFC2152 makes it +# clear that the answers to these questions vary between +# applications, so this code needs to be flexible. + +def _utf7_ENCODE_DIRECT(oc, directO, directWS): + return(oc < 128 and oc > 0 and + (utf7_category[oc] == 0 or + (directWS and utf7_category[oc] == 2) or + (directO and utf7_category[oc] == 1))) + +def _utf7_ENCODE_CHAR(result, oc, base64bits, base64buffer): + if oc >= 0x10000: + # code first surrogate + base64bits += 16 + base64buffer = (base64buffer << 16) | 0xd800 | ((oc-0x10000) >> 10) + while base64bits >= 6: + result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6))) + base64bits -= 6 + # prepare second surrogate + oc = 0xDC00 | ((oc-0x10000) & 0x3FF) + base64bits += 16 + base64buffer = (base64buffer << 16) | oc + while base64bits >= 6: + result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6))) + base64bits -= 6 + return base64bits, base64buffer + +def str_decode_utf_7(s, errors, final=False, + errorhandler=None): + size = len(s) + if size == 0: + return '', 0, 0 + + inShift = False + base64bits = 0 + base64buffer = 0 + surrogate = 0 + outsize = 0 + + result = StringBuilder(size) + pos = 0 + shiftOutStartPos = 0 + startinpos = 0 + while pos < size: + ch = s[pos] + + if inShift: # in a base-64 section + if _utf7_IS_BASE64(ord(ch)): #consume a base-64 character + base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch) + assert base64buffer >= 0 + base64bits += 6 + pos += 1 + + if base64bits >= 16: + # enough bits for a UTF-16 value + outCh = base64buffer >> (base64bits - 16) + assert outCh >= 0 + base64bits -= 16 + base64buffer &= (1 << base64bits) - 1 # clear high bits + assert outCh <= 0xffff + if surrogate: + # expecting a second surrogate + if outCh >= 0xDC00 and outCh <= 0xDFFF: + code = (((surrogate & 0x3FF)<<10) | + (outCh & 0x3FF)) + 0x10000 + rutf8.unichr_as_utf8_append(result, code) + outsize += 1 + surrogate = 0 + continue + else: + rutf8.unichr_as_utf8_append(result, surrogate, + allow_surrogates=True) + outsize += 1 + surrogate = 0 + # Not done with outCh: falls back to next line + if outCh >= 0xD800 and outCh <= 0xDBFF: + # first surrogate + surrogate = outCh + else: + outsize += 1 + assert outCh >= 0 + rutf8.unichr_as_utf8_append(result, outCh, True) + + else: + # now leaving a base-64 section + inShift = False + + if base64bits > 0: # left-over bits + if base64bits >= 6: + # We've seen at least one base-64 character + pos += 1 + msg = "partial character in shift sequence" + res, pos = errorhandler(errors, 'utf7', + msg, s, pos-1, pos) + reslen = rutf8.check_utf8(res, True) + outsize += reslen + result.append(res) + continue + else: + # Some bits remain; they should be zero + if base64buffer != 0: + pos += 1 + msg = "non-zero padding bits in shift sequence" + res, pos = errorhandler(errors, 'utf7', + msg, s, pos-1, pos) + reslen = rutf8.check_utf8(res, True) + outsize += reslen + result.append(res) + continue + + if surrogate and _utf7_DECODE_DIRECT(ord(ch)): + outsize += 1 + rutf8.unichr_as_utf8_append(result, surrogate, True) + surrogate = 0 + + if ch == '-': + # '-' is absorbed; other terminating characters are + # preserved + pos += 1 + + elif ch == '+': + startinpos = pos + pos += 1 # consume '+' + if pos < size and s[pos] == '-': # '+-' encodes '+' + pos += 1 + result.append('+') + outsize += 1 + else: # begin base64-encoded section + inShift = 1 + surrogate = 0 + shiftOutStartPos = result.getlength() + base64bits = 0 + base64buffer = 0 + + elif _utf7_DECODE_DIRECT(ord(ch)): # character decodes at itself + result.append(ch) + outsize += 1 + pos += 1 + else: + startinpos = pos + pos += 1 + msg = "unexpected special character" + res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos) + reslen = rutf8.check_utf8(res, True) + outsize += reslen + result.append(res) + + # end of string + final_length = result.getlength() + if inShift and final: # in shift sequence, no more to follow + # if we're in an inconsistent state, that's an error + inShift = 0 + if (surrogate or + base64bits >= 6 or + (base64bits > 0 and base64buffer != 0)): + msg = "unterminated shift sequence" + res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, pos) + reslen = rutf8.check_utf8(res, True) + outsize += reslen + result.append(res) + final_length = result.getlength() + elif inShift: + pos = startinpos + final_length = shiftOutStartPos # back off output + + assert final_length >= 0 + return result.build()[:final_length], pos, outsize + +def utf8_encode_utf_7(s, errors, errorhandler): + size = len(s) + if size == 0: + return '' + result = StringBuilder(size) + + encodeSetO = encodeWhiteSpace = False + + inShift = False + base64bits = 0 + base64buffer = 0 + + pos = 0 + while pos < size: + oc = rutf8.codepoint_at_pos(s, pos) + if not inShift: + if oc == ord('+'): + result.append('+-') + elif _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace): + result.append(chr(oc)) + else: + result.append('+') + inShift = True + base64bits, base64buffer = _utf7_ENCODE_CHAR( + result, oc, base64bits, base64buffer) + else: + if _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace): + # shifting out + if base64bits: # output remaining bits + result.append(_utf7_TO_BASE64(base64buffer << (6-base64bits))) + base64buffer = 0 + base64bits = 0 + + inShift = False + ## Characters not in the BASE64 set implicitly unshift the + ## sequence so no '-' is required, except if the character is + ## itself a '-' + if _utf7_IS_BASE64(oc) or oc == ord('-'): + result.append('-') + result.append(chr(oc)) + else: + base64bits, base64buffer = _utf7_ENCODE_CHAR( + result, oc, base64bits, base64buffer) + pos = rutf8.next_codepoint_pos(s, pos) + + if base64bits: + result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits))) + if inShift: + result.append('-') + + return result.build() + +@specialize.memo() +def _encode_unicode_error_handler(space): + # Fast version of the "strict" errors handler. + from rpython.rlib import runicode + def raise_unicode_exception_encode(errors, encoding, msg, uni, + startingpos, endingpos): + assert isinstance(uni, unicode) + u_len = len(uni) + utf8 = runicode.unicode_encode_utf8sp(uni, u_len) + raise OperationError(space.w_UnicodeEncodeError, + space.newtuple([space.newtext(encoding), + space.newtext(utf8, u_len), + space.newint(startingpos), + space.newint(endingpos), + space.newtext(msg)])) + return u'', None, 0 + return raise_unicode_exception_encode + + +def encode_utf8(space, uni, allow_surrogates=False): + # Note that Python3 tends to forbid *all* surrogates in utf-8. + # If allow_surrogates=True, then revert to the Python 2 behavior + # which never raises UnicodeEncodeError. Surrogate pairs are then + # allowed, either paired or lone. A paired surrogate is considered + # like the non-BMP character it stands for. See also *_utf8sp(). + from rpython.rlib import runicode + assert isinstance(uni, unicode) return runicode.unicode_encode_utf_8( uni, len(uni), "strict", - errorhandler=None, - allow_surrogates=True) + errorhandler=_encode_unicode_error_handler(space), + allow_surrogates=allow_surrogates) + +def encode_utf8sp(space, uni, allow_surrogates=True): + xxx + # Surrogate-preserving utf-8 encoding. Any surrogate character + # turns into its 3-bytes encoding, whether it is paired or not. + # This should always be reversible, and the reverse is + # decode_utf8sp(). + from rpython.rlib import runicode + return runicode.unicode_encode_utf8sp(uni, len(uni)) + +def decode_utf8sp(space, string): + # Surrogate-preserving utf-8 decoding. Assuming there is no + # encoding error, it should always be reversible, and the reverse is + # encode_utf8sp(). + return str_decode_utf8(string, "string", True, decode_never_raise, + allow_surrogates=True) + # ____________________________________________________________ # utf-16 -def str_decode_utf_16(s, size, errors, final=True, +BYTEORDER = sys.byteorder +BYTEORDER2 = BYTEORDER[0] + 'e' # either "le" or "be" +assert BYTEORDER2 in ('le', 'be') + +def str_decode_utf_16(s, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, + result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final, errorhandler, "native") - return result, length + return result, c, lgt -def str_decode_utf_16_be(s, size, errors, final=True, +def str_decode_utf_16_be(s, errors, final=True, + errorhandler=None): + result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final, + errorhandler, "big") + return result, c, lgt + +def str_decode_utf_16_le(s, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, - errorhandler, "big") - return result, length + result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final, + errorhandler, "little") + return result, c, lgt -def str_decode_utf_16_le(s, size, errors, final=True, - errorhandler=None): - result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final, - errorhandler, "little") - return result, length - -def str_decode_utf_16_helper(s, size, errors, final=True, +def str_decode_utf_16_helper(s, errors, final=True, errorhandler=None, byteorder="native", public_encoding_name='utf16'): - if errorhandler is None: - errorhandler = default_unicode_error_decode + size = len(s) bo = 0 if BYTEORDER == 'little': @@ -140,7 +1033,7 @@ else: bo = 1 if size == 0: - return u'', 0, bo + return '', 0, 0, bo if bo == -1: # force little endian ihi = 1 @@ -151,7 +1044,7 @@ ihi = 0 ilo = 1 - result = UnicodeBuilder(size // 2) + result = StringBuilder(size // 2) #XXX I think the errors are not correctly handled here while pos < size: @@ -168,7 +1061,7 @@ ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo]) pos += 2 if ch < 0xD800 or ch > 0xDFFF: - result.append(unichr(ch)) + rutf8.unichr_as_utf8_append(result, ch) continue # UTF-16 code pair: if len(s) - pos < 2: @@ -185,12 +1078,8 @@ ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo]) pos += 2 if 0xDC00 <= ch2 <= 0xDFFF: - if MAXUNICODE < 65536: - result.append(unichr(ch)) - result.append(unichr(ch2)) - else: - result.append(UNICHR((((ch & 0x3FF)<<10) | - (ch2 & 0x3FF)) + 0x10000)) + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000 + rutf8.unichr_as_utf8_append(result, ch) continue else: r, pos = errorhandler(errors, public_encoding_name, @@ -202,7 +1091,9 @@ "illegal encoding", s, pos - 2, pos) result.append(r) - return result.build(), pos, bo + r = result.build() + lgt = rutf8.check_utf8(r, True) + return result.build(), pos, lgt, bo def _STORECHAR(result, CH, byteorder): hi = chr(((CH) >> 8) & 0xff) @@ -214,13 +1105,12 @@ result.append(hi) result.append(lo) -def unicode_encode_utf_16_helper(s, size, errors, +def unicode_encode_utf_16_helper(s, errors, errorhandler=None, allow_surrogates=True, byteorder='little', public_encoding_name='utf16'): - if errorhandler is None: - errorhandler = default_unicode_error_encode + size = len(s) if size == 0: if byteorder == 'native': result = StringBuilder(2) @@ -234,9 +1124,9 @@ byteorder = BYTEORDER pos = 0 + index = 0 while pos < size: - ch = ord(s[pos]) - pos += 1 + ch = rutf8.codepoint_at_pos(s, pos) if ch < 0xD800: _STORECHAR(result, ch, byteorder) @@ -246,78 +1136,76 @@ elif ch >= 0xE000 or allow_surrogates: _STORECHAR(result, ch, byteorder) else: - ru, rs, pos = errorhandler(errors, public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - if rs is not None: - # py3k only - if len(rs) % 2 != 0: - errorhandler('strict', public_encoding_name, - 'surrogates not allowed', - s, pos-1, pos) - result.append(rs) - continue - for ch in ru: - if ord(ch) < 0xD800: - _STORECHAR(result, ord(ch), byteorder) + res_8, newindex = errorhandler( + errors, public_encoding_name, 'surrogates not allowed', + s, pos, pos+1) + for cp in rutf8.Utf8StringIterator(res_8): + if cp < 0xD800: + _STORECHAR(result, cp, byteorder) else: errorhandler('strict', public_encoding_name, 'surrogates not allowed', - s, pos-1, pos) + s, pos, pos+1) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue + pos = rutf8.next_codepoint_pos(s, pos) + index += 1 + return result.build() -def unicode_encode_utf_16(s, size, errors, +def utf8_encode_utf_16(s, errors, errorhandler=None, allow_surrogates=True): - return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + return unicode_encode_utf_16_helper(s, errors, errorhandler, allow_surrogates, "native") -def unicode_encode_utf_16_be(s, size, errors, +def utf8_encode_utf_16_be(s, errors, errorhandler=None, allow_surrogates=True): - return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + return unicode_encode_utf_16_helper(s, errors, errorhandler, allow_surrogates, "big") -def unicode_encode_utf_16_le(s, size, errors, +def utf8_encode_utf_16_le(s, errors, errorhandler=None, allow_surrogates=True): - return unicode_encode_utf_16_helper(s, size, errors, errorhandler, + return unicode_encode_utf_16_helper(s, errors, errorhandler, allow_surrogates, "little") - # ____________________________________________________________ # utf-32 -def str_decode_utf_32(s, size, errors, final=True, +def str_decode_utf_32(s, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper( - s, size, errors, final, errorhandler, "native") - return result, length + result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final, + errorhandler, "native") + return result, c, lgt -def str_decode_utf_32_be(s, size, errors, final=True, +def str_decode_utf_32_be(s, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper( - s, size, errors, final, errorhandler, "big") - return result, length + result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final, + errorhandler, "big") + return result, c, lgt -def str_decode_utf_32_le(s, size, errors, final=True, +def str_decode_utf_32_le(s, errors, final=True, errorhandler=None): - result, length, byteorder = str_decode_utf_32_helper( - s, size, errors, final, errorhandler, "little") - return result, length + result, c, lgt, _ = str_decode_utf_32_helper(s, errors, final, + errorhandler, "little") + return result, c, lgt -BOM32_DIRECT = intmask(0x0000FEFF) +BOM32_DIRECT = intmask(0x0000FEFF) BOM32_REVERSE = intmask(0xFFFE0000) -def str_decode_utf_32_helper(s, size, errors, final=True, - errorhandler=None, +def str_decode_utf_32_helper(s, errors, final, + errorhandler, byteorder="native", - public_encoding_name='utf32'): - if errorhandler is None: - errorhandler = default_unicode_error_decode + public_encoding_name='utf32', + allow_surrogates=True): + assert errorhandler is not None bo = 0 + size = len(s) if BYTEORDER == 'little': iorder = [0, 1, 2, 3] @@ -353,7 +1241,7 @@ else: bo = 1 if size == 0: - return u'', 0, bo + return '', 0, 0, bo if bo == -1: # force little endian iorder = [0, 1, 2, 3] @@ -361,7 +1249,7 @@ # force big endian iorder = [3, 2, 1, 0] - result = UnicodeBuilder(size // 4) + result = StringBuilder(size // 4) while pos < size: # remaining bytes at the end? (size should be divisible by 4) @@ -376,22 +1264,26 @@ break continue ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) | - (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]])) - if ch >= 0x110000: + (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]])) + if not allow_surrogates and 0xD800 <= ch <= 0xDFFF: + r, pos = errorhandler(errors, public_encoding_name, + "code point in surrogate code point " + "range(0xd800, 0xe000)", + s, pos, pos + 4) + result.append(r) + continue + elif ch >= 0x110000: r, pos = errorhandler(errors, public_encoding_name, "codepoint not in range(0x110000)", s, pos, len(s)) result.append(r) continue - if MAXUNICODE < 65536 and ch >= 0x10000: - ch -= 0x10000L - result.append(unichr(0xD800 + (ch >> 10))) - result.append(unichr(0xDC00 + (ch & 0x03FF))) - else: - result.append(UNICHR(ch)) + rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=allow_surrogates) pos += 4 - return result.build(), pos, bo + r = result.build() + lgt = rutf8.check_utf8(r, True) + return r, pos, lgt, bo def _STORECHAR32(result, CH, byteorder): c0 = chr(((CH) >> 24) & 0xff) @@ -409,13 +1301,12 @@ result.append(c2) result.append(c3) -def unicode_encode_utf_32_helper(s, size, errors, +def unicode_encode_utf_32_helper(s, errors, errorhandler=None, allow_surrogates=True, byteorder='little', public_encoding_name='utf32'): - if errorhandler is None: - errorhandler = default_unicode_error_encode + size = len(s) if size == 0: if byteorder == 'native': result = StringBuilder(4) @@ -429,50 +1320,253 @@ byteorder = BYTEORDER pos = 0 + index = 0 while pos < size: - ch = ord(s[pos]) - pos += 1 - ch2 = 0 + ch = rutf8.codepoint_at_pos(s, pos) + pos = rutf8.next_codepoint_pos(s, pos) if not allow_surrogates and 0xD800 <= ch < 0xE000: - ru, rs, pos = errorhandler( + res_8, newindex = errorhandler( errors, public_encoding_name, 'surrogates not allowed', s, pos - 1, pos) - if rs is not None: - # py3k only - if len(rs) % 4 != 0: + for ch in rutf8.Utf8StringIterator(res_8): + if ch < 0xD800: + _STORECHAR32(result, ch, byteorder) + else: errorhandler( 'strict', public_encoding_name, 'surrogates not allowed', s, pos - 1, pos) - result.append(rs) - continue - for ch in ru: - if ord(ch) < 0xD800: - _STORECHAR32(result, ord(ch), byteorder) - else: - errorhandler( - 'strict', public_encoding_name, - 'surrogates not allowed', s, pos - 1, pos) + if index != newindex: # Should be uncommon + index = newindex + pos = rutf8._pos_at_index(s, newindex) continue - if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size: - ch2 = ord(s[pos]) - if 0xDC00 <= ch2 < 0xE000: - ch = (((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000 - pos += 1 _STORECHAR32(result, ch, byteorder) + index += 1 return result.build() -def unicode_encode_utf_32(s, size, errors, +def utf8_encode_utf_32(s, errors, errorhandler=None, allow_surrogates=True): - return unicode_encode_utf_32_helper(s, size, errors, errorhandler, + return unicode_encode_utf_32_helper(s, errors, errorhandler, allow_surrogates, "native") -def unicode_encode_utf_32_be(s, size, errors, +def utf8_encode_utf_32_be(s, errors, errorhandler=None, allow_surrogates=True): - return unicode_encode_utf_32_helper(s, size, errors, errorhandler, + return unicode_encode_utf_32_helper(s, errors, errorhandler, allow_surrogates, "big") -def unicode_encode_utf_32_le(s, size, errors, _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit