Author: fijal Branch: unicode-utf8 Changeset: r92937:0c93ee971f62 Date: 2017-11-04 19:07 +0100 http://bitbucket.org/pypy/pypy/changeset/0c93ee971f62/
Log: first attempt at fixing the unicode surrogate mess diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -1,4 +1,3 @@ -* unskip tests in test_unicodeobject.py * rutf8.prev_codepoint_pos should use r_uint * find a better way to run "find" without creating the index storage, if one is not already readily available @@ -9,3 +8,4 @@ * find all the fast-paths that we want to do with utf8 (we only do utf-8 now, not UTF8 or utf8) for decode/encode * encode_error_handler has XXX +* reenable list strategies for ascii-only unicode diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -72,8 +72,8 @@ substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: - utf, lgt = unicodehelper.decode_utf8(space, substr) - w_u = space.newutf8(utf, lgt) + utf, (lgt, flag) = unicodehelper.decode_utf8(space, substr) + w_u = space.newutf8(utf, lgt, flag) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -45,14 +45,14 @@ def _has_surrogate(u): for c in u: - if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F: + if 0xD800 <= ord(c) <= 0xDFFF: return True return False def _get_flag(u): flag = rutf8.FLAG_ASCII for c in u: - if 0xDB80 <= ord(c) <= 0xCBFF or 0xD800 <= ord(c) <= 0xDB7F: + if 0xD800 <= ord(c) <= 0xDFFF: return rutf8.FLAG_HAS_SURROGATES if ord(c) >= 0x80: flag = rutf8.FLAG_REGULAR @@ -143,7 +143,7 @@ def str_decode_ascii(s, slen, errors, final, errorhandler): try: rutf8.check_ascii(s) - return s, slen, len(s) + return s, slen, len(s), rutf8.FLAG_ASCII except rutf8.CheckError: w = DecodeWrapper((errorhandler)) u, pos = runicode.str_decode_ascii(s, slen, errors, final, w.handle) diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py --- a/pypy/module/__builtin__/operation.py +++ b/pypy/module/__builtin__/operation.py @@ -30,8 +30,8 @@ raise oefmt(space.w_ValueError, "unichr() arg out of range") if code < 0x80: flag = rutf8.FLAG_ASCII - elif 0xDB80 <= code <= 0xCBFF or 0xD800 <= code <= 0xDB7F: - flag = rutf8.FLAG_HAS_SURROGATE + elif 0xD800 <= code <= 0xDFFF: + flag = rutf8.FLAG_HAS_SURROGATES else: flag = rutf8.FLAG_REGULAR return space.newutf8(s, 1, flag) diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -516,8 +516,9 @@ return w_obj.listview_unicode() if type(w_obj) is W_SetObject or type(w_obj) is W_FrozensetObject: return w_obj.listview_unicode() - #if isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj): - # return w_obj.listview_unicode() + if (isinstance(w_obj, W_UnicodeObject) and self._uni_uses_no_iter(w_obj) + and w_obj.is_ascii()): + return w_obj.listview_unicode() if isinstance(w_obj, W_ListObject) and self._uses_list_iter(w_obj): return w_obj.getitems_unicode() return None diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -27,7 +27,6 @@ assert len(warnings) == 2 def test_listview_unicode(self): - py.test.skip("skip for new") w_str = self.space.newutf8('abcd', 4, rutf8.FLAG_ASCII) assert self.space.listview_unicode(w_str) == list(u"abcd") @@ -662,7 +661,6 @@ assert unicode('+AB', 'utf-7', 'replace') == u'\ufffd' def test_codecs_utf8(self): - skip("unskip this before merge") assert u''.encode('utf-8') == '' assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac' assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82' @@ -695,7 +693,6 @@ assert unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' def test_codecs_errors(self): - skip("some nonsense in handling of ignore and replace") # Error handling (encoding) raises(UnicodeError, u'Andr\202 x'.encode, 'ascii') raises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict') diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -93,6 +93,8 @@ return space.text_w(space.str(self)) def utf8_w(self, space): + if self._has_surrogates(): + return rutf8.reencode_utf8_with_surrogates(self._utf8) return self._utf8 def readbuf_w(self, space): @@ -115,8 +117,8 @@ charbuf_w = str_w def listview_unicode(self): - XXX # fix at some point - return _create_list_from_unicode(self._value) + assert self.is_ascii() + return _create_list_from_unicode(self._utf8) def ord(self, space): if self._len() != 1: @@ -410,7 +412,7 @@ "or unicode") try: if codepoint >= 0x80: - flag = self._combine_flags(flag, rutf8.FLAG_NORMAL) + flag = self._combine_flags(flag, rutf8.FLAG_REGULAR) rutf8.unichr_as_utf8_append(result, codepoint, allow_surrogates=True) result_length += 1 @@ -632,7 +634,7 @@ return rutf8.FLAG_REGULAR def _get_flag(self): - if self._is_ascii(): + if self.is_ascii(): return rutf8.FLAG_ASCII elif self._has_surrogates(): return rutf8.FLAG_HAS_SURROGATES @@ -977,7 +979,7 @@ end = rutf8.next_codepoint_pos(self._utf8, start) return W_UnicodeObject(self._utf8[start:end], 1, self._get_flag()) - def _is_ascii(self): + def is_ascii(self): return self._index_storage is rutf8.UTF8_IS_ASCII def _has_surrogates(self): @@ -986,7 +988,8 @@ self._index_storage.flag == rutf8.FLAG_HAS_SURROGATES)) def _index_to_byte(self, index): - if self._is_ascii(): + if self.is_ascii(): + assert index >= 0 return index return rutf8.codepoint_position_at_index( self._utf8, self._get_index_storage(), index) @@ -1195,7 +1198,7 @@ assert False, "always raises" return space.newbytes(s) if ((encoding is None and space.sys.defaultencoding == 'utf8') or - encoding == 'utf-8'): + encoding == 'utf-8' or encoding == 'utf8'): return space.newbytes(space.utf8_w(w_object)) if w_encoder is None: from pypy.module._codecs.interp_codecs import lookup_codec diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -388,6 +388,34 @@ assert pos - continuation_bytes >= 0 return pos - continuation_bytes, flag +def reencode_utf8_with_surrogates(utf8): + """ Receiving valid UTF8 which contains surrogates, combine surrogate + pairs into correct UTF8 with pairs collpased. This is a rare case + and you should not be using surrogate pairs in the first place, + so the performance here is a bit secondary + """ + s = StringBuilder(len(utf8)) + stop = len(utf8) + i = 0 + while i < stop: + uchr = codepoint_at_pos(utf8, i) + if 0xD800 <= uchr <= 0xDBFF: + high = uchr + i = next_codepoint_pos(utf8, i) + if i >= stop: + unichr_as_utf8_append(s, uchr, True) + break + low = codepoint_at_pos(utf8, i) + if 0xDC00 <= low <= 0xDFFF: + uchr = 0x10000 + (high - 0xD800) * 0x400 + (low - 0xDC00) + i = next_codepoint_pos(utf8, i) + # else not really a surrogate pair, just append high + else: + i = next_codepoint_pos(utf8, i) + unichr_as_utf8_append(s, uchr, True) + return s.build() + + @jit.elidable def codepoints_in_utf8(value, start=0, end=sys.maxint): """Return the number of codepoints in the UTF-8 byte string diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py --- a/rpython/rlib/test/test_rutf8.py +++ b/rpython/rlib/test/test_rutf8.py @@ -40,9 +40,7 @@ def _test_check_utf8(s, allow_surrogates): def _has_surrogates(s): for u in s.decode('utf8'): - if 0xD800 <= ord(u) <= 0xDB7F: - return True - if 0xDC00 <= ord(u) <= 0xDBFF: + if 0xD800 <= ord(u) <= 0xDFFF: return True return False _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit