Author: fijal Branch: unicode-utf8 Changeset: r93298:8d468e08f3fe Date: 2017-12-07 18:03 +0200 http://bitbucket.org/pypy/pypy/changeset/8d468e08f3fe/
Log: whack a few more places, handle surrogates correctly diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py --- a/pypy/objspace/std/formatting.py +++ b/pypy/objspace/std/formatting.py @@ -330,8 +330,7 @@ space = self.space if do_unicode: cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1) - flag = rutf8.get_flag_from_code(cp) - w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag) + w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1) else: cp = ord(self.fmt[self.fmtpos - 1]) w_s = space.newbytes(chr(cp)) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -170,7 +170,8 @@ def _istitle(self, ch): return unicodedb.isupper(ch) or unicodedb.istitle(ch) - def _isspace(self, ch): + @staticmethod + def _isspace(ch): return unicodedb.isspace(ch) def _isalpha(self, ch): @@ -188,8 +189,8 @@ def _iscased(self, ch): return unicodedb.iscased(ch) - def _islinebreak(self, s, pos): - return rutf8.islinebreak(s, pos) + def _islinebreak(self, ch): + return unicodedb.islinebreak(ch) @staticmethod @unwrap_spec(w_string=WrappedDefault("")) @@ -610,7 +611,7 @@ while pos < length: sol = pos lgt = 0 - while pos < length and not self._islinebreak(value, pos): + while pos < length and not self._islinebreak(rutf8.codepoint_at_pos(value, pos)): pos = rutf8.next_codepoint_pos(value, pos) lgt += 1 eol = pos @@ -792,7 +793,7 @@ if pos < 0: return space.newtuple([self, self._empty(), self._empty()]) else: - lgt, _ = rutf8.check_utf8(value, True, stop=pos) + lgt = rutf8.check_utf8(value, True, stop=pos) return space.newtuple( [W_UnicodeObject(value[0:pos], lgt), w_sub, W_UnicodeObject(value[pos + len(sub._utf8):len(value)], @@ -810,7 +811,7 @@ if pos < 0: return space.newtuple([self._empty(), self._empty(), self]) else: - lgt, _ = rutf8.check_utf8(value, True, stop=pos) + lgt = rutf8.check_utf8(value, True, stop=pos) return space.newtuple( [W_UnicodeObject(value[0:pos], lgt), w_sub, W_UnicodeObject(value[pos + len(sub._utf8):len(value)], @@ -1087,7 +1088,10 @@ return space.newbytes(s) if ((encoding is None and space.sys.defaultencoding == 'utf8') or encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'): - return space.newbytes(space.utf8_w(w_object)) + utf8 = space.utf8_w(w_object) + if rutf8.has_surrogates(utf8): + utf8 = rutf8.reencode_utf8_with_surrogates(utf8) + return space.newbytes(utf8) if w_encoder is None: from pypy.module._codecs.interp_codecs import lookup_codec w_encoder = space.getitem(lookup_codec(space, encoding), space.newint(0)) @@ -1728,14 +1732,12 @@ result = ['\0'] * w_unistr._length digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] - i = 0 res_pos = 0 - while i < len(unistr): - uchr = rutf8.codepoint_at_pos(unistr, i) - if rutf8.isspace(unistr, i): + iter = rutf8.Utf8StringIterator(unistr) + for uchr in iter: + if W_UnicodeObject._isspace(uchr): result[res_pos] = ' ' res_pos += 1 - i = rutf8.next_codepoint_pos(unistr, i) continue try: result[res_pos] = digits[unicodedb.decimal(uchr)] @@ -1744,14 +1746,14 @@ result[res_pos] = chr(uchr) else: w_encoding = space.newtext('decimal') - w_start = space.newint(i) - w_end = space.newint(i+1) + pos = iter.get_pos() + w_start = space.newint(pos) + w_end = space.newint(pos+1) w_reason = space.newtext('invalid decimal Unicode string') raise OperationError(space.w_UnicodeEncodeError, space.newtuple([w_encoding, w_unistr, w_start, w_end, w_reason])) - i = rutf8.next_codepoint_pos(unistr, i) res_pos += 1 return ''.join(result) diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -410,6 +410,13 @@ assert pos - continuation_bytes >= 0 return pos - continuation_bytes +def has_surrogates(utf8): + # XXX write a faster version maybe + for ch in Utf8StringIterator(utf8): + if 0xD800 <= ch <= 0xDBFF: + return True + return False + def reencode_utf8_with_surrogates(utf8): """ Receiving valid UTF8 which contains surrogates, combine surrogate pairs into correct UTF8 with pairs collpased. This is a rare case _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit